Пример #1
0
def _make_package(args):  # pragma: no cover
    """Prepare transcriptiondata from the transcription sources."""
    from lingpy.sequence.sound_classes import token2class
    from lingpy.data import Model

    columns = ['LATEX', 'FEATURES', 'SOUND', 'IMAGE', 'COUNT', 'NOTE']
    bipa = TranscriptionSystem('bipa')
    for src, rows in args.repos.iter_sources(type='td'):
        args.log.info('TranscriptionData {0} ...'.format(src['NAME']))
        uritemplate = URITemplate(
            src['URITEMPLATE']) if src['URITEMPLATE'] else None
        out = [[
            'BIPA_GRAPHEME', 'CLTS_NAME', 'GENERATED', 'EXPLICIT', 'GRAPHEME',
            'URL'
        ] + columns]
        graphemes = set()
        for row in rows:
            if row['GRAPHEME'] in graphemes:
                args.log.warn('skipping duplicate grapheme: {0}'.format(
                    row['GRAPHEME']))
                continue
            graphemes.add(row['GRAPHEME'])
            if not row['BIPA']:
                bipa_sound = bipa[row['GRAPHEME']]
                explicit = ''
            else:
                bipa_sound = bipa[row['BIPA']]
                explicit = '+'
            generated = '+' if bipa_sound.generated else ''
            if is_valid_sound(bipa_sound, bipa):
                bipa_grapheme = bipa_sound.s
                bipa_name = bipa_sound.name
            else:
                bipa_grapheme, bipa_name = '<NA>', '<NA>'
            url = uritemplate.expand(
                **row) if uritemplate else row.get('URL', '')
            out.append([
                bipa_grapheme, bipa_name, generated, explicit, row['GRAPHEME'],
                url
            ] + [row.get(c, '') for c in columns])
        found = len([o for o in out if o[0] != '<NA>'])
        args.log.info('... {0} of {1} graphemes found ({2:.0f}%)'.format(
            found, len(out), found / len(out) * 100))
        with UnicodeWriter(pkg_path('transcriptiondata',
                                    '{0}.tsv'.format(src['NAME'])),
                           delimiter='\t') as writer:
            writer.writerows(out)

    count = 0
    with UnicodeWriter(pkg_path('soundclasses', 'lingpy.tsv'),
                       delimiter='\t') as writer:
        writer.writerow(['CLTS_NAME', 'BIPA_GRAPHEME'] + SOUNDCLASS_SYSTEMS)
        for grapheme, sound in sorted(bipa.sounds.items()):
            if not sound.alias:
                writer.writerow([sound.name, grapheme] + [
                    token2class(grapheme, Model(cls))
                    for cls in SOUNDCLASS_SYSTEMS
                ])
                count += 1
    args.log.info('SoundClasses: {0} written to file.'.format(count))
Пример #2
0
 def before(self, req, fp):
     self.writer = UnicodeWriter(fp)
     self.writer.__enter__()
     self.writer.writerow([
         f if isinstance(f, string_types) else f[1]
         for f in self.get_fields(req)
     ])
Пример #3
0
 def writerow(self, row):
     if not self.header_written:
         UnicodeWriter.writerow(
             self, [col.name for col in self.table.schema.columns.values()])
         self.header_written = True
     if not isinstance(row, Row):
         row = Row.from_list(self.table.schema, row)
     else:
         assert row.schema == self.table.schema
     UnicodeWriter.writerow(self, row.to_list())
Пример #4
0
    def __init__(self, table, container=None, **kw):
        self.table, kw['delimiter'] = _table_and_delimiter(table)

        if isinstance(container, Archive):
            f = None
        elif isinstance(container, Path):
            f = container.joinpath(self.table.url)
        else:
            f = self.table.url  # pragma: no cover

        self.container = container
        self.header_written = not self.table.dialect.header
        UnicodeWriter.__init__(self, f, **kw)
Пример #5
0
def iso2codes(args):
    from clldutils.dsv import UnicodeWriter

    nodes = list(args.repos.languoids())

    res = {}
    for node in nodes:
        if node.iso:
            res[node.id] = (node.iso, set())

    for node in nodes:
        if node.level == Level.family or node.id in res:
            continue
        for nid in res:
            matched = False
            for l in node.lineage:
                if l[1] == nid:
                    res[nid][1].add(node.id)
                    matched = True
                    break
            if matched:
                break

    with UnicodeWriter('iso2glottocodes.csv') as writer:
        writer.writerow(['iso', 'glottocodes'])
        for gc, (iso, gcs) in res.items():
            writer.writerow([iso, ';'.join([gc] + list(gcs))])
Пример #6
0
    def get_values(self, p, language_url_pattern):
        q = DBSession.query(Value).join(Value.valueset)\
            .filter(ValueSet.parameter_pk == p.pk)\
            .options(
            joinedload(Value.valueset, ValueSet.language),
            joinedload(Value.valueset, ValueSet.contribution),
            joinedload(Value.domainelement),
            joinedload_all(Value.valueset, ValueSet.references, ValueSetReference.source)
        ).order_by(ValueSet.parameter_pk, ValueSet.language_pk, Value.pk)

        with UnicodeWriter() as writer:
            writer.writerow([
                'ID',
                'Language_ID',
                'Parameter_ID',
                'Contribution_ID',
                'Value',
                'Source',
                'Comment',
            ])
            for v in page_query(q):
                writer.writerow([
                    v.id,
                    language_url_pattern.format(v.valueset.language.id),
                    p.id,
                    v.valueset.contribution.id,
                    v.domainelement.name if v.domainelement else v.name,
                    ';'.join(self.format_sources(v)),
                    getattr(v, 'comment', v.valueset.source) or '',
                ])

        return writer.read()
Пример #7
0
def iso2codes(args):
    """
    Map ISO codes to the list of all Glottolog languages and dialects subsumed "under" it.
    """
    from clldutils.dsv import UnicodeWriter

    nodes = list(args.repos.languoids())

    res = {}
    for node in nodes:
        if node.iso:
            res[node.id] = (node.iso, set())

    for node in nodes:
        if node.level == Level.family or node.id in res:
            continue
        for nid in res:
            matched = False
            for l in node.lineage:
                if l[1] == nid:
                    res[nid][1].add(node.id)
                    matched = True
                    break
            if matched:
                break

    outdir = Path('.') if not args.args else Path(args.args[0])
    with UnicodeWriter(outdir / 'iso2glottocodes.csv') as writer:
        writer.writerow(['iso', 'glottocodes'])
        for gc, (iso, gcs) in res.items():
            writer.writerow([iso, ';'.join([gc] + list(gcs))])
Пример #8
0
    def cmd_download(self, **kw):
        # download source
        self.raw.write('sources.bib', getEvoBibAsBibtex(SOURCE, **kw))

        # download data
        all_records = []
        for i in pb(list(range(1, 20 * self.pages + 1, 20))):
            with self.raw.temp_download(self._url(i),
                                        'file-{0}'.format(i),
                                        log=self.log) as fname:
                soup = BeautifulSoup(
                    fname.open(encoding='utf8').read(), 'html.parser')
                for record in soup.findAll(name='div',
                                           attrs={"class": "results_record"}):
                    if isinstance(record, bs4.element.Tag):
                        children = list(record.children)
                        number = children[0].findAll('span')[1].text.strip()
                        concept = children[1].findAll('span')[1].text
                        for child in children[2:]:
                            if isinstance(child, bs4.element.Tag):
                                dpoints = child.findAll('span')
                                if len(dpoints) >= 3:
                                    lname = dpoints[1].text
                                    glottolog = re.findall(
                                        'Glottolog: (........)',
                                        str(dpoints[1]))[0]
                                    entry = dpoints[2].text
                                    cogid = list(
                                        child.children)[4].text.strip()
                                    all_records.append(
                                        (number, concept, lname, glottolog,
                                         entry, cogid))
        with UnicodeWriter(self.raw.posix('output.csv')) as f:
            f.writerows(all_records)
Пример #9
0
def geo(args):
    with_session(args)
    fname = args.pkg_dir.joinpath('static', 'download',
                                  'languages-and-dialects-geo.csv')
    with transaction.manager, UnicodeWriter(fname) as writer:
        writer.writerow([
            'glottocode', 'name', 'isocodes', 'level', 'macroarea', 'latitude',
            'longitude'
        ])
        for l in DBSession.query(models.Languoid)\
                .filter(or_(
                    models.Languoid.level == models.LanguoidLevel.dialect,
                    models.Languoid.level == models.LanguoidLevel.language))\
                .options(
                    joinedload(models.Languoid.macroareas),
                    joinedload_all(
                        common.Language.languageidentifier,
                        common.LanguageIdentifier.identifier))\
                .order_by(common.Language.name):
            writer.writerow([
                l.id, l.name, ' '.join(
                    i.name
                    for i in l.get_identifier_objs(common.IdentifierType.iso)),
                l.level, l.macroareas[0].name if l.macroareas else '',
                l.latitude if l.latitude is not None else '',
                l.longitude if l.longitude is not None else ''
            ])

    args.log.info('{0} written'.format(fname))
Пример #10
0
def word_length(args):
    from pyconcepticon.api import Concepticon

    c = Concepticon(args.concepticon_repos)
    res = defaultdict(lambda: defaultdict(list))

    def _word_length(ds, **kw):
        ds.word_length(res)

    with_dataset(args, _word_length)
    concepts = c.conceptsets
    languoids = {l.id: l for l in Glottolog(args.glottolog_repos).languoids()}

    with UnicodeWriter('wordlength.csv') as writer:
        writer.writerow([
            'Concepticon_ID', 'Gloss', 'Semanticfield', 'Category',
            'Glottocode', 'Variety', 'Family', 'Form', 'Length'
        ])
        for pid, langs in res.items():
            if len(langs) >= 500:
                for (lang, variety), forms in langs.items():
                    if lang in languoids:
                        lengths = [len(f.split()) for f in forms]
                        lang = languoids[lang]
                        family = lang.lineage[0][0] if lang.lineage else ''
                        c = concepts[pid]
                        writer.writerow([
                            pid, c['GLOSS'], c['SEMANTICFIELD'],
                            c['ONTOLOGICAL_CATEGORY'], lang.id, variety,
                            family, forms[0],
                            sum(lengths) / len(lengths)
                        ])
Пример #11
0
def markconservative(m, trigs, ref, hht, outfn, verbose=True, rank=None):
    blamefield = "hhtype"
    mafter = markall(m, trigs, verbose=verbose, rank=rank)
    ls = lstat(ref, hht)
    lsafter = lstat_witness(mafter, hht)
    log = []
    for (lg, (stat, wits)) in lsafter.items():
        if not ls.get(lg):
            if verbose:
                print lg, "lacks status", [mafter[k][1]['srctrickle'] for k in wits]
            continue
        if hht[stat] > hht[ls[lg]]:
            log = log + [
                (lg, [(mafter[k][1].get(blamefield, "No %s" % blamefield),
                       k,
                       mafter[k][1].get('title', 'no title'),
                       mafter[k][1].get('srctrickle', 'no srctrickle')) for k in wits], ls[lg])]
            for k in wits:
                (t, f) = mafter[k]
                if blamefield in f:
                    del f[blamefield]
                mafter[k] = (t, f)
    with UnicodeWriter(outfn, dialect='excel-tab') as writer:
        writer.writerows(((lg, was) + mis for (lg, miss, was) in log for mis in miss))
    return mafter
Пример #12
0
def markconservative(m, trigs, ref, hht, outfn, verbose=True, rank=None):
    blamefield = "hhtype"
    mafter = markall(m, trigs, verbose=verbose, rank=rank)
    ls = lstat(ref, hht)
    lsafter = lstat_witness(mafter, hht)
    log = []
    no_status = defaultdict(set)
    for (lg, (stat, wits)) in lsafter.items():
        if not ls.get(lg):
            srctrickles = [mafter[k][1]['srctrickle'] for k in wits]
            for t in srctrickles:
                if not t.startswith('iso6393'):
                    no_status[lg].add(t)
            continue
        if hht[stat] > hht[ls[lg]]:
            log = log + [(lg, [(mafter[k][1].get(
                blamefield, "No %s" % blamefield), k, mafter[k][1].get(
                    'title', 'no title'), mafter[k][1].get(
                        'srctrickle', 'no srctrickle'))
                               for k in wits], ls[lg])]
            for k in wits:
                (t, f) = mafter[k]
                if blamefield in f:
                    del f[blamefield]
                mafter[k] = (t, f)
    for lg in no_status:
        print('{0} lacks status'.format(lg))
    with UnicodeWriter(outfn, dialect='excel-tab') as writer:
        writer.writerows(
            ((lg, was) + mis for (lg, miss, was) in log for mis in miss))
    return mafter
Пример #13
0
 def write(self, path, sep="\t"):
     with UnicodeWriter(path, delimiter=sep) as writer:
         for i, item in enumerate(self):
             if i == 0:
                 writer.writerow(list(item.keys()))
             writer.writerow(list(item.values()))
     if path is None:
         return writer.read()
Пример #14
0
    def render(self, ctx, req):
        with UnicodeWriter() as writer:
            writer.writerow(self.header(ctx, req))

            for item in ctx.get_query(limit=csv.QUERY_LIMIT):
                writer.writerow(self.row(ctx, req, item))

            return writer.read()
Пример #15
0
def languoids(langs, outdir):
    with UnicodeWriter(outdir.joinpath('csv', 'glottolog.csv')) as writer:
        writer.writerow(['id', 'name', 'family_id', 'family_name', 'iso_code'])
        for lang in sorted(langs):
            writer.writerow([
                lang.id, lang.name, lang.lineage[0][1] if lang.lineage else '',
                lang.lineage[0][0] if lang.lineage else '', lang.iso or ''
            ])
Пример #16
0
def locations(glottolog, fid, outpath):
    with UnicodeWriter(outpath) as writer:
        writer.writerow(['name', 'glottocode', 'latitude', 'longitude'])
        for lang in glottolog.languoids():
            if lang.level == Level.language and lang.latitude is not None:
                if fid in [l[1] for l in lang.lineage]:
                    writer.writerow(
                        [lang.name, lang.id, lang.latitude, lang.longitude])
Пример #17
0
 def render(self, ctx, req):
     with UnicodeWriter() as writer:
         rows = iter(ctx.get_query(limit=QUERY_LIMIT))
         first = next(rows, None)
         if first is not None:
             cols = first.csv_head()
             writer.writerow(cols)
             for item in chain([first], rows):
                 writer.writerow(item.to_csv(ctx=ctx, req=req, cols=cols))
         return writer.read()
Пример #18
0
 def to_csvfile(self, filename):
     """Write a CSV file with one row for each entry in each bibfile."""
     with self.connect() as conn:
         cursor = conn.execute(
             'SELECT filename, bibkey, hash, cast(id AS text) AS id '
             'FROM entry ORDER BY lower(filename), lower(bibkey), hash, id')
         with UnicodeWriter(filename) as writer:
             writer.writerow([col[0] for col in cursor.description])
             for row in cursor:
                 writer.writerow(row)
Пример #19
0
 def render(self, data, accepted_media_type=None, renderer_context=None):
     "Renders a list of SocietyResultSets to CSV"
     if data is None:
         return ''
     results = DPLACECSVResults(data)
     with UnicodeWriter() as writer:
         writer.writerow([CSV_PREAMBLE])
         writer.writerow(results.field_names)
         for row in results:
             writer.writerow(row)
     return writer.read()
Пример #20
0
def xls2csv(fname, outdir=None):
    res = {}
    outdir = outdir or fname.parent
    wb = xlrd.open_workbook(as_posix(fname))
    for sname in wb.sheet_names():
        sheet = wb.sheet_by_name(sname)
        if sheet.nrows:
            path = outdir.joinpath(fname.stem + '.' +
                                   slug(sname, lowercase=False) + '.csv')
            with UnicodeWriter(path) as writer:
                for i in range(sheet.nrows):
                    writer.writerow([col.value for col in sheet.row(i)])
            res[sname] = path
    return res
Пример #21
0
 def render(self, ctx, req):
     fid = req.route_url('parameter', id='xxx').replace('xxx', '{0}')
     lid = req.route_url('language', id='xxx').replace('xxx', '{0}')
     with UnicodeWriter() as writer:
         writer.writerow(['Language_ID', 'Feature_ID', 'Value'])
         for _lid, _fid, v in DBSession.query(
                     Language.id, Parameter.id, Value.name)\
                 .filter(Language.pk == ValueSet.language_pk)\
                 .filter(Parameter.pk == ValueSet.parameter_pk)\
                 .filter(Value.valueset_pk == ValueSet.pk)\
                 .order_by(Parameter.pk, Language.id):
             if v:
                 writer.writerow([lid.format(_lid), fid.format(_fid), v])
         return writer.read()
Пример #22
0
def download(dataset, **kw):
    def rp(*names):
        return dataset.raw.joinpath(*names).as_posix()

    download_and_unpack_zipfiles(URL, dataset, FNAME)
    check_call('libreoffice --headless --convert-to docx %s --outdir %s' %
               (rp(FNAME), rp()),
               shell=True)

    doc = Document(rp(Path(FNAME).stem + '.docx'))
    for i, table in enumerate(doc.tables):
        with UnicodeWriter(rp('%s.csv' % (i + 1, ))) as writer:
            for row in table.rows:
                writer.writerow(map(text_and_color, row.cells))
Пример #23
0
def lookup(args):
    """
    Looks up a single gloss from the commandline.

    concepticon lookup <gloss1 gloss2 ... glossN>
    """
    api = Concepticon()
    found = api.lookup(args.args,
                       language=args.language,
                       full_search=args.full_search,
                       similarity_level=args.similarity)
    with UnicodeWriter(None, delimiter='\t') as writer:
        writer.writerow(
            ['GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY'])
        for f in found:
            writer.writerow(f)
        print(writer.read().decode('utf-8'))
Пример #24
0
def write_tree(tree, fname, taxa_in_dplace, societies_by_glottocode):
    if not fname.exists():
        fname.mkdir()
    tree.prune([n.encode('ascii') for n in taxa_in_dplace])

    with fname.joinpath('summary.trees').open('w', encoding="utf-8") as handle:
        handle.write(
            NEXUS_TEMPLATE.format(tree.name if tree.name else 'UNTITLED',
                                  tree.write(format=9)))

    with UnicodeWriter(fname.joinpath('taxa.csv')) as writer:
        writer.writerow(['taxon', 'glottocode', 'xd_ids', 'soc_ids'])
        for gc in sorted(taxa_in_dplace):
            socs = societies_by_glottocode[gc]
            writer.writerow([
                gc, gc, ', '.join(set(s.xd_id for s in socs)),
                ', '.join(s.id for s in socs)
            ])
    return tree
Пример #25
0
def orthography(args):  # pragma: no cover
    ds = get_dataset(args)
    out = ds.dir.joinpath('orthography.tsv')
    if out.exists():
        if not confirm(
                'There already is an orthography profile for this dataset. Overwrite?',
                default=False):
            return

    graphemes = Counter()
    for line in ds.iter_raw_lexemes():
        graphemes.update(grapheme_pattern.findall(line))

    with UnicodeWriter(out, delimiter='\t') as writer:
        writer.writerow(['graphemes', 'frequency', 'IPA'])
        for grapheme, frequency in graphemes.most_common():
            writer.writerow([grapheme, '{0}'.format(frequency), grapheme])

    log_dump(out, log=args.log)
Пример #26
0
 def create(self, req, filename=None, verbose=True):  # pragma: no cover
     meanings = [(p.name, p.id)
                 for p in DBSession.query(Parameter).order_by(Parameter.pk)]
     tmp = mkdtemp()
     path = os.path.join(tmp, 'asjp.tab')
     with UnicodeWriter(f=path, delimiter=binary_type("\t")) as writer:
         writer.writerow([f[0]
                          for f in self.fields] + [m[0] for m in meanings])
         for lang in DBSession.query(Doculect).order_by(
                 Doculect.pk).options(
                     joinedload_all(Language.valuesets, ValueSet.values),
                     joinedload_all(Language.valuesets,
                                    ValueSet.parameter)).limit(10000):
             row = [f[1](lang) for f in self.fields]
             vss = {vs.parameter.id: vs for vs in lang.valuesets}
             row.extend(
                 [Doculect.format_words(vss.get(m[1])) for m in meanings])
             writer.writerow(row)
     Download.create(self, req, filename=path)
     rmtree(tmp)
Пример #27
0
def _freeze(table, fpath):
    def conv(v, col):
        if v is None:
            return ''
        if isinstance(col.type, DeclEnumType):  # pragma: no cover
            return v.value
        if isinstance(col.type, JSONEncodedDict):
            return json.dumps(v)
        if isinstance(v, (datetime, date)):
            return v.isoformat()
        return v

    keys = [col.name for col in table.columns]
    cols = {col.name: col for col in table.columns}
    rows = [keys]

    for row in DBSession.execute(select([table])):
        rows.append([conv(row[key], cols[key]) for key in keys])

    if len(rows) > 1:
        with UnicodeWriter(fpath) as writer:
            writer.writerows(rows)
Пример #28
0
class CsvDump(Download):
    """Download of a resource type as csv."""

    ext = 'csv'

    def __init__(self, model, pkg, fields=None, **kw):
        """Initialize.

        fields can be a list of column names or a dictionary mapping model attribute
        names to csv column names.
        """
        super(CsvDump, self).__init__(model, pkg, **kw)
        self.fields = fields
        self.writer = None

    def get_stream(self):
        return StringIO(newline='') if PY3 else BytesIO()

    def read_stream(self, fp):
        res = Download.read_stream(self, fp)
        if PY3:  # pragma: no cover
            res = res.encode('utf8')
        return res

    def get_fields(self, req):
        if not self.fields:
            self.fields = ['id', 'name']
        return self.fields

    def before(self, req, fp):
        self.writer = UnicodeWriter(fp)
        self.writer.__enter__()
        self.writer.writerow([
            f if isinstance(f, string_types) else f[1]
            for f in self.get_fields(req)
        ])

    def row(self, req, fp, item, index):
        return [
            getattr(item, f if isinstance(f, string_types) else f[0])
            for f in self.get_fields(req)
        ]

    def dump(self, req, fp, item, index):
        self.writer.writerow(self.row(req, fp, item, index))
Пример #29
0
def write_conceptlist(clist, filename, header=False):
    """
    Write conceptlist to file.
    """
    def natural_sort(l):
        """
        Code-piece from
        http://stackoverflow.com/questions/4836710/does-python-have-a-built-in-function-for-string-natural-sort
        """
        convert = lambda text: int(text) if text.isdigit() else text.lower()
        alphanum_key = lambda key: [
            convert(c) for c in re.split('([0-9]+)', key)
        ]
        return sorted(l, key=alphanum_key)

    header = header or clist['header']
    keys = natural_sort(list(clist.keys()))
    with UnicodeWriter(filename, delimiter='\t') as writer:
        writer.writerow(header)
        for k in keys:
            v = clist[k]
            if k not in ['splits', 'mergers', 'header']:
                writer.writerow([v[h] for h in header])
Пример #30
0
class CsvDump(Download):

    """Download of a resource type as csv."""

    ext = 'csv'

    def __init__(self, model, pkg, fields=None, **kw):
        """Initialize.

        fields can be a list of column names or a dictionary mapping model attribute
        names to csv column names.
        """
        super(CsvDump, self).__init__(model, pkg, **kw)
        self.fields = fields
        self.writer = None

    def get_stream(self):
        return StringIO(newline='') if PY3 else BytesIO()

    def read_stream(self, fp):
        res = Download.read_stream(self, fp)
        if PY3:  # pragma: no cover
            res = res.encode('utf8')
        return res

    def get_fields(self, req):
        if not self.fields:
            self.fields = ['id', 'name']
        return self.fields

    def before(self, req, fp):
        self.writer = UnicodeWriter(fp)
        self.writer.__enter__()
        self.writer.writerow(
            [f if isinstance(f, string_types) else f[1] for f in self.get_fields(req)])

    def row(self, req, fp, item, index):
        return [getattr(item, f if isinstance(f, string_types) else f[0])
                for f in self.get_fields(req)]

    def dump(self, req, fp, item, index):
        self.writer.writerow(self.row(req, fp, item, index))
Пример #31
0
def main():
    socs = read_win1252(
        'ALL_soc_ids_to_lang_wAltNames_sources_5Sept2017_win1252.csv')
    links = {
        r['soc_id']: r
        for r in read_win1252(
            'ALL_soc_links_to_other_databases_30Aug2017_win1252.csv')
    }
    locations = {
        'SCCS' + r['soc_id']: r
        for r in reader('../../legacy/LatLong_data.csv', dicts=True)
    }
    for row in reader(
            '../WNAI/DPLACE_RevisedLatLong_27April2017_inclWNAI_SCCS.csv',
            dicts=True):
        if row['Dataset'] == 'SCCS':
            locations[row['soc_id']]['Lat'] = row['soc.latitude']
            locations[row['soc_id']]['Long'] = row['soc.longitude']

    with UnicodeWriter('societies.csv') as w:
        w.writerow([f.name for f in attr.fields(Society)])
        for soc in socs:
            kw = {
                'id': soc['soc_id'],
                'glottocode': soc['glottolog_id'],
                'glottocode_comment': 'Lang_assignment_change_notes'
            }
            for col in [
                    'xd_id',
                    'pref_name_for_society',
                    'ORIG_name_and_ID_in_this_dataset',
                    'alt_names_by_society',
                    'main_focal_year',
            ]:
                kw[col] = soc[col]

            for col in ['Lat', 'Long', 'origLat', 'origLong', 'Comment']:
                kw[col] = locations[soc['soc_id']][col]

            kw['HRAF_name_ID'] = links[soc['soc_id']]['HRAF_name_ID']
            kw['HRAF_link'] = links[soc['soc_id']]['HRAF_link']
            w.writerow(attr.astuple(Society(**kw)))

    with UnicodeWriter('societies_mapping.csv') as w:
        w.writerow(['id', 'related'])
        for sid, l in links.items():
            rels = []
            for dsid, suffix in [
                ('EA', '1'),
                ('EA', '2'),
                ('Binford', '1'),
                ('Binford', '2'),
                ('Binford', '3'),
                ('SCCS', ''),
                ('WNAI', '1'),
                ('WNAI', '2'),
                ('WNAI', '3'),
                ('WNAI', '4'),
                ('WNAI', '5'),
            ]:
                if dsid == 'SCCS':
                    label = l['{0}_society_equivalent{1}'.format(dsid, suffix)]
                else:
                    label = l['{0}_label_society_equivalent{1}'.format(
                        dsid, suffix)]
                id = l['{0}_id_society_equivalent{1}'.format(dsid, suffix)]
                if label and id:
                    rels.append('{0}: {1} [{2}]'.format(dsid, label, id))
            w.writerow([sid, '; '.join(rels)])

    var_info = {
        r['source']: r['APA_reference']
        for r in read_win1252('SCCS_variable_sources_bibtex_to_APA.csv',
                              ignore_dataset=True)
    }

    with UnicodeWriter('variables.csv') as w:
        fm = OrderedDict([
            ('VarID', 'id'),
            ('Category', 'category'),
            ('VarTitle', 'title'),
            ('VarDefinition', 'definition'),
            ('VarType', 'type'),
            ('UserNotes', 'notes'),
            ('source', 'source'),
            ('VarTitleShort', 'changes'),
            ('Unit', 'units'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'SCCS_Full_VariableList_12Sept2017_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            row['VarType'] = row['VarType'].capitalize()
            if row['VarDefinition']:
                row['VarDefinition'] += '\n\n'
            row['VarDefinition'] += var_info.get(row['source'], row['source'])
            w.writerow([row[f] for f in fm.keys()])

    with UnicodeWriter('codes.csv') as w:
        fm = OrderedDict([
            ('VarID', 'var_id'),
            ('Code', 'code'),
            ('CodeDescription', 'description'),
            ('ShortName', 'name'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'SCCS_CodeDescriptions_12Sept2017_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            w.writerow([row[f] for f in fm.keys()])

    with UnicodeWriter('data.csv') as w:
        fm = OrderedDict([
            ('soc_id', 'soc_id'),
            ('SubCase', 'sub_case'),
            ('Year', 'year'),
            ('VarID', 'var_id'),
            ('Code', 'code'),
            ('EthnoReferences', 'references'),
            ('AdminComment', 'admin_comment'),
            ('UserComment', 'comment'),
            ('SourceCodedData', 'source_coded_data'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'Full_SCCS_data_12Sept2017_FINAL_329451rows_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            w.writerow([row[f] for f in fm.keys()])
Пример #32
0
def extract(args):
    import argparse
    usage = """
    dplace %(prog)s - extracts subsets of data for further processing.
    
    To filter societies:
    
    > dplace %(prog)s --society Cj4,Cj5,Cj6 output.csv

    To filter societies on a given tree:
    
    > dplace %(prog)s --tree gray_et_al2009 output.csv
    
    To filter societies only from a given dataset:
    
    > dplace %(prog)s --dataset EA output.csv
    """
    parser = argparse.ArgumentParser(prog='extract', usage=usage)
    parser.add_argument('filename', help='filename', default=None)
    parser.add_argument('--society',
                        help='restrict to these society ids (x,y,z)',
                        default=None)
    parser.add_argument('--tree', help='restrict to this tree', default=None)
    parser.add_argument('--dataset',
                        help='restrict to these datasets (x,y,z)',
                        default=None)
    parser.add_argument('--variable',
                        help='restrict to thes dataset (x,y,z)',
                        default=None)
    xargs = parser.parse_args(args.args)

    datasets = xargs.dataset.split(",") if xargs.dataset else None
    variables = xargs.variable.split(",") if xargs.variable else None
    societies = xargs.society.split(",") if xargs.society else None

    # get tree if given
    if xargs.tree:
        # get trees
        trees = {t.id: t for t in args.repos.phylogenies}
        try:
            tree = trees.get(xargs.tree)
        except IndexError:
            raise SystemExit("Failed to find Tree %s" % xargs.tree)
        societies = [
            s for sublist in [t.soc_ids for t in tree.taxa] for s in sublist
        ]

    with UnicodeWriter(f=xargs.filename) as out:
        header = [
            'ID', 'XD_ID', 'Glottocode', 'Name', 'OriginalName', 'FocalYear',
            'Latitude', 'Longitude', 'Variable', 'Value'
        ]
        out.writerow(header)

        for record in args.repos.iter_data(datasets=datasets,
                                           variables=variables,
                                           societies=societies):

            s = args.repos.societies.get(record.soc_id, None)
            if s is None:
                # we get these warnings as we are currently missing the SCCS
                # and WNAI data
                args.log.warn("Missing society definition for %s" %
                              record.soc_id)
                continue

            row = [
                s.id, s.xd_id, s.glottocode, s.pref_name_for_society,
                s.ORIG_name_and_ID_in_this_dataset, s.main_focal_year, s.Lat,
                s.Long, record.var_id, record.code
            ]
            out.writerow(row)
Пример #33
0
    def map(self,
            clist,
            otherlist=None,
            out=None,
            full_search=False,
            similarity_level=5,
            language='en'):
        assert clist.exists(), "File %s does not exist" % clist
        from_ = []
        for item in read_dicts(clist):
            from_.append((item.get('ID', item.get('NUMBER')),
                          item.get('GLOSS', item.get('ENGLISH'))))

        to = self._get_map_for_language(language, otherlist)

        if not full_search:
            cmap = concept_map2([i[1] for i in from_], [i[1] for i in to],
                                similarity_level=similarity_level,
                                freqs=self.frequencies,
                                language=language)
            good_matches = 0
            with UnicodeWriter(out, delimiter='\t') as writer:
                writer.writerow([
                    'ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS',
                    'SIMILARITY'
                ])
                for i, (fid, fgloss) in enumerate(from_):
                    row = [fid, fgloss]
                    matches, sim = cmap.get(i, ([], 10))
                    if sim <= 5:
                        good_matches += 1
                    if not matches:
                        writer.writerow(row + ['', '???', ''])
                    elif len(matches) == 1:
                        row.extend([
                            to[matches[0]][0],
                            to[matches[0]][1].split('///')[0], sim
                        ])
                        writer.writerow(row)
                    else:
                        # we need a list to retain the order by frequency
                        visited = []
                        for j in matches:
                            gls, cid = to[j][0], to[j][1].split('///')[0]
                            if (gls, cid) not in visited:
                                visited += [(gls, cid)]
                        if len(visited) > 1:
                            writer.writerow(['<<<', '', '', ''])
                            for gls, cid in visited:
                                writer.writerow(row + [gls, cid, sim])
                            writer.writerow(['>>>', '', '', ''])
                        else:
                            row.extend([visited[0][0], visited[0][1], sim])
                            writer.writerow(row)
                writer.writerow([
                    '#', good_matches,
                    len(from_), '{0:.2f}'.format(good_matches / len(from_))
                ])
        else:
            cmap = concept_map([i[1] for i in from_], [
                i[1] for i in self._get_map_for_language(language, otherlist)
            ],
                               similarity_level=similarity_level)
            with UnicodeWriter(out, delimiter='\t') as writer:
                writer.writerow(
                    ['ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS'])
                for i, (fid, fgloss) in enumerate(from_):
                    row = [fid, fgloss]
                    match = cmap.get(i)
                    row.extend(list(to[match[0]]) if match else ['', ''])
                    writer.writerow(row)

        if out is None:
            print(writer.read().decode('utf-8'))
Пример #34
0
 def __exit__(self, type, value, traceback):
     UnicodeWriter.__exit__(self, type, value, traceback)
     if isinstance(self.container, Archive):
         self.container.write_text(self.read(), self.table.url)
Пример #35
0
 def before(self, req, fp):
     self.writer = UnicodeWriter(fp)
     self.writer.__enter__()
     self.writer.writerow(
         [f if isinstance(f, string_types) else f[1] for f in self.get_fields(req)])