Пример #1
0
    def cmd_download(self, **kw):
        # download source
        self.raw.write('sources.bib', getEvoBibAsBibtex(SOURCE, **kw))

        # download data
        all_records = []
        for i in pb(list(range(1, 20 * self.pages + 1, 20))):
            with self.raw.temp_download(self._url(i),
                                        'file-{0}'.format(i),
                                        log=self.log) as fname:
                soup = BeautifulSoup(
                    fname.open(encoding='utf8').read(), 'html.parser')
                for record in soup.findAll(name='div',
                                           attrs={"class": "results_record"}):
                    if isinstance(record, bs4.element.Tag):
                        children = list(record.children)
                        number = children[0].findAll('span')[1].text.strip()
                        concept = children[1].findAll('span')[1].text
                        for child in children[2:]:
                            if isinstance(child, bs4.element.Tag):
                                dpoints = child.findAll('span')
                                if len(dpoints) >= 3:
                                    lname = dpoints[1].text
                                    glottolog = re.findall(
                                        'Glottolog: (........)',
                                        str(dpoints[1]))[0]
                                    entry = dpoints[2].text
                                    cogid = list(
                                        child.children)[4].text.strip()
                                    all_records.append(
                                        (number, concept, lname, glottolog,
                                         entry, cogid))
        with UnicodeWriter(self.raw.posix('output.csv')) as f:
            f.writerows(all_records)
Пример #2
0
def geo(args):
    with_session(args)
    fname = args.pkg_dir.joinpath('static', 'download',
                                  'languages-and-dialects-geo.csv')
    with transaction.manager, UnicodeWriter(fname) as writer:
        writer.writerow([
            'glottocode', 'name', 'isocodes', 'level', 'macroarea', 'latitude',
            'longitude'
        ])
        for l in DBSession.query(models.Languoid)\
                .filter(or_(
                    models.Languoid.level == models.LanguoidLevel.dialect,
                    models.Languoid.level == models.LanguoidLevel.language))\
                .options(
                    joinedload(models.Languoid.macroareas),
                    joinedload_all(
                        common.Language.languageidentifier,
                        common.LanguageIdentifier.identifier))\
                .order_by(common.Language.name):
            writer.writerow([
                l.id, l.name, ' '.join(
                    i.name
                    for i in l.get_identifier_objs(common.IdentifierType.iso)),
                l.level, l.macroareas[0].name if l.macroareas else '',
                l.latitude if l.latitude is not None else '',
                l.longitude if l.longitude is not None else ''
            ])

    args.log.info('{0} written'.format(fname))
Пример #3
0
def iso2codes(args):
    from clldutils.dsv import UnicodeWriter

    nodes = list(args.repos.languoids())

    res = {}
    for node in nodes:
        if node.iso:
            res[node.id] = (node.iso, set())

    for node in nodes:
        if node.level == Level.family or node.id in res:
            continue
        for nid in res:
            matched = False
            for l in node.lineage:
                if l[1] == nid:
                    res[nid][1].add(node.id)
                    matched = True
                    break
            if matched:
                break

    with UnicodeWriter('iso2glottocodes.csv') as writer:
        writer.writerow(['iso', 'glottocodes'])
        for gc, (iso, gcs) in res.items():
            writer.writerow([iso, ';'.join([gc] + list(gcs))])
Пример #4
0
def markconservative(m, trigs, ref, hht, outfn, verbose=True, rank=None):
    blamefield = "hhtype"
    mafter = markall(m, trigs, verbose=verbose, rank=rank)
    ls = lstat(ref, hht)
    lsafter = lstat_witness(mafter, hht)
    log = []
    for (lg, (stat, wits)) in lsafter.items():
        if not ls.get(lg):
            if verbose:
                print lg, "lacks status", [mafter[k][1]['srctrickle'] for k in wits]
            continue
        if hht[stat] > hht[ls[lg]]:
            log = log + [
                (lg, [(mafter[k][1].get(blamefield, "No %s" % blamefield),
                       k,
                       mafter[k][1].get('title', 'no title'),
                       mafter[k][1].get('srctrickle', 'no srctrickle')) for k in wits], ls[lg])]
            for k in wits:
                (t, f) = mafter[k]
                if blamefield in f:
                    del f[blamefield]
                mafter[k] = (t, f)
    with UnicodeWriter(outfn, dialect='excel-tab') as writer:
        writer.writerows(((lg, was) + mis for (lg, miss, was) in log for mis in miss))
    return mafter
Пример #5
0
    def get_values(self, p, language_url_pattern):
        q = DBSession.query(Value).join(Value.valueset)\
            .filter(ValueSet.parameter_pk == p.pk)\
            .options(
            joinedload(Value.valueset, ValueSet.language),
            joinedload(Value.valueset, ValueSet.contribution),
            joinedload(Value.domainelement),
            joinedload_all(Value.valueset, ValueSet.references, ValueSetReference.source)
        ).order_by(ValueSet.parameter_pk, ValueSet.language_pk, Value.pk)

        with UnicodeWriter() as writer:
            writer.writerow([
                'ID',
                'Language_ID',
                'Parameter_ID',
                'Contribution_ID',
                'Value',
                'Source',
                'Comment',
            ])
            for v in page_query(q):
                writer.writerow([
                    v.id,
                    language_url_pattern.format(v.valueset.language.id),
                    p.id,
                    v.valueset.contribution.id,
                    v.domainelement.name if v.domainelement else v.name,
                    ';'.join(self.format_sources(v)),
                    getattr(v, 'comment', v.valueset.source) or '',
                ])

        return writer.read()
Пример #6
0
def locations(glottolog, fid, outpath):
    with UnicodeWriter(outpath) as writer:
        writer.writerow(['name', 'glottocode', 'latitude', 'longitude'])
        for lang in glottolog.languoids():
            if lang.level == Level.language and lang.latitude is not None:
                if fid in [l[1] for l in lang.lineage]:
                    writer.writerow(
                        [lang.name, lang.id, lang.latitude, lang.longitude])
Пример #7
0
def languoids(langs, outdir):
    with UnicodeWriter(outdir.joinpath('csv', 'glottolog.csv')) as writer:
        writer.writerow(['id', 'name', 'family_id', 'family_name', 'iso_code'])
        for lang in sorted(langs):
            writer.writerow([
                lang.id, lang.name, lang.lineage[0][1] if lang.lineage else '',
                lang.lineage[0][0] if lang.lineage else '', lang.iso or ''
            ])
Пример #8
0
    def render(self, ctx, req):
        with UnicodeWriter() as writer:
            writer.writerow(self.header(ctx, req))

            for item in ctx.get_query(limit=csv.QUERY_LIMIT):
                writer.writerow(self.row(ctx, req, item))

            return writer.read()
Пример #9
0
 def write(self, path, sep="\t"):
     with UnicodeWriter(path, delimiter=sep) as writer:
         for i, item in enumerate(self):
             if i == 0:
                 writer.writerow(list(item.keys()))
             writer.writerow(list(item.values()))
     if path is None:
         return writer.read()
Пример #10
0
 def render(self, ctx, req):
     with UnicodeWriter() as writer:
         rows = iter(ctx.get_query(limit=QUERY_LIMIT))
         first = next(rows, None)
         if first is not None:
             cols = first.csv_head()
             writer.writerow(cols)
             for item in chain([first], rows):
                 writer.writerow(item.to_csv(ctx=ctx, req=req, cols=cols))
         return writer.read()
Пример #11
0
 def to_csvfile(self, filename):
     """Write a CSV file with one row for each entry in each bibfile."""
     with self.connect() as conn:
         cursor = conn.execute(
             'SELECT filename, bibkey, hash, cast(id AS text) AS id '
             'FROM entry ORDER BY lower(filename), lower(bibkey), hash, id')
         with UnicodeWriter(filename) as writer:
             writer.writerow([col[0] for col in cursor.description])
             for row in cursor:
                 writer.writerow(row)
Пример #12
0
 def render(self, data, accepted_media_type=None, renderer_context=None):
     "Renders a list of SocietyResultSets to CSV"
     if data is None:
         return ''
     results = DPLACECSVResults(data)
     with UnicodeWriter() as writer:
         writer.writerow([CSV_PREAMBLE])
         writer.writerow(results.field_names)
         for row in results:
             writer.writerow(row)
     return writer.read()
Пример #13
0
def xls2csv(fname, outdir=None):
    res = {}
    outdir = outdir or fname.parent
    wb = xlrd.open_workbook(as_posix(fname))
    for sname in wb.sheet_names():
        sheet = wb.sheet_by_name(sname)
        if sheet.nrows:
            path = outdir.joinpath(fname.stem + '.' +
                                   slug(sname, lowercase=False) + '.csv')
            with UnicodeWriter(path) as writer:
                for i in range(sheet.nrows):
                    writer.writerow([col.value for col in sheet.row(i)])
            res[sname] = path
    return res
Пример #14
0
 def render(self, ctx, req):
     fid = req.route_url('parameter', id='xxx').replace('xxx', '{0}')
     lid = req.route_url('language', id='xxx').replace('xxx', '{0}')
     with UnicodeWriter() as writer:
         writer.writerow(['Language_ID', 'Feature_ID', 'Value'])
         for _lid, _fid, v in DBSession.query(
                     Language.id, Parameter.id, Value.name)\
                 .filter(Language.pk == ValueSet.language_pk)\
                 .filter(Parameter.pk == ValueSet.parameter_pk)\
                 .filter(Value.valueset_pk == ValueSet.pk)\
                 .order_by(Parameter.pk, Language.id):
             if v:
                 writer.writerow([lid.format(_lid), fid.format(_fid), v])
         return writer.read()
Пример #15
0
def download(dataset, **kw):
    def rp(*names):
        return dataset.raw.joinpath(*names).as_posix()

    download_and_unpack_zipfiles(URL, dataset, FNAME)
    check_call('libreoffice --headless --convert-to docx %s --outdir %s' %
               (rp(FNAME), rp()),
               shell=True)

    doc = Document(rp(Path(FNAME).stem + '.docx'))
    for i, table in enumerate(doc.tables):
        with UnicodeWriter(rp('%s.csv' % (i + 1, ))) as writer:
            for row in table.rows:
                writer.writerow(map(text_and_color, row.cells))
Пример #16
0
def lookup(args):
    """
    Looks up a single gloss from the commandline.

    concepticon lookup <gloss1 gloss2 ... glossN>
    """
    api = Concepticon()
    found = api.lookup(args.args,
                       language=args.language,
                       full_search=args.full_search,
                       similarity_level=args.similarity)
    with UnicodeWriter(None, delimiter='\t') as writer:
        writer.writerow(
            ['GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY'])
        for f in found:
            writer.writerow(f)
        print(writer.read().decode('utf-8'))
Пример #17
0
def write_tree(tree, fname, taxa_in_dplace, societies_by_glottocode):
    if not fname.exists():
        fname.mkdir()
    tree.prune([n.encode('ascii') for n in taxa_in_dplace])

    with fname.joinpath('summary.trees').open('w', encoding="utf-8") as handle:
        handle.write(
            NEXUS_TEMPLATE.format(tree.name if tree.name else 'UNTITLED',
                                  tree.write(format=9)))

    with UnicodeWriter(fname.joinpath('taxa.csv')) as writer:
        writer.writerow(['taxon', 'glottocode', 'xd_ids', 'soc_ids'])
        for gc in sorted(taxa_in_dplace):
            socs = societies_by_glottocode[gc]
            writer.writerow([
                gc, gc, ', '.join(set(s.xd_id for s in socs)),
                ', '.join(s.id for s in socs)
            ])
    return tree
Пример #18
0
def orthography(args):  # pragma: no cover
    ds = get_dataset(args)
    out = ds.dir.joinpath('orthography.tsv')
    if out.exists():
        if not confirm(
                'There already is an orthography profile for this dataset. Overwrite?',
                default=False):
            return

    graphemes = Counter()
    for line in ds.iter_raw_lexemes():
        graphemes.update(grapheme_pattern.findall(line))

    with UnicodeWriter(out, delimiter='\t') as writer:
        writer.writerow(['graphemes', 'frequency', 'IPA'])
        for grapheme, frequency in graphemes.most_common():
            writer.writerow([grapheme, '{0}'.format(frequency), grapheme])

    log_dump(out, log=args.log)
Пример #19
0
 def create(self, req, filename=None, verbose=True):  # pragma: no cover
     meanings = [(p.name, p.id)
                 for p in DBSession.query(Parameter).order_by(Parameter.pk)]
     tmp = mkdtemp()
     path = os.path.join(tmp, 'asjp.tab')
     with UnicodeWriter(f=path, delimiter=binary_type("\t")) as writer:
         writer.writerow([f[0]
                          for f in self.fields] + [m[0] for m in meanings])
         for lang in DBSession.query(Doculect).order_by(
                 Doculect.pk).options(
                     joinedload_all(Language.valuesets, ValueSet.values),
                     joinedload_all(Language.valuesets,
                                    ValueSet.parameter)).limit(10000):
             row = [f[1](lang) for f in self.fields]
             vss = {vs.parameter.id: vs for vs in lang.valuesets}
             row.extend(
                 [Doculect.format_words(vss.get(m[1])) for m in meanings])
             writer.writerow(row)
     Download.create(self, req, filename=path)
     rmtree(tmp)
Пример #20
0
def _freeze(table, fpath):
    def conv(v, col):
        if v is None:
            return ''
        if isinstance(col.type, DeclEnumType):  # pragma: no cover
            return v.value
        if isinstance(col.type, JSONEncodedDict):
            return json.dumps(v)
        if isinstance(v, (datetime, date)):
            return v.isoformat()
        return v

    keys = [col.name for col in table.columns]
    cols = {col.name: col for col in table.columns}
    rows = [keys]

    for row in DBSession.execute(select([table])):
        rows.append([conv(row[key], cols[key]) for key in keys])

    if len(rows) > 1:
        with UnicodeWriter(fpath) as writer:
            writer.writerows(rows)
Пример #21
0
def write_conceptlist(clist, filename, header=False):
    """
    Write conceptlist to file.
    """
    def natural_sort(l):
        """
        Code-piece from
        http://stackoverflow.com/questions/4836710/does-python-have-a-built-in-function-for-string-natural-sort
        """
        convert = lambda text: int(text) if text.isdigit() else text.lower()
        alphanum_key = lambda key: [
            convert(c) for c in re.split('([0-9]+)', key)
        ]
        return sorted(l, key=alphanum_key)

    header = header or clist['header']
    keys = natural_sort(list(clist.keys()))
    with UnicodeWriter(filename, delimiter='\t') as writer:
        writer.writerow(header)
        for k in keys:
            v = clist[k]
            if k not in ['splits', 'mergers', 'header']:
                writer.writerow([v[h] for h in header])
Пример #22
0
def to_csv(args):
    """
    Parses data file 'org_data/records.tsv' into single csv files into 'raw'.
    In addition it outputs given warnings while parsing. If you only want to check
    the data integrity of data file 'org_data/records.tsv' then pass the argument
    'check' -> amsd to_csv check.
    """

    raw_path = Path(__file__).resolve().parent.parent.parent / 'raw'
    if not raw_path.exists():
        raw_path.mkdir()

    csv_dataframe = {
        'sticks': [],
        'keywords': {},
        'sem_domain': {},
        'linked_filenames': {},
        'item_type': {},
        'material': {},
        'technique': {},
        'ling_area': {},
        'source_citation': {},
        'source_type': {},
        'holder_file': {},
        'data_entry': {}
    }

    datafile = Path(
        __file__).resolve().parent.parent.parent / 'org_data' / 'records.tsv'

    with UnicodeReader(datafile, delimiter='\t') as reader:
        for i, row in enumerate(reader):
            data = []
            if i == 0:  #header
                data.append('pk')  # add pk
                for j, col in enumerate(row):
                    data.append(fields[j][2].strip())
            else:
                data.append(i)  # add id
                for j, col_ in enumerate(row):
                    if j > 41 and len(col_):
                        print('Error: too many filled columns for line %i' %
                              (i + 1))
                        continue
                    if re.sub(r'[ ]+', '', col_) == '':
                        data.append('')
                    else:
                        col = col_.strip()
                        if fields[j][2] in fields_not_in_sticks \
                                and fields[j][2] not in ['linked_filenames', 'source_citation']:
                            col = col.lower()
                        if fields[j][0] == 0:
                            if fields[j][2] in ['lat', 'long']:
                                try:
                                    data.append(dms2dec(col))
                                except:
                                    print(
                                        'Error: check lat/long notation in line %i for "%s"'
                                        % (i + 1, col))
                                    data.append(col)
                            else:
                                data.append(col)
                        elif fields[j][0] == 1 and len(fields[j][3]) == 0:
                            if col not in csv_dataframe[fields[j][2]]:
                                csv_dataframe[fields[j][2]][col] = len(
                                    csv_dataframe[fields[j][2]]) + 1
                            data.append(csv_dataframe[fields[j][2]][col])
                        elif fields[j][0] == 1 and len(fields[j][3]) > 1:
                            ref_data = []
                            if re.match(r'^ling_area_\d+$', fields[j][2]):
                                try:
                                    data_array = [
                                        "|".join([
                                            i.strip() for i in list(
                                                re.findall(fields[j][3], col)
                                                [0])
                                        ])
                                    ]
                                except:
                                    print(
                                        'Error: %s in line %i has wrong structure: %s'
                                        % (fields[j][2], i + 1, col))
                                    data_array = []
                            else:
                                data_array = re.split(fields[j][3], col)
                            for item_ in data_array:
                                item = item_.strip()
                                col_name = fields[j][2]
                                if re.match(r'^ling_area_\d+$', col_name):
                                    col_name = 'ling_area'
                                    if item not in csv_dataframe[col_name]:
                                        csv_dataframe[col_name][item] = len(
                                            csv_dataframe[col_name]) + 1
                                    ref_data.append(
                                        csv_dataframe[col_name][item])
                                elif col_name in ['holder_file']:
                                    if item not in csv_dataframe[col_name]:
                                        csv_dataframe[col_name][item] = len(
                                            csv_dataframe[col_name]) + 1
                                    ref_data.append(
                                        csv_dataframe[col_name][item])
                                else:
                                    dfkey = 'x_sticks_' + col_name
                                    if item not in csv_dataframe[col_name]:
                                        csv_dataframe[col_name][item] = len(
                                            csv_dataframe[col_name]) + 1
                                    if not csv_dataframe[col_name][
                                            item] in ref_data:
                                        ref_data.append(
                                            csv_dataframe[col_name][item])
                                        if dfkey not in csv_dataframe:  # header
                                            csv_dataframe[dfkey] = []
                                            csv_dataframe[dfkey].append(
                                                ['stick_pk', col_name + '_pk'])
                                        csv_dataframe[dfkey].append(
                                            [i, csv_dataframe[col_name][item]])
                            # save ids to related table as semicolon separated lists of ids
                            data.append(';'.join(map(str, ref_data)))
            csv_dataframe['sticks'].append(data)

    with get_catalog() as cat:
        images_objs = {obj.metadata['name']: obj for obj in cat}

    # look for similar entries
    for t, k in [('source_citation', 5), ('holder_file', 4), ('ling_area', 10),
                 ('material', 1)]:
        check_sim = list(csv_dataframe[t].keys())
        for i in range(len(check_sim)):
            for j in range(i + 1, len(check_sim)):
                if sim(check_sim[i], check_sim[j]) < k:
                    print('sim check: %s\n%s\n%s\n' %
                          (t, check_sim[i], check_sim[j]))

    if not args.args or args.args[0].lower() != 'check':
        for filename, data in csv_dataframe.items():
            with UnicodeWriter(raw_path.joinpath(filename + '.csv')) as writer:
                if type(data) is list:
                    for item in data:
                        writer.writerow(item)
                else:
                    d = []
                    if filename == 'ling_area':
                        d.append([
                            'pk', 'chirila_name', 'austlang_code',
                            'austlang_name', 'glottolog_code'
                        ])
                        for k, v in data.items():
                            c, ac, an, g = re.split(r'\|', k)
                            if g == 'no code':
                                g = ''
                            d.append([v, c, ac, an, g])
                    elif filename == 'linked_filenames':
                        d.append(['pk', 'name', 'oid', 'path'])
                        for k, v in data.items():
                            k_ = os.path.splitext(k)[0]
                            if k_ in images_objs:
                                url_path = ''
                                for o in images_objs[k_].bitstreams:
                                    if o.id not in [
                                            'thumbnail.jpg', 'web.jpg'
                                    ]:
                                        url_path = o.id
                                        break
                                if url_path == '':
                                    print("no path found for %s" % (k_))
                                d.append([v, k, images_objs[k_].id, url_path])
                            else:
                                print("no image match for '%s'" % (k))
                                d.append([v, k, ''])
                    else:
                        d.append(['pk', 'name'])
                        for k, v in data.items():
                            d.append([v, k])
                    for item in d:
                        writer.writerow(item)
Пример #23
0
def extract(args):
    import argparse
    usage = """
    dplace %(prog)s - extracts subsets of data for further processing.
    
    To filter societies:
    
    > dplace %(prog)s --society Cj4,Cj5,Cj6 output.csv

    To filter societies on a given tree:
    
    > dplace %(prog)s --tree gray_et_al2009 output.csv
    
    To filter societies only from a given dataset:
    
    > dplace %(prog)s --dataset EA output.csv
    """
    parser = argparse.ArgumentParser(prog='extract', usage=usage)
    parser.add_argument('filename', help='filename', default=None)
    parser.add_argument('--society',
                        help='restrict to these society ids (x,y,z)',
                        default=None)
    parser.add_argument('--tree', help='restrict to this tree', default=None)
    parser.add_argument('--dataset',
                        help='restrict to these datasets (x,y,z)',
                        default=None)
    parser.add_argument('--variable',
                        help='restrict to thes dataset (x,y,z)',
                        default=None)
    xargs = parser.parse_args(args.args)

    datasets = xargs.dataset.split(",") if xargs.dataset else None
    variables = xargs.variable.split(",") if xargs.variable else None
    societies = xargs.society.split(",") if xargs.society else None

    # get tree if given
    if xargs.tree:
        # get trees
        trees = {t.id: t for t in args.repos.phylogenies}
        try:
            tree = trees.get(xargs.tree)
        except IndexError:
            raise SystemExit("Failed to find Tree %s" % xargs.tree)
        societies = [
            s for sublist in [t.soc_ids for t in tree.taxa] for s in sublist
        ]

    with UnicodeWriter(f=xargs.filename) as out:
        header = [
            'ID', 'XD_ID', 'Glottocode', 'Name', 'OriginalName', 'FocalYear',
            'Latitude', 'Longitude', 'Variable', 'Value'
        ]
        out.writerow(header)

        for record in args.repos.iter_data(datasets=datasets,
                                           variables=variables,
                                           societies=societies):

            s = args.repos.societies.get(record.soc_id, None)
            if s is None:
                # we get these warnings as we are currently missing the SCCS
                # and WNAI data
                args.log.warn("Missing society definition for %s" %
                              record.soc_id)
                continue

            row = [
                s.id, s.xd_id, s.glottocode, s.pref_name_for_society,
                s.ORIG_name_and_ID_in_this_dataset, s.main_focal_year, s.Lat,
                s.Long, record.var_id, record.code
            ]
            out.writerow(row)
Пример #24
0
    def map(self,
            clist,
            otherlist=None,
            out=None,
            full_search=False,
            similarity_level=5,
            language='en'):
        assert clist.exists(), "File %s does not exist" % clist
        from_ = []
        for item in read_dicts(clist):
            from_.append((item.get('ID', item.get('NUMBER')),
                          item.get('GLOSS', item.get('ENGLISH'))))

        to = self._get_map_for_language(language, otherlist)

        if not full_search:
            cmap = concept_map2([i[1] for i in from_], [i[1] for i in to],
                                similarity_level=similarity_level,
                                freqs=self.frequencies,
                                language=language)
            good_matches = 0
            with UnicodeWriter(out, delimiter='\t') as writer:
                writer.writerow([
                    'ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS',
                    'SIMILARITY'
                ])
                for i, (fid, fgloss) in enumerate(from_):
                    row = [fid, fgloss]
                    matches, sim = cmap.get(i, ([], 10))
                    if sim <= 5:
                        good_matches += 1
                    if not matches:
                        writer.writerow(row + ['', '???', ''])
                    elif len(matches) == 1:
                        row.extend([
                            to[matches[0]][0],
                            to[matches[0]][1].split('///')[0], sim
                        ])
                        writer.writerow(row)
                    else:
                        # we need a list to retain the order by frequency
                        visited = []
                        for j in matches:
                            gls, cid = to[j][0], to[j][1].split('///')[0]
                            if (gls, cid) not in visited:
                                visited += [(gls, cid)]
                        if len(visited) > 1:
                            writer.writerow(['<<<', '', '', ''])
                            for gls, cid in visited:
                                writer.writerow(row + [gls, cid, sim])
                            writer.writerow(['>>>', '', '', ''])
                        else:
                            row.extend([visited[0][0], visited[0][1], sim])
                            writer.writerow(row)
                writer.writerow([
                    '#', good_matches,
                    len(from_), '{0:.2f}'.format(good_matches / len(from_))
                ])
        else:
            cmap = concept_map([i[1] for i in from_], [
                i[1] for i in self._get_map_for_language(language, otherlist)
            ],
                               similarity_level=similarity_level)
            with UnicodeWriter(out, delimiter='\t') as writer:
                writer.writerow(
                    ['ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS'])
                for i, (fid, fgloss) in enumerate(from_):
                    row = [fid, fgloss]
                    match = cmap.get(i)
                    row.extend(list(to[match[0]]) if match else ['', ''])
                    writer.writerow(row)

        if out is None:
            print(writer.read().decode('utf-8'))
Пример #25
0
def csv(wl, delimiter='\t', **kw):
    with UnicodeWriter(f=None, delimiter=delimiter, **kw) as writer:
        writer.writerow(wl.header)
        for row in wl:
            writer.writerow(row)
    return writer.read().decode('utf8')
Пример #26
0
 def csv_writer(self, comp, name, delimiter=',', suffix='csv'):
     p = self.existing_dir(comp).joinpath('{0}.{1}'.format(name, suffix))
     self.file_written(p)
     return UnicodeWriter(p, delimiter=delimiter)
Пример #27
0
def report(args):
    """
    clpa report <FILE> [rules=FILE] [format=md|csv|cldf] [outfile=FILENAME]

    Note
    ----

    * Rules point to a tab-separated value file in which source and target are
      given to convert a segment to another segment to be applied on a
      data-set-specific basis which may vary from dataset to dataset and can thus
      not be included as standard clpa behaviour.
    * Input file needs to be in csv-format, with tabstop as separator, and it
      needs to contain one column named "TOKENS".
    * format now allows for md (MarkDown), csv (CSV, tab as separator), or cldf
      (no pure cldf but rather current lingpy-csv-format). CLDF format means
      that the original file will be given another two columns, one called
      CLPA_TOKENS, one called CLPA_IDS.
    * if you specify an outfile from the input, the data will be written to
      file instead showing it on the screen.

    """
    if len(args.args) < 1:
        raise ParserError('not enough arguments')

    # get keywords from arguments @xrotwang: is there any better way to do so?
    settings = defaultdict(str)
    settings['format'] = 'md'
    fname = None
    for arg in args.args:
        if '=' in arg:
            key, val = arg.split('=')
            settings[key] = val
        else:
            fname = arg

    if not fname:
        raise ParserError('no filename passed as argument')

    wordlist = Wordlist.from_file(fname)
    sounds, errors = wordlist.check(rules=settings['rules'])

    if settings['format'] not in ['md', 'csv']:
        text = wordlist.write(settings['outfile'] or None)
        if not settings['outfile']:
            print(text)
        return

    segments = OrderedDict([('existing', []), ('missing', []),
                            ('convertible', [])])
    for k in sorted(sounds,
                    key=lambda x: (sounds[x]['frequency'], sounds[x]['id']),
                    reverse=True):
        type_, symbol = None, None
        if k == sounds[k]['clpa']:
            type_, symbol = 'existing', k
        elif sounds[k]['clpa'] == '?':
            type_, symbol = 'missing', k
        else:
            check = sounds[k]['clpa']
            if k != check != '?':
                type_, symbol = 'convertible', k + ' >> ' + sounds[k]['clpa']
        if type_ and symbol:
            segments[type_].append(
                [symbol, sounds[k]['id'], sounds[k]['frequency']])

    if settings['format'] == 'csv':
        with UnicodeWriter(settings['outfile'] or None,
                           delimiter='\t') as writer:
            for key, items in segments.items():
                for i, item in enumerate(items):
                    writer.writerow([i + 1] + item + [key])
        if not settings['outfile']:
            print(writer.read())
        return

    text = []
    header_template = """
# {0} sounds

| number | sound | clpa | frequency |
| ------:| ----- | ---- | ---------:|"""

    for key, items in segments.items():
        text.append(header_template.format(key.capitalize()))
        for i, item in enumerate(items):
            text.append("| {0} | {1[0]} | {1[1]} | {1[2]} |".format(
                i + 1, item))

    text = '\n'.join(text)
    if settings['outfile']:
        with Path(settings['outfile']).open('w', encoding='utf8') as fp:
            fp.write(text)
    else:
        print(text)
Пример #28
0
def main():
    socs = read_win1252(
        'ALL_soc_ids_to_lang_wAltNames_sources_5Sept2017_win1252.csv')
    links = {
        r['soc_id']: r
        for r in read_win1252(
            'ALL_soc_links_to_other_databases_30Aug2017_win1252.csv')
    }
    locations = {
        'SCCS' + r['soc_id']: r
        for r in reader('../../legacy/LatLong_data.csv', dicts=True)
    }
    for row in reader(
            '../WNAI/DPLACE_RevisedLatLong_27April2017_inclWNAI_SCCS.csv',
            dicts=True):
        if row['Dataset'] == 'SCCS':
            locations[row['soc_id']]['Lat'] = row['soc.latitude']
            locations[row['soc_id']]['Long'] = row['soc.longitude']

    with UnicodeWriter('societies.csv') as w:
        w.writerow([f.name for f in attr.fields(Society)])
        for soc in socs:
            kw = {
                'id': soc['soc_id'],
                'glottocode': soc['glottolog_id'],
                'glottocode_comment': 'Lang_assignment_change_notes'
            }
            for col in [
                    'xd_id',
                    'pref_name_for_society',
                    'ORIG_name_and_ID_in_this_dataset',
                    'alt_names_by_society',
                    'main_focal_year',
            ]:
                kw[col] = soc[col]

            for col in ['Lat', 'Long', 'origLat', 'origLong', 'Comment']:
                kw[col] = locations[soc['soc_id']][col]

            kw['HRAF_name_ID'] = links[soc['soc_id']]['HRAF_name_ID']
            kw['HRAF_link'] = links[soc['soc_id']]['HRAF_link']
            w.writerow(attr.astuple(Society(**kw)))

    with UnicodeWriter('societies_mapping.csv') as w:
        w.writerow(['id', 'related'])
        for sid, l in links.items():
            rels = []
            for dsid, suffix in [
                ('EA', '1'),
                ('EA', '2'),
                ('Binford', '1'),
                ('Binford', '2'),
                ('Binford', '3'),
                ('SCCS', ''),
                ('WNAI', '1'),
                ('WNAI', '2'),
                ('WNAI', '3'),
                ('WNAI', '4'),
                ('WNAI', '5'),
            ]:
                if dsid == 'SCCS':
                    label = l['{0}_society_equivalent{1}'.format(dsid, suffix)]
                else:
                    label = l['{0}_label_society_equivalent{1}'.format(
                        dsid, suffix)]
                id = l['{0}_id_society_equivalent{1}'.format(dsid, suffix)]
                if label and id:
                    rels.append('{0}: {1} [{2}]'.format(dsid, label, id))
            w.writerow([sid, '; '.join(rels)])

    var_info = {
        r['source']: r['APA_reference']
        for r in read_win1252('SCCS_variable_sources_bibtex_to_APA.csv',
                              ignore_dataset=True)
    }

    with UnicodeWriter('variables.csv') as w:
        fm = OrderedDict([
            ('VarID', 'id'),
            ('Category', 'category'),
            ('VarTitle', 'title'),
            ('VarDefinition', 'definition'),
            ('VarType', 'type'),
            ('UserNotes', 'notes'),
            ('source', 'source'),
            ('VarTitleShort', 'changes'),
            ('Unit', 'units'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'SCCS_Full_VariableList_12Sept2017_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            row['VarType'] = row['VarType'].capitalize()
            if row['VarDefinition']:
                row['VarDefinition'] += '\n\n'
            row['VarDefinition'] += var_info.get(row['source'], row['source'])
            w.writerow([row[f] for f in fm.keys()])

    with UnicodeWriter('codes.csv') as w:
        fm = OrderedDict([
            ('VarID', 'var_id'),
            ('Code', 'code'),
            ('CodeDescription', 'description'),
            ('ShortName', 'name'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'SCCS_CodeDescriptions_12Sept2017_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            w.writerow([row[f] for f in fm.keys()])

    with UnicodeWriter('data.csv') as w:
        fm = OrderedDict([
            ('soc_id', 'soc_id'),
            ('SubCase', 'sub_case'),
            ('Year', 'year'),
            ('VarID', 'var_id'),
            ('Code', 'code'),
            ('EthnoReferences', 'references'),
            ('AdminComment', 'admin_comment'),
            ('UserComment', 'comment'),
            ('SourceCodedData', 'source_coded_data'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'Full_SCCS_data_12Sept2017_FINAL_329451rows_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            w.writerow([row[f] for f in fm.keys()])
Пример #29
0
def dump(args, test=False):
    sounds = defaultdict(dict)
    data = []
    bipa = TranscriptionSystem('bipa')
    # start from assembling bipa-sounds
    for grapheme, sound in sorted(bipa.sounds.items(),
                                  key=lambda p: p[1].alias
                                  if p[1].alias else False):
        if sound.type not in ['marker']:
            if sound.alias:
                assert sound.name in sounds
                sounds[sound.name]['aliases'].add(grapheme)
            else:
                assert sound.name not in sounds
                sounds[sound.name] = {
                    'grapheme': grapheme,
                    'unicode': sound.uname or '',
                    'generated': '',
                    'note': sound.note or '',
                    'type': sound.type,
                    'aliases': set(),
                    'normalized': '+' if sound.normalized else ''
                }
            data.append(
                Grapheme(grapheme, sound.name, '+', '', 'bipa', '0', '', '',
                         '', '', sound.note or ''))

    # add sounds systematically by their alias
    for td in args.repos.iter_transcriptiondata():
        for name in td.names:
            bipa_sound = bipa[name]
            # check for consistency of mapping here
            if not is_valid_sound(bipa_sound, bipa):
                continue

            sound = sounds.get(name)
            if not sound:
                sound = sounds[name] = {
                    'grapheme': bipa_sound.s,
                    'aliases': {bipa_sound.s},
                    'generated': '+',
                    'unicode': bipa_sound.uname or '',
                    'note': '',
                    'type': bipa_sound.type,
                    'alias': '+' if bipa_sound.alias else '',
                    'normalized': '+' if bipa_sound.normalized else ''
                }

            for item in td.data[name]:
                sound['aliases'].add(item['grapheme'])
                # add the values here
                data.append(
                    Grapheme(
                        item['grapheme'],
                        name,
                        item['explicit'],
                        '',  # sounds[name]['alias'],
                        td.id,
                        item.get('frequency', ''),
                        item.get('url', ''),
                        item.get('features', ''),
                        item.get('image', ''),
                        item.get('sound', ''),
                    ))
        if test:
            break

    # sound classes have a generative component, so we need to treat them
    # separately
    for sc in args.repos.iter_soundclass():
        for name in sounds:
            try:
                grapheme = sc[name]
                data.append(
                    Grapheme(
                        grapheme,
                        name,
                        '+' if name in sc.data else '',
                        '',
                        sc.id,
                    ))
            except KeyError:  # pragma: no cover
                args.log.debug(name, sounds[name]['grapheme'])
        if test:
            break

    # last run, check again for each of the remaining transcription systems,
    # whether we can translate the sound
    for ts in args.repos.iter_transcriptionsystem(exclude=['bipa']):
        for name in sounds:
            try:
                ts_sound = ts[name]
                if is_valid_sound(ts_sound, ts):
                    sounds[name]['aliases'].add(ts_sound.s)
                    data.append(
                        Grapheme(
                            ts_sound.s,
                            name,
                            '' if sounds[name]['generated'] else '+',
                            '',  # sounds[name]['alias'],
                            ts.id,
                        ))
            except ValueError:
                pass
            except TypeError:
                args.log.debug('{0}: {1}'.format(ts.id, name))
        if test:
            break

    with UnicodeWriter(args.repos.data_path('sounds.tsv'),
                       delimiter='\t') as writer:
        writer.writerow(
            ['NAME', 'TYPE', 'GRAPHEME', 'UNICODE', 'GENERATED', 'NOTE'])
        for k, v in sorted(sounds.items(), reverse=True):
            writer.writerow([
                k, v['type'], v['grapheme'], v['unicode'], v['generated'],
                v['note']
            ])

    with UnicodeWriter(args.repos.data_path('graphemes.tsv'),
                       delimiter='\t') as writer:
        writer.writerow([f.name for f in attr.fields(Grapheme)])
        for row in data:
            writer.writerow(attr.astuple(row))
Пример #30
0
def trees(societies_by_glottocode, langs, outdir, year, title):
    label_pattern = re.compile("'[^\[]+\[([a-z0-9]{4}[0-9]{4})[^']*'")

    def rename(n):
        n.name = label_pattern.match(n.name).groups()[0]
        n.length = 1

    glottocodes = set(societies_by_glottocode.keys())
    glottocodes_in_global_tree = set()
    index = {}
    outdir = outdir.joinpath('phylogenies')
    languoids = {}
    families = []
    for lang in langs:
        if not lang.lineage:  # a top-level node
            if not lang.category.startswith('Pseudo '):
                families.append(lang)
        languoids[lang.id] = lang

    glob = Tree()
    glob.name = 'glottolog_global'

    for family in sorted(families):
        node = family.newick_node(nodes=languoids)
        node.visit(rename)
        taxa_in_tree = set(n.name for n in node.walk())
        taxa_in_dplace = glottocodes.intersection(taxa_in_tree)
        if not taxa_in_dplace:
            continue

        tree = Tree("({0});".format(node.newick), format=3)
        tree.name = 'glottolog_{0}'.format(family.id)
        if family.level.name == 'family':
            tree = write_tree(tree, outdir.joinpath(tree.name), taxa_in_dplace,
                              societies_by_glottocode)
            glottocodes_in_global_tree = glottocodes_in_global_tree.union(
                set(n.name for n in tree.traverse()))
            index[tree.name] = dict(
                id=tree.name,
                name='{0} ({1})'.format(family.name, title),
                author='{0} ({1})'.format(title, family.name),
                year=year,
                scaling='',
                reference=reference(title, year),
                url='http://glottolog.org/resource/languoid/id/{}'.format(
                    family.id))

        else:
            glottocodes_in_global_tree = glottocodes_in_global_tree.union(
                taxa_in_tree)
        glob.add_child(tree)

    # global
    write_tree(glob, outdir.joinpath(glob.name),
               glottocodes_in_global_tree.intersection(glottocodes),
               societies_by_glottocode)
    index[glob.name] = dict(id=glob.name,
                            name='Global Classification ({0})'.format(title),
                            author=title,
                            year=year,
                            scaling='',
                            reference=reference(title, year),
                            url='http://glottolog.org/')

    index_path = outdir.joinpath('index.csv')
    phylos = list(reader(index_path, dicts=True))
    with UnicodeWriter(index_path) as writer:
        header = list(phylos[0].keys())
        writer.writerow(header)
        for phylo in sorted(phylos):
            if phylo['id'] in index:
                writer.writerow([index[phylo['id']][k] for k in header])
                del index[phylo['id']]
            else:
                writer.writerow(phylo.values())

        for id_, spec in sorted(index.items()):
            writer.writerow([spec[k] for k in header])