예제 #1
0
def test_create(api, wiki, capsys, tmp_path):
    cldf_repos = tmp_path
    cldf.create(StructureDataset.in_dir(cldf_repos / 'cldf'), api,
                Path(__file__).parent / 'glottolog')
    #captured = capsys.readouterr()
    #assert 'inconsistent' in captured.out
    ds = StructureDataset.from_metadata(cldf_repos / 'cldf' /
                                        'StructureDataset-metadata.json')
    assert len(list(ds['ValueTable'])) == 1
    assert ds['contributors.csv', 'Photo'].valueUrl.expand(list(ds['contributors.csv'])[0]) == \
           'https://glottobank.org/photos/abc'
예제 #2
0
def run(args):
    cldf = StructureDataset.in_dir(args.cldf_repos / 'cldf')
    if args.glottolog_version != 'test':  # pragma: no cover
        with Catalog(args.glottolog, args.glottolog_version) as glottolog:
            write_metadata(cldf, args, glottolog)
    else:
        write_metadata(cldf, args, None)
    write_schema(cldf)
    cldf.write(**get_data(cldf, args))
    shutil.copy(str(args.repos.path('LICENSE.txt')), str(args.cldf_repos))
    if not args.dev:
        cldf.validate(log=args.log)
예제 #3
0
def make_cldf(db, out, fid):
    # Initialize a CLDF dataset in the output directory, using the appropriate module:
    ds = StructureDataset.in_dir(out)

    # We add the WALS language metadata:
    ds.add_component('LanguageTable', 'Genus', 'Family')

    # And some metadata about the feature:
    ds.add_component('ParameterTable', 'Authors', 'Url', 'Area')
    ds.add_component('CodeTable')

    # Now we collect the data by querying the database:
    values, languages = [], []

    lids = defaultdict(dict)
    for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]):
        for itype, names in groupby(ids, lambda rr: rr[1]):
            names = [n[2] for n in names]
            if len(names) == 1:
                # only add identifiers for equivalent languoids, ignore partial matches.
                lids[lpk][itype] = names[0]

    # We store the sources and references per datapoint:
    sources, refs = defaultdict(list), defaultdict(list)
    for vspk, rs in groupby(db.execute(SQL_SOURCES), lambda r: r[0]):
        for r in rs:
            ref = r[2]
            if r[1]:
                ref += '[{0}]'.format(r[1])  # add the page info in the correct format.
            refs[vspk].append(ref)
            sources[vspk].append(Source(r[3], r[2], author=r[4], year=r[5], title=r[6]))

    codes = {}
    for row in db.execute(SQL_VALUES.format(fid)):
        lpk, lid, lname, vsid, denumber, dename, lat, lon, vspk, gname, fname = row
        ids = lids[lpk]
        if vspk in sources:
            ds.sources.add(*sources[vspk])
        languages.append(dict(
            ID=lid,
            Name=lname,
            Latitude=lat,
            Longitude=lon,
            Glottocode=ids.get('glottolog'),
            ISO639P3code=ids.get('iso639-3'),
            Genus=gname,
            Family=fname,
        ))
        values.append(dict(
            ID=vsid,
            Language_ID=lid,
            Parameter_ID=fid,
            Value=denumber,
            Code_ID='{0}-{1}'.format(fid, denumber),
            Source=refs.get(vspk, []),
        ))
        codes[denumber] = {
            'ID': '{0}-{1}'.format(fid, denumber),
            'Name': dename,
            'Parameter_ID': fid,
        }

    fname, fauthors, aname = list(db.execute(SQL_FEATURE.format(fid)))[0]
    ds.write(
        ValueTable=values,
        LanguageTable=languages,
        ParameterTable=[{
            'ID': fid,
            'Name': fname,
            'Area': aname,
            'Authors': fauthors,
            'Url': 'http://wals.info/feature/' + fid}],
        CodeTable=codes.values(),
    )
예제 #4
0
def main(args):  # pragma: no cover
    ds = StructureDataset.from_metadata(DS)
    data = Data()
    for source in ds.sources:
        data.add(common.Source, source.id, _obj=bibtex2source(source))

    ext = [
        Record.from_string('@' + s, lowercase=True)
        for s in nfilter(BIB.split('@'))
    ]
    for rec in ext:
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for contrib in ds['contributors.csv']:
        o = data.add(
            common.Contributor,
            contrib['ID'],
            id=contrib['ID'].upper(),
            name=contrib['Name'],
            description=contrib['Description'],
            url=contrib['URL'],
            jsondata={
                'readme': contrib['Readme'],
                'contents': contrib['Contents']
            },
        )
        for src in contrib['Source']:
            DBSession.add(
                models.ContributorReference(source=data['Source'][src],
                                            contributor=o))

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE 2.0',
        description='PHOIBLE 2.0',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        domain='phoible.org',
        license='https://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'https://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, (cid, name) in enumerate([
        ('UZ', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
    ],
                                    start=1):
        contrib = data['Contributor'].get(cid)
        if not contrib:
            contrib = common.Contributor(id=cid, name=name)
        DBSession.add(
            common.Editor(dataset=dataset, ord=i, contributor=contrib))

    glottolog = Glottolog(
        Path(phoible.__file__).parent.parent.parent.parent.joinpath(
            'glottolog', 'glottolog'))

    for lang in ds['LanguageTable']:
        l = data.add(
            models.Variety,
            lang['ID'],
            id=lang['ID'],
            name=lang['Name'],
        )

    load_families(data, [(l.id, l)
                         for l in data['Variety'].values() if len(l.id) == 8],
                  glottolog.repos)
    DBSession.flush()

    # assign color codes:
    families = defaultdict(list)
    for l in data['Variety'].values():
        families[l.family_pk].append(l)

    colors = color.qualitative_colors(len(families))
    for i, langs in enumerate(sorted(families.values(),
                                     key=lambda v: -len(v))):
        for l in langs:
            l.jsondata = {'color': colors[i]}

    for segment in ds['ParameterTable']:
        equivalence_class = ''.join([
            t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']]
            if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
        ]),
        data.add(models.Segment,
                 segment['ID'],
                 id=segment['ID'],
                 name=segment['Name'],
                 description=segment['Description'],
                 segment_class=segment['SegmentClass'],
                 equivalence_class=equivalence_class)
    DBSession.flush()

    # Add redirects for old language pages! get relevant ISO codes and map to Glottocode!
    for model, repls in load(
            Path(phoible.__file__).parent.parent /
            'replacements.json').items():
        if model == 'Language':
            languoids = {l.id: l for l in glottolog.languoids()}
            iso_languoids = {l.iso: l for l in languoids.values() if l.iso}
            gl_in_phoible = set(data['Variety'].keys())
            for oid, nid in repls.items():
                gls = descendants_from_nodemap(
                    iso_languoids.get(oid),
                    languoids).intersection(gl_in_phoible)
                if gls:
                    nid = gls.pop()
                    if len(gls) > 1:
                        print('+++', oid, gls)
                else:
                    print('---', oid)
                common.Config.add_replacement(oid, nid, common.Language)
        elif model == 'Parameter':
            segments_in_phoible = set(data['Segment'].keys())
            for oid, nid in repls.items():
                id_ = nid if nid in segments_in_phoible else None
                common.Config.add_replacement(oid, id_, common.Parameter)

    for segment in ds['ParameterTable']:
        for i, (k, v) in enumerate(sorted(segment.items())):
            if k not in ['ID', 'Name', 'Description', 'SegmentClass']:
                DBSession.add(
                    common.Parameter_data(
                        key=feature_name(k),
                        value=v,
                        ord=i,
                        object_pk=data['Segment'][segment['ID']].pk))

    for inventory in ds['contributions.csv']:
        inv = data.add(
            models.Inventory,
            inventory['ID'],
            id=inventory['ID'],
            name='{0} ({1} {2})'.format(
                inventory['Name'],
                inventory['Contributor_ID'].upper(),
                inventory['ID'],
            ),
            source_url=inventory['URL'],
            count_tone=inventory['count_tones'],
            count_vowel=inventory['count_vowels'],
            count_consonant=inventory['count_consonants'],
        )
        DBSession.add(
            common.ContributionContributor(
                contribution=inv,
                contributor=data['Contributor'][
                    inventory['Contributor_ID'].upper()]))
        for src in inventory['Source']:
            DBSession.add(
                common.ContributionReference(contribution=inv,
                                             source=data['Source'][src]))

    for phoneme in ds['ValueTable']:
        lang = data['Variety'][phoneme['Language_ID']]
        inv = data['Inventory'][phoneme['Contribution_ID']]
        if not inv.language:
            inv.language = lang
        vs = common.ValueSet(
            id=phoneme['ID'],
            contribution=inv,
            language=lang,
            parameter=data['Segment'][phoneme['Parameter_ID']])

        for ref in phoneme['Source']:
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            models.Phoneme(
                id=phoneme['ID'],
                name='%s %s' %
                (phoneme['Value'],
                 data['Inventory'][phoneme['Contribution_ID']].name),
                allophones=' '.join(phoneme['Allophones']),
                marginal=phoneme['Marginal'],
                valueset=vs))

    return
예제 #5
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'),
                       bib_format='bibtex')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)
    ds = StructureDataset.in_dir(cldf_dir)
    ds.tablegroup.notes.append(
        OrderedDict([('dc:title', 'environment'),
                     ('properties',
                      OrderedDict([
                          ('glottolog_version', git_describe(glottolog.repos)),
                      ]))]))
    ds.add_columns('ValueTable', {
        'name': 'Marginal',
        'datatype': 'boolean'
    }, {
        'name': 'Allophones',
        'separator': ' '
    }, 'Contribution_ID')
    features = [
        "tone", "stress", "syllabic", "short", "long", "consonantal",
        "sonorant", "continuant", "delayedRelease", "approximant", "tap",
        "trill", "nasal", "lateral", "labial", "round", "labiodental",
        "coronal", "anterior", "distributed", "strident", "dorsal", "high",
        "low", "front", "back", "tense", "retractedTongueRoot",
        "advancedTongueRoot", "periodicGlottalSource", "epilaryngealSource",
        "spreadGlottis", "constrictedGlottis", "fortis",
        "raisedLarynxEjective", "loweredLarynxImplosive", "click"
    ]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable')
    ds.add_table(
        'contributions.csv', 'ID', 'Name', 'Contributor_ID', {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        }, 'URL')
    ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        },
        'URL',
    )

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], {}, {}, []
    for contrib in read('contributors.csv'):
        sources.append(
            dict(
                ID=contrib.Name,
                Name=contrib.Contributor,
                Description=contrib.Description,
                Readme=desc(dev, contrib.Name),
                Contents=contrib.Contents,
                Source=[
                    c.strip().lower() for c in contrib.Citation.split(';')
                ],
                URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            ))

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = pid
        segments.append(
            dict(ID=pid,
                 Name=row.Name,
                 Description=row.Description,
                 SegmentClass=row.SegmentClass,
                 **{f: getattr(row, f)
                    for f in features}))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(
            ';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(ID=row.ID,
                                   Name=row.Name,
                                   Contributor_ID=row.Contributor_ID,
                                   URL=row.URI if row.URI != 'NA' else '',
                                   Source=src[row.ID])

    uniq = set()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(
            inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code
                if row.ISO639P3code != 'NA' else None,
            )
        values.append(
            dict(
                ID=row.ID,
                Language_ID=lid,
                Parameter_ID=pid_map[row.Parameter_ID],
                Contribution_ID=row.Contribution_ID,
                Value=row.Name,
                Marginal=None if row.Marginal == 'NA' else eval(
                    row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
                Allophones=row.Allophones.split()
                if row.Allophones != 'NA' else [],
                Source=src[row.Contribution_ID],
            ))

    ds.write(
        **{
            'ValueTable': values,
            'LanguageTable': languages.values(),
            'ParameterTable': segments,
            'contributions.csv': inventories.values(),
            'contributors.csv': sources
        })
    ds.validate(logging.getLogger(__name__))
예제 #6
0
    def cmd_makecldf(self, args):
        #
        # Augment the schema of the rather simplistic CLDF download:
        #
        ds = args.writer.cldf
        # Add tables for controlled vocabularies:
        ds.add_table('regions.csv', 'ID', 'Name')
        ds.add_table('varietytypes.csv', 'ID', 'Name', 'Description')
        ds.add_table('featurecategories.csv', 'ID', 'Name', 'Description')
        ds.add_table('contributors.csv', 'ID', 'Name', 'URL', 'Address',
                     'Email')

        # We merge the data from contributions.csv into languages.csv for simplicity:
        ds.remove_table('contributions.csv')

        # Varieties have a region, a type, an abbreviation and contributors.
        ds.add_columns('LanguageTable', 'Description', 'Region_ID', 'Type_ID',
                       'abbr', {
                           'name': 'Contributor_ID',
                           'separator': ' '
                       })
        ds['LanguageTable'].add_foreign_key('Region_ID', 'regions.csv', 'ID')
        ds['LanguageTable'].add_foreign_key('Type_ID', 'varietytypes.csv',
                                            'ID')
        ds['LanguageTable'].add_foreign_key('Contributor_ID',
                                            'contributors.csv', 'ID')

        # Features have a category and a typical example, with source.
        ds.add_columns(
            'ParameterTable',
            'Category_ID',
            'Example_Source',
            {
                'name':
                'Attestation',
                'datatype':
                'float',
                'dc:description':
                "Attestation is a relative measure of how widespread a feature is in the set "
                "of eWAVE varieties. It is expressed as a percentage and is calculated as the "
                "sum of all A-, B- and C-ratings for a feature, divided by the number of "
                "varieties in the eWAVE dataset. The closer the value to 100%, the more "
                "widespread the feature is.",
            },
            {
                'name':
                'Pervasiveness',
                'datatype':
                'float',
                'dc:description':
                """\
Pervasiveness provides a measure of how pervasive a feature is on average in the varieties in 
which it is attested. Pervasiveness is calculated as all A-ratings for a feature plus 0.6 times 
the B-ratings for the same feature plus 0.3 times the C-ratings, divided by the sum of all 
A-, B- and C-ratings for the feature. This value is then multiplied by 100 and expressed as a 
percentage. A Pervasiveness value of 100% or close to 100% thus indicates that the feature is 
highly pervasive (rated A) in all or most of the varieties for which it is attested, while a 
value close to 30% (the lowest possible value) indicates that the feature is extremely rare 
(rated C) in most or all of the varieties for which it is attested. Intermediate values are less 
easy to interpret – here one has to look more closely at the ratio of A- to B- to C-values. 
Two more things should also be noted here:

- The Pervasiveness value does not provide information on how widespread a feature is in the entire 
  eWAVE dataset, i.e. for how many varieties the feature is actually attested.
- Since the eWAVE contributors did not all use exactly the same strategies in deciding when to 
  give a feature an A- vs. a B- or a C- vs. a B- rating, it is very difficult to translate the 
  ratings into numerical values that adequately reflect the differences between A-, B- and 
  C-ratings. The choice made here (1 for A, 0.6 for B and 0.3 for C) is certainly only one of 
  many, and further testing is required to see how adequate this model is.
""",
            },
        )
        ds['ParameterTable'].add_foreign_key('Category_ID',
                                             'featurecategories.csv', 'ID')

        # Values may have (many) examples:
        ds.add_columns(
            'ValueTable', {
                'name': 'Example_ID',
                'propertyUrl':
                'http://cldf.clld.org/v1.0/terms.rdf#exampleReference',
                'separator': ' ',
            })
        # ... but no Contribution_ID anymore:
        ds.remove_columns('ValueTable', 'Contribution_ID')

        # Examples may have sources:
        ds.add_columns(
            'ExampleTable', {
                'name': 'Source',
                'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
                'separator': ';',
            })

        history = ds.add_table('history.csv', 'Version', 'Language_ID',
                               'Parameter_ID', 'Code_ID')
        history.add_foreign_key('Language_ID', 'languages.csv', 'ID')
        history.add_foreign_key('Parameter_ID', 'parameters.csv', 'ID')
        history.add_foreign_key('Code_ID', 'codes.csv', 'ID')

        #
        # Now add the data:
        #
        ds.add_sources(self.raw_dir.read('sources.bib'))

        args.writer.objects['varietytypes.csv'] = [{
            'ID': r[0],
            'Name': r[1],
            'Description': r[2]
        } for r in self.raw_dir.read_csv('varietytype.psv', delimiter='|')]
        args.writer.objects['featurecategories.csv'] = [{
            'ID': r[0],
            'Name': r[1],
            'Description': r[2]
        } for r in self.raw_dir.read_csv('featurecategory.psv', delimiter='|')]
        args.writer.objects['regions.csv'] = [{
            'ID': r[0],
            'Name': r[1]
        } for r in self.raw_dir.read_csv('region.psv', delimiter='|')]

        for lid, pid, cid, _ in self.raw_dir.read_json('changes.json')['2013']:
            args.writer.objects['history.csv'].append({
                'Version':
                '1.0',
                'Language_ID':
                lid,
                'Parameter_ID':
                pid,
                'Code_ID':
                '{0}-{1}'.format(pid, cid.replace('?', 'NA'))
            })

        for row in self.raw_dir.read_csv('contributors.csv', dicts=True):
            #id, name, url, email, address
            args.writer.objects['contributors.csv'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'URL':
                row['url'],
                'Email':
                row['email'],
                'Address':
                row['address'],
            })

        # We read the bulk of the data from the CLDF export of the website:
        raw_ds = StructureDataset.from_metadata(
            self.raw_dir / 'StructureDataset-metadata.json')

        cc = {
            cid: [r[1] for r in rows]
            for cid, rows in itertools.groupby(
                sorted(self.raw_dir.read_csv('cc.csv'),
                       key=lambda r: (int(r[0]), int(r[2]), int(r[1]))),
                lambda r: r[0],
            )
        }
        desc = {
            r['ID']: r['Description']
            for r in self.raw_dir.read_csv('contributions.csv', dicts=True)
        }
        data = {r[0]: r[1:] for r in self.raw_dir.read_csv('variety.csv')}
        for row in raw_ds['LanguageTable']:
            row['Region_ID'] = data[row['ID']][0]
            row['Type_ID'] = data[row['ID']][1]
            row['abbr'] = data[row['ID']][2]
            row['Description'] = desc[row['ID']]
            row['Contributor_ID'] = cc[row['ID']]
            args.writer.objects['LanguageTable'].append(row)

        data = {r[0]: r[1:] for r in self.raw_dir.read_csv('feature.csv')}
        for row in raw_ds['ParameterTable']:
            row['Example_Source'] = data[row['ID']][0]
            row['Category_ID'] = data[row['ID']][1]
            row['Attestation'] = data[row['ID']][2]
            row['Pervasiveness'] = data[row['ID']][3]
            args.writer.objects['ParameterTable'].append(row)

        # Augment examples.csv
        def ref(r):
            return str(
                Reference(r['source'],
                          r['description'].replace('[', '(').replace(']',
                                                                     ')')))

        examplesource = {
            eid: [ref(r) for r in rows]
            for eid, rows in itertools.groupby(
                sorted(self.raw_dir.read_csv('examplesource.csv', dicts=True),
                       key=lambda d: (int(d['example']), d['source'])),
                lambda d: d['example'])
        }
        for row in raw_ds['ExampleTable']:
            row['Source'] = examplesource.get(row['ID'], [])
            args.writer.objects['ExampleTable'].append(row)

        # Renumber codes and values!
        for row in raw_ds['CodeTable']:
            row['ID'] = '{0}-{1}'.format(row['Parameter_ID'],
                                         row['Name'].replace('?', 'NA'))
            args.writer.objects['CodeTable'].append(row)

        valuesentence = {
            vid: [r['sentence'] for r in rows]
            for vid, rows in itertools.groupby(
                sorted(self.raw_dir.read_csv('valueexample.csv', dicts=True),
                       key=lambda d: (int(d['value']), int(d['sentence']))),
                lambda d: d['value'])
        }

        for row in raw_ds['ValueTable']:
            row['Example_ID'] = valuesentence.get(row['ID'], [])
            row['ID'] = '{0}-{1}'.format(row['Language_ID'],
                                         row['Parameter_ID'])
            row['Code_ID'] = '{0}-{1}'.format(row['Parameter_ID'], row['Value']
                                              or 'NA')
            args.writer.objects['ValueTable'].append(row)
예제 #7
0
idx = 1
for line in forms:
    data = line.strip().split()
    lid = data[0]
    for i, p in enumerate(data[1:]):
        pid = str(i+1)
        formtable += [{
            "ID": '{0}-{1}-{2}'.format(lid, pid, idx),
            "Value": p,
            "Language_ID": lid,
            "Parameter_ID": pid,
            "Source": ["Szeto2018"]
            }]
        idx += 1

ds = StructureDataset.in_dir('cldf')
ds.add_sources(Source('article', 'Szeto2018', 
        author = 'Szeto, Pui Yiu and Ansaldo, Umberto and Matthews, Steven',
        journal = 'Linguistic Typology',
        pages = '233-275',
        title = 'Typological variation across Mandarin dialects: An areal perspective with a quantitative approach',
        doi = '10.1515/lingty-2018-0009',
        ))

ds.add_component('ParameterTable')
ds.add_component('LanguageTable')
ds.write(ValueTable=formtable, ParameterTable=parametertable,
        LanguageTable=languagetable)

ds.write_metadata()
ds.write_sources()
예제 #8
0
파일: wals2cldf.py 프로젝트: afcarl/pycldf
def make_cldf(db, out, fid):
    # Initialize a CLDF dataset in the output directory, using the appropriate module:
    ds = StructureDataset.in_dir(out)

    # We add the WALS language metadata:
    ds.add_component('LanguageTable', 'Genus', 'Family')

    # And some metadata about the feature:
    ds.add_component('ParameterTable', 'Authors', 'Url', 'Area')
    ds.add_component('CodeTable')

    # Now we collect the data by querying the database:
    values, languages = [], []

    lids = defaultdict(dict)
    for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]):
        for itype, names in groupby(ids, lambda rr: rr[1]):
            names = [n[2] for n in names]
            if len(names) == 1:
                # only add identifiers for equivalent languoids, ignore partial matches.
                lids[lpk][itype] = names[0]

    # We store the sources and references per datapoint:
    sources, refs = defaultdict(list), defaultdict(list)
    for vspk, rs in groupby(db.execute(SQL_SOURCES), lambda r: r[0]):
        for r in rs:
            ref = r[2]
            if r[1]:
                ref += '[{0}]'.format(
                    r[1])  # add the page info in the correct format.
            refs[vspk].append(ref)
            sources[vspk].append(
                Source(r[3], r[2], author=r[4], year=r[5], title=r[6]))

    codes = {}
    for row in db.execute(SQL_VALUES.format(fid)):
        lpk, lid, lname, vsid, denumber, dename, lat, lon, vspk, gname, fname = row
        ids = lids[lpk]
        if vspk in sources:
            ds.sources.add(*sources[vspk])
        languages.append(
            dict(
                ID=lid,
                Name=lname,
                Latitude=lat,
                Longitude=lon,
                Glottocode=ids.get('glottolog'),
                ISO639P3code=ids.get('iso639-3'),
                Genus=gname,
                Family=fname,
            ))
        values.append(
            dict(
                ID=vsid,
                Language_ID=lid,
                Parameter_ID=fid,
                Value=denumber,
                Code_ID='{0}-{1}'.format(fid, denumber),
                Source=refs.get(vspk, []),
            ))
        codes[denumber] = {
            'ID': '{0}-{1}'.format(fid, denumber),
            'Name': dename,
            'Parameter_ID': fid,
        }

    fname, fauthors, aname = list(db.execute(SQL_FEATURE.format(fid)))[0]
    ds.write(
        ValueTable=values,
        LanguageTable=languages,
        ParameterTable=[{
            'ID': fid,
            'Name': fname,
            'Area': aname,
            'Authors': fauthors,
            'Url': 'http://wals.info/feature/' + fid
        }],
        CodeTable=codes.values(),
    )
예제 #9
0
파일: create.py 프로젝트: marctang/phoible
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex')
    for _, e in bib.entries.items():
        for field in e.fields:
            e.fields[field] = e.fields[field].replace('\\', '')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)

    ds = StructureDataset.in_dir(cldf_dir)

    def describe_repos(r, org, name=None):
        return OrderedDict([
            ('dc:title', '{0}/{1}'.format(org, name or r.name)),
            ('dc:description', git_describe(r))])

    ds.tablegroup.common_props['prov:wasDerivedFrom'] = [
        describe_repos(dev, 'phoible'),
        describe_repos(scripts, 'bambooforest'),
        describe_repos(glottolog.repos, 'clld'),
    ]
    ds.tablegroup.common_props['prov:wasGeneratedBy'] = describe_repos(
        Path(__file__).parent, 'cldf-datasets', name='phoible')

    ds.add_columns(
        'ValueTable',
        {'name': 'Marginal', 'datatype': 'boolean'},
        {'name': 'Allophones', 'separator': ' '},
        'Contribution_ID')
    features = ["tone","stress","syllabic","short","long","consonantal","sonorant","continuant","delayedRelease","approximant","tap","trill","nasal","lateral","labial","round","labiodental","coronal","anterior","distributed","strident","dorsal","high","low","front","back","tense","retractedTongueRoot","advancedTongueRoot","periodicGlottalSource","epilaryngealSource","spreadGlottis","constrictedGlottis","fortis","raisedLarynxEjective","loweredLarynxImplosive","click"]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable', 'Family_Glottocode', 'Family_Name')
    table = ds.add_table(
        'contributions.csv', 
        'ID', 
        'Name', 
        'Contributor_ID', 
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'count_phonemes', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_consonants', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_vowels', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_tones', 'datatype': {'base': 'integer', 'minimum': 0}, 'null': 'NA'},
    )
    table.tableSchema.primaryKey = ['ID']
    table.tableSchema.foreignKeys.append(ForeignKey.fromdict(dict(
        columnReference='Contributor_ID',
        reference=dict(resource='contributors.csv', columnReference='ID'))))
    table.common_props['dc:conformsTo'] = None
    table = ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'with_tones', 'datatype': {'base': 'boolean', 'format': '1|0'}},
    )
    table.tableSchema.primaryKey = ['ID']
    table.common_props['dc:conformsTo'] = None

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], OrderedDict(), OrderedDict(), []
    with_tones = {}
    for contrib in read('contributors.csv'):
        sources.append(dict(
            ID=contrib.Name,
            Name=contrib.Contributor,
            Description=contrib.Description,
            Readme=desc(dev, contrib.Name),
            Contents=contrib.Contents,
            Source=[c.strip().lower() for c in contrib.Citation.split(';')],
            URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            with_tones=contrib.with_tones == '1',
        ))
        with_tones[contrib.Name] = contrib.with_tones == '1'

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = (pid, row.SegmentClass)
        segments.append(dict(
            ID=pid,
            Name=row.Name,
            Description=row.Description,
            SegmentClass=row.SegmentClass,
            **{f: getattr(row, f) for f in features}
        ))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(
            ID=row.ID, 
            Name=row.Name, 
            Contributor_ID=row.Contributor_ID.upper(), 
            URL=row.URI if row.URI != 'NA' else '',
            Source=src[row.ID],
            count_phonemes=0,
            count_consonants=0,
            count_vowels=0,
            count_tones=0,
        )

    uniq, counts = set(), Counter()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            fam = lang.lineage[0] if lang and lang.lineage else None
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None,
                Macroarea=lang.macroareas[0].value if lang and lang.macroareas else None,
                Latitude=lang.latitude if lang else None,
                Longitude=lang.longitude if lang else None,
                Family_Glottocode=fam[1] if fam else None,
                Family_Name=fam[0] if fam else None,
            )
        pid, sc = pid_map[row.Parameter_ID]
        counts.update([(row.Contribution_ID, sc)])
        values.append(dict(
            ID=row.ID,
            Language_ID=lid,
            Parameter_ID=pid,
            Contribution_ID=row.Contribution_ID,
            Value=row.Name,
            Marginal=None if row.Marginal == 'NA' else eval(row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
            Allophones=row.Allophones.split() if row.Allophones != 'NA' else [],
            Source=src[row.Contribution_ID],
        ))
    for key, count in counts.items():
        inventories[key[0]]['count_{0}s'.format(key[1])] = count
        inventories[key[0]]['count_phonemes'] += count

    for inv in inventories.values():
        if not with_tones[inv['Contributor_ID']]:
            assert inv['count_tones'] == 0
            inv['count_tones'] = 'NA'

    ds.write(**{
        'ValueTable': values,
        'LanguageTable': languages.values(),
        'ParameterTable': segments,
        'contributions.csv': inventories.values(),
        'contributors.csv': sources
    })
    ds.validate(logging.getLogger(__name__))
예제 #10
0
def cldf(api, outdir, log):
    if not outdir.exists():
        outdir.mkdir()
    for p in outdir.iterdir():
        if p.suffix in ['.bib', '.csv', '.json']:
            p.unlink()
    ds = StructureDataset.in_dir(outdir)
    ds.add_provenance(
        wasDerivedFrom=repos('glottolog', clone=api.repos),
        wasGeneratedBy=repos('pyglottolog', version=pyglottolog.__version__),
    )
    ds.add_component('ParameterTable', {'name': 'type', 'default': None})
    ds.add_component('CodeTable', 'numerical_value')
    ds.add_columns('ValueTable', 'codeReference')
    ds.add_component(
        'LanguageTable',
        dict(name='Countries', separator=';'),
        {
            'name': 'Family_ID',
            'dc:description': 'Glottocode of the top-level genetic unit, the '
            'languoid belongs to'},
        {
            'name': 'Language_ID',
            'dc:description': 'Glottocode of the language-level languoid, the '
            'languoid belongs to (in case of dialects)'},
    )
    ds.add_foreign_key('LanguageTable', 'Family_ID', 'LanguageTable', 'ID')
    ds.add_foreign_key('LanguageTable', 'Language_ID', 'LanguageTable', 'ID')

    ds['LanguageTable', 'Macroarea'].separator = ';'
    ds['ValueTable', 'Value'].null = ['<NA>']

    data = collections.defaultdict(list)
    data['ParameterTable'].extend([
        dict(ID='level', Name='Level', type='categorical'),
        dict(ID='category', Name='Category', type='categorical'),
        dict(ID='classification', Name='Classification'),
        dict(ID='subclassification', Name='Subclassification'),
        dict(ID='aes', Name='Agglomerated Endangerment Status', type='sequential'),
        dict(ID='med', Name='Most Extensive Description', type='sequential'),
    ])
    for level in api.languoid_levels.values():
        data['CodeTable'].append(dict(
            ID='level-{0}'.format(level.name),
            Parameter_ID='level',
            Name=level.name,
            Description=level.description,
            numerical_value=level.ordinal))
        data['CodeTable'].append(dict(
            ID='category-{0}'.format(level.name.capitalize()),
            Parameter_ID='category',
            Name=level.name.capitalize()))
    for el in sorted(api.language_types.values()):
        data['CodeTable'].append(dict(
            ID='category-{0}'.format(el.category.replace(' ', '_')),
            Parameter_ID='category',
            Name=el.category))
    for el in sorted(api.aes_status.values()):
        data['CodeTable'].append(dict(
            ID='aes-{0}'.format(el.name.replace(' ', '_')),
            Parameter_ID='aes',
            Name=el.name,
            numerical_value=el.ordinal))
    for el in sorted(api.med_types.values()):
        data['CodeTable'].append(dict(
            ID='med-{0}'.format(el.id),
            Parameter_ID='med',
            Name=el.name,
            Description=el.description,
            numerical_value=el.rank))
    languoids = collections.OrderedDict((l.id, l) for l in api.languoids())
    refs_by_languoid, refs = api.refs_by_languoid(languoids)

    def get_language_id(l):
        if l.level == api.languoid_levels.dialect:
            for _, lid, _ in reversed(l.lineage):
                if languoids[lid].level == api.languoid_levels.language:
                    return lid

    def format_ref(ref):
        return '{0}[{1}]'.format(ref.key, ref.pages.replace(';', ',')) if ref.pages else ref.key

    for l in languoids.values():
        data['LanguageTable'].append(dict(
            ID=l.id,
            Name=l.name,
            Glottocode=l.id,
            ISO639P3code=l.iso,
            Latitude=l.latitude,
            Longitude=l.longitude,
            Macroarea=[ma.name for ma in l.macroareas],
            Countries=[c.id for c in l.countries],
            Family_ID=l.lineage[0][1] if l.lineage else None,
            Language_ID=get_language_id(l),
        ))
        med = sorted(refs_by_languoid[l.id], reverse=True)[0] if l.id in refs_by_languoid else None
        if med:
            ds.add_sources(Source(med.type, med.id, _check_id=False, **med.fields))
        clf = l.classification_comment
        if clf:
            for ref in clf.merged_refs('family') + clf.merged_refs('sub'):
                if ref.key not in refs:
                    log.warning('missing reference in classification comment: {0}'.format(ref))
                    continue
                e = refs[ref.key]
                ds.add_sources(Source(e.type, ref.key, _check_id=False, **e.fields))

        aes_src = l.endangerment.source.reference_id if l.endangerment else None
        if aes_src:
            e = refs[aes_src]
            ds.add_sources(Source(e.type, aes_src, _check_id=False, **e.fields))

        data['ValueTable'].extend([
            value(
                l.id,
                'level',
                l.level.name,
                Code_ID='level-{0}'.format(l.level.name)),
            value(l.id, 'category', l.category.replace(' ', '_')),
            value(
                l.id,
                'classification',
                '/'.join(l[1] for l in l.lineage),
                Source=[format_ref(ref) for ref in clf.merged_refs('family')] if clf else [],
                Comment=clf.family if clf else None,
            ),
            value(
                l.id,
                'subclassification',
                l.newick_node(nodes=languoids, template="{l.id}").newick,
                Source=[format_ref(ref) for ref in clf.merged_refs('sub')] if clf else [],
                Comment=clf.sub if clf else None,
            ),
            value(
                l.id,
                'aes',
                l.endangerment.status.name if l.endangerment else None,
                Comment=l.endangerment.comment if l.endangerment else None,
                Source=[aes_src] if aes_src else [],
                Code_ID='aes-{0}'.format(
                    l.endangerment.status.name.replace(' ', '_')) if l.endangerment else None),
            value(
                l.id,
                'med',
                med.med_type.name if med else None,
                Source=[med.id] if med else [],
                Code_ID='med-{0}'.format(med.med_type.id) if med else None),
        ])

    ds.write(outdir / 'cldf-metadata.json', **data)
    ds.validate(log=log)