def test_create(api, wiki, capsys, tmp_path): cldf_repos = tmp_path cldf.create(StructureDataset.in_dir(cldf_repos / 'cldf'), api, Path(__file__).parent / 'glottolog') #captured = capsys.readouterr() #assert 'inconsistent' in captured.out ds = StructureDataset.from_metadata(cldf_repos / 'cldf' / 'StructureDataset-metadata.json') assert len(list(ds['ValueTable'])) == 1 assert ds['contributors.csv', 'Photo'].valueUrl.expand(list(ds['contributors.csv'])[0]) == \ 'https://glottobank.org/photos/abc'
def run(args): cldf = StructureDataset.in_dir(args.cldf_repos / 'cldf') if args.glottolog_version != 'test': # pragma: no cover with Catalog(args.glottolog, args.glottolog_version) as glottolog: write_metadata(cldf, args, glottolog) else: write_metadata(cldf, args, None) write_schema(cldf) cldf.write(**get_data(cldf, args)) shutil.copy(str(args.repos.path('LICENSE.txt')), str(args.cldf_repos)) if not args.dev: cldf.validate(log=args.log)
def make_cldf(db, out, fid): # Initialize a CLDF dataset in the output directory, using the appropriate module: ds = StructureDataset.in_dir(out) # We add the WALS language metadata: ds.add_component('LanguageTable', 'Genus', 'Family') # And some metadata about the feature: ds.add_component('ParameterTable', 'Authors', 'Url', 'Area') ds.add_component('CodeTable') # Now we collect the data by querying the database: values, languages = [], [] lids = defaultdict(dict) for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]): for itype, names in groupby(ids, lambda rr: rr[1]): names = [n[2] for n in names] if len(names) == 1: # only add identifiers for equivalent languoids, ignore partial matches. lids[lpk][itype] = names[0] # We store the sources and references per datapoint: sources, refs = defaultdict(list), defaultdict(list) for vspk, rs in groupby(db.execute(SQL_SOURCES), lambda r: r[0]): for r in rs: ref = r[2] if r[1]: ref += '[{0}]'.format(r[1]) # add the page info in the correct format. refs[vspk].append(ref) sources[vspk].append(Source(r[3], r[2], author=r[4], year=r[5], title=r[6])) codes = {} for row in db.execute(SQL_VALUES.format(fid)): lpk, lid, lname, vsid, denumber, dename, lat, lon, vspk, gname, fname = row ids = lids[lpk] if vspk in sources: ds.sources.add(*sources[vspk]) languages.append(dict( ID=lid, Name=lname, Latitude=lat, Longitude=lon, Glottocode=ids.get('glottolog'), ISO639P3code=ids.get('iso639-3'), Genus=gname, Family=fname, )) values.append(dict( ID=vsid, Language_ID=lid, Parameter_ID=fid, Value=denumber, Code_ID='{0}-{1}'.format(fid, denumber), Source=refs.get(vspk, []), )) codes[denumber] = { 'ID': '{0}-{1}'.format(fid, denumber), 'Name': dename, 'Parameter_ID': fid, } fname, fauthors, aname = list(db.execute(SQL_FEATURE.format(fid)))[0] ds.write( ValueTable=values, LanguageTable=languages, ParameterTable=[{ 'ID': fid, 'Name': fname, 'Area': aname, 'Authors': fauthors, 'Url': 'http://wals.info/feature/' + fid}], CodeTable=codes.values(), )
def main(scripts, dev, glr): cldf_dir = Path('cldf') bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex') write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex')) glottolog = Glottolog(glr) ds = StructureDataset.in_dir(cldf_dir) ds.tablegroup.notes.append( OrderedDict([('dc:title', 'environment'), ('properties', OrderedDict([ ('glottolog_version', git_describe(glottolog.repos)), ]))])) ds.add_columns('ValueTable', { 'name': 'Marginal', 'datatype': 'boolean' }, { 'name': 'Allophones', 'separator': ' ' }, 'Contribution_ID') features = [ "tone", "stress", "syllabic", "short", "long", "consonantal", "sonorant", "continuant", "delayedRelease", "approximant", "tap", "trill", "nasal", "lateral", "labial", "round", "labiodental", "coronal", "anterior", "distributed", "strident", "dorsal", "high", "low", "front", "back", "tense", "retractedTongueRoot", "advancedTongueRoot", "periodicGlottalSource", "epilaryngealSource", "spreadGlottis", "constrictedGlottis", "fortis", "raisedLarynxEjective", "loweredLarynxImplosive", "click" ] ds.add_component('ParameterTable', 'SegmentClass', *features) ds.add_component('LanguageTable') ds.add_table( 'contributions.csv', 'ID', 'Name', 'Contributor_ID', { 'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';' }, 'URL') ds.add_table( 'contributors.csv', 'ID', 'Name', 'Description', 'Readme', 'Contents', { 'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';' }, 'URL', ) def read(what): return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True) languoids = {l.id: l for l in glottolog.languoids()} values, segments, languages, inventories, sources = [], [], {}, {}, [] for contrib in read('contributors.csv'): sources.append( dict( ID=contrib.Name, Name=contrib.Contributor, Description=contrib.Description, Readme=desc(dev, contrib.Name), Contents=contrib.Contents, Source=[ c.strip().lower() for c in contrib.Citation.split(';') ], URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '', )) pid_map = {} for row in read('parameters.csv'): pid = md5(row.Description.encode('utf8')).hexdigest().upper() pid_map[row.ID] = pid segments.append( dict(ID=pid, Name=row.Name, Description=row.Description, SegmentClass=row.SegmentClass, **{f: getattr(row, f) for f in features})) src = {} for row in read('contributions.csv'): src[row.ID] = row.References.split( ';') if row.References != 'no source given' else [] src[row.ID] = [sid.lower() for sid in src[row.ID]] inventories[row.ID] = dict(ID=row.ID, Name=row.Name, Contributor_ID=row.Contributor_ID, URL=row.URI if row.URI != 'NA' else '', Source=src[row.ID]) uniq = set() for row in read('values.csv'): pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID) if pk in uniq: print('skipping duplicate phoneme {0}'.format(pk)) continue uniq.add(pk) lid = row.Language_ID if row.Language_ID in languoids else slug( inventories[row.Contribution_ID]['Name']) if lid not in languages: # # FIXME: Language_ID == 'NA' for three inventories! This must be mapped! # lang = languoids.get(lid) languages[lid] = dict( ID=lid, Name=lang.name if lang else None, Glottocode=lang.id if lang else None, ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None, ) values.append( dict( ID=row.ID, Language_ID=lid, Parameter_ID=pid_map[row.Parameter_ID], Contribution_ID=row.Contribution_ID, Value=row.Name, Marginal=None if row.Marginal == 'NA' else eval( row.Marginal.lower().capitalize()), # FALSE|TRUE|NA Allophones=row.Allophones.split() if row.Allophones != 'NA' else [], Source=src[row.Contribution_ID], )) ds.write( **{ 'ValueTable': values, 'LanguageTable': languages.values(), 'ParameterTable': segments, 'contributions.csv': inventories.values(), 'contributors.csv': sources }) ds.validate(logging.getLogger(__name__))
idx = 1 for line in forms: data = line.strip().split() lid = data[0] for i, p in enumerate(data[1:]): pid = str(i+1) formtable += [{ "ID": '{0}-{1}-{2}'.format(lid, pid, idx), "Value": p, "Language_ID": lid, "Parameter_ID": pid, "Source": ["Szeto2018"] }] idx += 1 ds = StructureDataset.in_dir('cldf') ds.add_sources(Source('article', 'Szeto2018', author = 'Szeto, Pui Yiu and Ansaldo, Umberto and Matthews, Steven', journal = 'Linguistic Typology', pages = '233-275', title = 'Typological variation across Mandarin dialects: An areal perspective with a quantitative approach', doi = '10.1515/lingty-2018-0009', )) ds.add_component('ParameterTable') ds.add_component('LanguageTable') ds.write(ValueTable=formtable, ParameterTable=parametertable, LanguageTable=languagetable) ds.write_metadata() ds.write_sources()
def make_cldf(db, out, fid): # Initialize a CLDF dataset in the output directory, using the appropriate module: ds = StructureDataset.in_dir(out) # We add the WALS language metadata: ds.add_component('LanguageTable', 'Genus', 'Family') # And some metadata about the feature: ds.add_component('ParameterTable', 'Authors', 'Url', 'Area') ds.add_component('CodeTable') # Now we collect the data by querying the database: values, languages = [], [] lids = defaultdict(dict) for lpk, ids in groupby(db.execute(SQL_IDENTIFIERS), lambda r: r[0]): for itype, names in groupby(ids, lambda rr: rr[1]): names = [n[2] for n in names] if len(names) == 1: # only add identifiers for equivalent languoids, ignore partial matches. lids[lpk][itype] = names[0] # We store the sources and references per datapoint: sources, refs = defaultdict(list), defaultdict(list) for vspk, rs in groupby(db.execute(SQL_SOURCES), lambda r: r[0]): for r in rs: ref = r[2] if r[1]: ref += '[{0}]'.format( r[1]) # add the page info in the correct format. refs[vspk].append(ref) sources[vspk].append( Source(r[3], r[2], author=r[4], year=r[5], title=r[6])) codes = {} for row in db.execute(SQL_VALUES.format(fid)): lpk, lid, lname, vsid, denumber, dename, lat, lon, vspk, gname, fname = row ids = lids[lpk] if vspk in sources: ds.sources.add(*sources[vspk]) languages.append( dict( ID=lid, Name=lname, Latitude=lat, Longitude=lon, Glottocode=ids.get('glottolog'), ISO639P3code=ids.get('iso639-3'), Genus=gname, Family=fname, )) values.append( dict( ID=vsid, Language_ID=lid, Parameter_ID=fid, Value=denumber, Code_ID='{0}-{1}'.format(fid, denumber), Source=refs.get(vspk, []), )) codes[denumber] = { 'ID': '{0}-{1}'.format(fid, denumber), 'Name': dename, 'Parameter_ID': fid, } fname, fauthors, aname = list(db.execute(SQL_FEATURE.format(fid)))[0] ds.write( ValueTable=values, LanguageTable=languages, ParameterTable=[{ 'ID': fid, 'Name': fname, 'Area': aname, 'Authors': fauthors, 'Url': 'http://wals.info/feature/' + fid }], CodeTable=codes.values(), )
def main(scripts, dev, glr): cldf_dir = Path('cldf') bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex') for _, e in bib.entries.items(): for field in e.fields: e.fields[field] = e.fields[field].replace('\\', '') write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex')) glottolog = Glottolog(glr) ds = StructureDataset.in_dir(cldf_dir) def describe_repos(r, org, name=None): return OrderedDict([ ('dc:title', '{0}/{1}'.format(org, name or r.name)), ('dc:description', git_describe(r))]) ds.tablegroup.common_props['prov:wasDerivedFrom'] = [ describe_repos(dev, 'phoible'), describe_repos(scripts, 'bambooforest'), describe_repos(glottolog.repos, 'clld'), ] ds.tablegroup.common_props['prov:wasGeneratedBy'] = describe_repos( Path(__file__).parent, 'cldf-datasets', name='phoible') ds.add_columns( 'ValueTable', {'name': 'Marginal', 'datatype': 'boolean'}, {'name': 'Allophones', 'separator': ' '}, 'Contribution_ID') features = ["tone","stress","syllabic","short","long","consonantal","sonorant","continuant","delayedRelease","approximant","tap","trill","nasal","lateral","labial","round","labiodental","coronal","anterior","distributed","strident","dorsal","high","low","front","back","tense","retractedTongueRoot","advancedTongueRoot","periodicGlottalSource","epilaryngealSource","spreadGlottis","constrictedGlottis","fortis","raisedLarynxEjective","loweredLarynxImplosive","click"] ds.add_component('ParameterTable', 'SegmentClass', *features) ds.add_component('LanguageTable', 'Family_Glottocode', 'Family_Name') table = ds.add_table( 'contributions.csv', 'ID', 'Name', 'Contributor_ID', {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'}, 'URL', {'name': 'count_phonemes', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_consonants', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_vowels', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_tones', 'datatype': {'base': 'integer', 'minimum': 0}, 'null': 'NA'}, ) table.tableSchema.primaryKey = ['ID'] table.tableSchema.foreignKeys.append(ForeignKey.fromdict(dict( columnReference='Contributor_ID', reference=dict(resource='contributors.csv', columnReference='ID')))) table.common_props['dc:conformsTo'] = None table = ds.add_table( 'contributors.csv', 'ID', 'Name', 'Description', 'Readme', 'Contents', {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'}, 'URL', {'name': 'with_tones', 'datatype': {'base': 'boolean', 'format': '1|0'}}, ) table.tableSchema.primaryKey = ['ID'] table.common_props['dc:conformsTo'] = None def read(what): return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True) languoids = {l.id: l for l in glottolog.languoids()} values, segments, languages, inventories, sources = [], [], OrderedDict(), OrderedDict(), [] with_tones = {} for contrib in read('contributors.csv'): sources.append(dict( ID=contrib.Name, Name=contrib.Contributor, Description=contrib.Description, Readme=desc(dev, contrib.Name), Contents=contrib.Contents, Source=[c.strip().lower() for c in contrib.Citation.split(';')], URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '', with_tones=contrib.with_tones == '1', )) with_tones[contrib.Name] = contrib.with_tones == '1' pid_map = {} for row in read('parameters.csv'): pid = md5(row.Description.encode('utf8')).hexdigest().upper() pid_map[row.ID] = (pid, row.SegmentClass) segments.append(dict( ID=pid, Name=row.Name, Description=row.Description, SegmentClass=row.SegmentClass, **{f: getattr(row, f) for f in features} )) src = {} for row in read('contributions.csv'): src[row.ID] = row.References.split(';') if row.References != 'no source given' else [] src[row.ID] = [sid.lower() for sid in src[row.ID]] inventories[row.ID] = dict( ID=row.ID, Name=row.Name, Contributor_ID=row.Contributor_ID.upper(), URL=row.URI if row.URI != 'NA' else '', Source=src[row.ID], count_phonemes=0, count_consonants=0, count_vowels=0, count_tones=0, ) uniq, counts = set(), Counter() for row in read('values.csv'): pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID) if pk in uniq: print('skipping duplicate phoneme {0}'.format(pk)) continue uniq.add(pk) lid = row.Language_ID if row.Language_ID in languoids else slug(inventories[row.Contribution_ID]['Name']) if lid not in languages: # # FIXME: Language_ID == 'NA' for three inventories! This must be mapped! # lang = languoids.get(lid) fam = lang.lineage[0] if lang and lang.lineage else None languages[lid] = dict( ID=lid, Name=lang.name if lang else None, Glottocode=lang.id if lang else None, ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None, Macroarea=lang.macroareas[0].value if lang and lang.macroareas else None, Latitude=lang.latitude if lang else None, Longitude=lang.longitude if lang else None, Family_Glottocode=fam[1] if fam else None, Family_Name=fam[0] if fam else None, ) pid, sc = pid_map[row.Parameter_ID] counts.update([(row.Contribution_ID, sc)]) values.append(dict( ID=row.ID, Language_ID=lid, Parameter_ID=pid, Contribution_ID=row.Contribution_ID, Value=row.Name, Marginal=None if row.Marginal == 'NA' else eval(row.Marginal.lower().capitalize()), # FALSE|TRUE|NA Allophones=row.Allophones.split() if row.Allophones != 'NA' else [], Source=src[row.Contribution_ID], )) for key, count in counts.items(): inventories[key[0]]['count_{0}s'.format(key[1])] = count inventories[key[0]]['count_phonemes'] += count for inv in inventories.values(): if not with_tones[inv['Contributor_ID']]: assert inv['count_tones'] == 0 inv['count_tones'] = 'NA' ds.write(**{ 'ValueTable': values, 'LanguageTable': languages.values(), 'ParameterTable': segments, 'contributions.csv': inventories.values(), 'contributors.csv': sources }) ds.validate(logging.getLogger(__name__))
def cldf(api, outdir, log): if not outdir.exists(): outdir.mkdir() for p in outdir.iterdir(): if p.suffix in ['.bib', '.csv', '.json']: p.unlink() ds = StructureDataset.in_dir(outdir) ds.add_provenance( wasDerivedFrom=repos('glottolog', clone=api.repos), wasGeneratedBy=repos('pyglottolog', version=pyglottolog.__version__), ) ds.add_component('ParameterTable', {'name': 'type', 'default': None}) ds.add_component('CodeTable', 'numerical_value') ds.add_columns('ValueTable', 'codeReference') ds.add_component( 'LanguageTable', dict(name='Countries', separator=';'), { 'name': 'Family_ID', 'dc:description': 'Glottocode of the top-level genetic unit, the ' 'languoid belongs to'}, { 'name': 'Language_ID', 'dc:description': 'Glottocode of the language-level languoid, the ' 'languoid belongs to (in case of dialects)'}, ) ds.add_foreign_key('LanguageTable', 'Family_ID', 'LanguageTable', 'ID') ds.add_foreign_key('LanguageTable', 'Language_ID', 'LanguageTable', 'ID') ds['LanguageTable', 'Macroarea'].separator = ';' ds['ValueTable', 'Value'].null = ['<NA>'] data = collections.defaultdict(list) data['ParameterTable'].extend([ dict(ID='level', Name='Level', type='categorical'), dict(ID='category', Name='Category', type='categorical'), dict(ID='classification', Name='Classification'), dict(ID='subclassification', Name='Subclassification'), dict(ID='aes', Name='Agglomerated Endangerment Status', type='sequential'), dict(ID='med', Name='Most Extensive Description', type='sequential'), ]) for level in api.languoid_levels.values(): data['CodeTable'].append(dict( ID='level-{0}'.format(level.name), Parameter_ID='level', Name=level.name, Description=level.description, numerical_value=level.ordinal)) data['CodeTable'].append(dict( ID='category-{0}'.format(level.name.capitalize()), Parameter_ID='category', Name=level.name.capitalize())) for el in sorted(api.language_types.values()): data['CodeTable'].append(dict( ID='category-{0}'.format(el.category.replace(' ', '_')), Parameter_ID='category', Name=el.category)) for el in sorted(api.aes_status.values()): data['CodeTable'].append(dict( ID='aes-{0}'.format(el.name.replace(' ', '_')), Parameter_ID='aes', Name=el.name, numerical_value=el.ordinal)) for el in sorted(api.med_types.values()): data['CodeTable'].append(dict( ID='med-{0}'.format(el.id), Parameter_ID='med', Name=el.name, Description=el.description, numerical_value=el.rank)) languoids = collections.OrderedDict((l.id, l) for l in api.languoids()) refs_by_languoid, refs = api.refs_by_languoid(languoids) def get_language_id(l): if l.level == api.languoid_levels.dialect: for _, lid, _ in reversed(l.lineage): if languoids[lid].level == api.languoid_levels.language: return lid def format_ref(ref): return '{0}[{1}]'.format(ref.key, ref.pages.replace(';', ',')) if ref.pages else ref.key for l in languoids.values(): data['LanguageTable'].append(dict( ID=l.id, Name=l.name, Glottocode=l.id, ISO639P3code=l.iso, Latitude=l.latitude, Longitude=l.longitude, Macroarea=[ma.name for ma in l.macroareas], Countries=[c.id for c in l.countries], Family_ID=l.lineage[0][1] if l.lineage else None, Language_ID=get_language_id(l), )) med = sorted(refs_by_languoid[l.id], reverse=True)[0] if l.id in refs_by_languoid else None if med: ds.add_sources(Source(med.type, med.id, _check_id=False, **med.fields)) clf = l.classification_comment if clf: for ref in clf.merged_refs('family') + clf.merged_refs('sub'): if ref.key not in refs: log.warning('missing reference in classification comment: {0}'.format(ref)) continue e = refs[ref.key] ds.add_sources(Source(e.type, ref.key, _check_id=False, **e.fields)) aes_src = l.endangerment.source.reference_id if l.endangerment else None if aes_src: e = refs[aes_src] ds.add_sources(Source(e.type, aes_src, _check_id=False, **e.fields)) data['ValueTable'].extend([ value( l.id, 'level', l.level.name, Code_ID='level-{0}'.format(l.level.name)), value(l.id, 'category', l.category.replace(' ', '_')), value( l.id, 'classification', '/'.join(l[1] for l in l.lineage), Source=[format_ref(ref) for ref in clf.merged_refs('family')] if clf else [], Comment=clf.family if clf else None, ), value( l.id, 'subclassification', l.newick_node(nodes=languoids, template="{l.id}").newick, Source=[format_ref(ref) for ref in clf.merged_refs('sub')] if clf else [], Comment=clf.sub if clf else None, ), value( l.id, 'aes', l.endangerment.status.name if l.endangerment else None, Comment=l.endangerment.comment if l.endangerment else None, Source=[aes_src] if aes_src else [], Code_ID='aes-{0}'.format( l.endangerment.status.name.replace(' ', '_')) if l.endangerment else None), value( l.id, 'med', med.med_type.name if med else None, Source=[med.id] if med else [], Code_ID='med-{0}'.format(med.med_type.id) if med else None), ]) ds.write(outdir / 'cldf-metadata.json', **data) ds.validate(log=log)