def test_BibFile(self): bibfile = self.api.bibfiles['a.bib'] self.assertEqual(bibfile['a:key'].type, 'misc') self.assertEqual(bibfile['s:Andalusi:Turk'].key, 's:Andalusi:Turk') for entry in bibfile.iterentries(): if entry.key == 'key': self.assertEqual(len(list(entry.languoids({'abc': 1})[0])), 1) with self.assertRaises(KeyError): bibfile['xyz'] self.assertEqual(len(list(bibfile.iterentries())), 3) lines = [line for line in read_text(bibfile.fname).split('\n') if not line.strip().startswith('glottolog_ref_id')] write_text(self.tmp_path('a.bib'), '\n'.join(lines)) bibfile.update(self.tmp_path('a.bib')) self.assertEqual(len(list(bibfile.iterentries())), 3) bibfile.update(self.api.bibfiles['b.bib'].fname) self.assertEqual(len(list(bibfile.iterentries())), 1) def visitor(entry): entry.fields['new_field'] = 'a' bibfile.visit(visitor=visitor) for entry in bibfile.iterentries(): self.assertIn('new_field', entry.fields) bibfile.visit(visitor=lambda e: True) self.assertEqual(len(bibfile.keys()), 0)
def app(args): # pragma: no cover """ Dumps Concepticon's contents for English, German, Chinese, and French. Notes ----- Data are by default dumped into a structured JSON file in html/data.js. Examples -------- $ concepticon html """ data = defaultdict(list) def key(g, l): return '{0}---{1}'.format(g, l) for lang in ['en', 'de', 'zh', 'fr', 'ru', 'es', 'pt']: for cidx, gloss in args.api._get_map_for_language(lang): g0, _, g1 = gloss.partition('///') csspec = ( cidx, args.api.conceptsets[cidx].gloss, args.api.conceptsets[cidx].definition, args.api.conceptsets[cidx].ontological_category) data[key(g1, lang)].append(csspec) if lang == 'en': data[key(g0, lang)].append(csspec) data[key(g0.lower(), lang)].append(csspec) data['language'] = 'en' write_text( args.api.appdatadir.joinpath('data.js'), 'var Concepticon = {0};\n'.format(json.dumps(data, indent=2))) args.log.info('app data recreated')
def test_Dataset_from_scratch(tmpdir, data): # An unknown file name cannot be used with Dataset.from_data: copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv')) with pytest.raises(ValueError): Dataset.from_data(str(tmpdir / 'xyz.csv')) # Known file name, but non-standard column name: write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1") with pytest.raises(ValueError, match='missing columns'): ds = Dataset.from_data(str(tmpdir / 'values.csv')) # A known file name will determine the CLDF module of the dataset: copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv')) ds = Dataset.from_data(str(tmpdir / 'values.csv')) assert ds.module == 'StructureDataset' assert len(list(ds['ValueTable'])) == 2 ds.validate() ds['ValueTable'].write(2 * list(ds['ValueTable'])) with pytest.raises(ValueError): ds.validate() md = ds.write_metadata() Dataset.from_metadata(md) repr(ds) del ds.tablegroup.common_props['dc:conformsTo'] Dataset.from_metadata(ds.write_metadata()) assert len(ds.stats()) == 1 ds.add_table('extra.csv', 'ID') ds.write(**{'ValueTable': [], 'extra.csv': []}) counts = {r[0]: r[2] for r in ds.stats()} assert counts['extra.csv'] == 0
def test_Dataset_from_scratch(tmpdir, data): # An unknown file name cannot be used with Dataset.from_data: copy(str(data / 'ds1.csv'), str(tmpdir / 'xyz.csv')) with pytest.raises(ValueError): Dataset.from_data(str(tmpdir / 'xyz.csv')) # Known file name, but non-standard column name: write_text(str(tmpdir / 'values.csv'), "IDX,Language_ID,Parameter_ID,Value\n1,1,1,1") with pytest.raises(ValueError, match='missing columns'): ds = Dataset.from_data(str(tmpdir / 'values.csv')) # A known file name will determine the CLDF module of the dataset: copy(str(data / 'ds1.csv'), str(tmpdir / 'values.csv')) ds = Dataset.from_data(str(tmpdir / 'values.csv')) assert ds.module == 'StructureDataset' assert len(list(ds['ValueTable'])) == 2 ds.validate() ds['ValueTable'].write(2 * list(ds['ValueTable'])) with pytest.raises(ValueError): ds.validate() md = ds.write_metadata() Dataset.from_metadata(md) repr(ds) del ds.tablegroup.common_props['dc:conformsTo'] Dataset.from_metadata(ds.write_metadata()) assert len(ds.stats()) == 1
def test_encoding(tmppath): ini = tmppath / 'test.ini' write_text(ini, '[äöü]\näöü = äöü', encoding='cp1252') with pytest.raises(UnicodeDecodeError): INI.from_file(ini) assert INI.from_file(ini, encoding='cp1252')['äöü']['äöü'] == 'äöü'
def new_dataset(args): """ lexibank new-dataset OUTDIR [ID] """ if not args.args: raise ParserError('you must specify an existing directory') outdir = Path(args.args.pop(0)) if not outdir.exists(): raise ParserError('you must specify an existing directory') id_pattern = re.compile('[a-z_0-9]+$') md = {} if args.args: md['id'] = args.args.pop(0) else: md['id'] = input('Dataset ID: ') while not id_pattern.match(md['id']): print( 'dataset id must only consist of lowercase ascii letters, digits and _ (underscore)!' ) md['id'] = input('Dataset ID: ') outdir = outdir / md['id'] if not outdir.exists(): outdir.mkdir() for key in ['title', 'url', 'license', 'conceptlist', 'citation']: md[key] = input('Dataset {0}: '.format(key)) # check license! # check conceptlist! for path in Path( pylexibank.__file__).parent.joinpath('dataset_template').iterdir(): if path.is_file(): if path.suffix in ['.pyc']: continue # pragma: no cover target = path.name content = read_text(path) if '+' in path.name: target = re.sub('\+([a-z]+)\+', lambda m: '{' + m.groups()[0] + '}', path.name).format(**md) if target.endswith('_tmpl'): target = target[:-5] content = content.format(**md) write_text(outdir / target, content) else: target = outdir / path.name if target.exists(): shutil.rmtree(str(target)) shutil.copytree(str(path), str(target)) del md['id'] jsonlib.dump(md, outdir / 'metadata.json', indent=4)
def readme(args): md = ['# Sources', ''] for datatype in ['datasets', 'phylogenies']: md.append('\n## {0}\n'.format(datatype.capitalize())) t = Table('Name', 'Reference') for obj in getattr(args.repos, datatype): if not obj.id.startswith( 'glottolog_') or obj.id == 'glottolog_global': t.append([ '[{0}]({1}/{2})'.format(obj.name, datatype, obj.id), obj.reference ]) md.append(t.render(condensed=False)) write_text(args.repos.dir.joinpath('SOURCES.md'), '\n'.join(md))
def dump(version, all_langs, identifiers): out = args.data_file('files', 'glottolog-{0}'.format(version)) if out.exists(): for p in out.iterdir(): remove(p) else: out.mkdir() langs = all_langs[version].values() langs_by_pk = {l.pk: l for l in langs} children = { pk: list(c) for pk, c in groupby(sorted(langs, key=lambda l: l.fpk), lambda l: l.fpk) } for lang in langs: ancestors, fpk = [], lang.fpk while fpk and fpk in langs_by_pk: ancestors.append(langs_by_pk[fpk]) fpk = langs_by_pk[fpk].fpk versions = [ '<strong><a href="http://glottolog.org/resource/languoid/id/{0}">[{0}] in current Glottolog</a></strong>' .format(lang.id) ] for v in sorted(all_langs.keys()): if v != version: if lang.id in all_langs[v]: versions.append(all_langs[v][lang.id].cross_version_link) clf = [link_list(children.get(lang.pk, []))] clf.append(lang.text) clf.extend(a.link for a in ancestors) write_text( out.joinpath('{0}.html'.format(lang.id)), T.render_unicode( version=version, lang=lang, clf=reduce(wrap, clf) if not lang.replacements else '', versions=versions, identifiers=identifiers.get(lang.pk, []), replacements=[ all_langs[version][lid].link for lid in lang.replacements if lid in all_langs[version] ], wrap=wrap, link_list=link_list, ))
def newick(args): from pyglottolog.languoids import Level nodes = collections.OrderedDict((l.id, l) for l in args.repos.languoids()) trees = [] for lang in nodes.values(): if not lang.lineage and not lang.category.startswith('Pseudo '): ns = lang.newick_node(nodes=nodes).newick if lang.level == Level.language and not ns.startswith('('): # an isolate without dialects: we wrap it in a pseudo-family with the # same name and ID. ns = '({0}){0}'.format(ns) trees.append('{0};'.format(ns)) fname = args.pkg_dir.joinpath('static', 'download', 'tree-glottolog-newick.txt') write_text(fname, '\n'.join(trees)) args.log.info('{0} written'.format(fname))
def test_read_write(tmppath): from clldutils.path import read_text, write_text text = 'äöüß' p = tmppath / 'test' assert write_text(p, text) == len(text) assert read_text(p) == text
def write_valid_soundfilepaths(args): """ Creates the file 'valid_soundfilepaths.txt' containig all valid sound file paths based on database data. """ db = _db(args) api = _api(args) # make sure all data will be concatenated db("SET @@group_concat_max_len = 4096") query = """ SELECT concat(L.FilePathPart,"/",L.FilePathPart, W.SoundFileWordIdentifierText) as P FROM Words AS W, Languages AS L WHERE L.study <> 'Europe' AND W.study <> 'Europe' AND L.study = W.study UNION SELECT concat( L.FilePathPart,"/",L.FilePathPart, W.SoundFileWordIdentifierText, case when T.AlternativeLexemIx > 1 and T.AlternativePhoneticRealisationIx = 0 then concat("_lex", T.AlternativeLexemIx) when T.AlternativeLexemIx = 0 and T.AlternativePhoneticRealisationIx > 1 then concat("_pron", T.AlternativePhoneticRealisationIx) when T.AlternativeLexemIx > 1 and T.AlternativePhoneticRealisationIx > 1 then concat("_lex", T.AlternativeLexemIx,"_pron", T.AlternativePhoneticRealisationIx) else "" end ) as P FROM Transcriptions AS T, Words AS W, Languages AS L WHERE L.study <> 'Europe' AND T.study <> 'Europe' AND W.study <> 'Europe' AND L.`LanguageIx` = T.`LanguageIx` AND W.`IxElicitation` = T.`IxElicitation` AND W.IxMorphologicalInstance = T.IxMorphologicalInstance AND L.study = W.study ORDER BY 1 ASC """ data = list(db(query)) valid_snd_file_names = set() for row in data: valid_snd_file_names.add(row['P']) write_text(api.repos / 'soundfiles' / 'valid_soundfilepaths.txt', '\n'.join(sorted(valid_snd_file_names, key=lambda s: s.lower())))
def read_url(path, cache_dir=None, log=None): """ Delegate scraping to clldutils, since nowadays this requires tweaking the user agent as well. """ if cache_dir: cache_dir = Path(cache_dir) if log: # pragma: no cover log.debug('retrieving {0} ...'.format(path)) fpath = cache_dir / hashlib.md5(path.encode('utf8')).hexdigest() if not fpath.exists(): with iso_639_3._open(path) as fp: write_text(fpath, fp.read().decode('utf8')) else: # pragma: no cover if log: log.debug('... from cache {0}'.format(fpath)) return read_text(fpath) with iso_639_3._open(path) as fp: return fp.read().decode('utf8')
def dump(out, version, all_langs, identifiers): if out.exists(): for p in out.iterdir(): remove(p) else: out.mkdir() langs = all_langs[version].values() langs_by_pk = {l.pk: l for l in langs} children = { pk: list(c) for pk, c in groupby(sorted(langs, key=lambda l: l.fpk or 0), lambda l: l.fpk)} for lang in langs: ancestors, fpk = [], lang.fpk while fpk and fpk in langs_by_pk: ancestors.append(langs_by_pk[fpk]) fpk = langs_by_pk[fpk].fpk versions = [ '<strong><a href="http://glottolog.org/resource/languoid/id/{0}">[{0}] in current Glottolog</a></strong>'.format(lang.id)] for v in sorted(all_langs.keys()): if v != version: if lang.id in all_langs[v]: versions.append(all_langs[v][lang.id].cross_version_link) clf = [link_list(children.get(lang.pk, []))] clf.append(lang.text) clf.extend(a.link for a in ancestors) write_text( out.joinpath('{0}.html'.format(lang.id)), T.render_unicode( version=version, lang=lang, clf=reduce(wrap, clf) if not lang.replacements else '', versions=versions, identifiers=identifiers.get(lang.pk, []), replacements=[all_langs[version][lid].link for lid in lang.replacements if lid in all_langs[version]], wrap=wrap, link_list=link_list, ) )
def test_BibFile(tmpdir, bibfiles): bf = bibfiles['a.bib'] assert bf['a:key'].type == 'misc' assert bf['s:Andalusi:Turk'].key == 's:Andalusi:Turk' for entry in bf.iterentries(): if entry.key == 'key': assert len(list(entry.languoids({'abc': 1})[0])) == 1 with pytest.raises(KeyError): bf['xyz'] assert len(list(bf.iterentries())) == 3 lines = [line for line in read_text(bf.fname).split('\n') if not line.strip().startswith('glottolog_ref_id')] write_text(str(tmpdir / 'a.bib'), '\n'.join(lines)) entries = bf.load() # FIXME bf.fname = str(tmpdir / ' newa.bib') bf.save(entries) bf.update(str(tmpdir / 'a.bib')) assert len(list(bf.iterentries())) == 3 bf.update(bibfiles['b.bib'].fname) assert len(list(bf.iterentries())) == 1 def visitor(entry): entry.fields['new_field'] = 'a' bf.visit(visitor=visitor) for entry in bf.iterentries(): assert 'new_field' in entry.fields bf.visit(visitor=lambda e: True) assert len(bf.keys()) == 0
def app(args): # pragma: no cover """ Dumps Concepticon's contents for English, German, Chinese, and French. Notes ----- Data are by default dumped into a structured JSON file in html/data.js. Examples -------- $ concepticon html """ data = defaultdict(list) def key(g, l): return "{0}---{1}".format(g, l) for lang in ["en", "de", "zh", "fr", "ru", "es", "pt"]: for cidx, gloss in args.api._get_map_for_language(lang): g0, _, g1 = gloss.partition("///") csspec = ( cidx, args.api.conceptsets[cidx].gloss, args.api.conceptsets[cidx].definition, args.api.conceptsets[cidx].ontological_category, ) data[key(g1, lang)].append(csspec) if lang == "en": data[key(g0, lang)].append(csspec) data[key(g0.lower(), lang)].append(csspec) data["language"] = "en" write_text( args.api.appdatadir.joinpath("data.js"), "var Concepticon = {0};\n".format(json.dumps(data, indent=2)), ) args.log.info("app data recreated")
def _install(self, **kw): self.log = kw.get('log', self.log) self.unmapped.clear() for p in self.cldf_dir.iterdir(): if p.name not in ['README.md', '.gitattributes']: p.unlink() self.tr_analyses = {} self.tr_bad_words = [] self.tr_invalid_words = [] if len(self.metadata.conceptlist): self.conceptlist = self.concepticon.conceptlists[ self.metadata.conceptlist[0]] if self.cmd_install(**kw) == NOOP: return if self.metadata.known_license: legalcode = self.metadata.known_license.legalcode if legalcode: write_text(self.dir / 'LICENSE', legalcode) gitattributes = self.cldf_dir / '.gitattributes' if not gitattributes.exists(): with gitattributes.open('wt') as fp: fp.write('*.csv text eol=crlf') if kw.get('verbose'): self.unmapped.pprint() self.cldf.validate(kw['log']) stats = transcription.Stats( bad_words=sorted(self.tr_bad_words[:100], key=lambda x: x['ID']), bad_words_count=len(self.tr_bad_words), invalid_words=sorted(self.tr_invalid_words[:100], key=lambda x: x['ID']), invalid_words_count=len(self.tr_invalid_words)) for lid, analysis in self.tr_analyses.items(): for attribute in [ 'segments', 'bipa_errors', 'sclass_errors', 'replacements' ]: getattr(stats, attribute).update(getattr(analysis, attribute)) stats.general_errors += analysis.general_errors stats.inventory_size += len(analysis.segments) / len( self.tr_analyses) error_segments = stats.bipa_errors.union(stats.sclass_errors) for i, row in enumerate(stats.bad_words): analyzed_segments = [] for s in row['Segments']: analyzed_segments.append('<s> %s </s>' % s if s in error_segments else s) stats.bad_words[i] = [ row['ID'], row['Language_ID'], row['Parameter_ID'], row['Form'], ' '.join(analyzed_segments) ] for i, row in enumerate(stats.invalid_words): stats.invalid_words[i] = [ row['ID'], row['Language_ID'], row['Parameter_ID'], row['Form'] ] # Aggregate transcription analysis results ... tr = dict(by_language={ k: attr.asdict(v) for k, v in self.tr_analyses.items() }, stats=attr.asdict(stats)) # ... and write a report: for text, fname in [ (transcription.report(tr), 'TRANSCRIPTION.md'), (self.report(tr, log=kw.get('log')), 'README.md'), ]: textdump(text, self.dir / fname, log=kw.get('log'))
def main(scripts, dev, glr): cldf_dir = Path('cldf') bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex') write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex')) glottolog = Glottolog(glr) ds = StructureDataset.in_dir(cldf_dir) ds.tablegroup.notes.append( OrderedDict([('dc:title', 'environment'), ('properties', OrderedDict([ ('glottolog_version', git_describe(glottolog.repos)), ]))])) ds.add_columns('ValueTable', { 'name': 'Marginal', 'datatype': 'boolean' }, { 'name': 'Allophones', 'separator': ' ' }, 'Contribution_ID') features = [ "tone", "stress", "syllabic", "short", "long", "consonantal", "sonorant", "continuant", "delayedRelease", "approximant", "tap", "trill", "nasal", "lateral", "labial", "round", "labiodental", "coronal", "anterior", "distributed", "strident", "dorsal", "high", "low", "front", "back", "tense", "retractedTongueRoot", "advancedTongueRoot", "periodicGlottalSource", "epilaryngealSource", "spreadGlottis", "constrictedGlottis", "fortis", "raisedLarynxEjective", "loweredLarynxImplosive", "click" ] ds.add_component('ParameterTable', 'SegmentClass', *features) ds.add_component('LanguageTable') ds.add_table( 'contributions.csv', 'ID', 'Name', 'Contributor_ID', { 'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';' }, 'URL') ds.add_table( 'contributors.csv', 'ID', 'Name', 'Description', 'Readme', 'Contents', { 'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';' }, 'URL', ) def read(what): return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True) languoids = {l.id: l for l in glottolog.languoids()} values, segments, languages, inventories, sources = [], [], {}, {}, [] for contrib in read('contributors.csv'): sources.append( dict( ID=contrib.Name, Name=contrib.Contributor, Description=contrib.Description, Readme=desc(dev, contrib.Name), Contents=contrib.Contents, Source=[ c.strip().lower() for c in contrib.Citation.split(';') ], URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '', )) pid_map = {} for row in read('parameters.csv'): pid = md5(row.Description.encode('utf8')).hexdigest().upper() pid_map[row.ID] = pid segments.append( dict(ID=pid, Name=row.Name, Description=row.Description, SegmentClass=row.SegmentClass, **{f: getattr(row, f) for f in features})) src = {} for row in read('contributions.csv'): src[row.ID] = row.References.split( ';') if row.References != 'no source given' else [] src[row.ID] = [sid.lower() for sid in src[row.ID]] inventories[row.ID] = dict(ID=row.ID, Name=row.Name, Contributor_ID=row.Contributor_ID, URL=row.URI if row.URI != 'NA' else '', Source=src[row.ID]) uniq = set() for row in read('values.csv'): pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID) if pk in uniq: print('skipping duplicate phoneme {0}'.format(pk)) continue uniq.add(pk) lid = row.Language_ID if row.Language_ID in languoids else slug( inventories[row.Contribution_ID]['Name']) if lid not in languages: # # FIXME: Language_ID == 'NA' for three inventories! This must be mapped! # lang = languoids.get(lid) languages[lid] = dict( ID=lid, Name=lang.name if lang else None, Glottocode=lang.id if lang else None, ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None, ) values.append( dict( ID=row.ID, Language_ID=lid, Parameter_ID=pid_map[row.Parameter_ID], Contribution_ID=row.Contribution_ID, Value=row.Name, Marginal=None if row.Marginal == 'NA' else eval( row.Marginal.lower().capitalize()), # FALSE|TRUE|NA Allophones=row.Allophones.split() if row.Allophones != 'NA' else [], Source=src[row.Contribution_ID], )) ds.write( **{ 'ValueTable': values, 'LanguageTable': languages.values(), 'ParameterTable': segments, 'contributions.csv': inventories.values(), 'contributors.csv': sources }) ds.validate(logging.getLogger(__name__))
def test_Dataset_from_data_empty_file(tmpdir): write_text(str(tmpdir / 'values.csv'), '') with pytest.raises(ValueError, match='empty data file'): Dataset.from_data(str(tmpdir / 'values.csv'))
def main(scripts, dev, glr): cldf_dir = Path('cldf') bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex') for _, e in bib.entries.items(): for field in e.fields: e.fields[field] = e.fields[field].replace('\\', '') write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex')) glottolog = Glottolog(glr) ds = StructureDataset.in_dir(cldf_dir) def describe_repos(r, org, name=None): return OrderedDict([ ('dc:title', '{0}/{1}'.format(org, name or r.name)), ('dc:description', git_describe(r))]) ds.tablegroup.common_props['prov:wasDerivedFrom'] = [ describe_repos(dev, 'phoible'), describe_repos(scripts, 'bambooforest'), describe_repos(glottolog.repos, 'clld'), ] ds.tablegroup.common_props['prov:wasGeneratedBy'] = describe_repos( Path(__file__).parent, 'cldf-datasets', name='phoible') ds.add_columns( 'ValueTable', {'name': 'Marginal', 'datatype': 'boolean'}, {'name': 'Allophones', 'separator': ' '}, 'Contribution_ID') features = ["tone","stress","syllabic","short","long","consonantal","sonorant","continuant","delayedRelease","approximant","tap","trill","nasal","lateral","labial","round","labiodental","coronal","anterior","distributed","strident","dorsal","high","low","front","back","tense","retractedTongueRoot","advancedTongueRoot","periodicGlottalSource","epilaryngealSource","spreadGlottis","constrictedGlottis","fortis","raisedLarynxEjective","loweredLarynxImplosive","click"] ds.add_component('ParameterTable', 'SegmentClass', *features) ds.add_component('LanguageTable', 'Family_Glottocode', 'Family_Name') table = ds.add_table( 'contributions.csv', 'ID', 'Name', 'Contributor_ID', {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'}, 'URL', {'name': 'count_phonemes', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_consonants', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_vowels', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}}, {'name': 'count_tones', 'datatype': {'base': 'integer', 'minimum': 0}, 'null': 'NA'}, ) table.tableSchema.primaryKey = ['ID'] table.tableSchema.foreignKeys.append(ForeignKey.fromdict(dict( columnReference='Contributor_ID', reference=dict(resource='contributors.csv', columnReference='ID')))) table.common_props['dc:conformsTo'] = None table = ds.add_table( 'contributors.csv', 'ID', 'Name', 'Description', 'Readme', 'Contents', {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'}, 'URL', {'name': 'with_tones', 'datatype': {'base': 'boolean', 'format': '1|0'}}, ) table.tableSchema.primaryKey = ['ID'] table.common_props['dc:conformsTo'] = None def read(what): return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True) languoids = {l.id: l for l in glottolog.languoids()} values, segments, languages, inventories, sources = [], [], OrderedDict(), OrderedDict(), [] with_tones = {} for contrib in read('contributors.csv'): sources.append(dict( ID=contrib.Name, Name=contrib.Contributor, Description=contrib.Description, Readme=desc(dev, contrib.Name), Contents=contrib.Contents, Source=[c.strip().lower() for c in contrib.Citation.split(';')], URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '', with_tones=contrib.with_tones == '1', )) with_tones[contrib.Name] = contrib.with_tones == '1' pid_map = {} for row in read('parameters.csv'): pid = md5(row.Description.encode('utf8')).hexdigest().upper() pid_map[row.ID] = (pid, row.SegmentClass) segments.append(dict( ID=pid, Name=row.Name, Description=row.Description, SegmentClass=row.SegmentClass, **{f: getattr(row, f) for f in features} )) src = {} for row in read('contributions.csv'): src[row.ID] = row.References.split(';') if row.References != 'no source given' else [] src[row.ID] = [sid.lower() for sid in src[row.ID]] inventories[row.ID] = dict( ID=row.ID, Name=row.Name, Contributor_ID=row.Contributor_ID.upper(), URL=row.URI if row.URI != 'NA' else '', Source=src[row.ID], count_phonemes=0, count_consonants=0, count_vowels=0, count_tones=0, ) uniq, counts = set(), Counter() for row in read('values.csv'): pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID) if pk in uniq: print('skipping duplicate phoneme {0}'.format(pk)) continue uniq.add(pk) lid = row.Language_ID if row.Language_ID in languoids else slug(inventories[row.Contribution_ID]['Name']) if lid not in languages: # # FIXME: Language_ID == 'NA' for three inventories! This must be mapped! # lang = languoids.get(lid) fam = lang.lineage[0] if lang and lang.lineage else None languages[lid] = dict( ID=lid, Name=lang.name if lang else None, Glottocode=lang.id if lang else None, ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None, Macroarea=lang.macroareas[0].value if lang and lang.macroareas else None, Latitude=lang.latitude if lang else None, Longitude=lang.longitude if lang else None, Family_Glottocode=fam[1] if fam else None, Family_Name=fam[0] if fam else None, ) pid, sc = pid_map[row.Parameter_ID] counts.update([(row.Contribution_ID, sc)]) values.append(dict( ID=row.ID, Language_ID=lid, Parameter_ID=pid, Contribution_ID=row.Contribution_ID, Value=row.Name, Marginal=None if row.Marginal == 'NA' else eval(row.Marginal.lower().capitalize()), # FALSE|TRUE|NA Allophones=row.Allophones.split() if row.Allophones != 'NA' else [], Source=src[row.Contribution_ID], )) for key, count in counts.items(): inventories[key[0]]['count_{0}s'.format(key[1])] = count inventories[key[0]]['count_phonemes'] += count for inv in inventories.values(): if not with_tones[inv['Contributor_ID']]: assert inv['count_tones'] == 0 inv['count_tones'] = 'NA' ds.write(**{ 'ValueTable': values, 'LanguageTable': languages.values(), 'ParameterTable': segments, 'contributions.csv': inventories.values(), 'contributors.csv': sources }) ds.validate(logging.getLogger(__name__))
def write_js_var(self, var_name, var, *path): p = self.path(*path) write_text(p, 'var ' + var_name + ' = ' + json.dumps(var, indent=2) + ';') self.file_written(p)
def htmlmap(args, min_langs_for_legend_item=10): """ glottolog --repos=. htmlmap [OUTDIR] [GLOTTOCODES] """ nodes = {n.id: n for n in args.repos.languoids()} legend = Counter() glottocodes = None if len(args.args) > 1: glottocodes = read_text(args.args[1]).split() langs = [] for n in nodes.values(): if ((glottocodes is None and n.level == args.repos.languoid_levels.language) or (glottocodes and n.id in glottocodes)) and n.latitude != None: fid = n.lineage[0][1] if n.lineage else n.id if (not nodes[fid].category.startswith('Pseudo')) or fid == n.id: langs.append((n, fid)) legend.update([fid]) color_map = [fid for fid, _ in legend.most_common()] color_map = dict(zip(color_map, qualitative_colors(len(color_map)))) print(color_map) def l2f(t): n, fid = t lon, lat = n.longitude, n.latitude if lon <= -26: lon += 360 # make the map pacific-centered. return { "geometry": { "coordinates": [lon, lat], "type": "Point" }, "id": n.id, "properties": { "name": n.name, "color": color_map[fid], "family": nodes[fid].name, "family_id": fid, }, "type": "Feature" } def legend_item(fid, c): return \ '<span style="background-color: {0}; border: 1px solid black;">'\ ' </span> '\ '<a href="https://glottolog.org/resource/languoid/id/{1}">{2}</a> ({3})'.format( color_map[fid], fid, nodes[fid].name, c) geojson = { "features": list(map(l2f, langs)), "properties": { "legend": { fid: legend_item(fid, c) for fid, c in legend.most_common() if c >= min_langs_for_legend_item }, }, "type": "FeatureCollection" } def rendered_template(name, **kw): return Template( read_text( Path(pyglottolog.__file__).parent.joinpath( 'templates', 'htmlmap', name))).substitute(**kw) jsname = 'glottolog_map.json' outdir = Path('.') if not args.args else Path(args.args[0]) write_text( outdir.joinpath(jsname), rendered_template('htmlmap.js', geojson=dumps(geojson, indent=4))) html = outdir.joinpath('glottolog_map.html') write_text( html, rendered_template('htmlmap.html', version=git_describe(args.repos.repos), jsname=jsname, nlangs=len(langs))) print(html.resolve().as_uri())
def write(self, fname, text, encoding='utf8'): write_text(self.joinpath(fname), text, encoding=encoding) return fname
def _download(self, **kw): self.cmd_download(**kw) write_text( self.raw / 'README.md', 'Raw data downloaded {0}'.format(datetime.utcnow().isoformat()))
def htmlmap(args): """ glottolog htmlmap [OUTDIR] """ nodes = {n.id: n for n in args.repos.languoids()} legend = Counter() langs = [] for n in nodes.values(): if n.level == Level.language and n.latitude != None: fid = n.lineage[0][1] if n.lineage else n.id if not nodes[fid].category.startswith('Pseudo'): langs.append((n, fid)) legend.update([fid]) color_map = { fid: "{0:0{1}X}".format((i + 1) * 10, 3) for i, fid in enumerate(sorted(legend.keys())) } def l2f(t): n, fid = t lon, lat = n.longitude, n.latitude if lon <= -26: lon += 360 return { "geometry": { "coordinates": [lon, lat], "type": "Point" }, "id": n.id, "properties": { "name": n.name, "color": color_map[fid], "family": nodes[fid].name, "family_id": fid, }, "type": "Feature" } def legend_item(fid, c): return \ '<span style="background-color: #{0}; border: 1px solid black;">'\ ' </span> '\ '<a href="http://glottolog.org/resource/languoid/id/{1}">{2}</a> ({3})'.format( color_map[fid], fid, nodes[fid].name, c) geojson = { "features": map(l2f, langs), "properties": { "legend": { fid: legend_item(fid, c) for fid, c in legend.most_common() if c > 10 }, }, "type": "FeatureCollection" } def rendered_template(name, **kw): return Template( read_text( Path(pyglottolog.__file__).parent.joinpath( 'templates', 'htmlmap', name))).substitute(**kw) jsname = 'glottolog_map.json' outdir = Path('.') if not args.args else Path(args.args[0]) write_text( outdir.joinpath(jsname), rendered_template('htmlmap.js', geojson=dumps(geojson, indent=4))) html = outdir.joinpath('glottolog_map.html') write_text( html, rendered_template('htmlmap.html', version=git_describe(args.repos.repos), jsname=jsname, nlangs=len(langs))) print(html.resolve().as_uri())