def word_length(args): from pyconcepticon.api import Concepticon c = Concepticon(args.concepticon_repos) res = defaultdict(lambda: defaultdict(list)) def _word_length(ds, **kw): ds.word_length(res) with_dataset(args, _word_length) concepts = c.conceptsets languoids = {l.id: l for l in Glottolog(args.glottolog_repos).languoids()} with UnicodeWriter('wordlength.csv') as writer: writer.writerow([ 'Concepticon_ID', 'Gloss', 'Semanticfield', 'Category', 'Glottocode', 'Variety', 'Family', 'Form', 'Length' ]) for pid, langs in res.items(): if len(langs) >= 500: for (lang, variety), forms in langs.items(): if lang in languoids: lengths = [len(f.split()) for f in forms] lang = languoids[lang] family = lang.lineage[0][0] if lang.lineage else '' c = concepts[pid] writer.writerow([ pid, c['GLOSS'], c['SEMANTICFIELD'], c['ONTOLOGICAL_CATEGORY'], lang.id, variety, family, forms[0], sum(lengths) / len(lengths) ])
def main(): # pragma: no cover parser = ArgumentParserWithLogging('pyglottolog') parser.add_argument('--repos', help="path to glottolog data repository", type=Glottolog, default=Glottolog()) sys.exit(parser.main())
def cldf(args): """ Create CLDF datasets from the raw data for a dataset. lexibank --glottolog-repos PATH --concepticon-repos PATH cldf [DATASET_ID] """ if not args.glottolog_repos or not Path(args.glottolog_repos).exists(): raise ParserError('Invalid glottolog repository path given') if not args.concepticon_repos or not Path(args.concepticon_repos).exists(): raise ParserError('Invalid concepticon repository path given') # FIXME: get dict of all glottolog langs right here, and attach to datasets! try: languoids = load('glottolog') except ValueError: languoids = { l.id: l for l in Glottolog(args.glottolog_repos).languoids() } dump(languoids, 'glottolog') def _cldf(ds, **kw): ds.glottolog_languoids = languoids ds.cldf(**kw) ds.write_cognates() with_dataset(args, _cldf)
def iter_languages(): ldstatus = load( GLOTTOLOG_VENV.joinpath('glottolog3/glottolog3/static/ldstatus.json')) for l in Glottolog(GLOTTOLOG_VENV.joinpath('glottolog')).languoids(): if l.level == Level.language and not l.category.startswith('Pseudo'): yield Language(l, ((ldstatus.get(l.id) or [[0, None]])[0] or [0, None])[1])
def load(args): """ clics load /path/to/concepticon-data /path/to/glottolog """ if len(args.args) != 2: raise ParserError( 'concepticon and glottolog repos locations must be specified!') concepticon = Path(args.args[0]) if not concepticon.exists(): raise ParserError('concepticon repository does not exist') glottolog = Path(args.args[1]) if not glottolog.exists(): raise ParserError('glottolog repository does not exist') args.api.db.create(exists_ok=True) args.log.info('loading datasets into {0}'.format(args.api.db.fname)) in_db = args.api.db.datasets for ds in iter_datasets(): if args.unloaded and ds.id in in_db: args.log.info('skipping {0} - already loaded'.format(ds.id)) continue args.log.info('loading {0}'.format(ds.id)) args.api.db.load(ds) args.log.info('loading Concepticon data') args.api.db.load_concepticon_data(Concepticon(str(concepticon))) args.log.info('loading Glottolog data') args.api.db.load_glottolog_data(Glottolog(str(glottolog))) return
def update(repos, gl_repos, year, title): societies_by_glottocode = { gc: list(socs) for gc, socs in groupby( sorted(repos.societies.values(), key=lambda s: s.glottocode), lambda s: s.glottocode) } langs = list(Glottolog(gl_repos).languoids()) languoids(langs, repos.dir) trees(societies_by_glottocode, langs, repos.dir, year, title)
def update(repos, gl_repos, year, title): societies_by_glottocode = { gc: list(socs) for gc, socs in itertools.groupby( sorted(repos.societies.values(), key=lambda s: s.glottocode), lambda s: s.glottocode) } api = Glottolog(gl_repos) langs = list(api.languoids()) languoids(api, langs, repos.repos) trees(societies_by_glottocode, langs, repos.repos, year, title)
def main(): # pragma: no cover pkg_dir = Path(glottolog3.__file__).parent parser = ArgumentParserWithLogging('glottolog3') parser.add_argument( '--repos', help="path to glottolog data repository", type=Glottolog, default=Glottolog( Path(glottolog3.__file__).parent.parent.parent.joinpath( 'glottolog'))) parser.add_argument('--pkg-dir', help=argparse.SUPPRESS, default=pkg_dir) sys.exit(parser.main())
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path( os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data') with transaction.manager: dataset = common.Dataset( id=lexirumah.__name__, name="lexirumah", publisher_name= "Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexirumah.model-ling.eu', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) glottolog_repos = Path( lexirumah.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog') languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()} concepticon = Concepticon( Path(lexirumah.__file__).parent.parent.parent.parent.joinpath( 'concepticon', 'concepticon-data')) conceptsets = {c.id: c for c in concepticon.conceptsets.values()} skip = True for dname in sorted(repos.joinpath('datasets').iterdir(), key=lambda p: p.name): #if dname.name == 'benuecongo': # skip = False #if skip: # continue if dname.is_dir() and dname.name != '_template': mdpath = dname.joinpath('cldf', 'metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families(Data(), DBSession.query(LexiRumahLanguage), glottolog_repos=glottolog_repos, isolates_icon='tcccccc')
def tree(glottocodes, gl_repos): label_pattern = re.compile("'[^\[]+\[([a-z0-9]{4}[0-9]{4})[^']*'") def rename(n): n.name = label_pattern.match(n.name).groups()[0] n.length = 1 glottocodes = set(glottocodes) glottocodes_in_global_tree = set() languoids = {} families = [] for lang in Glottolog(gl_repos).languoids(): if not lang.lineage: # a top-level node if not lang.category.startswith('Pseudo '): families.append(lang) languoids[lang.id] = lang glob = Tree() glob.name = 'glottolog_global' for family in families: node = family.newick_node(nodes=languoids) node.visit(rename) langs_in_tree = set(n.name for n in node.walk()) langs_selected = glottocodes.intersection(langs_in_tree) if not langs_selected: continue tree = Tree("({0});".format(node.newick), format=3) tree.name = 'glottolog_{0}'.format(family.id) if family.level.name == 'family': tree.prune([n for n in langs_selected]) glottocodes_in_global_tree = glottocodes_in_global_tree.union( set(n.name for n in tree.traverse())) else: glottocodes_in_global_tree = glottocodes_in_global_tree.union( langs_in_tree) glob.add_child(tree) # global nodes = glottocodes_in_global_tree.intersection(glottocodes) glob.prune([n for n in nodes]) return glob.write(format=9), nodes
def make_tree(*taxa): # We create a dict to lookup Glottolog languoids by name, ISO- or Glottocode. langs = {} for lang in Glottolog().languoids(): if lang.iso: langs[lang.iso] = lang langs[lang.name] = lang langs[lang.id] = lang t = TreeMaker() for taxon in taxa: if taxon not in langs: print('unknown taxon: {0}'.format(taxon)) continue t.add(taxon, ', '.join(l[1] for l in langs[taxon].lineage)) return t
def prime_cache(args): # pragma: no cover """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ if 1: langs = {l.pk: l for l in DBSession.query(models.GrambankLanguage)} features = {f.pk: f for f in DBSession.query(models.Feature)} for lpk, nf in DBSession.query(common.ValueSet.language_pk, func.count(common.ValueSet.pk)) \ .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk) \ .group_by(common.ValueSet.language_pk): langs[lpk].representation = nf for fpk, nl in DBSession.query(common.ValueSet.parameter_pk, func.count(common.ValueSet.pk))\ .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk)\ .group_by(common.ValueSet.parameter_pk): features[fpk].representation = nl compute_language_sources() get_repos() for obj in DBSession.query(LanguageTreeLabel).all(): DBSession.delete(obj) for obj in DBSession.query(TreeLabel).all(): DBSession.delete(obj) for obj in DBSession.query(Phylogeny).all(): DBSession.delete(obj) DBSession.flush() for tree in tqdm( iter_trees([l.id for l in DBSession.query(common.Language)], Glottolog(REPOS['glottolog']))): nodes = set(n.name for n in tree.traverse()) phylo = Phylogeny(id=tree.name.split('_')[1], name=tree.name, newick=tree.write(format=9)) for l in DBSession.query(common.Language).filter( common.Language.id.in_(nodes)): LanguageTreeLabel(language=l, treelabel=TreeLabel(id=l.id, name=l.id, phylogeny=phylo)) DBSession.add(phylo)
def main(args): # pragma: no cover ds = StructureDataset.from_metadata(DS) data = Data() for source in ds.sources: data.add(common.Source, source.id, _obj=bibtex2source(source)) ext = [ Record.from_string('@' + s, lowercase=True) for s in nfilter(BIB.split('@')) ] for rec in ext: if rec.id not in data['Source']: data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for contrib in ds['contributors.csv']: o = data.add( common.Contributor, contrib['ID'], id=contrib['ID'].upper(), name=contrib['Name'], description=contrib['Description'], url=contrib['URL'], jsondata={ 'readme': contrib['Readme'], 'contents': contrib['Contents'] }, ) for src in contrib['Source']: DBSession.add( models.ContributorReference(source=data['Source'][src], contributor=o)) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE 2.0', description='PHOIBLE 2.0', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='https://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'https://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, (cid, name) in enumerate([ ('UZ', "Steven Moran"), ('mccloy', "Daniel McCloy"), ], start=1): contrib = data['Contributor'].get(cid) if not contrib: contrib = common.Contributor(id=cid, name=name) DBSession.add( common.Editor(dataset=dataset, ord=i, contributor=contrib)) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog', 'glottolog')) for lang in ds['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], ) load_families(data, [(l.id, l) for l in data['Variety'].values() if len(l.id) == 8], glottolog.repos) DBSession.flush() # assign color codes: families = defaultdict(list) for l in data['Variety'].values(): families[l.family_pk].append(l) colors = color.qualitative_colors(len(families)) for i, langs in enumerate(sorted(families.values(), key=lambda v: -len(v))): for l in langs: l.jsondata = {'color': colors[i]} for segment in ds['ParameterTable']: equivalence_class = ''.join([ t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']] if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), data.add(models.Segment, segment['ID'], id=segment['ID'], name=segment['Name'], description=segment['Description'], segment_class=segment['SegmentClass'], equivalence_class=equivalence_class) DBSession.flush() # Add redirects for old language pages! get relevant ISO codes and map to Glottocode! for model, repls in load( Path(phoible.__file__).parent.parent / 'replacements.json').items(): if model == 'Language': languoids = {l.id: l for l in glottolog.languoids()} iso_languoids = {l.iso: l for l in languoids.values() if l.iso} gl_in_phoible = set(data['Variety'].keys()) for oid, nid in repls.items(): gls = descendants_from_nodemap( iso_languoids.get(oid), languoids).intersection(gl_in_phoible) if gls: nid = gls.pop() if len(gls) > 1: print('+++', oid, gls) else: print('---', oid) common.Config.add_replacement(oid, nid, common.Language) elif model == 'Parameter': segments_in_phoible = set(data['Segment'].keys()) for oid, nid in repls.items(): id_ = nid if nid in segments_in_phoible else None common.Config.add_replacement(oid, id_, common.Parameter) for segment in ds['ParameterTable']: for i, (k, v) in enumerate(sorted(segment.items())): if k not in ['ID', 'Name', 'Description', 'SegmentClass']: DBSession.add( common.Parameter_data( key=feature_name(k), value=v, ord=i, object_pk=data['Segment'][segment['ID']].pk)) for inventory in ds['contributions.csv']: inv = data.add( models.Inventory, inventory['ID'], id=inventory['ID'], name='{0} ({1} {2})'.format( inventory['Name'], inventory['Contributor_ID'].upper(), inventory['ID'], ), source_url=inventory['URL'], count_tone=inventory['count_tones'], count_vowel=inventory['count_vowels'], count_consonant=inventory['count_consonants'], ) DBSession.add( common.ContributionContributor( contribution=inv, contributor=data['Contributor'][ inventory['Contributor_ID'].upper()])) for src in inventory['Source']: DBSession.add( common.ContributionReference(contribution=inv, source=data['Source'][src])) for phoneme in ds['ValueTable']: lang = data['Variety'][phoneme['Language_ID']] inv = data['Inventory'][phoneme['Contribution_ID']] if not inv.language: inv.language = lang vs = common.ValueSet( id=phoneme['ID'], contribution=inv, language=lang, parameter=data['Segment'][phoneme['Parameter_ID']]) for ref in phoneme['Source']: DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( models.Phoneme( id=phoneme['ID'], name='%s %s' % (phoneme['Value'], data['Inventory'][phoneme['Contribution_ID']].name), allophones=' '.join(phoneme['Allophones']), marginal=phoneme['Marginal'], valueset=vs)) return
def setUp(self): WithTempDir.setUp(self) self.repos = self.tmp_path('repos') copytree(Path(__file__).parent.joinpath('data'), self.repos) self.api = Glottolog(self.repos)
def main(args): # pragma: no cover get_repos() api = Grambank(REPOS['Grambank']) cldf = args.cldf data = Data() dataset = models.Grambank( id=grambank.__name__, name="Grambank", description="Grambank", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grambank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) contributors = {} for i, contrib in enumerate(api.contributors): contrib = common.Contributor( contrib.id, id=contrib.id, name=contrib.name, ) common.Editor(dataset=dataset, contributor=contrib, ord=i) DBSession.add(contrib) DBSession.flush() contributors[contrib.id] = contrib.pk contributions = {r['ID']: r for r in cldf['LanguageTable']} DBSession.add(dataset) for rec in tqdm(list(Database.from_file(cldf.bibpath, lowercase=True)), desc='sources'): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() sources = {k: v.pk for k, v in data['Source'].items()} features, codes = import_features(cldf, contributors) transaction.commit() values_by_sheet = [(lid, list(v)) for lid, v in itertools.groupby( sorted(cldf['ValueTable'], key=lambda r: r['Language_ID']), lambda r: r['Language_ID'], )] for lid, values in tqdm(values_by_sheet, desc='loading values'): transaction.begin() import_values(values, contributions[lid], features, codes, contributors, sources) transaction.commit() transaction.begin() glottolog = Glottolog(REPOS['glottolog']) languoids = {l.id: l for l in glottolog.languoids()} gblangs = DBSession.query(models.GrambankLanguage).all() load_families(data, gblangs, glottolog_repos=REPOS['glottolog'], isolates_icon='dcccccc') # Add isolates for lg in gblangs: gl_language = languoids.get(lg.id) if not gl_language.family: family = data.add( Family, gl_language.id, id=gl_language.id, name=gl_language.name, description=common.Identifier( name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'}) lg.family = family coverage.main(glottolog) return
import pandas as pd from fuzzywuzzy import fuzz from pyglottolog.api import Glottolog api = Glottolog("/Users/stiv/Github/glottolog") def matching_dialect(glottocode, name): print(type(glottocode), glottocode, type(name), name) if glottocode == "NA": return glottocode lang = api.languoid(glottocode) if lang is None: # print("glottocode has been updated:", glottocode) return glottocode if fuzz.ratio(lang.name, name) < 95: ratios = [] for dialect in lang.children: ratios.append((dialect.id, dialect.name, fuzz.ratio(dialect.name, name))) if ratios and max(r[2] for r in ratios) >= 95: return sorted(ratios, key=lambda r: r[2], reverse=True)[0] else: return glottocode else: return glottocode def get_code(glottocode, name):
""" Small script from xrotwang to get Glottolog code to ISO 639-3 code mappings """ import csv from pyglottolog.api import Glottolog api = Glottolog('/Users/stiv/Github/glottolog/') gc2iso = {l.id: l.iso for l in api.languoids() if l.iso} with open('gc2iso.csv', 'wb') as csv_file: writer = csv.writer(csv_file) for key, value in gc2iso.items(): writer.writerow([key, value])
def get_names(): return {l.id: l.name for l in Glottolog(GLOTTOLOG_REPOS).languoids()}
def main(args): data = Data() glottocodes, bibtex_keys = {}, defaultdict(set) for d in reader( args.data_file('repos', 'mappings', 'InventoryID-ISO-gcode-Bibkey-Source.tsv')): glottocodes[d['InventoryID']] = d['Glottocode'] bibtex_keys[d['InventoryID']].add(d['BibtexKey']) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} phonemes = sorted(list( reader(args.data_file('repos', 'data', 'phoible-by-phoneme.tsv'))), key=lambda r: (r['InventoryID'], r['GlyphID'])) inventories = defaultdict(set) for p in phonemes: if p['InventoryID'] in glottocodes: inventories[(languoids[glottocodes[p['InventoryID']]].name, p['SpecificDialect'], p['Source'].upper())].add( (p['InventoryID'], p['LanguageName'])) inventory_names = {} for (glname, dname, source), invids in inventories.items(): if len(invids) == 1: invid, lname = invids.pop() inventory_names[invid] = name_in_source(glname, dname) + ' [%s]' % source else: use_lname = len(set(r[1] for r in invids)) == len(invids) for i, (invid, lname) in enumerate(sorted(invids, key=lambda j: int(j[0]))): disambiguation = ' %s' % (i + 1, ) if use_lname: disambiguation = ' (%s)' % lname inventory_names[invid] = name_in_source( glname, dname) + '%s [%s]' % (disambiguation, source) for (invid, lname, dname, source), ps in groupby( phonemes, lambda p: (p['InventoryID'], p['LanguageName'], p[ 'SpecificDialect'], p['Source'])): if invid not in glottocodes: continue ps = list(ps) gc = glottocodes[invid] lang = data['Variety'].get(gc) if not lang: languoid = languoids[gc] lang = data.add( models.Variety, gc, id=gc, language_code=ps[0]['LanguageCode'], name=languoid.name, level=text_type(languoid.level.name), latitude=languoid.latitude, longitude=languoid.longitude, ) if lang.latitude is None and languoid.level == Level.dialect: ll = get_language(languoid) lang.latitude = ll.latitude lang.longitude = ll.longitude contrib = data.add( models.Inventory, invid, id=invid, #language=lang, source=source, #source_url=source_urls.get(row.InventoryID), #internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[invid], description=name_in_source(lname, dname)) return # FIXME: read from mappings file! refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) #squibs = defaultdict(list) #for row in get_rows(args, 'Squib'): # squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) # FIXME: group phoible-by-phoneme by LanguageCode, Source (make sure this is unique!) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), delimiter='\t', namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) # pull in Glottolog families instead? or in addition? family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) #for j, squib in enumerate(squibs.get(row.InventoryID, [])): # f = common.Contribution_files( # object=contrib, # id='squib-%s-%s.pdf' % (contrib.id, j + 1), # name='Phonological squib', # description=squib, # mime_type='application/pdf') # assert f # # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) # FIXME: add allophones! DBSession.flush()
def get_clf_paths(lgs): glottolog = Glottolog(GLOTTOLOG_REPOS) return [ tuple([ll.id for ll in l.ancestors] + [l.id]) for l in glottolog.languoids(lgs) ]
`pip install --upgrade --force-reinstall pyglottolog` `pip install fuzzywuzzy` Usage: $ python match.py nyun1247 Bibbulman (u'bibb1234', u'Bibbulman', 100) $ python match.py nyun1247 Balardung None """ from fuzzywuzzy import fuzz def matching_dialect(glottolog, glottocode, name): lang = glottolog.languoid(glottocode) if fuzz.ratio(lang.name, name) < 95: ratios = [] for dialect in lang.children: ratios.append( (dialect.id, dialect.name, fuzz.ratio(dialect.name, name))) if ratios and max(r[2] for r in ratios) >= 95: return sorted(ratios, key=lambda r: r[2], reverse=True)[0] if __name__ == "__main__": import sys from pyglottolog.api import Glottolog print( matching_dialect(Glottolog("/Users/stiv/Github/glottolog"), *sys.argv[1:]))
from clldutils.dsv import UnicodeWriter from pyglottolog.api import Glottolog from pyglottolog.objects import Level def locations(glottolog, fid, outpath): with UnicodeWriter(outpath) as writer: writer.writerow(['name', 'glottocode', 'latitude', 'longitude']) for lang in glottolog.languoids(): if lang.level == Level.language and lang.latitude is not None: if fid in [l[1] for l in lang.lineage]: writer.writerow( [lang.name, lang.id, lang.latitude, lang.longitude]) if __name__ == '__main__': import sys locations(Glottolog(sys.argv[1]), sys.argv[2], sys.argv[3])
def main(args=sys.argv): """The main CLI""" # Parse options parser = argparse.ArgumentParser(description=__doc__.split("\n")[0]) parser.add_argument( 'dataset', type=Path, help="Path to the CLDF dataset's JSON description") parser.add_argument( "output", help="File name to write output to") parser.add_argument( "--glottolog-repos", default=None, help="Path to local clone or export of clld/glottolog") parser.add_argument( "--cmap", type=plt.get_cmap, default=plt.get_cmap("magma_r"), help="Colormap to be used for the parameter counts") options = parser.parse_args() dataset = pycldf.Dataset.from_metadata(options.dataset) # Try to load language locations from the dataset locations = {} try: idcol = dataset["LanguageTable", "id"].name latcol = dataset["LanguageTable", "latitude"].name loncol = dataset["LanguageTable", "longitude"].name for row in dataset["LanguageTable"]: if row[latcol] is not None: locations[row[idcol]] = row[latcol], row[loncol] except ValueError: # No language table pass for lang in Glottolog(options.glottolog_repos).languoids(): if lang.latitude is not None: if lang.id not in locations: locations[lang.id] = (lang.latitude, lang.longitude) if lang.iso and lang.iso not in locations: locations[lang.iso] = (lang.latitude, lang.longitude) # Aggregate the data lats, lons, sizes = [], [], [] for language, sample_size in parameters_sampled(dataset).items(): if language in locations: lat, lon = locations[language] lats.append(float(lat)) lons.append(float(lon)) sizes.append(sample_size) assert len(sizes) == len(lats) == len(lons) # Calculate coordinate boundaries min_lat, max_lat = min(lats), max(lats) d_lat = max_lat - min_lat min_lat = max(-90, min_lat - 0.1 * d_lat) max_lat = min(90, max_lat + 0.1 * d_lat) min_lon, max_lon = min(lons), max(lons) d_lon = max_lon - min_lon min_lon = max(-180, min_lon - 0.1 * d_lon) max_lon = min(180, max_lon + 0.1 * d_lon) # Draw the base map # TODO: Get coordinates from commandline, fallback to bounding box of data # TODO: Give more control over map drawing to user (projection, level of # detail, drawing other patterns (countries, eg.) instead of just coast # lines, continent color) – What is a good way to do that? map = Basemap(llcrnrlat=min_lat, llcrnrlon=min_lon, urcrnrlat=max_lat, urcrnrlon=max_lon, # projection='lcc', resolution='h', area_thresh=10) map.drawcoastlines() map.fillcontinents(color='#fff7ee', zorder=0) # Plot the sample sizes map.scatter(lons, lats, c=sizes, cmap=options.cmap, latlon=True) # TODO: Improve shape of components: Colorbar is very huge, margins are quite large plt.colorbar() plt.gcf().set_size_inches(12, 9) plt.savefig(options.output) return 0
'''Imports Glottolog data needed for Pshrimp to a Postgres database.''' from db_postgres import init_db from pyglottolog.api import Glottolog from import_postgres import insert, get_id from collections import OrderedDict import csv from os.path import expanduser GLOTTOLOG_LOCATION = expanduser('~/Documents/glottolog-3.4') api = Glottolog(GLOTTOLOG_LOCATION) def language(glottocode): '''Dialects don't have most information defined, so go upstairs to a language.''' # Ideally there would be error handling here in case there's a family. # In practice, it just crashed and I edited the csv file. languoid = api.languoid(glottocode) if languoid.level.name == 'dialect': while languoid.level.name == 'dialect': languoid = languoid.parent return languoid def data(glottocode): languoid = language(glottocode) print(languoid)