def main(args): data = Data() dataset = common.Dataset( id=cdk.__name__, name="CDK", description="Comprehensive Dictionary of Ket", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='cdk.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) contrib = common.Contribution(id='ket', name=dataset.name) DBSession.add(contrib) for i, (id, name) in enumerate([ ('kotorov', 'E.G. Kotorova'), ('nefedov', 'A.V. Nefedov'), ]): dataset.editors.append( common.Editor(contributor=common.Contributor(id=id, name=name), ord=i)) ket = data.add( common.Language, 'ket', id='ket', name='Ket', latitude=63.76, longitude=87.55) add_language_codes(data, ket, 'ket', glottocode='kett1243') for abbr, name in DIALECTS.items(): data.add(common.Language, abbr, id=abbr, name=name) with args.data_file('sources.txt').open(encoding='utf8') as fp: for i, chunk in enumerate(fp.read().split('\n\n\n')): try: id_, year, author, desc = chunk.split('\n') except: print(chunk) raise data.add( common.Source, id_, id=str(i + 1), name=id_, author=author, year=year, description=desc) with UnicodeReader(args.data_file('Ket_nouns_and_other_pos_table.docx.csv')) as reader: load(data, reader, ket, contrib, verbs=False) with UnicodeReader(args.data_file('Ket_verbs_table.docx.csv')) as reader: load(data, reader, ket, contrib) print('parsing examples problematic in %s cases' % len(PROBLEMS))
def languoid_visitor(lang, row, _): try: add_language_codes( data, lang, lang.id.split('-')[0], glottolog, glottocode=row[2] or None) except: print(row) raise second_languages[row[0]] = row[8]
def test_add_language_codes(env): from clld.db.models.common import Language from clld.scripts.util import Data, add_language_codes add_language_codes(Data(), Language(), 'iso', glottocodes=dict(iso='glot1234'))
def main(args): glottocodes = {} if getuser() == "robert": glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3") data = Data() dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org") DBSession.add(dataset) bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True) for i, spec in enumerate( [ ("bickel", "Balthasar Bickel", "University of Zurich"), ("nichols", "Johanna Nichols", "University of California, Berkeley"), ] ): contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1]) DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor)) for l in rows( args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True ): # LID language ISO639.3.2013 stock continent area latitude longitude if l.stock not in data["Stock"]: stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock) else: stock = data["Stock"][l.stock] if l.continent not in data["Continent"]: continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent) else: continent = data["Continent"][l.continent] if l.area not in data["Area"]: area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent) else: area = data["Area"][l.area] lang = data.add( models.Languoid, l.LID, id=l.LID, name=l.language, latitude=coord(l.latitude), longitude=coord(l.longitude), stock=stock, area=area, ) add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes) loader.case_alignment(args, data, bib) loader.inclusive_excusive(args, data, bib)
def load_families(data, languages, glottolog=None, icons=ORDERED_ICONS, isolates_icon=ISOLATES_ICON): """Add Family objects to a database and update Language object from Glottolog. Family information is retrieved from Glottolog based on the id attribute of a language. This id must be either a glottocode or an ISO 639-3 code. :param data: :return: """ icons = cycle([ getattr(i, 'name', i) for i in icons if getattr(i, 'name', i) != isolates_icon ]) glottolog = glottolog or Glottolog() for language in languages: if isinstance(language, (tuple, list)) and len(language) == 2: code, language = language else: code = language.id if code != '-': gl_language = glottolog.languoid(code) if gl_language: gl_family = gl_language.family if gl_family: family = data['Family'].get(gl_family.id) #print family if not family: family = data.add( Family, gl_family.id, id=gl_family.id, name=gl_family.name, description=Identifier( name=gl_family.id, type=IdentifierType.glottolog.value).url(), jsondata=dict(icon=next(icons))) language.family = family language.macroarea = gl_language.macroareas[0] add_language_codes(data, language, gl_language.iso_code, glottocode=gl_language.id) for attr in 'latitude', 'longitude', 'name': if getattr(language, attr) is None: setattr(language, attr, getattr(gl_language, attr)) else: language.macroarea = None
def load_families( data, languages, glottolog=None, icons=ORDERED_ICONS, isolates_icon=ISOLATES_ICON): """Add Family objects to a database and update Language object from Glottolog. Family information is retrieved from Glottolog based on the id attribute of a language. This id must be either a glottocode or an ISO 639-3 code. :param data: :return: """ icons = cycle([getattr(i, 'name', i) for i in icons if getattr(i, 'name', i) != isolates_icon]) glottolog = glottolog or Glottolog() for language in languages: if isinstance(language, (tuple, list)) and len(language) == 2: code, language = language else: code = language.id gl_language = glottolog.get(code) \ if isinstance(glottolog, dict) else glottolog.languoid(code) if gl_language: gl_family = gl_language.family if not gl_family and getattr(gl_language, 'level', None) == 'family': # Make sure top-level families are not treated as isolates! gl_family = gl_language if gl_family: family = data['Family'].get(gl_family.id) if not family: family = data.add( Family, gl_family.id, id=gl_family.id, name=gl_family.name, description=Identifier( name=gl_family.id, type=IdentifierType.glottolog.value).url(), jsondata=dict(icon=next(icons))) language.family = family language.macroarea = gl_language.macroareas[0] if gl_language.macroareas else None add_language_codes( data, language, gl_language.iso_code, glottocode=gl_language.id) for attr in 'latitude', 'longitude', 'name': if getattr(language, attr) is None: setattr(language, attr, getattr(gl_language, attr))
def languoid_visitor(lang, row, _): add_language_codes( data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8]
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License'}) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list(reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby( sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add( models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add( common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add(models.ContributorReference( source=data['Source'][ref], contributor=contributor)) contrib = data.add( models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add(common.ContributionContributor( contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files( object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join( [t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER']]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet( id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add(common.ValueSetReference( source=data['Source'][ref], valueset=vs)) DBSession.add(common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add( common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate(reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add(common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def main(args): data = Data() glottocodes, bibtex_keys = {}, defaultdict(set) for d in reader( args.data_file('repos', 'mappings', 'InventoryID-ISO-gcode-Bibkey-Source.tsv')): glottocodes[d['InventoryID']] = d['Glottocode'] bibtex_keys[d['InventoryID']].add(d['BibtexKey']) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} phonemes = sorted(list( reader(args.data_file('repos', 'data', 'phoible-by-phoneme.tsv'))), key=lambda r: (r['InventoryID'], r['GlyphID'])) inventories = defaultdict(set) for p in phonemes: if p['InventoryID'] in glottocodes: inventories[(languoids[glottocodes[p['InventoryID']]].name, p['SpecificDialect'], p['Source'].upper())].add( (p['InventoryID'], p['LanguageName'])) inventory_names = {} for (glname, dname, source), invids in inventories.items(): if len(invids) == 1: invid, lname = invids.pop() inventory_names[invid] = name_in_source(glname, dname) + ' [%s]' % source else: use_lname = len(set(r[1] for r in invids)) == len(invids) for i, (invid, lname) in enumerate(sorted(invids, key=lambda j: int(j[0]))): disambiguation = ' %s' % (i + 1, ) if use_lname: disambiguation = ' (%s)' % lname inventory_names[invid] = name_in_source( glname, dname) + '%s [%s]' % (disambiguation, source) for (invid, lname, dname, source), ps in groupby( phonemes, lambda p: (p['InventoryID'], p['LanguageName'], p[ 'SpecificDialect'], p['Source'])): if invid not in glottocodes: continue ps = list(ps) gc = glottocodes[invid] lang = data['Variety'].get(gc) if not lang: languoid = languoids[gc] lang = data.add( models.Variety, gc, id=gc, language_code=ps[0]['LanguageCode'], name=languoid.name, level=text_type(languoid.level.name), latitude=languoid.latitude, longitude=languoid.longitude, ) if lang.latitude is None and languoid.level == Level.dialect: ll = get_language(languoid) lang.latitude = ll.latitude lang.longitude = ll.longitude contrib = data.add( models.Inventory, invid, id=invid, #language=lang, source=source, #source_url=source_urls.get(row.InventoryID), #internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[invid], description=name_in_source(lname, dname)) return # FIXME: read from mappings file! refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) #squibs = defaultdict(list) #for row in get_rows(args, 'Squib'): # squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) # FIXME: group phoible-by-phoneme by LanguageCode, Source (make sure this is unique!) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), delimiter='\t', namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) # pull in Glottolog families instead? or in addition? family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) #for j, squib in enumerate(squibs.get(row.InventoryID, [])): # f = common.Contribution_files( # object=contrib, # id='squib-%s-%s.pdf' % (contrib.id, j + 1), # name='Phonological squib', # description=squib, # mime_type='application/pdf') # assert f # # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) # FIXME: add allophones! DBSession.flush()
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files(object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def main(args): # # order of init: # - villages # - files # - movies # videos = defaultdict(list) for f in util.iter_files(args): obj = models.File(**attr.asdict(f)) if obj.mime_type.startswith('video'): videos[slug(obj.name.split('.')[0])].append(obj) DBSession.add(obj) lexicon = list(util.iter_lexicon(args)) villages = util.get_villages(args) ff_images = list(util.ff_images(args)) bib = list(util.get_bib(args)) data = Data() dataset = common.Dataset( id=dogonlanguages.__name__, name="Dogon and Bangime Linguistics", contact="*****@*****.**", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='dogonlanguages.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'} ) DBSession.add(dataset) if Glottolog: if socket.gethostname() == 'dlt5502178l': glottolog = Glottolog( Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog')) else: glottolog = Glottolog( Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath( 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} else: languoids = {} print('got glottolog') for c in util.CONTRIBUTORS: id_ = slug(c.name.split()[-1]) data.add(models.Member, id_, id=id_, **attr.asdict(c)) data.add( models.Member, 'forkel', id='forkel', name='Robert Forkel', email='*****@*****.**', in_project=False) for i, id_ in enumerate(['moran', 'forkel', 'heath']): DBSession.add(common.Editor( dataset=dataset, ord=i + 1, contributor=data['Member'][id_])) contrib = data.add(common.Contribution, 'd', id='d', name='Dogon Languages') for doc in bib: obj = data.add( models.Document, doc.rec.id, _obj=bibtex2source(doc.rec, cls=models.Document)) keywords = nfilter([s.strip() for s in doc.rec.get('keywords', '').split(',')]) for dt in 'grammar lexicon typology texts'.split(): if dt in keywords: obj.doctype = dt break obj.project_doc = ('DLP' in keywords) or bool(doc.files) if obj.project_doc: for i, cid in enumerate(util.get_contributors(doc.rec, data)): models.DocumentContributor( document=obj, contributor=data['Member'][cid], ord=i) for i, (path, cdstar) in enumerate(doc.files): common.Source_files( id='%s-%s' % (obj.id, i + 1), name=path, object=obj, mime_type=guess_type(path)[0], jsondata=cdstar, ) print('got bib') for name, (gc, desc) in LANGUAGES.items(): gl_lang = languoids[gc] lat, lon = gl_lang.latitude, gl_lang.longitude lang = data.add( models.Languoid, gc, id=gc, name=name, description=desc, latitude=lat, longitude=lon, family=gl_lang.family.name if gl_lang and gl_lang.family else name, ) if name == 'Penange' and lang.longitude > 0: lang.longitude = -lang.longitude if name == 'Bankan Tey': lang.latitude, lang.longitude = 15.07, -2.91 if name == 'Ben Tey': lang.latitude, lang.longitude = 14.85, -2.95 if name == 'Togo Kan': lang.latitude, lang.longitude = 14.00, -3.25 add_language_codes(data, lang, gl_lang.iso, glottocode=gc) villages_by_name = defaultdict(list) contrib_by_initial = {c.abbr: c for c in data['Member'].values()} for i, village in enumerate(villages): lang = None if village.glottocode: lang = data['Languoid'].get(village.glottocode) if not lang: gl_lang = languoids[village.glottocode] lang = data.add( models.Languoid, gl_lang.id, id=gl_lang.id, name=gl_lang.name, in_project=False, family=gl_lang.family.name if gl_lang.family else gl_lang.name) v = data.add( models.Village, str(i + 1), id=str(i + 1), name=village.name, description=village.data.pop('social info'), surnames=village.data.pop('surnames'), major_city=village.data['MajorCity'] == 'Y', transcribed_name=village.data.pop('Transcribed Village Name'), source_of_coordinates=village.data.pop('sourceOfCoordinates'), latitude=village.lat, longitude=village.lon, languoid=lang, jsondata=village.data, ) villages_by_name[village.name].append(v) for img in village.images: mimetype = guess_type(img.name)[0] if mimetype: f = models.Village_files( id=img.id, name=img.name, description=img.description, date_created=img.date, latitude=img.coords[0] if img.coords else None, longitude=-img.coords[1] if img.coords else None, object=v, mime_type=mimetype, jsondata=img.cdstar, ) for initial in img.creators: if initial in contrib_by_initial: models.Fotographer( foto=f, contributor=contrib_by_initial[initial]) for cat, desc, place, name in MOVIES: s = slug(name) m = models.Movie( id=s, name=desc, description=cat, place=place, ) if place in villages_by_name and len(villages_by_name[place]) == 1: m.village = villages_by_name[place][0] #print('found village: %s' % name) for v in videos[s]: #print('found video: %s' % name) v.movie = m m.duration = v.duration names = defaultdict(int) for concept in lexicon: add(concept, data, names, contrib) count = set() for img in ff_images: if img.id in count: continue count.add(img.id) if img.ref: if img.ref in data['Concept']: concept = data['Concept'][img.ref] if img.tsammalex_taxon and not concept.tsammalex_taxon: concept.tsammalex_taxon = img.tsammalex_taxon #print(concept.tsammalex_taxon) common.Parameter_files( object=concept, id=img.id, name=img.name.decode('utf8'), mime_type=guess_type(img.name)[0], jsondata=img.cdstar) else: print('missing ref: %s' % img.ref)
def main(args): # pragma: no cover wl = Wordlist.from_metadata(args.data_file('cldf', 'cldf-metadata.json')) data = Data() data.add( common.Contributor, 'barthwolfgang', id='barthwolfgang', name="Wolfgang Barth", url="http://www.dynamicsoflanguage.edu.au/") # # FIXME: get dataset attributes from CLDF metadata! # dataset = common.Dataset( id='parabank', name='Parabank Pronouns', description='Database of pronouns', domain='parabank.clld.org', publisher_name="CoEDL Centre of Excellence for the Dynamics of Language", publisher_place="Canberra, Australia", publisher_url="http://www.dynamicsoflanguage.edu.au/", license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0'}) DBSession.add(dataset) for i, editor in enumerate(['barthwolfgang']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) contrib = common.Contribution(id='contrib', name='the contribution') for l in wl['LanguageTable']: lang = data.add( models.ParabankLanguage, l['ID'], id=l['ID'], name=l['Name'], description=l['Notes'], source=l['Source_Citation'], classification=l['Classification'], ) add_language_codes(data, lang, None, glottocode=l['Glottocode']) for p in wl['ParameterTable']: data.add( common.Parameter, p['ID'], id=p['ID'], name='{0} ({1})'.format(p['Name'], p['ID']), #description=p['Description'], ) for f in wl['FormTable']: vsid = '{0}-{1}'.format(f['Parameter_ID'], f['Language_ID']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id=vsid, language=data['ParabankLanguage'][f['Language_ID']], parameter=data['Parameter'][f['Parameter_ID']], contribution=contrib) DBSession.add(models.Word( id=f['ID'], name=f['Form'], comment=f.get('Comment'), original=f['Original_parameter'], valueset=vs)) load_families( data, [(l.glottocode, l) for l in data['ParabankLanguage'].values()], glottolog_repos=args.data_file('glottolog'), isolates_icon='tcccccc')
def load_families(data, languages, glottolog=None, icons=ORDERED_ICONS, isolates_icon=ISOLATES_ICON): """Add Family objects to a database and update Language object from Glottolog. Family information is retrieved from Glottolog based on the id attribute of a language. This id must be either a glottocode or an ISO 639-3 code. :param data: :return: """ icons = cycle([ getattr(i, 'name', i) for i in icons if getattr(i, 'name', i) != isolates_icon ]) glottolog = glottolog or Glottolog() print len(languages), languages for language in languages: if isinstance(language, (tuple, list)) and len(language) == 2: code, language = language else: code = language.id print language, code if code != '-': gl_language = glottolog.languoid(code) if gl_language: gl_family = gl_language.family if gl_family and gl_family.name in [ 'Sino-Tibetan', 'Dravidian', 'Indo-European', 'Austroasiatic' ]: # the second condition is added by me (shafqat) family = data['Family'].get(gl_family.id) #print 'this one' #print gl_family.name if not family: family = data.add( Family, gl_family.id, id=gl_family.id, name=gl_family.name, description=Identifier( name=gl_family.id, type=IdentifierType.glottolog.value).url(), ##jsondata=dict(icon=next(icons))) jsondata=dict(icon=custom_icons[gl_family.name]) ) ## based on family, we can use different icons if we like as needed in case of LSI language.family = family language.macroarea = gl_language.macroareas[0] add_language_codes(data, language, gl_language.iso_code, glottocode=gl_language.id) for attr in 'latitude', 'longitude', 'name': if getattr(language, attr) is None: setattr(language, attr, getattr(gl_language, attr)) else: language.macroarea = None
def main(args): data = Data() dataset = common.Dataset( id=abvd.__name__, name='ABVD', description='', domain='abvd.clld.org', published=date.today(), license='https://creativecommons.org/licenses/by/4.0/', contact='', jsondata={ 'doi': args.doi, 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for name in ['Simon Greenhill', 'Robert Blust', 'Russell Gray']: common.Editor(contributor=contributor(data, name), dataset=dataset) cnames = Counter() families = Counter([l['Family'] for l in args.cldf['LanguageTable']]) colors = dict( zip([i[0] for i in families.most_common()], color.qualitative_colors(len(families)))) cid2l = {} for lang in args.cldf['LanguageTable']: lid = (lang['Name'], lang['Glottocode']) l = data['Language'].get(lid) if not l: l = data.add( common.Language, lid, id=lang['ID'], name=lang['Name'], latitude=lang['Latitude'], longitude=lang['Longitude'], jsondata=dict( family=lang['Family'], icon='{0}{1}'.format('c' if lang['Family'] else 't', colors[lang['Family']]), ), ) if lang['Glottocode'] or lang['ISO639P3code']: add_language_codes( data, l, isocode=lang['ISO639P3code'], glottocode=lang['Glottocode']) cid2l[lang['ID']] = l cname = '{0} ({1})'.format(lang['Name'], lang['author']) cnames.update([cname]) if cnames[cname] > 1: cname += ' {0}'.format(cnames[cname]) c = data.add( models.Wordlist, lang['ID'], id=lang['ID'], name=cname, description=lang['author'], language=l, notes=lang['notes'], ) i = 0 typers = (lang['typedby'] or '').split(' and ') checkers = (lang['checkedby'] or '').split(' and ') for name in typers: i += 1 DBSession.add(common.ContributionContributor( contribution=c, contributor=contributor(data, name), ord=i, jsondata=dict(type='typedby and checkedby' if name in checkers else 'typedby'), )) for name in checkers: if name in typers: continue i += 1 DBSession.add(common.ContributionContributor( contribution=c, contributor=contributor(data, name), ord=i, jsondata=dict(type='checkedby'), )) for param in args.cldf['ParameterTable']: data.add( common.Parameter, param['ID'], id=param['ID'], name=param['Name'], ) # # FIXME: add sources! # vsrs = set() for row in args.cldf['FormTable']: vs = data['ValueSet'].get((row['Language_ID'], row['Parameter_ID'])) if not vs: vs = data.add( common.ValueSet, (row['Language_ID'], row['Parameter_ID']), id='{0}-{1}'.format(row['Language_ID'], row['Parameter_ID']), language=cid2l[row['Language_ID']], parameter=data['Parameter'][row['Parameter_ID']], contribution=data['Wordlist'][row['Language_ID']], ) v = data.add( common.Value, row['ID'], id=row['ID'], name=row['Form'], valueset=vs ) for row in args.cldf['CognateTable']: cc = data['Cognateset'].get(row['Cognateset_ID']) if not cc: cc = data.add(Cognateset, row['Cognateset_ID'], id=row['Cognateset_ID']) data.add( Cognate, row['ID'], cognateset=cc, counterpart=data['Value'][row['Form_ID']], doubt=row['Doubt'], )
def load_languoid(data, lang, nodemap): dblang = data.add( models.Languoid, lang.id, id=lang.id, hid=lang.hid, name=lang.name, bookkeeping=lang.category == models.BOOKKEEPING, newick=lang.newick_node(nodemap).newick, latitude=lang.latitude, longitude=lang.longitude, # # TODO: switch to using the AES labels, i.e. lang.endangerment.description! # status=models.LanguoidStatus.get( lang.endangerment.name if lang.endangerment else 'safe'), level=models.LanguoidLevel.from_string(lang.level.name), father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None) if lang.iso: add_language_codes(data, dblang, lang.iso) for prov, names in lang.names.items(): for name in names: l = 'en' if '[' in name and name.endswith(']'): name, l = [s.strip() for s in name[:-1].split('[', 1)] add_identifier(dblang, data, name, 'name', prov, lang=l) for prov, ids in lang.identifier.items(): for id_ in split_text(ids, separators=',;'): add_identifier(dblang, data, id_, prov, None) if not dblang.bookkeeping: # Languages in Bookkeeping do not have a meaningful classification! clf = lang.classification_comment if clf: for attr, pid in [('sub', 'sc'), ('family', 'fc')]: val = getattr(clf, attr) if attr == 'sub' and not val: # Handle cases with subrefs but no sub comment. val = getattr(clf, 'subrefs') if val: val = ', '.join('{0}'.format(r) for r in val) if attr == 'family' and not val: # Handle cases with subrefs but no sub comment. val = getattr(clf, 'familyrefs') if val: val = ', '.join('{0}'.format(r) for r in val) if not val: continue vs = common.ValueSet( id='%s-%s' % (pid, lang.id), description=val, language=dblang, parameter=data['Parameter'][pid], contribution=data['Contribution']['clf']) DBSession.add(common.Value(id='%s-%s' % (pid, lang.id), valueset=vs)) iso_ret = lang.iso_retirement if iso_ret: DBSession.add(models.ISORetirement( id=iso_ret.code, name=iso_ret.name, description=iso_ret.comment, effective=iso_ret.effective, reason=iso_ret.reason, remedy=iso_ret.remedy, change_request=iso_ret.change_request, languoid=dblang)) eth_cmt = lang.ethnologue_comment if eth_cmt: DBSession.add(models.EthnologueComment( comment=eth_cmt.comment, code=eth_cmt.isohid, type=eth_cmt.comment_type, affected=eth_cmt.ethnologue_versions, languoid=dblang))
def load_languoid(data, lang, nodemap): dblang = data.add( models.Languoid, lang.id, id=lang.id, hid=lang.hid, name=lang.name, bookkeeping=lang.category == models.BOOKKEEPING, newick=lang.newick_node(nodemap).newick, latitude=lang.latitude, longitude=lang.longitude, status=models.LanguoidStatus.get( lang.endangerment.name if lang.endangerment else 'safe'), level=models.LanguoidLevel.from_string(lang.level.name), father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None) if lang.iso: add_language_codes(data, dblang, lang.iso) for prov, names in lang.names.items(): for name in names: l = 'en' if '[' in name and name.endswith(']'): name, l = [s.strip() for s in name[:-1].split('[', 1)] add_identifier(dblang, data, name, 'name', prov, lang=l) for prov, ids in lang.identifier.items(): for id_ in split_text(ids, separators=',;'): add_identifier(dblang, data, id_, prov, None) if not dblang.bookkeeping: # Languages in Bookkeeping do not have a meaningful classification! clf = lang.classification_comment if clf: for attr, pid in [('sub', 'sc'), ('family', 'fc')]: val = getattr(clf, attr) if attr == 'sub' and not val: # Handle cases with subrefs but no sub comment. val = getattr(clf, 'subrefs') if val: val = ', '.join('{0}'.format(r) for r in val) if not val: continue vs = common.ValueSet( id='%s-%s' % (pid, lang.id), description=val, language=dblang, parameter=data['Parameter'][pid], contribution=data['Contribution']['clf']) DBSession.add(common.Value(id='%s-%s' % (pid, lang.id), valueset=vs)) iso_ret = lang.iso_retirement if iso_ret: DBSession.add(models.ISORetirement( id=iso_ret.code, name=iso_ret.name, description=iso_ret.comment, effective=iso_ret.effective, reason=iso_ret.reason, remedy=iso_ret.remedy, change_request=iso_ret.change_request, languoid=dblang)) eth_cmt = lang.ethnologue_comment if eth_cmt: DBSession.add(models.EthnologueComment( comment=eth_cmt.comment, code=eth_cmt.isohid, type=eth_cmt.comment_type, affected=eth_cmt.ethnologue_versions, languoid=dblang))
def load_languoid(glottolog, data, lang, nodemap): """ Load data from one Languoid object. :param glottolog: A `pyglottolog.Glottolog` instance. :param data: A `dict` providing access to previously loaded data. :param lang: The `pyglottolog.languoids.Languoid` object. :param nodemap: A `dict` mapping glottocodes to `pyglottolog.languoids.Languoid`s. :return: """ dblang = data.add( models.Languoid, lang.id, id=lang.id, hid=lang.hid, name=lang.name, bookkeeping=lang.category == glottolog.language_types.bookkeeping.category, category=lang.category, newick=lang.newick_node(nodemap).newick, latitude=lang.latitude, longitude=lang.longitude, level=models.LanguoidLevel.from_string(lang.level.name), father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None, jsondata=dict( iso_retirement=lang.iso_retirement.__json__() if lang.iso_retirement else None, ethnologue_comment=lang.ethnologue_comment.__json__() if lang.ethnologue_comment else None, links=[l.__json__() for l in lang.links], ) ) if lang.iso: add_language_codes(data, dblang, lang.iso) add_identifiers(data, dblang, lang.names, name_type=True) add_identifiers(data, dblang, lang.identifier, name_type=False) add = functools.partial(add_values, data, dblang) add('macroarea', [(m.id, m.name) for m in lang.macroareas]) add('country', [(c.id, c.name) for c in lang.countries]) if lang.endangerment: add('aes', [(lang.endangerment.status.id, lang.endangerment.status.name)], source=lang.endangerment.source.name, jsondata=attr.asdict(lang.endangerment.source), description=lang.endangerment.comment, ) if lang.level == glottolog.languoid_levels.language: add('ltype', [(lang.category, lang.category)]) if not dblang.bookkeeping: # Languages in Bookkeeping do not have a meaningful classification! clf = lang.classification_comment if clf: for attr_, pid in [('sub', 'sc'), ('family', 'fc')]: val = getattr(clf, attr_) if not val: val = getattr(clf, attr_ + 'refs') if val: val = ', '.join('{0}'.format(r) for r in val) if val: add(pid, [('1', '')], with_de=False, description=val)
def main(args): """Fills the database with data retrieved from tabular files. 'filltables()' iterates over each row of each table. 'typ' is the name of the table, 'name' a key column and 'tupl' the other columns as a dict {column_name,cell_value}.""" data = Data() count = 0 # dataset dataset = _addDataset(data) # load languages for typ, name, tupl in filltables(): if not name or name == "na": continue #TODO we exclude non core language if typ == "languages" and tupl.get('Id').startswith('L_'): #print(name, tupl) lang = _addLang([ name, tupl.get('Language', "na"), tupl.get('Family', "na"), tupl.get('fam_glottocode', ""), tupl.get('Area', "na"), tupl.get('Creator', "na"), tupl.get('Date', "na"), tupl.get('Archive', "na"), tupl.get('Archive_link', "na"), tupl.get('Translation', "na"), tupl.get('License', "na"), tupl.get('Audio license', "na"), tupl.get('NAKALA', "na"), tupl.get('Gloss', "na"), tupl.get('Words', 0), tupl.get('Speakers', 0), tupl.get('Texts', 0), tupl.get('Core words', 0), tupl.get('Core speakers', 0), tupl.get('Core texts', 0), tupl.get('Latitude', 0.0), tupl.get('Longitude', 0.0), tupl.get('Extended', "no") ]) add_language_codes(data, lang, tupl.get('iso-639-3'), glottocode=name) elif typ == "editors": dataset, count = _addEditor(dataset, count, [ name, tupl.get('url', "na"), tupl.get('email', "na"), tupl.get('address', "na"), tupl.get('team', "na"), tupl.get('function', "na") ]) elif typ == "sources": _addSource([ name, tupl.get('bibtex_type', "na"), tupl.get('author', "na"), tupl.get('year', "na"), tupl.get('title', "na"), tupl.get('url', "na"), tupl.get('note', "na") ]) else: #TODO for texts, we exclude delete and so on in column extended if tupl.get('extended') in ['no', 'yes']: #if typ=='dolg1241' : print(tupl):'' _addText([ typ, name, tupl.get('name', "na"), tupl.get('spk_code', "na"), tupl.get('spk_age', '0'), tupl.get('spk_age_c', "na"), tupl.get('spk_sex', "na"), tupl.get('rec_date', "na"), tupl.get('rec_date_c', "na"), tupl.get('genre', "na"), tupl.get('genre_stim', "na"), tupl.get('gloss', "na"), tupl.get('transl', "na"), tupl.get('sound', "na"), tupl.get('overlap', "na"), tupl.get('processed', "na"), tupl.get('nakala', "na"), tupl.get('words', 0), tupl.get('extended', "no") ]) # dataset # Note: needs to run after loading (for editors) DBSession.add(dataset) DBSession.flush()
def test_add_language_codes(self, ): from clld.db.models.common import Language from clld.scripts.util import Data, add_language_codes add_language_codes(Data(), Language(), 'iso', glottocodes=dict(iso='glot1234'))
def main(args): sources = get_sources(args) Index('ducet1', collkey(common.Value.name)).create(DBSession.bind) Index('ducet2', collkey(models.Counterpart.phonetic)).create(DBSession.bind) data = Data() glottocodes, geocoords = {}, defaultdict(lambda: (None, None)) for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] geocoords[k] = (v[1], v[2]) geocoords['win'] = (43.50, -88.50) dataset = common.Dataset( id=csd.__name__, name="Comparative Siouan Dictionary", description="Comparative Siouan Dictionary", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='csd.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) contrib = common.Contribution(id='csd', name=dataset.name) for i, spec in enumerate([ ('Robert L. Rankin', True), ('Richard T. Carter', True), ('A. Wesley Jones', True), ('John E. Koontz', True), ('David S. Rood', True), ('Iren Hartmann', True), ]): name, primary = spec c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i, primary=primary)) d = Dictionary( args.data_file(TXT), entry_impl=CsdEntry, entry_sep='\\lx ') d.entries = list(filter(lambda r: r.get('lx'), d.entries))[1:] print(len(d.entries)) for i, v in enumerate(_LANGUAGES): l = data.add( models.Languoid, v[0], id=v[0], name=v[1], ord=i, color=v[4].lower(), proto=v[0].startswith('p') and len(v[0]) == 3, latitude=geocoords[v[2]][0], longitude=geocoords[v[2]][1], parent=data['Languoid'].get(v[5])) if v[2]: add_language_codes(data, l, v[2], glottocodes=glottocodes) if l.id == 'pn': l.latitude, l.longitude = (42.75, -98.03) if l.id == 'op': l.latitude, l.longitude = (43.5, -96.6) if l.id == 'mo': l.latitude, l.longitude = (40.05, -95.52) pnames = set() def _get(d, marker): _l = set(nfilter(d.get(marker, []))) if _l: _l = list(_l) if marker not in ['oo', 'or']: assert len(_l) == 1 _l = _l[0] return _l def add_counterpart(d, vs, id, phonetic, # forms cognate, # oo me, cm, so, org): assert phonetic or cognate if not cognate: if vs.language.proto: cognate = phonetic phonetic = None else: cognate = '[%s]' % phonetic m = models.Counterpart( id=id, name=cognate, phonetic=phonetic, description=me or '[%s]' % vs.parameter.name, comment=cm, original_entry=org, other_reconstructions='; '.join(_get(d, 'or') or []) if vs.language.id == 'psi' else None, valueset=vs) if so: for sid in nfilter([s.strip() for s in SEP_PATTERN.split(so or '')]): match = SID_PATTERN.match(sid) if not match: continue name = sid sid = normalize_sid(match.group('key')) source = data['Source'].get(sid) if not source: if sid in sources: s = sources[sid] source = data.add( common.Source, sid, id=sid, name=s['Name'].upper() if len(s['Name']) <= 3 else s['Name'], description=s.get('Title', s['citation']), author=s.get('Author'), title=s.get('Title'), year=s.get('Year'), ) else: source = data.add( common.Source, sid, id=sid, name=name.upper() if len(name) <= 3 else name) m.references.append(models.ValueReference( source=source, description=match.group('pages'))) for i, entry in enumerate(sorted(d.entries, key=lambda d: d.get('lx'), reverse=True)): lemma = entry.get('lx') if not lemma or not lemma.strip(): continue pname = lemma j = 1 while pname in pnames: pname = '%s (%s)' % (lemma, j) j += 1 pnames.add(pname) contrib = data.add( common.Contribution, pname, id=str(i + 1), name='Entry "%s"' % pname) meaning = data.add( models.Entry, pname, id=str(i + 1), name=pname, contribution=contrib, description=entry.get('com'), psi_reconstruction_with_root_extension_code=entry.get('lxcm'), sd=normalize_comma_separated(entry.get('sd'), SD, lower=True), ps=normalize_comma_separated(entry.get('ps'), PS), othlgs='\n---\n'.join(entry.getall('othlgs'))) if meaning.description: meaning.description = meaning.description.replace('.\n', '.\n\n') for lid, words in entry.get_words().items(): vsid = '%s-%s' % (lid, meaning.id) vs = data.add( common.ValueSet, vsid, id=vsid, parameter=meaning, contribution=contrib, language=data['Languoid'][lid]) for j, d in enumerate(words): looped = False for k, (oo, me, so, cm, org) in enumerate(izip_longest( *[d.get(_m, []) for _m in 'oo me so cm _org'.split()])): if not oo: continue looped = True add_counterpart(d, vs, '%s-%s-%s' % (vsid, j + 1, k + 1), d['forms'], oo, me, cm, so, org) if not looped: # not oo if not d['forms']: print '--->', d continue add_counterpart(d, vs, '%s-%s-%s' % (vsid, j + 1, 1), d['forms'], '; '.join(_get(d, 'oo') or []), _get(d, 'me'), _get(d, 'cm'), _get(d, 'so'), _get(d, '_org'))