def add_sources(sources_file_path, session): """ Creates and adds to the given SQLAlchemy session the common.Source model instances that comprise the project's references. Expects the path to a bibtex file as its first argument. Returns a dict containing the added model instances with the bibtex IDs being the keys. Helper for the main function. """ d = {} bibtex_db = bibtex.Database.from_file(sources_file_path, encoding='utf-8') seen = set() for record in bibtex_db: if record.id in seen: continue d[record.id] = bibtex2source(record) session.add(d[record.id]) seen.add(record.id) session.flush() return d
def main(args): assert args.glottolog, 'The --glottolog option is required!' clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') data = Data() ds = data.add( common.Dataset, lsi.__name__, id=lsi.__name__, name= 'The Comparative Vocabularies of the "Linguistic Survey of India" Online', domain='lsi.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Taraka Rama', 'Robert Forkel', 'Johann-Mattis List']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], order=int(lang['Order']), number=lang['NumberInSource'], family_in_source=lang['FamilyInSource'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), description=param['Concepticon_Gloss'], concepticon_id=param['concepticonReference'], pages=param['PageNumber'], ) inventories = collections.defaultdict(set) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']] = inventories[ form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Form, form['id'], id=form['id'], name=form['form'], description=''.join(form['Segments']).replace('+', ' '), segments=' '.join(form['Segments']), valueset=vs, ) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata(inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): # pragma: no cover data = Data() clts_repos = Path(__file__).parent.parent.parent.parent.resolve() / 'clts-data' clts_repos = CLTS(clts_repos) print(clts_repos.repos) version = 'v2.1.0' # assert_release(clts_repos.repos) for rec in Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) dataset = common.Dataset( id='clts', name="CLTS {0}".format(version), publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", contact='*****@*****.**', domain='clts.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) DBSession.add(dataset) for i, name in enumerate([ 'Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 'Robert Forkel', ]): c = common.Contributor(id=slug(name), name=name) dataset.editors.append(common.Editor(contributor=c, ord=i)) for line in args.cldf['data/features.tsv']: data.add( models.Feature, line['ID'], id=line['ID'], name='{} {}: {}'.format(line['TYPE'], line['FEATURE'], line['VALUE']), sound_type=line['TYPE'], feature=line['FEATURE'], value=line['VALUE'], ) DBSession.add(models.SoundSegment( id='NA', name='<NA>', description='<NA>', type='marker', generated=True, unicode='', color='#bbbbbb', )) for line in args.cldf['data/sounds.tsv']: s = data.add( models.SoundSegment, line['ID'], id=line['ID'], name=line['GRAPHEME'], description=line['NAME'], type=line['TYPE'], generated=line['GENERATED'], unicode=' / '.join(line['UNICODE']), color=clts_repos.soundclass('color').resolve_sound(line['GRAPHEME']), ) if s.color == '0': s.color = '#bbbbbb' assert s.color in LEGEND DBSession.flush() seen = set() for line in args.cldf['data/sounds.tsv']: for fid in line['FEATURES']: spk, fpk = data['SoundSegment'][line['ID']].pk, data['Feature'][fid].pk if (spk, fpk) not in seen: DBSession.add(models.SoundSegmentFeature(soundsegment_pk=spk, feature_pk=fpk)) seen.add((spk, fpk)) english = data.add( common.Language, 'eng', id='eng', name='English') for line in args.cldf['sources/index.tsv']: c = data.add( models.Transcription, line['NAME'], id=line['NAME'], name=line['NAME'], description=line['DESCRIPTION'].replace(':bib:', '/sources/'), datatype=getattr(models.Datatype, line['TYPE']) ) for ref in line.get('REFS', []): common.ContributionReference(source=data['Source'][ref], contribution=c) sound_url_template = args.cldf['data/graphemes.tsv', 'SOUND'].valueUrl image_url_template = args.cldf['data/graphemes.tsv', 'IMAGE'].valueUrl for line in args.cldf['data/graphemes.tsv']: key = line['DATASET'] + ':' + line['NAME'] + ':' + line['GRAPHEME'] if key not in data['Grapheme']: sound_id = line['NAME'].replace(' ', '_') vs = data['ValueSet'].get((line['DATASET'], line['NAME'])) if not vs: try: vs = data.add( common.ValueSet, (line['DATASET'], line['NAME']), id=key, description=line['NAME'], language=english, contribution=data['Transcription'][line['DATASET']], parameter=data['SoundSegment'][sound_id] ) except: print(line) raise data.add( models.Grapheme, key, id=key, name=line['GRAPHEME'], description=line['NAME'], url=line['URL'].unsplit() if line['URL'] else None, audio=sound_url_template.expand(line) if line['SOUND'] else None, image=image_url_template.expand(line) if line['IMAGE'] else None, valueset=vs )
def main(args): # pragma: no cover data = Data() clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts') ds = data.add( common.Dataset, tppsr.__name__, id=tppsr.__name__, name='Tableaux phonétiques des patois suisses romands Online', domain='tppsr.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}, ) for i, name in enumerate(['Hans Geisler', 'Robert Forkel', 'Johann-Mattis List']): common.Editor( dataset=ds, ord=i, contributor=common.Contributor(id=slug(HumanName(name).last), name=name) ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['Number'], name=lang['name'], description=lang['FullName'], latitude=lang['latitude'], longitude=lang['longitude'], canton=lang['Canton'], group=lang['DialectGroup'], recorded=lang['DateOfRecording'], population=int(lang['Population']) if lang['Population'] else None, speaker_age=int(lang['SpeakerAge']) if lang['SpeakerAge'] else None, speaker_proficiency=lang['SpeakerProficiency'], speaker_language_use=lang['SpeakerLanguageUse'], speaker_gender=lang['SpeakerGender'], investigators=lang['Investigators'], ) colors = qualitative_colors(len(set(l.canton for l in data['Variety'].values())), set='tol') for i, (_, langs) in enumerate(itertools.groupby( sorted(data['Variety'].values(), key=lambda l: l.canton), lambda l: l.canton, )): for lang in langs: lang.update_jsondata(color=colors[i]) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['Number'], number=int(param['Number']), name='{} [{}]'.format(param['name'], param['Number']), latin_gloss=param['Latin_Gloss'], french_gloss=param['French_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], concepticon_concept_id=param['id'].split('_')[0], ) inventories = collections.defaultdict(set) scan_url_template = args.cldf['FormTable', 'Scan'].valueUrl for form in iteritems(args.cldf, 'FormTable', 'id', 'value', 'form', 'languageReference', 'parameterReference', 'source'): if not form['form']: continue inventories[form['languageReference']] = inventories[form['languageReference']].union(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) f = data.add( models.Form, form['id'], # Gauchat-1925-480-1_ id=form['id'], name=form['form'].replace('+', ' '), description=form['value'], segments=' '.join(form['Segments']), valueset=vs, scan=scan_url_template.expand(**form), prosodic_structure=form['ProsodicStructure'], ) for example in args.cldf['ExampleTable']: sentence = models.Phrase( id=example['ID'], language=data['Variety'][example['Language_ID']], name=example['Primary_Text'], description=example['Translated_Text'], original_script=example['Alt_Transcription'], ) for cid in example['Concept_ID']: DBSession.add(models.ConceptSentence(concept=data['Concept'][cid], sentence=sentence)) for fid in example['Form_ID']: DBSession.add(common.ValueSentence(value=data['Form'][fid], sentence=sentence)) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')]) for (vsid, sid), pages in refs.items(): DBSession.add(common.ValueSetReference( valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)) ))
def main(args): # pragma: no cover license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') clts = CLTS( input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data') data = Data() ds = data.add( common.Dataset, vanuatuvoices.__name__, id=vanuatuvoices.__name__, name='Vanuatu Voices', domain='vanuatuvoices.clld.org', contact="*****@*****.**", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg') r = get_dataset('vanuatuvoices', ep='lexibank.dataset') authors, _ = r.get_creators_and_contributors() for ord, author in enumerate(authors): cid = slug(HumanName(author['name']).last) img = pathlib.Path( vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid) c = data.add( common.Contributor, cid, id=cid, name=author['name'], description=author.get('description'), jsondata=dict(img=img.name if img.exists() else None), ) data.add( common.Contributor, 'forkel', id='forkel', name='Robert Forkel', description='Data curation and website implementation', jsondata=dict(img=None), ) for ord, cid in enumerate(['walworth', 'forkel', 'gray']): DBSession.add( common.Editor(ord=ord, dataset=ds, contributor=data['Contributor'][cid])) contribs = collections.defaultdict(lambda: collections.defaultdict(list)) for c in args.cldf.iter_rows('contributions.csv'): for role in ['phonetic_transcriptions', 'recording', 'sound_editing']: for name in c[role].split(' and '): if name: cid = slug(HumanName(name).last) contribs[c['Language_ID']][cid].append(role) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): contrib = data.add( common.Contribution, lang['id'], id=lang['id'], name='Wordlist for {}'.format(lang['name']), ) if lang['id'] in contribs: for cid, roles in contribs[lang['id']].items(): DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor'][cid], jsondata=dict(roles=roles), )) data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], contribution=contrib, island=lang['Island'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), description=param['Bislama_Gloss'], concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) inventories = collections.defaultdict(collections.Counter) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): inventories[form['languageReference']].update(form['Segments']) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=data['Contribution'][form['languageReference']], ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add(Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=form2audio.get(form['id'])) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) for lid, inv in inventories.items(): inv = [clts.bipa[c] for c in inv] data['Variety'][lid].update_jsondata( inventory=[(str(c), c.name) for c in inv if getattr(c, 'name', None)])
def main(args): # pragma: no cover data = Data() print("Setting up dataset…") dataset = common.Dataset( id=cariban.__name__, domain="cariban.clld.org", name="Comparative Cariban Database", description="Comparative Cariban Database", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_url="https://www.eva.mpg.de", publisher_place="Leipzig", license="https://creativecommons.org/licenses/by/4.0/", contact="*****@*****.**", jsondata={'function_paradigms': []}, ) fps = [] for morph_func in args.cldf["ValueTable"]: for function in morph_func["Function"]: for cons in morph_func["Construction"]: fps.append({ 'Function': function, 'Construction': cons, 'Morpheme': morph_func['Morpheme']}) dataset.update_jsondata(function_paradigms=fps) DBSession.add(dataset) DBSession.flush() print("Adding contributors…") c = common.Contributor(id="fm",name="Florian Matter", email="*****@*****.**", url="https://florianmatter.gitlab.io/") dataset.editors.append(common.Editor(contributor=c, ord=1, primary=True)) print("Adding languages…") dialect_mapping = {} lang_shorthands = {} glottocodes = {} lang_ids = {} for lang in args.cldf["LanguageTable"]: if lang["Sampled"] == "y": language = data.add( common.Language, lang["ID"], id=lang["ID"], name=lang["Name"], latitude=float(lang["Latitude"]) if lang["Latitude"] is not None else None, longitude=float(lang["Longitude"]) if lang["Longitude"] is not None else None, jsondata={'Shorthand': lang['Shorthand'], 'Glottocode': lang['Glottocode']}, ) add_language_codes(data, language, isocode=lang["ISO"], glottocode = lang["Glottocode"]) if lang["Dialect_Of"] not in [None, "y"]: dialect_mapping[lang["ID"]] = lang["Dialect_Of"] lang_shorthands[lang["Shorthand"]] = {"ID": lang["ID"], "Name": lang["Name"]} glottocodes[lang["Glottocode"]] = {"ID": lang["ID"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]} lang_ids[lang["ID"]] = {"Glottocode": lang["Glottocode"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]} def get_lang_id(key): if key in lang_shorthands: lang_id = lang_shorthands[key]["ID"] elif key in glottocodes: lang_id = glottocodes[key]["ID"] elif key in lang_ids: lang_id = key else: print("Could not identify language %s" % key) return None if lang_id in dialect_mapping: lang_id = dialect_mapping[lang_id] return lang_id def get_key_and_page(source_string): if len(source_string.split("[")) > 1: bib_key = source_string.split("[")[0] pages = source_string.split("[")[1].split("]")[0] else: bib_key = source_string pages = "" return bib_key, pages print("Adding sources…") for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) print("Adding language sources…") DBSession.flush() for rec in bibtex.Database.from_file(args.cldf.bibpath): if "keywords" in rec: for keyword in rec["keywords"].split(","): if keyword in lang_shorthands: lang_id = get_lang_id(keyword.strip(" ")) if lang_id in data["Language"]: data.add(common.LanguageSource, rec.id+lang_id, language_pk=data["Language"][lang_id].pk, source_pk=data["Source"][rec.id].pk ) data.add( common.Source, "pc", id="pc", name="Personal communication", description="Placeholder for data obtained from personal communication.", bibtex_type=bibtex.EntryType.misc ) # print("Adding glossing abbreviations…") # length = len(pynterlinear.get_all_abbrevs().keys()) # for i, (key, name) in enumerate(pynterlinear.get_all_abbrevs().items()): # print("%s/%s" % (i+1, length), end="\r") # DBSession.add(common.GlossAbbreviation(id=key, name=name)) # print("") # print("Adding examples…") gloss_replacements = { "S_A_": "Sa", "S_P_": "Sp" } def clldify_glosses(gloss_line): for orig, new in gloss_replacements.items(): gloss_line = gloss_line.replace(orig,new) gloss_line = re.sub(r"(\d)([A-Z])", r"\1.\2", gloss_line) return gloss_line for ex in args.cldf["ExampleTable"]: lang_id = get_lang_id(ex["Language_ID"]) new_ex = data.add(common.Sentence, ex["ID"], id=ex["ID"], name=ex["Name"], description=ex["Translated_Text"], analyzed="\t".join(ex["Analyzed_Word"]), gloss=clldify_glosses("\t".join(ex["Gloss"])), language=data["Language"][lang_id], comment=ex["Comment"], markup_gloss="\t".join(ex["Morpheme_IDs"]) ) if ex["Source"]: bib_key, pages = get_key_and_page(ex["Source"]) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(common.SentenceReference( sentence=new_ex, source=source, key=source.id, description=pages.replace("--","–")) ) def add_morpheme_reference(morpheme, source_string): bib_key, pages = get_key_and_page(source_string) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.MorphemeReference( morpheme=morpheme, source=source, key=source.id, description=pages.replace("--","–") ) ) print("Adding morphemes…") for morph in args.cldf["FormTable"]: lang_id = get_lang_id(morph["Language_ID"]) form = util.merge_allomorphs("; ".join(morph["Form"])).split("; ") new_morph = data.add(models.Morpheme, morph["ID"], morpheme_type="grammatical", language=data["Language"][lang_id], name="/".join(form), id=morph["ID"], ) if morph["Source"]: add_morpheme_reference(new_morph, morph["Source"][0]) print("Adding constructions…") data.add(models.DeclarativeType, "imp", id="imp", name="imperative") data.add(models.DeclarativeType, "decl", id="decl", name="declarative") data.add(models.MainClauseVerb, "y", id="y", name="main clause construction") data.add(models.MainClauseVerb, "n", id="n", name="subordinate clause construction") for cons in args.cldf["ParameterTable"]: lang_id = get_lang_id(cons["Language_ID"]) new_construction = data.add( models.Construction, cons["ID"], id=cons["ID"], language=data["Language"][lang_id], name=cons["Description"], mainclauseverb=data["MainClauseVerb"][cons["MainClauseVerb"]], ) if cons["DeclarativeType"]: new_construction.declarativetype = data["DeclarativeType"][cons["DeclarativeType"]] def add_morph_func(morpheme, func_key, construction): data.add(models.MorphemeFunction, "%s:%s" % (morpheme, function), id="%s:%s" % (morpheme, func_key), name="MorphemeFunction %s:%s"% (morpheme, func_key), unit=data["Morpheme"][morpheme], unitparameter=data["Meaning"][function], construction=construction ) print("Adding morpheme functions…") for morph_func in args.cldf["ValueTable"]: for function in morph_func["Function"]: func_key = function.replace(".","_") if ">" in function or function == "LK" or bool(re.search(r"\d[SP]$", function) or function == "3"): meaning_type="inflectional" else: meaning_type="derivational" if function not in data["Meaning"]: data.add(models.Meaning, function, id=func_key, name=function, meaning_type=meaning_type ) #Only some morpheme functions are specified as occurring in specific constructions if len(morph_func["Construction"]) == 0: for morpheme in morph_func["Morpheme"]: add_morph_func(morpheme, func_key, None) else: for construction in morph_func["Construction"]: if len(morph_func["Morpheme"]) == 1 and morph_func["Morpheme"][0] != "?": for morpheme in morph_func["Morpheme"]: if data["Morpheme"][morpheme].language != data["Construction"][construction].language: print("Warning: the %s Morpheme %s is stated to occur in the %s construction %s!" % ( data["Morpheme"][morpheme].language, data["Morpheme"][morpheme], data["Construction"][construction].language, data["Construction"][construction] ) ) cons_func_key = func_key + ":" + construction add_morph_func(morpheme, cons_func_key, data["Construction"][construction]) print("Checking examples for illustrated morphemes…") proto_languages = ["pc"] is_illustrated = {} for key, row in data["MorphemeFunction"].items(): if row.unit.language.id in proto_languages: continue is_illustrated["%s:%s" % (row.unit.id, row.unitparameter.id)] = False for row in args.cldf["ExampleTable"]: for word in row["Morpheme_IDs"]: morph_ids = util.split_word(word) for unit_value in morph_ids: if unit_value in ["X","-","=", "~"]: continue unitvaluesentence_key = "{0}-{1}".format(unit_value.replace(".","-"),row["ID"]) if unitvaluesentence_key in data["UnitValueSentence"].keys(): continue is_illustrated[unit_value] = True morph_id = unit_value.split(":")[0] if morph_id not in data["Morpheme"].keys(): print("Warning: Example %s illustrates unknown morpheme %s" % (row["ID"], morph_id)) elif data["Morpheme"][morph_id].language != data["Sentence"][row["ID"]].language: print("Warning: The %s example %s claims to contain the %s morpheme %s." % ( data["Sentence"][row["ID"]].language, row["ID"], data["Morpheme"][morph_id].language, data["Morpheme"][morph_id] ) ) if ":" not in unit_value: print("%s in %s contains no defined function!" % (unit_value, row["ID"])) function = unit_value.split(":")[1] morph_function_id = "%s:%s" % (morph_id, function) if morph_function_id not in data["MorphemeFunction"].keys(): print("Warning: Example %s tries to illustrate inexistent morpheme function %s!" % (row["ID"], unit_value.replace(".","-"))) continue data.add(models.UnitValueSentence, unitvaluesentence_key, sentence=data["Sentence"][row["ID"]], unitvalue=data["MorphemeFunction"][morph_function_id], ) # see how many morpheme functions are illustrated with example sentences good_ill = [key for key, value in is_illustrated.items() if value] not_ill = [key for key, value in is_illustrated.items() if not value] not_ill.sort() cov = len(good_ill)/len(is_illustrated)*100 print("Morpheme exemplification coverage is at %s%%. List of unillustrated morphemes saved to unillustrated_morphemes.txt" % str(round(cov, 2))) f = open("../unillustrated_morphemes.txt", "w") for morph in not_ill: f.write(morph+"\n") f.close() print("Adding cognate sets…") for cogset in args.cldf["CognatesetTable"]: new_cset = data.add(models.Cognateset, cogset["ID"], id=cogset["ID"], name=cogset["Name"], description=cogset["Function"], cogset_type="grammatical" ) if cogset["Source"]: for source in cogset["Source"]: bib_key, pages = get_key_and_page(source) if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.CognatesetReference( cognateset=new_cset, source=source, key=source.id, description=pages) ) print("Adding cognates…") for morph in args.cldf["FormTable"]: for cognate_ID in morph["Cognateset_ID"]: DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognate_ID], counterpart=data["Morpheme"][morph["ID"]] ) ) print("Adding morpheme comments…") for row in args.cldf["FormTable"]: data["Morpheme"][row["ID"]].markup_description=util.generate_markup(row["Comment"]) print("Adding construction descriptions…") for cons in args.cldf["ParameterTable"]: if cons["Comment"] is None: description = "" else: description = util.generate_markup(cons["Comment"]) description += "\n" + util.generate_markup(util.transitive_construction_paradigm(cons["ID"])) description += util.generate_markup(util.intransitive_construction_paradigm(cons["ID"])) data["Construction"][cons["ID"]].markup_description = description print("Adding cognate set descriptions…") for cogset in args.cldf["CognatesetTable"]: data["Cognateset"][cogset["ID"]].markup_description = util.generate_markup(cogset["Description"]) # if cogset["ID"] == "13pro": # data["Cognateset"][cogset["ID"]].markup_description += util.generate_markup( # util.comparative_function_paradigm( # ["apa_main", "tri_main", "way_main", "mak_main", "kar_main", "hix_main", "wai_main", "ara_main", "ikp_main", "wmr_main", "pan_old", "kax_main"], # "1+3 scenarios", # ["1+3S", "1+3>3", "3>1+3", "2>1+3", "1+3>2"])) def add_tree_labels(phylo): uncertain_nodes = [] for node in phylo.find_clades(): if node.name == None or not node.is_terminal(): continue plain_name = node.name.replace("?","") if "?" in node.name: uncertain_nodes.append(plain_name) if plain_name in lang_ids: node.name = lang_ids[plain_name]["Name"].replace("'", "’") if plain_name in uncertain_nodes: node.name += "?" return phylo, uncertain_nodes print("Adding trees…") own_trees = ["matter"] tree_path = str(args.cldf.tablegroup._fname.parent / '..' / 'raw') newick_files = {} for tree in args.cldf["cariban_trees.csv"]: if tree["ID"] in own_trees: continue newick_files[tree["ID"]] = { "orig": tree["ID"]+"_orig.newick", "norm": tree["ID"]+"_norm.newick", "source": tree["Source"], "comment": tree["Comment"], "o_comment": tree["Orig_Comment"] } #adding my own trees separately. for my_tree_count, tree_id in enumerate(own_trees): my_tree = Phylo.read(tree_path+"/"+"%s.newick" % tree_id, "newick") my_tree, uncertain_nodes = add_tree_labels(my_tree) edited_tree = io.StringIO() Phylo.write(my_tree, edited_tree, "newick") tree = edited_tree.getvalue().replace(":0.00000","") my_phylo = Phylogeny( tree_id, id=tree_id, name="Matter (2020)",# % str(my_tree_count+1), newick=tree, markup_description="My own, conservative, classification." ) for l in DBSession.query(common.Language): lname = l.name.replace("'", "’") if l.id in uncertain_nodes: lname += "?" new_label = LanguageTreeLabel( language=l, treelabel=TreeLabel( id="%s_%s" % (tree_id, l.id), name=lname, phylogeny=my_phylo ) ) DBSession.add(my_phylo) #adding the other trees for tree_id, values in newick_files.items(): norm_biotree = Phylo.read(tree_path+"/"+values["norm"], "newick") orig_biotree = Phylo.read(tree_path+"/"+values["orig"], "newick") norm_biotree, uncertain_nodes = add_tree_labels(norm_biotree) edited_tree = io.StringIO() Phylo.write(norm_biotree, edited_tree, "newick") norm_tree = edited_tree.getvalue().replace(":0.00000","") edited_tree = io.StringIO() Phylo.write(orig_biotree, edited_tree, "newick") orig_tree = edited_tree.getvalue().replace(":0.00000","") norm_phylo = Phylogeny( id=tree_id+"_norm", name=str(data["Source"][values["source"]]) + " (Normalized)", markup_description=util.generate_markup("Source: src:"+values["source"])+ "<br>This is a normalized version of <a href='/phylogeny/%s_orig'>this original tree</a>." % tree_id + util.generate_markup( "<br>Comments: %s" % values["comment"] ), newick=norm_tree ) if values["o_comment"] == None: o_comment = "" else: o_comment = values["o_comment"] orig_phylo = Phylogeny( id=tree_id+"_orig", name=str(data["Source"][values["source"]]) + " (Original)", markup_description=util.generate_markup("Source: src:"+values["source"])+ "<br>This is a representation of the original classification. A normalized version can be found <a href='/phylogeny/%s_norm'>here</a>." % tree_id + util.generate_markup( "<br>Comments: %s" % values["comment"] + " " + o_comment ), newick=orig_tree ) for l in DBSession.query(common.Language): lname = l.name.replace("'", "’") if l.id in uncertain_nodes: lname += "?" new_label = LanguageTreeLabel( language=l, treelabel=TreeLabel( id="%s_%s" % (tree_id, l.id), name=lname, phylogeny=norm_phylo ) ) DBSession.add(norm_phylo) DBSession.add(orig_phylo) print("Adding t-adding verb cognate sets…") for t_verb_set in args.cldf["cariban_t_cognates.csv"]: cognate_ID = "t"+t_verb_set["ID"] rec_t_form = "*[%s]%s" % (t_prefix_form(t_verb_set["Form"]), t_verb_set["Form"]) t_cogset = data.add(models.Cognateset, cognate_ID, id=cognate_ID, name=rec_t_form, description="‘%s’ (*t-adding verb)" % t_verb_set["Parameter_ID"], cogset_type="t_adding" ) if t_verb_set["Source"]: bib_key = t_verb_set["Source"].split("[")[0] if len(t_verb_set["Source"].split("[")) > 1: pages = t_verb_set["Source"].split("[")[1].split("]")[0] else: pages = " " if bib_key in data["Source"]: source = data["Source"][bib_key] DBSession.add(models.CognatesetReference( cognateset=t_cogset, source=source, key=source.id, description=pages) ) print("Adding t-adding verbs…") t_langs = {} t_verbs = {} non_t_adding_lgs = ["ing","mac","kar","wmr","pan"] data.add(models.Meaning, "t_verb", id="t-verb", name="t-adding verb", ) for t_verb_entry in args.cldf["cariban_t_verbs.csv"]: if t_verb_entry["Language_ID"] == "cari1283": continue cognate_ID = "t"+t_verb_entry["Cognateset_ID"] lang_id = get_lang_id(t_verb_entry["Language_ID"]) morph_id = lang_id+"_"+cognate_ID if morph_id in data["Morpheme"].keys(): if morph_id + "_2" in data["Morpheme"].keys(): morph_id += "_3" else: morph_id += "_2" t_verb = data.add(models.Morpheme, morph_id, id=morph_id, morpheme_type="t_adding", name=t_verb_entry["Form"], language=data["Language"][lang_id], ) DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognate_ID], counterpart=t_verb ) ) if t_verb_entry["t"] == "y": t_verb.name = "[%s]%s" % (t_prefix_form(t_verb.name), t_verb.name) t_verb.markup_description = util.generate_markup("Shows cogset:t") if t_verb_entry["t"] == "?" and lang_id not in non_t_adding_lgs: t_verb.name = "[t-?]"+t_verb.name t_verb.markup_description = util.generate_markup("It is not known if this verb shows cogset:t") if t_verb_entry["t"] == "n": t_verb.markup_description = util.generate_markup("Does not show cogset:t") if lang_id not in t_langs.keys(): t_langs[lang_id] = {"y": 0, "n": 0, "?": 0} if cognate_ID not in t_verbs.keys(): t_verbs[cognate_ID] = {"y": 0, "n": 0, "?": 0} t_langs[lang_id][t_verb_entry["t"]] += 1 if lang_id not in non_t_adding_lgs: t_verbs[cognate_ID][t_verb_entry["t"]] += 1 if t_verb_entry["Source"]: add_morpheme_reference(t_verb, t_verb_entry["Source"]) data.add(models.MorphemeFunction, "t_"+t_verb_entry["ID"], id="t_"+t_verb_entry["ID"], name="t-Verb %s" % t_verb_entry["Parameter_ID"], unit=t_verb, unitparameter=data["Meaning"]["t_verb"], construction=None ) for lang, values in t_langs.items(): data["Language"][lang].update_jsondata(t_values=values) for verb, values in t_verbs.items(): # data["Cognateset"][verb].description += " (%s/%s)" % (str(values["y"]), str(values["n"]+values["y"]+values["?"])) data["Cognateset"][verb].markup_description = util.generate_markup("This verb occurs with obj:t- in %s of %s languages which show reflexes of cogset:t." % (str(values["y"]), str(values["n"]+values["y"]+values["?"]))) print("Adding reconstructed lexemes…") proto_forms = {} for cogset in args.cldf["cariban_lexical_reconstructions.csv"]: proto_forms[cogset["ID"]] = cogset["Form"] first_found = [] for entry in args.cldf["cariban_swadesh_list.csv"]: cognateset_ID = entry["Parameter_ID"].replace("/","_")+"-"+entry["Cognateset_ID"] if cognateset_ID not in data["Cognateset"]: if cognateset_ID in proto_forms: form = "*" + proto_forms[cognateset_ID].replace("; ", " / ") # else: # form = "" data.add(models.Cognateset, cognateset_ID, id=cognateset_ID, name=form, description=cognateset_ID, cogset_type="lexical" ) lang_id = get_lang_id(entry["Language_ID"]) if lang_id not in data["Language"]: continue function = entry["Parameter_ID"].replace(".","_") morph_id = entry["Language_ID"] + "_" + function if morph_id in first_found: continue first_found.append(morph_id) if function not in data["Meaning"].keys(): data.add(models.Meaning, function, id=function, name=function, meaning_type="lexical" ) morpheme = data.add(models.Morpheme, morph_id, id=morph_id, morpheme_type="lexical", name=entry["Value"][0], language=data["Language"][lang_id], ) data.add(models.MorphemeFunction, "%s:%s" % (morph_id, function), id="%s:%s" % (morph_id, function), name="MorphemeFunction %s:%s"% (morph_id, function), unit=data["Morpheme"][morph_id], unitparameter=data["Meaning"][function], construction=None ) if entry["Source"]: add_morpheme_reference(morpheme, entry["Source"]) if cognateset_ID in proto_forms: DBSession.add(models.Cognate( cognateset=data["Cognateset"][cognateset_ID], counterpart=morpheme ) )
def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() data.add( common.Dataset, polyglottaafricana.__name__, id=polyglottaafricana.__name__, domain='', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), ) for form in iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def main(args): license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, papuanvoices.__name__, id=papuanvoices.__name__, domain='papuanvoices.clld.org', name="Papuan Voices", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') for i, ed in enumerate(['gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], description=lang['LongName'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )
def add_sources(sources_file_path, session): bibtex_db = bibtex.Database.from_file(sources_file_path, encoding='utf-8') for record in bibtex_db: session.add(bibtex2source(record)) yield record.id session.flush()
def main(args): data = Data() icons = cycle(ORDERED_ICONS) dataset = common.Dataset( id=gelato.__name__, name="GeLaTo", description="Genes and Languages together", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='gelato.clld.org', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) for i, (id_, name) in enumerate([('barbierichiara', 'Chiara Barbieri'), ('blasidamian', 'Damián Blasi'), ('forkelrobert', 'Robert Forkel')]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) families = {} for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for r in args.cldf.iter_rows('ContributionTable', 'id', 'name', 'description'): ds = data.add(models.Panel, r['id'], id=r['id'], name=r['name'], description=r['description']) for row in args.cldf.iter_rows('LanguageTable', 'id', 'name', 'contributionReference'): icon = families.get(row['LanguageFamily_Glottocode']) if not icon: families[row['LanguageFamily_Glottocode']] = icon = next(icons) lang = data['Languoid'].get(row['Glottocode']) if not lang: lang = data.add( models.Languoid, row['Glottocode'], id=row['Glottocode'], name=row['Language_Name'], family_id=row['LanguageFamily_Glottocode'], family_name=row['LanguageFamily'], jsondata=dict(icon=icon.name), ) s = data.add( models.Sample, row['id'], id=row['id'], name=row['Name'], panel=data['Panel'][row['contributionReference']], languoid=lang, latitude=row['Latitude'], longitude=row['Longitude'], samplesize=int(row['samplesize']), #source=row.get('dataSet.of.origin'), region=row['geographicRegion'], #location=row['location'], jsondata=dict(color=REGIONS[row['geographicRegion']]), ) DBSession.flush() for bibkey in row['Source']: DBSession.add( common.LanguageSource(language_pk=s.pk, source_pk=data['Source'][bibkey].pk)) types = {} for row in args.cldf.iter_rows('ParameterTable', 'id', 'name', 'description', 'contributionReference'): types[row['id']] = Datatype.fromvalue(row['datatype']) data.add(models.Measure, row['id'], id=row['id'], name=row['name'], description=row['description'], panel=data['Panel'][row['contributionReference']]) for row in args.cldf.iter_rows('ValueTable', 'id', 'parameterReference', 'languageReference'): v = types[row['parameterReference']].read(row['Value']) if isinstance(v, float): vs = data.add( common.ValueSet, row['id'], id=row['id'], language=data['Sample'][row['languageReference']], parameter=data['Measure'][row['parameterReference']], #contribution=ds, #jsondata=dict(color=REGIONS[sample.region]), ) data.add(models.Measurement, row['id'], id=row['id'], valueset=vs, name=row['Value'], value=v)
def main(args): # pragma: no cover get_repos() api = Grambank(REPOS['Grambank']) cldf = args.cldf data = Data() dataset = models.Grambank( id=grambank.__name__, name="Grambank", description="Grambank", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='grambank.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) contributors = {} for i, contrib in enumerate(api.contributors): contrib = common.Contributor( contrib.id, id=contrib.id, name=contrib.name, ) common.Editor(dataset=dataset, contributor=contrib, ord=i) DBSession.add(contrib) DBSession.flush() contributors[contrib.id] = contrib.pk contributions = {r['ID']: r for r in cldf['LanguageTable']} DBSession.add(dataset) for rec in tqdm(list(Database.from_file(cldf.bibpath, lowercase=True)), desc='sources'): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) DBSession.flush() sources = {k: v.pk for k, v in data['Source'].items()} features, codes = import_features(cldf, contributors) transaction.commit() values_by_sheet = [(lid, list(v)) for lid, v in itertools.groupby( sorted(cldf['ValueTable'], key=lambda r: r['Language_ID']), lambda r: r['Language_ID'], )] for lid, values in tqdm(values_by_sheet, desc='loading values'): transaction.begin() import_values(values, contributions[lid], features, codes, contributors, sources) transaction.commit() transaction.begin() glottolog = Glottolog(REPOS['glottolog']) languoids = {l.id: l for l in glottolog.languoids()} gblangs = DBSession.query(models.GrambankLanguage).all() load_families(data, gblangs, glottolog_repos=REPOS['glottolog'], isolates_icon='dcccccc') # Add isolates for lg in gblangs: gl_language = languoids.get(lg.id) if not gl_language.family: family = data.add( Family, gl_language.id, id=gl_language.id, name=gl_language.name, description=common.Identifier( name=gl_language.id, type=common.IdentifierType.glottolog.value).url(), jsondata={"icon": 'tcccccc'}) lg.family = family coverage.main(glottolog) return
def main(args): assert args.glottolog, 'The --glottolog option is required!' license = licenses.find(args.cldf.properties['dc:license']) assert license and license.id.startswith('CC-') data = Data() ds = data.add( common.Dataset, mixezoqueanvoices.__name__, id=mixezoqueanvoices.__name__, name="Mixe-Zoquean Voices", domain='mixezoqueanvoices.clld.org', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", license=license.url, jsondata={ 'license_icon': '{}.png'.format('-'.join( [p.lower() for p in license.id.split('-')[:-1]])), 'license_name': license.name }, ) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic') data.add(common.Contributor, 'gray', id='gray', name='Russell Gray') DBSession.add( common.ContributionContributor( contribution=contrib, contributor=data['Contributor']['kondic'], )) for i, ed in enumerate(['kondic', 'gray']): data.add(common.Editor, ed, dataset=ds, contributor=data['Contributor'][ed], ord=i) ancestors = collections.defaultdict(list) gl = Glottolog(args.glottolog) lnames = {} for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): lnames[lang['id']] = lang['name'] glang = None if lang['glottocode']: glang = gl.languoid(lang['glottocode']) lineage = [i[0] for i in glang.lineage] if 'Mixe-Zoque' in lineage: ancestors[lang['id']].append('Protomixezoque') if 'Mixe' in lineage: ancestors[lang['id']].append('Protomixe') if 'Oaxaca Mixe' in lineage: ancestors[lang['id']].append('Protooaxacamixe') if not glang: assert lang['name'] == 'Nizaviguiti' data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], description=lang['LongName'], subgroup=glang.lineage[1][0] if glang and len(glang.lineage) > 1 else None, ) colors = dict( zip( set(l.subgroup for l in data['Variety'].values()), qualitative_colors( len(set(l.subgroup for l in data['Variety'].values()))))) for l in data['Variety'].values(): l.jsondata = dict(color=colors[l.subgroup].replace('#', '')) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) # Store proto-forms for later lookup: proto_forms = collections.defaultdict( lambda: collections.defaultdict(list)) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference'): if form['languageReference'].startswith('Proto'): proto_forms[form['languageReference']][ form['parameterReference']].append(form['form']) for param in args.cldf.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): proto = collections.OrderedDict() for lid, forms in proto_forms.items(): f = forms.get(param['id']) if f: proto[lnames[lid]] = f data.add( models.Concept, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id'].split('_')[0]), concepticon_id=param['concepticonReference'], concepticon_gloss=param['Concepticon_Gloss'], description=param['Spanish_Gloss'], jsondata=dict(reconstructions=proto), ) f2a = form2audio(args.cldf) for form in args.cldf.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): assert not (form['form'] == '►' and not f2a.get(form['id'])) vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) proto = collections.OrderedDict() for lid in ancestors.get(form['languageReference'], []): f = proto_forms[lid].get(form['parameterReference']) if f: proto[lnames[lid]] = f data.add( Counterpart, form['id'], id=form['id'], name=form['form'], valueset=vs, audio=f2a.get(form['id']), jsondata=dict(reconstructions=proto), ) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def add_to_database(self, data, language_id_map, contrib): used_languages = { row['Language_ID'] for row in chain( self.cldf.get('ValueTable') or (), self.cldf.get('ExampleTable') or (), self.cldf.get('constructions.csv') or ()) if row.get('Language_ID') } local_lang_ids = set() for language_row in self.cldf['LanguageTable']: old_id = language_row.get('ID') if not old_id or old_id not in used_languages: continue # Apparently some datasets contain multiple languages sharing the # same Glottocode... So try and use the name to distinguish them id_candidate = language_row.get('Glottocode') or old_id number = 1 new_id = id_candidate lang = data['Variety'].get(new_id) while (lang and new_id in local_lang_ids and slug(lang.name) != slug(language_row.get('Name'))): number += 1 new_id = '{}-{}'.format(id_candidate, number) lang = data['Variety'].get(new_id) local_lang_ids.add(new_id) language_id_map[old_id] = new_id if not lang: lang = data.add(Variety, new_id, id=new_id, **map_cols(LANG_MAP, language_row)) DBSession.flush() # TODO add glottocode, iso code, and wals code if available DBSession.add( ContributionLanguage(language_pk=lang.pk, contribution_pk=contrib.pk)) DBSession.flush() for i, spec in enumerate(self.authors): if not isinstance(spec, dict): spec = {'name': spec} name = spec.get('name', '') parsed_name = HumanName(name) author_id = slug('{}{}'.format(parsed_name.last, parsed_name.first)) author = data['Contributor'].get(author_id) if not author: author = data.add(Contributor, author_id, id=author_id, name=parsed_name.full_name, address=spec.get('affiliation'), url=spec.get('url'), email=spec.get('email')) DBSession.flush() DBSession.add( ContributionContributor(ord=i + 1, primary=spec.get('primary', True), contribution=contrib, contributor=author)) biblio_map = {} if self.sources: for bibrecord in self.sources.records: source = bibtex2source(bibrecord, CrossgramDataSource) old_id = bibrecord.id new_id = '{}-{}'.format(contrib.id, old_id) source.id = new_id source.contribution = contrib biblio_map[old_id] = source cparam_ids = { row['Parameter_ID'] for row in self.cldf.get('cvalues.csv', ()) if 'Parameter_ID' in row } if self.cldf.get('ParameterTable'): for param_row in self.cldf.get('ParameterTable', ()): old_id = param_row.get('ID') if not old_id: continue new_id = '{}-{}'.format(contrib.id, old_id) data.add(CParameter if old_id in cparam_ids else LParameter, old_id, contribution=contrib, id=new_id, **map_cols(PARAM_MAP, param_row)) else: # If there is no parameter table fall back to Parameter_ID's in the # value tables for lvalue_row in self.cldf.get('ValueTable', ()): old_id = lvalue_row.get('Parameter_ID') if not old_id or old_id in data['LParameter']: continue new_id = '{}-{}'.format(contrib.id, old_id) data.add(LParameter, old_id, contribution=contrib, id=new_id, name=old_id) for cvalue_row in self.cldf.get('cvalues.csv', ()): old_id = lvalue_row.get('Parameter_ID') if not old_id or old_id in data['CParameter']: continue new_id = '{}-{}'.format(contrib.id, old_id) data.add(LParameter, old_id, contribution=contrib, id=new_id, name=old_id) DBSession.flush() for code_row in self.cldf.get('CodeTable', ()): old_id = code_row.get('ID') param_id = code_row.get('Parameter_ID') if not old_id or not param_id: continue new_id = '{}-{}'.format(contrib.id, old_id) if param_id in cparam_ids: param = data['CParameter'].get(param_id) data.add(UnitDomainElement, old_id, parameter=param, id=new_id, **map_cols(CCODE_MAP, code_row)) else: param = data['LParameter'].get(param_id) data.add(DomainElement, old_id, parameter=param, id=new_id, **map_cols(LCODE_MAP, code_row)) for index, example_row in enumerate(self.cldf.get('ExampleTable', ())): old_id = example_row.get('ID') lang_new_id = language_id_map.get(example_row['Language_ID']) lang = data['Variety'].get(lang_new_id) if not old_id or not lang: continue new_id = '{}-{}'.format(contrib.number or contrib.id, index + 1) example_row = _merge_glosses(example_row) example = data.add(Example, old_id, language=lang, contribution=contrib, id=new_id, **map_cols(EXAMPLE_MAP, example_row)) DBSession.flush() st = parse_source(biblio_map, example_row.get('Source') or '') if st and st.source_pk is not None: DBSession.add( SentenceReference(key=st.bibkey, description=st.pages, sentence_pk=example.pk, source_pk=st.source_pk)) DBSession.flush() for constr_row in self.cldf.get('constructions.csv', ()): old_id = constr_row.get('ID') if not old_id: continue new_id = '{}-{}'.format(contrib.id, old_id) lang_new_id = language_id_map.get(constr_row['Language_ID']) lang = data['Variety'].get(lang_new_id) constr = data.add(Construction, old_id, language=lang, contribution=contrib, id=new_id, **map_cols(CONSTR_MAP, constr_row)) DBSession.flush() for source_string in sorted(set(constr_row.get('Source') or ())): st = parse_source(biblio_map, source_string) if st and st.source_pk is not None: DBSession.add( UnitReference(key=st.bibkey, description=st.pages, unit_pk=constr.pk, source_pk=st.source_pk)) for ex_id in sorted(set(constr_row.get('Example_IDs', ()))): example = data['Example'].get(ex_id) if example: DBSession.add(UnitSentence(unit=constr, sentence=example)) DBSession.flush() valueset_refs = OrderedDict() for value_row in self.cldf.get('ValueTable', ()): old_id = value_row.get('ID') lang_new_id = language_id_map.get(value_row['Language_ID']) lang = data['Variety'].get(lang_new_id) param = data['LParameter'].get(value_row['Parameter_ID']) code = data['DomainElement'].get(value_row['Code_ID']) value_name = code.name if code and code.name else value_row['Value'] if not old_id or not lang or not param or not value_name: continue new_id = '{}-{}'.format(contrib.id, old_id) valueset = data['ValueSet'].get((lang.pk, param.pk)) if not valueset: valueset = data.add(ValueSet, (lang.pk, param.pk), id=new_id, language=lang, parameter=param, contribution=contrib) DBSession.flush() lvalue = data['Value'].get((valueset.pk, value_name)) if not lvalue: lvalue = data.add(Value, (valueset.pk, value_name), id=new_id, name=value_name, valueset=valueset, domainelement=code) for source_string in sorted(set(value_row.get('Source') or ())): st = parse_source(biblio_map, source_string) if st and st.source_pk is not None: # collect sources for all values in the same value set if valueset.pk not in valueset_refs: valueset_refs[valueset.pk] = list() valueset_refs[valueset.pk].append(st) DBSession.flush() for ex_id in sorted(set(value_row.get('Example_IDs', ()))): example = data['Example'].get(ex_id) if example: DBSession.add(ValueSentence(value=lvalue, sentence=example)) # attach collected sources from values to the value set valuesets = DBSession.query(ValueSet)\ .filter(ValueSet.contribution == contrib) for valueset in valuesets: source_tuples = sorted(set(valueset_refs.get(valueset.pk, ()))) for st in source_tuples: DBSession.add( ValueSetReference(key=st.bibkey, description=st.pages or None, valueset_pk=valueset.pk, source_pk=st.source_pk)) valueset.source = ';'.join(st[2] for st in source_tuples) for cvalue_row in self.cldf.get('cvalues.csv', ()): old_id = cvalue_row.get('ID') constr = data['Construction'].get(cvalue_row['Construction_ID']) param = data['CParameter'].get(cvalue_row['Parameter_ID']) code = data['UnitDomainElement'].get(cvalue_row['Code_ID']) value_name = code.name if code else cvalue_row['Value'] if not old_id or not constr or not param or not value_name: continue new_id = '{}-{}'.format(contrib.id, old_id) cvalue = data.add(UnitValue, old_id, id=new_id, name=value_name, contribution=contrib, unit=constr, unitparameter=param, unitdomainelement=code) DBSession.flush() for ex_id in sorted(set(cvalue_row.get('Example_IDs') or ())): example = data['Example'].get(ex_id) if example: DBSession.add( UnitValueSentence(unitvalue=cvalue, sentence=example)) for source_string in sorted(set(cvalue_row.get('Source') or ())): st = parse_source(biblio_map, source_string) if st and st.source_pk is not None: DBSession.add( UnitValueReference(key=st.bibkey, description=st.pages or None, unitvalue=cvalue, source_pk=st.source_pk))
def main(args): data = Data() doi = input('DOI of the released dataset: ') dataset = common.Dataset( id=ewave.__name__, name='eWAVE', description='The Electronic World Atlas of Varieties of English', domain='ewave-atlas.org', published=date.today(), license='http://creativecommons.org/licenses/by/3.0/', contact='*****@*****.**', jsondata={ 'doi': doi, 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 3.0 Unported License'}) DBSession.add(dataset) ed_pattern = re.compile('ed(?P<ord>[0-9]+)$') for c in args.cldf['contributors.csv']: contrib = data.add( models.WaveContributor, c['ID'], id=c['ID'], name=c['Name'], email=c['Email'], url=c['URL'], address=c['Address'], sortkey=HumanName(c['Name']).last, ) m = ed_pattern.match(c['ID']) if m: common.Editor(dataset=dataset, contributor=contrib, ord=int(m.group('ord'))) for fc in args.cldf['featurecategories.csv']: data.add( models.FeatureCategory, fc['ID'], id=fc['ID'], name=fc['Name'], description=fc['Description']) for vt in args.cldf['varietytypes.csv']: data.add( models.VarietyType, vt['ID'], id=vt['ID'], name=vt['Name'], description=vt['Description'], jsondata=VARIETY_TYPE_ICONS[vt['ID']], ) for vt in args.cldf['regions.csv']: data.add(models.Region, vt['ID'], id=vt['ID'], name=vt['Name']) for lang in args.cldf['LanguageTable']: l = data.add( models.Variety, lang['ID'], id=lang['ID'], name=lang['Name'], latitude=lang['Latitude'], longitude=lang['Longitude'], abbr=lang['abbr'], region=data['Region'][lang['Region_ID']], type=data['VarietyType'][lang['Type_ID']], ) if lang['Glottocode']: add_language_codes(data, l, None, glottocode=lang['Glottocode']) c = data.add( models.WaveContribution, lang['ID'], id=str(lang['ID']), name=lang['Name'], description=lang['Description'], variety=l) for i, cid in enumerate(lang['Contributor_ID']): DBSession.add(common.ContributionContributor( contribution=c, contributor=data['WaveContributor'][cid], ord=i+1, )) for param in args.cldf['ParameterTable']: data.add( models.Feature, param['ID'], id=param['ID'], category=data['FeatureCategory'][param['Category_ID']], name=param['Name'], description=param['Description'], jsondata={'example_source': param['Example_Source']}) for de in args.cldf['CodeTable']: data.add( common.DomainElement, de['ID'], id=de['ID'], parameter=data['Feature'][de['Parameter_ID']], name=de['Name'], description=de['Description'], jsondata={'color': CODE_COLORS[de['Name']]}, number=de['Number']) for rec in bibtex.Database.from_file(args.cldf.bibpath): data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec)) for example in args.cldf['ExampleTable']: s = data.add( common.Sentence, example['ID'], id=example['ID'], name=example['Primary_Text'], gloss='\t'.join(example['Gloss']) if example['Gloss'] else None, comment=example['Comment'] or None, description=example['Translated_Text'] or None, language=data['Variety'][example['Language_ID']]) for ref in example['Source']: sid, pages = Sources.parse(ref) DBSession.add(common.SentenceReference( sentence=s, source=data['Source'][sid], description=pages, key=sid)) for value in args.cldf['ValueTable']: de = data['DomainElement'][value['Code_ID']] vs = data.add( common.ValueSet, value['ID'], id=value['ID'], contribution=data['WaveContribution'][value['Language_ID']], parameter=data['Feature'][value['Parameter_ID']], jsondata=de.jsondata, language=data['Variety'][value['Language_ID']]) v = data.add( common.Value, value['ID'], id=value['ID'], domainelement=de, valueset=vs) for eid in value['Example_ID']: DBSession.add(common.ValueSentence(sentence=data['Sentence'][eid], value=v))
def main(args): assert args.glottolog, 'The --glottolog option is required!' data = Data() ds = data.add( common.Dataset, jambu.__name__, id=jambu.__name__, name='Jambu', domain='jambu-clld.herokuapp.com', publisher_name="Georgetown University", publisher_place="Washington", publisher_url="http://gucl.georgetown.edu/", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate(['Aryaman Arora']): common.Editor(dataset=ds, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) contrib = data.add( common.Contribution, None, id='cldf', name=args.cldf.properties.get('dc:title'), description=args.cldf.properties.get('dc:bibliographicCitation'), ) print("Languages...") for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'glottocode', 'longitude', 'latitude', 'Clade'): data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], family=lang['Clade'], ) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) print("Cognates...") for cognate in iteritems(args.cldf, 'CognateTable'): # print(cognate) data.add(models.Cognate_, cognate['Cognateset_ID'], name=cognate['Form'], language=cognate['Language_ID'], description=cognate['Description']) counts = collections.defaultdict(set) print("Forms...") for form in tqdm( iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source')): counts[form['parameterReference']].add(form['languageReference']) print("Params...") for param in tqdm( iteritems(args.cldf, 'ParameterTable', 'ID', 'Name', 'Concepticon_ID', 'Description')): data.add(models.Concept, param['ID'], id=param['ID'], name='{} [{}]'.format(param['Name'], param['ID']), description=param['Description'], count=len(counts[param['ID']])) print("Forms...") for form in tqdm( iteritems(args.cldf, 'FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source')): l = re.split(r";|\+", form['parameterReference']) for i, paramref in enumerate(l): if paramref == '?': continue vsid = (form['languageReference'], paramref) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Concept'][paramref], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( models.Lexeme, form['id'] + '-' + str(i) if len(l) > 1 else form['id'], id=form['id'] + '-' + str(i) if len(l) > 1 else form['id'], name=form['form'], gloss=form['Gloss'], native=form['Native'], phonemic='/' + form['Phonemic'] + '/' if form['Phonemic'] else None, description=form['Description'], cognateset=form['Cognateset'], valueset=vs, ) print("Refs...") for (vsid, sid), pages in tqdm(refs.items()): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages))))
def main(args): def data_file(*comps): return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add( common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): add_language_codes(data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace('/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict(url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files(object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)
def main(args): data = Data() ds = Pofatu( pathlib.Path(pofatu.__file__).parent.parent.parent / 'pofatu-data') dataset = common.Dataset( id=pofatu.__name__, name="POFATU", publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="https://www.eva.mpg.de", license="https://creativecommons.org/licenses/by/4.0/", domain='pofatu.clld.org', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) for i, (id_, name) in enumerate([ ('hermannaymeric', 'Aymeric Hermann'), ('forkelrobert', 'Robert Forkel'), ]): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) for rec in ds.iterbib(): rec.genre = bibtex.EntryType.from_string( ENTRY_TYPES.get(rec.genre, rec.genre)) if 'date' in rec: rec['year'] = rec.pop('date') data.add(common.Source, rec.id, _obj=bibtex2source(rec, lowercase_id=False)) analyses = list(ds.iterdata()) def midpoint(coords): p = MultiPoint([(lat, lon + 360 if lon < 0 else lon) for lat, lon in coords]).convex_hull #geojson = { # 'type': 'Feature', # 'properties': {}, # 'geometry': mapping(p)} c = p.centroid return c.x, (c.y - 360) if c.y > 180 else c.y artefacts = collections.defaultdict(dict) midpoints = {} for a in analyses: l = a.sample.location lid = l.id if lid not in midpoints: midpoints[lid] = set() if l.latitude is not None and l.longitude is not None: midpoints[lid].add((l.latitude, l.longitude)) art = a.sample.artefact for attr_ in ['name', 'category', 'collection_type']: if not artefacts[slug(art.id)].get(attr_): artefacts[slug(art.id)][attr_] = getattr(art, attr_) midpoints = { k: midpoint(v) if v else (None, None) for k, v in midpoints.items() } for analysis in analyses: loc = analysis.sample.location if loc.id not in data['Location']: data.add( models.Location, loc.id, id=valid_id(loc.id), name=loc.label, latitude=midpoints[loc.id][0], longitude=midpoints[loc.id][1], region=loc.region.replace('_', ' '), subregion=loc.subregion, location=loc.locality, ) # Add contributions for contrib in ds.itercontributions(): contribution = data.add( common.Contribution, contrib.id, id=valid_id(contrib.id), name=contrib.label, description=contrib.description, ) DBSession.flush() for i, name in enumerate(contrib.contributors): cid = slug(name) co = data['Contributor'].get(cid) if not co: co = data.add(common.Contributor, cid, id=cid, name=name) common.ContributionContributor(ord=i, contribution=contribution, contributor=co) for ref in contrib.source_ids: DBSession.add( common.ContributionReference( contribution=contribution, source=data['Source'][ref], )) data['Contribution'][ref] = contribution methods = collections.defaultdict(list) for method in ds.itermethods(): m = data.add( models.Method, method.id, id=valid_id(method.id), name=method.label, code=method.code, parameter=method.parameter.strip(), instrument=method.instrument, number_of_replicates=method.number_of_replicates, date=method.date, comment=method.comment, detection_limit=method.detection_limit, detection_limit_unit=method.detection_limit_unit, total_procedural_blank_value=method.total_procedural_blank_value, total_procedural_unit=method.total_procedural_unit, ) methods[(m.code.lower(), m.parameter.lower())].append(m) for ref in method.references: DBSession.add( models.MethodReference( method=m, sample_name=ref.sample_name, sample_measured_value=ref.sample_measured_value, uncertainty=ref.uncertainty, uncertainty_unit=ref.uncertainty_unit, number_of_measurements=ref.number_of_measurements, )) for ref in method.normalizations: DBSession.add( models.Normalization( method=m, reference_sample_name=ref.reference_sample_name, reference_sample_accepted_value=ref. reference_sample_accepted_value, citation=ref.citation, )) parameter = data.add(common.Parameter, 'c', id='category', name='Sample category') for i, opt in enumerate(attr.fields_dict( pypofatu.models.Sample)['sample_category'].validator.options, start=1): data.add(common.DomainElement, opt, parameter=parameter, id=str(i), name=opt) DBSession.flush() assert parameter.pk # Add Samples and UnitParameters and Measurements for analysis in analyses: sample = analysis.sample vsid = '{0}-{1}'.format(sample.location.id, data['Contribution'][sample.source_id].id) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id=valid_id(vsid), language_pk=data['Location'][sample.location.id].pk, parameter_pk=parameter.pk, contribution_pk=data['Contribution'][sample.source_id].pk, ) v = data['Sample'].get(sample.id) if not v: v = data.add( models.Sample, sample.id, id=valid_id(sample.id), name=sample.id, sample_name=sample.sample_name, sample_comment=sample.sample_comment, petrography=sample.petrography, latitude=sample.location.latitude, longitude=sample.location.longitude, elevation=sample.location.elevation, location_comment=sample.location.comment, site_name=sample.site.name, site_code=sample.site.code, site_context=sample.site.context, site_comment=sample.site.comment, site_stratigraphic_position=sample.site.stratigraphic_position, site_stratigraphy_comment=sample.site.stratigraphy_comment, domainelement=data['DomainElement'][sample.sample_category], valueset=vs, artefact_id=sample.artefact.id, artefact_name=sample.artefact.name, artefact_category=sample.artefact.category, artefact_comment=sample.artefact.comment, artefact_attributes=sample.artefact.attributes, artefact_collector=sample.artefact.collector, artefact_collection_type=sample.artefact.collection_type, artefact_collection_location=sample.artefact. collection_location, artefact_collection_comment=sample.artefact.collection_comment, artefact_fieldwork_date=sample.artefact.fieldwork_date, ) DBSession.add( models.SampleReference( description='sample', sample=v, source=data['Source'][sample.source_id])) for ref in sample.artefact.source_ids: DBSession.add( models.SampleReference(description='artefact', sample=v, source=data['Source'][ref])) for ref in sample.site.source_ids: DBSession.add( models.SampleReference(description='site', sample=v, source=data['Source'][ref])) a = data.add( models.Analysis, analysis.id, id=better_slug(analysis.id), name=analysis.id, sample=v, ) for i, measurement in enumerate(analysis.measurements): if i == 0: method = measurement.method if method: a.analyzed_material_1 = method.analyzed_material_1, a.analyzed_material_2 = method.analyzed_material_2, a.sample_preparation = method.sample_preparation, a.chemical_treatment = method.chemical_treatment, a.technique = method.technique, a.laboratory = method.laboratory, a.analyst = method.analyst, pid = slug(measurement.parameter, lowercase=False) p = data['Param'].get(pid) if not p: p = data.add(models.Param, pid, id=pid, name=measurement.parameter) data.add( models.Measurement, None, id='{0}-{1}'.format(a.id, p.id), analysis=a, method=data['Method'].get(measurement.method.id) if measurement.method else None, value=measurement.value, less=measurement.less, precision=measurement.value_sd, sigma=measurement.sd_sigma, unitparameter=p, )
def main(args): # pragma: no cover # # FIXME: more generic: # - run iter_datasets(args.cldf) -> assuming args.cldf is a directory! -> must go in clld! # - Store datasets in defaultdict(list) keyed with module # datasets = {} for ds in iter_datasets(args.cldf.directory): datasets[ds.module] = ds assert args.glottolog, 'The --glottolog option is required!' data = Data() thedataset = data.add( common.Dataset, hindukush.__name__, id=hindukush.__name__, name='Hindu Kush Areal Typology', domain='hindukush.clld.org', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }, ) for i, name in enumerate( ['Henrik Liljegren', 'Robert Forkel', 'Nina Knobloch', 'Noa Lange']): common.Editor(dataset=thedataset, ord=i, contributor=common.Contributor(id=slug( HumanName(name).last), name=name)) for rec in bibtex.Database.from_file(pathlib.Path(__file__).parent / 'HK_website.bib', lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True): data.add(common.Source, rec.id, _obj=bibtex2source(rec)) refs = collections.defaultdict(list) for module, ds in sorted(datasets.items(), key=lambda i: i[0]): for lang in ds.iter_rows('LanguageTable', 'id', 'glottocode', 'name', 'latitude', 'longitude'): if lang['id'] not in data['Variety']: data.add( models.Variety, lang['id'], id=lang['id'], name=lang['name'], latitude=lang['latitude'], longitude=lang['longitude'], glottocode=lang['glottocode'], subgroup=lang['SubGroup'], location=lang['Location'], elicitation=lang['Elicitation'], jsondata=dict(shape=subgroup_shapes.get(lang['SubGroup'])), ) contrib = data.add( models.CLDFDataset, module, id=module, name='{} [{}]'.format(ds.properties.get('dc:title'), module), description=ds.properties.get('dc:bibliographicCitation'), module=module, ) if module == 'Wordlist': for param in ds.iter_rows('ParameterTable', 'id', 'concepticonReference', 'name'): data.add( models.Param, param['id'], id=param['id'], name='{} [{}]'.format(param['name'], param['id']), sortkey=param['id'] if not param['id'].startswith('Numerals') else 'Numerals-{0:04d}'.format(int(param['id'].split('-')[1])), concepticon_id=param['concepticonReference'], contribution=contrib, category=param['domain'] or 'ASJPlist', ) audio = { r['ID']: r for r in ds.iter_rows('media.csv') if r['mimetype'] == 'audio/mpeg' } for form in ds.iter_rows('FormTable', 'id', 'form', 'languageReference', 'parameterReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) mp3 = next( iter([ audio[aid] for aid in form['Audio_Files'] if aid in audio ]), None) data.add( common.Value, form['id'], id=form['id'], name=form['form'], valueset=vs, jsondata=dict(audio=ds.get_row_url('media.csv', mp3 ) if mp3 else None), ) elif module == 'StructureDataset': for param in ds.iter_rows('ParameterTable', 'id', 'name', 'description'): data.add( models.Param, param['id'], id=param['id'], name=param['name'], description=html(param['description']) if param['description'] else None, category=param['Category'], contribution=contrib, ) for code in ds.iter_rows('CodeTable', 'id', 'name', 'description', 'parameterReference'): data.add(common.DomainElement, code['id'], id=code['id'], name=code['name'], description=code['description'], parameter=data['Param'][code['parameterReference']], jsondata={ 'color': { 'absent': 'ff0000', 'present': '0000ff', 'indeterminate': 'cccccc', }.get(code['description']) }) # # FIXME: read CodeTable! # for form in ds.iter_rows('ValueTable', 'id', 'value', 'languageReference', 'parameterReference', 'codeReference', 'source'): vsid = (form['languageReference'], form['parameterReference']) vs = data['ValueSet'].get(vsid) if not vs: vs = data.add( common.ValueSet, vsid, id='-'.join(vsid), language=data['Variety'][form['languageReference']], parameter=data['Param'][form['parameterReference']], contribution=contrib, ) for ref in form.get('source', []): sid, pages = Sources.parse(ref) refs[(vsid, sid)].append(pages) data.add( common.Value, form['id'], id=form['id'], name=form['value'], valueset=vs, domainelement=data['DomainElement'][form['codeReference']]) for (vsid, sid), pages in refs.items(): DBSession.add( common.ValueSetReference(valueset=data['ValueSet'][vsid], source=data['Source'][sid], description='; '.join(nfilter(pages)))) load_families( Data(), [(l.glottocode, l) for l in data['Variety'].values()], glottolog_repos=args.glottolog, isolates_icon='tcccccc', strict=False, )