def testapp(): from webtest import TestApp from clld.db.meta import DBSession, VersionedDBSession, Base from clld.db.models import common from clld_cognacy_plugin.models import Cognateset, Cognate def main(): cfg = config.Configurator(settings={ 'sqlalchemy.url': 'sqlite://', 'mako.directories': [ 'clld:web/templates', 'clld_cognacy_plugin:templates' ]}) cfg.include('clld.web.app') cfg.include('clld_cognacy_plugin') return cfg.make_wsgi_app() DBSession.remove() VersionedDBSession.remove() wsgi_app = main() Base.metadata.bind = DBSession.bind Base.metadata.create_all() DBSession.add(common.Dataset(id='1', name='test app', domain='example.org')) cs = Cognateset(id='1', name='cs: test') lang = common.Language(id='l', latitude=2, longitude=2) param = common.Parameter(id='l') vs = common.ValueSet(id='vs', language=lang, parameter=param) v = common.Value(id='v', name='abc', valueset=vs) DBSession.add(Cognate(cognateset=cs, counterpart=v)) yield TestApp(wsgi_app)
def add_values(data, dblang, pid, values, with_de=True, **vskw): vs = None for i, (vid, vname) in enumerate(values): if i == 0: vs = common.ValueSet( id=idjoin(pid, dblang.id), language=dblang, parameter=data['Parameter'][pid], contribution=data['Contribution']['glottolog'], **vskw) vkw = dict(id=idjoin(pid, slug(vid), dblang.id), name=vname, valueset=vs) if with_de: vkw['domainelement'] = data['DomainElement'][pid, vid] DBSession.add(common.Value(**vkw))
def vs_switch_lang(session, timestamp, vs, lang): # pragma: no cover if isinstance(lang, basestring): lang = common.Language.get(lang, session=session) vs1 = get_vs(session, vs) pid, lid = vs1.id.split('-') id_ = '-'.join([pid, lang.id]) try: vs2 = get_vs(session, id_) vs2.updated = timestamp except NoResultFound: vs2 = common.ValueSet(id=id_, description=vs1.description, language=lang, parameter=vs1.parameter, contribution=vs1.contribution, updated=timestamp, created=timestamp, source=vs1.source) session.add(vs2) v1 = vs1.values[0] if vs2.values: assert v1.domainelement == vs2.values[0].domainelement else: session.add( common.Value(id=vs2.id, valueset=vs2, domainelement=v1.domainelement, created=timestamp, updated=timestamp)) delete(session, v1) for ref in vs1.references: ref.valueset = vs2 delete(session, vs1)
def vs_copy_lang(session, timestamp, vs, lang): # pragma: no cover if isinstance(lang, basestring): lang = common.Language.get(lang, session=session) vs1 = get_vs(session, vs) pid, lid = vs1.id.split('-') id_ = '-'.join([pid, lang.id]) try: vs2 = get_vs(session, id_) vs2.updated = timestamp raise AssertionError except NoResultFound: vs2 = common.ValueSet(id=id_, description=vs1.description, language=lang, parameter=vs1.parameter, contribution=vs1.contribution, updated=timestamp, created=timestamp, source=vs1.source) session.add(vs2) # copy values and references: session.add( common.Value(id=vs2.id, valueset=vs2, domainelement=vs1.values[0].domainelement, created=timestamp, updated=timestamp)) for ref in vs1.references: session.add( common.ValueSetReference(valueset=vs2, source=ref.source, key=ref.key, description=ref.description))
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld')) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name) contribution = common.Contribution(id='contribution', name='Contribution') cr = common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) identifier = common.Identifier(type='iso639-3', id='iso') li = common.LanguageIdentifier(language=language, identifier=identifier) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) _li = common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) vr = common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence(id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language) sr = common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) DBSession.flush()
def add_cultural_data(questionnaire_file_name, parameters, language): """ Parse the typological questionnaire into the database """ contribution_text, parameter_descriptions, answers = parse_culture_questionnaire( os.path.join(DBPATH, questionnaire_file_name)) # All ValueSets must be related to a contribution, so generate one from the metadata. contrib = common.Contribution(id='contrib' + newid(), name=contribution_text + newid()) for p, parameter in parameter_descriptions.iterrows(): # First, make sure that this parameter exists – either look it up or create it. pid = p.replace(".", "-") try: param, domain = parameters[pid] except KeyError: param = common.Parameter( id='culture' + pid, name=p, description=parameter['Question_text_English'], markup_description=parameter['Question_text_English']) domain = {} parameters[pid] = (param, domain) # Secondly, check whether we are aware that this answer is # valid already – otherwise we add its value to the domain, # and use that. # Note: Once we have a database, we can do better filtering # and constraining, and don't need to rely on reasonable data. answer = str(answers["Answer"][p]) try: domain_element = domain[slug(answer)] except KeyError: try: numerical_value = int(answer) except ValueError: numerical_value = ( 1 if answer == "Y" or answer == 'True' else 0 if answer == "N" or answer == 'False' else None) domain_element = common.DomainElement( id=param.id + slug(answer), description=answer, number=numerical_value, name=answer, parameter=param, abbr=answer, jsondata={'color': color(numerical_value)}) DBSession.add(domain_element) try: DBSession.flush() except: print(domain, domain_element, language.name, pid, param.name) domain[slug(answer)] = domain_element # Now create the ValueSet, representing all values the # language has for this parameter vs = common.ValueSet(id='vs' + newid(), language=language, parameter=param, jsondata=domain_element.jsondata, contribution=contrib) # and fill in the actual values, which in this case is only # one. This object, and all objects it depends on, are then # scheduled for writing into the database. DBSession.add( common.Value(id='v' + newid(), valueset=vs, frequency=float(100), jsondata=domain_element.jsondata, domainelement=domain_element)) # Execute all scheduled database updates. DBSession.flush()
def main(args): data = Data() lotw_conn = sqlite3.connect("lotw_base.sqlite") lotw_base = lotw_conn.cursor() contrib = common.Contribution(id="initial_contrib", name="Initial contribution") dataset = common.Dataset(id=lotw_dev.__name__, domain='lotw_dev.clld.org', name="Languages of the World", publisher_name="IL RAS", publisher_place="Moscow", publisher_url="http://iling-ran.ru/main/", jsondata={ 'license_name': 'Creative Commons Attribution 4.0 International License'} ) DBSession.add(dataset) feature_dict = {} unnamed_feature_count = 0 features = lotw_base.execute("SELECT * FROM Feature").fetchall() names = [y[2] for y in features] feat_name_counts = {x[2]: [names.count(x[2]), 0] for x in features if names.count(x[2]) > 1} # features = [convert_feature(x) for x in features] for feature in features: name = feature[2] # if name == ".О": # continue if name in feat_name_counts.keys(): temp_name = name name += ("_" + str(feat_name_counts[name][1])) feat_name_counts[temp_name][1] += 1 feature_dict[feature[0]] = TreeFeature(pk=feature[0], id=feature[0], name=name, father_pk=feature[5]) print("Added feature %s" % feature[2]) langs = lotw_base.execute("SELECT * FROM Language").fetchall() assert len(set([lang[0] for lang in langs])) == len([lang[0] for lang in langs]) for language in langs: value_sets = [] geodata = lotw_base.execute("SELECT * FROM Geographical_index WHERE Lang_id=?", (str(language[0]), )).fetchone() famdata = lotw_base.execute("SELECT * FROM Genealogical_index WHERE Lang_id=?", (str(language[0]), )).fetchone() famname = lotw_base.execute("SELECT * FROM Family where Id=?", (famdata[2], )).fetchone()[1] branchname =lotw_base.execute("SELECT * FROM Branch where Id=?", (famdata[3], )).fetchone()[1] if not geodata: geodata = [0.0 for x in range(7)] data.add(lotw_devLanguage, language[0], id=str(language[0]), iso=language[3], family=famname, branch=branchname, name=language[1], latitude=geodata[5], longitude=geodata[6]) print("Added language %s" % language[3]) # Lang_id=language["Lang_id"], Order_of_addition=language["Order_of_addition"], # Sorting_number=language["Sorting_number"], Code_ISO_639_3=language["Code_ISO_639_3"] language_features = lotw_base.execute("SELECT * FROM Binary_data WHERE Lang_id=? AND Feature_value=1", (str(language[0]), )) for l_feat in language_features.fetchall(): feat_id = l_feat[0] try: feat_name = feature_dict[l_feat[2]].name except KeyError: continue vs = common.ValueSet(id=feat_id, language=data["lotw_devLanguage"][language[0]], parameter=feature_dict[l_feat[2]], contribution=contrib) DBSession.add(common.Value(id=feat_id, name=feat_name, valueset=vs)) print("Added value %s" % feat_id) lotw_conn.close()
def prime(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ # # Now that we loaded all languoids and refs, we can compute the MED values. # meds = defaultdict(list) for lpk, spk, sid, sname, med_type, year, pages in DBSession.execute("""\ select l.pk, r.pk, s.id, s.name, r.med_type, s.year_int, r.med_pages from languagesource as ls, language as l, source as s, ref as r where ls.active = TRUE and l.pk = ls.language_pk and s.pk = ls.source_pk and s.pk = r.pk order by l.id, r.med_index desc, r.med_pages, coalesce(s.year_int, 0), s.pk """): meds[lpk].append((spk, sid, sname, med_type, year, pages)) # The last one is the overall MED # Now weed out the "newer but worse" sources: for lpk, sources in {k: reversed(v) for k, v in meds.items()}.items(): relevant, lastyear = [], 10000 for spk, sid, sname, med_type, year, pages in sources: if year and year < lastyear: # If year is more recent, this is a "newer but worse" item relevant.append((spk, sid, sname, med_type, year, pages)) lastyear = year meds[lpk] = relevant med_param = common.Parameter.get('med') med_domain = {de.id: de for de in med_param.domain} contrib = common.Contribution.get('glottolog') for l in DBSession.query(common.Language).filter(common.Language.pk.in_(list(meds.keys()))): l.update_jsondata(meds=[ (sid, med_type, year, pages, sname) for spk, sid, sname, med_type, year, pages in meds[l.pk]]) if not meds[l.pk]: continue med = meds[l.pk][0] # Record the overall MED as value for the 'med' Parameter: vs = common.ValueSet( id=idjoin('med', l.id), contribution=contrib, parameter=med_param, language=l, ) DBSession.add(common.Value( id=idjoin('med', l.id), name=getattr(args.repos.med_types, med[3]).name, domainelement=med_domain[idjoin('med', med[3])], valueset=vs, )) DBSession.flush() DBSession.add(common.ValueSetReference(source_pk=med[0], valueset_pk=vs.pk)) recreate_treeclosure() macroareas = {r[0]: (r[1], r[2]) for r in DBSession.execute("""\ select de.pk, de.id, de.name from domainelement as de, parameter as p where de.parameter_pk = p.pk and p.id = 'macroarea' """)} for lid, lpk, cpk, ppk, mas in DBSession.execute("""\ select l.id, l.pk, vs.contribution_pk, vs.parameter_pk, array_agg(distinct v.domainelement_pk) from language as l, treeclosuretable as t, parameter as p, valueset as vs, value as v where l.pk = t.parent_pk and t.child_pk = vs.language_pk and vs.parameter_pk = p.pk and p.id = 'macroarea' and v.valueset_pk = vs.pk and l.pk not in ( select language_pk from valueset as _vs, parameter as _p where _vs.parameter_pk = _p.pk and _p.id = 'macroarea' ) group by l.id, l.pk, vs.contribution_pk, vs.parameter_pk"""): for i, mapk in enumerate(mas): if i == 0: vs = common.ValueSet( id=idjoin('macroarea', lid), language_pk=lpk, parameter_pk=ppk, contribution_pk=cpk) DBSession.add(common.Value( id=idjoin(macroareas[mapk][0], lid), name=macroareas[mapk][1], domainelement_pk=mapk, valueset=vs)) for vs in DBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == 'macroarea')\ .options(joinedload(common.ValueSet.values), joinedload(common.ValueSet.language)): vs.language.macroareas = ', '.join([macroareas[v.domainelement_pk][1] for v in vs.values]) for row in list(DBSession.execute( "select pk, pages, pages_int, startpage_int from source where pages_int < 0" )): raise ValueError(row) version = assert_release(args.repos.repos) with jsonlib.update(gc2version(args), indent=4) as legacy: for lang in DBSession.query(common.Language): if lang.id not in legacy: lang.update_jsondata(new=True) legacy[lang.id] = version valuesets = { r[0]: r[1] for r in DBSession.query(common.ValueSet.id, common.ValueSet.pk)} refs = { r[0]: r[1] for r in DBSession.query(models.Refprovider.id, models.Refprovider.ref_pk)} for vsid, vspk in valuesets.items(): if vsid.startswith('macroarea-'): DBSession.add(common.ValueSetReference( source_pk=refs[args.repos.macroareas.__defaults__['reference_id']], valueset_pk=vspk)) for vs in DBSession.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == 'aes'): if vs.jsondata['reference_id']: DBSession.add(common.ValueSetReference( source_pk=refs[vs.jsondata['reference_id']], valueset_pk=vs.pk)) for lang in args.repos.languoids(): if lang.category == args.repos.language_types.bookkeeping.category: continue clf = lang.classification_comment if clf: for pid, attr_ in [('sc', 'sub'), ('fc', 'family')]: if getattr(clf, attr_ + 'refs'): if split_items(lang.cfg['classification'][attr_ + 'refs']) != \ split_items(lang.cfg['classification'].get(attr_)): vspk = valuesets['{0}-{1}'.format(pid, lang.id)] for ref in getattr(clf, attr_ + 'refs'): spk = refs.get(ref.key) if spk: DBSession.add( common.ValueSetReference(source_pk=spk, valueset_pk=vspk))
def main(args): Index('ducet', collkey(func.translate(common.Value.name, 'ˈ,ː,ˌ', '')))\ .create(DBSession.bind) data = Data() dataset = common.Dataset( id=numerals.__name__, name="Numeralbank", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain="numerals.clld.org", jsondata={ "license_icon": "cc-by.png", "license_name": "Creative Commons Attribution 4.0 International License", }, ) DBSession.add(dataset) for i, (id_, name) in enumerate( [("verkerkannemarie", "Annemarie Verkerk"), ("rzymskichristoph", "Christoph Rzymski")] ): ed = data.add(common.Contributor, id_, id=id_, name=name) common.Editor(dataset=dataset, contributor=ed, ord=i + 1) DBSession.add(dataset) # Take meta data from curated CLDF data set ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json') # Parameters: for parameter in ds["ParameterTable"]: data.add( models.NumberParameter, parameter["ID"], id=parameter["ID"], name="{0}".format(parameter["ID"]), concepticon_id=parameter['Concepticon_ID'], ) basis_parameter = data.add( models.NumberParameter, "0", id="0", name="Base", ) load_family_langs = [] for language in ds["LanguageTable"]: lang = data.add( models.Variety, language["ID"], id=language["ID"], name=language["Name"], latitude=language["Latitude"], longitude=language["Longitude"], creator=language["Contributor"], comment=language["Comment"], url_soure_name=language["SourceFile"], ) if language["Glottocode"]: load_family_langs.append((language["Glottocode"], lang)) # get orginal forms ds = Wordlist.from_metadata(data_repos[0]['data_path'] / 'cldf' / 'cldf-metadata.json') org_forms = {f["ID"]: f for f in ds["FormTable"]} d = data_repos[1] contrib = data.add( common.Contribution, d['id'], id=d['id'], name=d['name'] ) # process curated forms ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json') # Add Base info if given for language in ds["LanguageTable"]: if language["Base"]: basis = language["Base"] de = data["DomainElement"].get(basis) if not de: de = data.add( common.DomainElement, basis, id=text_type(basis), name=text_type(basis), parameter=basis_parameter, ) vs = data.add( common.ValueSet, data["Variety"][language["ID"]].id, id=data["Variety"][language["ID"]].id, language=data["Variety"][language["ID"]], parameter=basis_parameter, contribution=contrib, ) common.Value( id=data["Variety"][language["ID"]].id, valueset=vs, domainelement=de ) # Forms: for form in ds["FormTable"]: valueset_id = "{0}-{1}".format(form["Parameter_ID"], form["Language_ID"]) valueset = data["ValueSet"].get(valueset_id) # Unless we already have something in the VS: if not valueset: if form["Language_ID"] in data["Variety"]: vs = data.add( common.ValueSet, valueset_id, id=valueset_id, language=data["Variety"][form["Language_ID"]], parameter=data["NumberParameter"][form["Parameter_ID"]], contribution=contrib, ) org_form = "" if form["ID"] in org_forms: if unicodedata.normalize('NFC', org_forms[form["ID"]]["Form"].strip()) != form["Form"]: org_form = org_forms[form["ID"]]["Form"] else: org_form = "no original form" DBSession.add( models.NumberLexeme( id=form["ID"], name=form["Form"], comment=form["Comment"], is_loan=form["Loan"], other_form=form["Other_Form"], org_form=org_form, is_problematic=form["Problematic"], valueset=vs, ) ) load_families( Data(), load_family_langs, glottolog_repos=gl_repos, strict=False, ) distinct_varieties = DBSession.query(models.Variety.family_pk).distinct().all() families = dict( zip([r[0] for r in distinct_varieties], color.qualitative_colors(len(distinct_varieties))) ) for l in DBSession.query(models.Variety): l.jsondata = {"color": families[l.family_pk]} p = common.Parameter.get("0") colors = color.qualitative_colors(len(p.domain)) for i, de in enumerate(p.domain): de.jsondata = {"color": colors[i]}
def main(args): data = Data() glottocodes, bibtex_keys = {}, defaultdict(set) for d in reader( args.data_file('repos', 'mappings', 'InventoryID-ISO-gcode-Bibkey-Source.tsv')): glottocodes[d['InventoryID']] = d['Glottocode'] bibtex_keys[d['InventoryID']].add(d['BibtexKey']) glottolog = Glottolog( Path(phoible.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog')) languoids = {l.id: l for l in glottolog.languoids()} phonemes = sorted(list( reader(args.data_file('repos', 'data', 'phoible-by-phoneme.tsv'))), key=lambda r: (r['InventoryID'], r['GlyphID'])) inventories = defaultdict(set) for p in phonemes: if p['InventoryID'] in glottocodes: inventories[(languoids[glottocodes[p['InventoryID']]].name, p['SpecificDialect'], p['Source'].upper())].add( (p['InventoryID'], p['LanguageName'])) inventory_names = {} for (glname, dname, source), invids in inventories.items(): if len(invids) == 1: invid, lname = invids.pop() inventory_names[invid] = name_in_source(glname, dname) + ' [%s]' % source else: use_lname = len(set(r[1] for r in invids)) == len(invids) for i, (invid, lname) in enumerate(sorted(invids, key=lambda j: int(j[0]))): disambiguation = ' %s' % (i + 1, ) if use_lname: disambiguation = ' (%s)' % lname inventory_names[invid] = name_in_source( glname, dname) + '%s [%s]' % (disambiguation, source) for (invid, lname, dname, source), ps in groupby( phonemes, lambda p: (p['InventoryID'], p['LanguageName'], p[ 'SpecificDialect'], p['Source'])): if invid not in glottocodes: continue ps = list(ps) gc = glottocodes[invid] lang = data['Variety'].get(gc) if not lang: languoid = languoids[gc] lang = data.add( models.Variety, gc, id=gc, language_code=ps[0]['LanguageCode'], name=languoid.name, level=text_type(languoid.level.name), latitude=languoid.latitude, longitude=languoid.longitude, ) if lang.latitude is None and languoid.level == Level.dialect: ll = get_language(languoid) lang.latitude = ll.latitude lang.longitude = ll.longitude contrib = data.add( models.Inventory, invid, id=invid, #language=lang, source=source, #source_url=source_urls.get(row.InventoryID), #internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[invid], description=name_in_source(lname, dname)) return # FIXME: read from mappings file! refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="https://www.shh.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) #squibs = defaultdict(list) #for row in get_rows(args, 'Squib'): # squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) # FIXME: group phoible-by-phoneme by LanguageCode, Source (make sure this is unique!) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), delimiter='\t', namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) # pull in Glottolog families instead? or in addition? family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) #for j, squib in enumerate(squibs.get(row.InventoryID, [])): # f = common.Contribution_files( # object=contrib, # id='squib-%s-%s.pdf' % (contrib.id, j + 1), # name='Phonological squib', # description=squib, # mime_type='application/pdf') # assert f # # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) # FIXME: add allophones! DBSession.flush()
def main(args): # determine if we run on a machine where other databases are available for lookup # locally: data = Data() genera = get_genera(data) if astroman else {} glottocodes, lnames, geocoords = {}, {}, {} if astroman: for k, v in glottocodes_by_isocode( 'postgresql://robert@/glottolog3', cols=['id', 'name', 'latitude', 'longitude']).items(): glottocodes[k] = v[0] lnames[k] = v[1] geocoords[k] = (v[2], v[3]) refs = defaultdict(list) for row in get_rows(args, 'BibtexKey'): if row[1] == 'NO SOURCE GIVEN': refs[row[0]] = [] else: refs[row[0]].append(row[1]) add_sources(args, data) dataset = data.add( common.Dataset, 'phoible', id='phoible', name='PHOIBLE Online', description='PHOIBLE Online', publisher_name="Max Planck Institute for Evolutionary Anthropology", publisher_place="Leipzig", publisher_url="http://www.eva.mpg.de", domain='phoible.org', license='http://creativecommons.org/licenses/by-sa/3.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png', 'license_name': 'Creative Commons Attribution-ShareAlike 3.0 Unported License' }) for i, spec in enumerate([ ('moran', "Steven Moran"), ('mccloy', "Daniel McCloy"), ('wright', "Richard Wright"), ]): DBSession.add( common.Editor(dataset=dataset, ord=i + 1, contributor=common.Contributor(id=spec[0], name=spec[1]))) squibs = defaultdict(list) for row in get_rows(args, 'Squib'): squibs[row[0]].append(row[1]) source_urls = dict(get_rows(args, 'URL')) ia_urls = dict(get_rows(args, 'InternetArchive')) aggregated = list( reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True)) inventory_names = {} for key, items in groupby(sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)), key=lambda t: (t.LanguageCode, t.Source)): items = list(items) lname = lnames.get(key[0]) if not lname: lname = items[0].LanguageName lnames[key[0]] = lname if len(items) == 1: inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1]) else: for i, item in enumerate(items): inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1]) family_map = { ("Arawakan", "arwk"): "Arawakan", ("Trans-New Guinea", "trng"): "Trans-New Guinea", ("Moklen", "anes"): "Austronesian", ("Oko", "ncon"): "Niger-Congo", ("Muniche", "saso"): "Muniche", ("Tinigua", "saso"): "Tinigua", ("Vilela", "luvi"): "Vilela", ("Ofayé", "macg"): "Kamakanan", ("Purian", "macg"): "PurianPrint", ("Mixed language", "saml"): "Mixed language", ("Tupian", "tupi"): "Tupian", ("Yuwana", "saun"): "YuwanaPrint", } family_code_map = {k[1]: v for k, v in family_map.items()} for row in aggregated: lang = data['Variety'].get(row.LanguageCode) if not lang: if row.LanguageFamilyGenus == 'UNCLASSIFIED': genus = None else: genus_id = slug(strip_quotes(row.LanguageFamilyGenus)) genus = genera.get(genus_id) if not genus: genus = genera.get(row.LanguageCode) if not genus: #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot) family = family_map.get( (row.LanguageFamilyGenus, row.LanguageFamilyRoot)) genus = genera[genus_id] = data.add( models.Genus, genus_id, id=genus_id, name=row.LanguageFamilyGenus, description=family or row.LanguageFamilyRoot, active=False, root=row.LanguageFamilyRoot) if not genus.root: genus.root = row.LanguageFamilyRoot if genus.description in family_code_map: genus.description = family_code_map[genus.description] if row.LanguageCode in geocoords: coords = geocoords[row.LanguageCode] elif row.Latitude != 'NULL' and row.Longitude != 'NULL': coords = (float(row.Latitude), float(row.Longitude)) lang = data.add(models.Variety, row.LanguageCode, id=row.LanguageCode, name=lnames[row.LanguageCode], genus=genus, country=strip_quotes(row.Country), area=strip_quotes(row.Area), latitude=coords[0], longitude=coords[1], jsondata=dict(inventory_id=row.InventoryID)) add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes) contributor = data['Contributor'].get(row.Source) if not contributor: contributor = data.add(common.Contributor, row.Source, id=row.Source, name=SOURCES[row.Source][0], description=SOURCES[row.Source][2]) for ref in SOURCES[row.Source][1]: DBSession.add( models.ContributorReference(source=data['Source'][ref], contributor=contributor)) contrib = data.add(models.Inventory, row.InventoryID, id=row.InventoryID, language=lang, source=row.Source, source_url=source_urls.get(row.InventoryID), internetarchive_url=ia_urls.get(row.InventoryID), name=inventory_names[row.InventoryID], description=row.LanguageName) DBSession.add( common.ContributionContributor(contribution=contrib, contributor=contributor)) for j, squib in enumerate(squibs.get(row.InventoryID, [])): f = common.Contribution_files(object=contrib, id='squib-%s-%s.pdf' % (contrib.id, j + 1), name='Phonological squib', description=squib, mime_type='application/pdf') assert f # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read()) DBSession.flush() unknown_refs = {} for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True): inventory = data['Inventory'][row.InventoryID] segment = data['Segment'].get(row.Phoneme) if not segment: unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme] description = ' - '.join([t[1] for t in unicode_desc]) segment = data.add( models.Segment, row.Phoneme, id=b16encode(md5(description).digest()), name=row.Phoneme, description=description, equivalence_class=''.join([ t[0] for t in unicode_desc if t[1].split()[0] not in ['COMBINING', 'MODIFIER'] ]), segment_class=row.Class, combined_class=row.CombinedClass) DBSession.flush() vs = common.ValueSet(id=row.PhonemeID, contribution=inventory, language=inventory.language, parameter=segment) for ref in refs.get(row.InventoryID, []): if ref not in data['Source']: if ref not in unknown_refs: print('-------', ref) unknown_refs[ref] = 1 continue DBSession.add( common.ValueSetReference(source=data['Source'][ref], valueset=vs)) DBSession.add( common.Value( id=row.PhonemeID, name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name), valueset=vs)) DBSession.flush() for inventory_id in refs: for ref in refs[inventory_id]: if ref not in data['Source']: continue data.add(common.ContributionReference, '%s-%s' % (inventory_id, ref), source=data['Source'][ref], contribution=data['Inventory'][inventory_id]) for i, row in enumerate( reader(args.data_file('phoible-segments-features.tsv'))): if i == 0: features = list(map(feature_name, row)) continue if row[0] not in data['Segment']: # print('skipping feature vector:', row) continue for j, value in enumerate(row): if j and value != '0': DBSession.add( common.Parameter_data( key=features[j], value=value, ord=j, object_pk=data['Segment'][row[0]].pk)) DBSession.flush()
def update(args): pid, cid = 'vitality', 'unesco' count = 0 notfound = {} contrib = common.Contribution.get(cid, default=None) if not contrib: contrib = common.Contribution( id=cid, name='Atlas of the World’s Languages in Danger', description= 'Atlas of the World’s Languages in Danger, © UNESCO, http://www.unesco.org/culture/languages-atlas' ) param = common.Parameter.get(pid, default=None) if param is None: param = common.Parameter(id=pid, name='Degree of endangerment') domain = {de.name: de for de in param.domain} for i, spec in enumerate(VITALITY_VALUES): name, desc = spec if name not in domain: number = i + 1 domain[name] = common.DomainElement(id='%s-%s' % (pid, number), name=name, description=desc, number=number, parameter=param) valuesets = {vs.id: vs for vs in param.valuesets} for record in et.parse(args.data_file(DATA_FILE)).findall('.//RECORD'): item = {} for attr in [ 'ID', 'Name in English', 'Name in French', 'Name in Spanish', 'Countries', 'Country codes alpha 3', 'ISO639-3 codes', 'Degree of endangerment' ]: item[attr] = record.find(attr.replace(' ', '_')).text if item['ISO639-3 codes']: for code in item['ISO639-3 codes'].split(','): code = code.strip() lang = Languoid.get(code, key='hid', default=None) if lang: count += 1 item[ 'url'] = 'http://www.unesco.org/culture/languages-atlas/en/atlasmap/language-iso-%s.html' % code lang.update_jsondata(unesco=item) de = domain[item['Degree of endangerment']] vsid = '%s-%s' % (pid, lang.id) vs = valuesets.get(vsid) if not vs: vs = common.ValueSet(id='vitality-%s' % lang.id, parameter=param, contribution=contrib, language=lang) DBSession.add( common.Value(valueset=vs, name=de.name, domainelement=de)) valuesets[vsid] = vs else: vs.values[0].domainelement = de else: notfound[code] = 1 print 'assigned', count, 'unesco urls' print 'missing iso codes:', notfound
def load_languoid(data, lang, nodemap): dblang = data.add( models.Languoid, lang.id, id=lang.id, hid=lang.hid, name=lang.name, bookkeeping=lang.category == models.BOOKKEEPING, newick=lang.newick_node(nodemap).newick, latitude=lang.latitude, longitude=lang.longitude, status=models.LanguoidStatus.get( lang.endangerment.name if lang.endangerment else 'safe'), level=models.LanguoidLevel.from_string(lang.level.name), father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None) if lang.iso: add_language_codes(data, dblang, lang.iso) for prov, names in lang.names.items(): for name in names: l = 'en' if '[' in name and name.endswith(']'): name, l = [s.strip() for s in name[:-1].split('[', 1)] add_identifier(dblang, data, name, 'name', prov, lang=l) for prov, ids in lang.identifier.items(): for id_ in split_text(ids, separators=',;'): add_identifier(dblang, data, id_, prov, None) if not dblang.bookkeeping: # Languages in Bookkeeping do not have a meaningful classification! clf = lang.classification_comment if clf: for attr, pid in [('sub', 'sc'), ('family', 'fc')]: val = getattr(clf, attr) if attr == 'sub' and not val: # Handle cases with subrefs but no sub comment. val = getattr(clf, 'subrefs') if val: val = ', '.join('{0}'.format(r) for r in val) if not val: continue vs = common.ValueSet( id='%s-%s' % (pid, lang.id), description=val, language=dblang, parameter=data['Parameter'][pid], contribution=data['Contribution']['clf']) DBSession.add(common.Value(id='%s-%s' % (pid, lang.id), valueset=vs)) iso_ret = lang.iso_retirement if iso_ret: DBSession.add(models.ISORetirement( id=iso_ret.code, name=iso_ret.name, description=iso_ret.comment, effective=iso_ret.effective, reason=iso_ret.reason, remedy=iso_ret.remedy, change_request=iso_ret.change_request, languoid=dblang)) eth_cmt = lang.ethnologue_comment if eth_cmt: DBSession.add(models.EthnologueComment( comment=eth_cmt.comment, code=eth_cmt.isohid, type=eth_cmt.comment_type, affected=eth_cmt.ethnologue_versions, languoid=dblang))
def load(): wals = create_engine('postgresql://robert@/wals3') contributor = common.Contributor(id='gastvolker', name='Volker Gast') contribution = common.Contribution( id='tdir', name='Typological Database of Intensifiers and Reflexives') cc = common.ContributionContributor( contribution=contribution, contributor=contributor) DBSession.add(cc) for row in read('glosses'): DBSession.add(common.GlossAbbreviation(id=row['gloss'], name=row['explanation'])) params = {} for id_, name in PARAMS.items(): params[id_] = common.Parameter(id='tdir-' + id_, name=name) DBSession.add(params[id_]) # # TODO: domain for sortal restrictions! # values = {} languages = {} for row in read('languages'): if row['adn'] and '<br>' in row['adn']: row['adn'], other = row['adn'].split('<br>', 1) if not row['otherint']: row['otherint'] = '' row['otherint'] = '\n'.join(filter(None, row['otherint'].split('<br>') + other.split('<br>'))) row['sil'] = row['sil'].lower() row['sil'] = { 'arm': 'hye', 'vmn': 'mig', 'gli': 'gle', 'grk': 'ell', 'hbr': 'heb', 'ltn': 'lat', 'chn': 'cmn', 'ota': 'ote', 'pnj': 'pan', 'pba': 'rap', 'esg': 'kal', 'vla': 'zea', 'lat': 'lav', }.get(row['sil'], row['sil']) l = common.Language(id=row['sil'].lower(), name=row['language']) languages[row['language']] = l res = wals.execute("select l.latitude, l.longitude from language as l, languageidentifier as li, identifier as i where l.pk = li.language_pk and li.identifier_pk = i.pk and i.id = '%s' and i.type = 'iso639-3';" \ % row['sil']).fetchone() if not res: res = wals.execute("select latitude, longitude from language where name = '%s';" % row['language']).fetchone() if res: l.latitude, l.longitude = res else: print(row['language'], row['sil']) #(u'Classical Nahuatl', u'nci') ??? #(u'Ancient Greek', u'gko') for pid in params.keys(): value = row[pid] if value: value = common.Value( id='tdir-%s-%s' % (pid, l.id), name=unicode(bs(value)), contribution=contribution, parameter=params[pid], language=l) values['%s-%s' % (pid, row['language'])] = value DBSession.add(value) def normalize_ref(ref): ref = re.sub('\s+', ' ', ref).strip() return unicode(bs(ref)).replace('<i>', '"').replace('</i>', '"') """ Ogawa, A. (1998) Wali, K. et al. (2000) Lyutikova. -> Lyutikova, se-Bertit -> se-Berit missing refs: Sengupta, G. (2000). Lexical anaphors and pronouns in Bangla. In Lust et al. (eds.), <i>Lexical Anaphors and Pronouns in Selected South Asian Languages</i>. Berlin: Mouton de Gruyter. Davison, A. Mistry (2000). Lexical anaphors and pronouns in Hindi/Urdu. In Lust et al. (eds.), <i>Lexical Anaphors and Pronouns in Selected South Asian Languages</i>. Berlin: Mouton de Gruyter. """ refs = {} for row in read('references'): name = re.sub('\s+', ' ', row['entry'].split(').')[0].strip()) + ')' src = common.Source( id=row['ref'].strip(), name=name, description=normalize_ref(row['entry'])) refs[name] = src DBSession.add(src) for row in read('examples'): if row['language'] not in languages: print('example for unknown language "%s"' % row['language']) continue s = common.Sentence( id=row['Nr'].strip(), name=fix_example(row['original'], repl=' '), language=languages[row['language']], analyzed=fix_example(row['original']), gloss=fix_example(row['gloss']), description=row['translation'], source=row['source'], comment=row['comments']) has_refs = False for ref in refs: if ref in row['source']: if normalize_ref(row['source']) != refs[ref].description: print('-->') print(row['source']) has_refs = True common.SentenceReference(sentence=s, source=refs[ref]) if not has_refs: print('+++++') print(row['source']) pid = EXAMPLE_MAP[row['pov']] if pid: # associate with value! o = common.ValueSentence(value=values['%s-%s' % (pid, row['language'])], sentence=s) DBSession.add(s)
def populate_test_db(engine): set_alembic_version(engine, '58559d4eea0d') data = TestData() data.add_default(common.Dataset, domain='clld', jsondata={ 'license_icon': 'cc-by', 'license_url': 'http://example.org' }) data.add_default(common.Contributor, name='A Name', email='*****@*****.**') for id_, name in { 'b': 'b Name', 'c': 'c Name', 'd': 'd Name', }.items(): data.add(common.Contributor, id_, id=id_, name=name, url='http://example.org') DBSession.add( common.Editor(dataset=data[common.Dataset], contributor=data[common.Contributor])) data.add_default(common.Source) data.add(common.Source, 'replaced', id='replaced', active=False, jsondata={'__replacement_id__': 'source'}) data.add_default(common.Contribution) common.ContributionReference(contribution=data[common.Contribution], source=data[common.Source]) for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'), (False, 'd')]: common.ContributionContributor(contribution=data[common.Contribution], primary=primary, contributor=data['Contributor'][c]) data.add_default(common.Language, latitude=10.5, longitude=0.3) data[common.Language].sources.append(data[common.Source]) for i, type_ in enumerate(common.IdentifierType): common.LanguageIdentifier( language=data[common.Language], identifier=common.Identifier( type=type_.value, id=type_.value + str(i), name='abc' if type_.name == 'iso' else 'glot1234')) common.LanguageIdentifier(language=data[common.Language], identifier=common.Identifier(type='name', id='name', name='a')) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='abc') common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = data.add_default(common.Parameter) de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = data.add_default(common.ValueSet, language=data[common.Language], parameter=param, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') data.add_default(common.Value, domainelement=de, valueset=valueset, frequency=50, confidence='high') data.add(common.Value, 'value2', id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') paramnd = data.add(common.Parameter, 'no-domain', id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=data[common.Language], parameter=paramnd, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') unit = data.add_default(common.Unit, language=data[common.Language]) up = data.add_default(common.UnitParameter) common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = data.add_default(common.Sentence, description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=data[common.Language], jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=data[common.Source]) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld', jsondata={'license_icon': 'cc-by'})) DBSession.add( common.Source(id='replaced', active=False, jsondata={'__replacement_id__': 'source'})) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name, url='http://example.org') contribution = common.Contribution(id='contribution', name='Contribution') common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) for i, type_ in enumerate(common.IdentifierType): id_ = common.Identifier(type=type_.value, id=type_.value + str(i), name='abc') common.LanguageIdentifier(language=language, identifier=id_) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) value2 = common.Value(id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') DBSession.add(value2) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence( id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language, jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()