def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld')) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name) contribution = common.Contribution(id='contribution', name='Contribution') cr = common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) identifier = common.Identifier(type='iso639-3', id='iso') li = common.LanguageIdentifier(language=language, identifier=identifier) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) _li = common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) vr = common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence(id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language) sr = common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) DBSession.flush()
def setUp(self): TestWithDb.setUp(self) DBSession.add( common.Dataset(id='dataset', name='dataset', description='desc', domain='clld', jsondata={'license_icon': 'cc-by'})) DBSession.add( common.Source(id='replaced', active=False, jsondata={'__replacement_id__': 'source'})) source = common.Source(id='source') contributors = { 'contributor': 'A Name', 'b': 'b Name', 'c': 'c Name', 'd': 'd Name' } for id_, name in contributors.items(): contributors[id_] = common.Contributor(id=id_, name=name, url='http://example.org') contribution = common.Contribution(id='contribution', name='Contribution') common.ContributionReference(contribution=contribution, source=source) assert common.ContributionContributor( contribution=contribution, primary=True, contributor=contributors['contributor']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['b']) assert common.ContributionContributor(contribution=contribution, primary=True, contributor=contributors['c']) assert common.ContributionContributor(contribution=contribution, primary=False, contributor=contributors['d']) DBSession.add(contribution) language = common.Language(id='language', name='Language 1', latitude=10.5, longitude=0.3) language.sources.append(source) for i, type_ in enumerate(common.IdentifierType): id_ = common.Identifier(type=type_.value, id=type_.value + str(i), name='abc') common.LanguageIdentifier(language=language, identifier=id_) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='%.3i' % i) common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = common.Parameter(id='parameter', name='Parameter') de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = common.ValueSet(id='valueset', language=language, parameter=param, contribution=contribution) value = common.Value(id='value', domainelement=de, valueset=valueset, frequency=50, confidence='high') DBSession.add(value) value2 = common.Value(id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') DBSession.add(value2) paramnd = common.Parameter(id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=language, parameter=paramnd, contribution=contribution) common.ValueSetReference(valueset=valueset, source=source) value = common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') DBSession.add(value) unit = common.Unit(id='unit', name='Unit', language=language) up = common.UnitParameter(id='unitparameter', name='UnitParameter') DBSession.add(unit) DBSession.add( common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up)) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = common.Sentence( id='sentence', name='sentence name', description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=language, jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=source) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()
def populate_test_db(engine): set_alembic_version(engine, '58559d4eea0d') data = TestData() data.add_default(common.Dataset, domain='clld', jsondata={ 'license_icon': 'cc-by', 'license_url': 'http://example.org' }) data.add_default(common.Contributor, name='A Name', email='*****@*****.**') for id_, name in { 'b': 'b Name', 'c': 'c Name', 'd': 'd Name', }.items(): data.add(common.Contributor, id_, id=id_, name=name, url='http://example.org') DBSession.add( common.Editor(dataset=data[common.Dataset], contributor=data[common.Contributor])) data.add_default(common.Source) data.add(common.Source, 'replaced', id='replaced', active=False, jsondata={'__replacement_id__': 'source'}) data.add_default(common.Contribution) common.ContributionReference(contribution=data[common.Contribution], source=data[common.Source]) for primary, c in [(True, 'contributor'), (False, 'b'), (True, 'c'), (False, 'd')]: common.ContributionContributor(contribution=data[common.Contribution], primary=primary, contributor=data['Contributor'][c]) data.add_default(common.Language, latitude=10.5, longitude=0.3) data[common.Language].sources.append(data[common.Source]) for i, type_ in enumerate(common.IdentifierType): common.LanguageIdentifier( language=data[common.Language], identifier=common.Identifier( type=type_.value, id=type_.value + str(i), name='abc' if type_.name == 'iso' else 'glot1234')) common.LanguageIdentifier(language=data[common.Language], identifier=common.Identifier(type='name', id='name', name='a')) for i in range(2, 102): _l = common.Language(id='l%s' % i, name='Language %s' % i) _i = common.Identifier(type='iso639-3', id='%.3i' % i, name='abc') common.LanguageIdentifier(language=_l, identifier=_i) DBSession.add(_l) param = data.add_default(common.Parameter) de = common.DomainElement(id='de', name='DomainElement', parameter=param) de2 = common.DomainElement(id='de2', name='DomainElement2', parameter=param) valueset = data.add_default(common.ValueSet, language=data[common.Language], parameter=param, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') data.add_default(common.Value, domainelement=de, valueset=valueset, frequency=50, confidence='high') data.add(common.Value, 'value2', id='value2', domainelement=de2, valueset=valueset, frequency=50, confidence='high') paramnd = data.add(common.Parameter, 'no-domain', id='no-domain', name='Parameter without domain') valueset = common.ValueSet(id='vs2', language=data[common.Language], parameter=paramnd, contribution=data[common.Contribution]) common.ValueSetReference(valueset=valueset, source=data[common.Source], description='10-20') common.Value(id='v2', valueset=valueset, frequency=50, confidence='high') unit = data.add_default(common.Unit, language=data[common.Language]) up = data.add_default(common.UnitParameter) common.UnitValue(id='unitvalue', name='UnitValue', unit=unit, unitparameter=up) up2 = common.UnitParameter(id='up2', name='UnitParameter with domain') de = common.UnitDomainElement(id='de', name='de', parameter=up2) DBSession.add( common.UnitValue(id='uv2', name='UnitValue2', unit=unit, unitparameter=up2, unitdomainelement=de)) DBSession.add(common.Source(id='s')) sentence = data.add_default(common.Sentence, description='sentence description', analyzed='a\tmorpheme\tdoes\tdo', gloss='a\tmorpheme\t1SG\tdo.SG2', source='own', comment='comment', original_script='a morpheme', language=data[common.Language], jsondata={'alt_translation': 'Spanish: ...'}) common.SentenceReference(sentence=sentence, source=data[common.Source]) DBSession.add(common.Config(key='key', value='value')) common.Config.add_replacement('replaced', 'language', model=common.Language) common.Config.add_replacement('gone', None, model=common.Language) DBSession.flush()
def main(args): old_db = create_engine(DB) data = Data() # # migrate contributor table: complete # for row in old_db.execute("select * from contributor"): data.add(common.Contributor, row['id'], id=row['id'], name='%(firstname)s %(lastname)s' % row, url=row['homepage'], description=row['note'], email=row['email'], address=row['address']) data.add(common.Contributor, 'haspelmathmartin', id='haspelmathmartin', name="Martin Haspelmath", url="http://email.eva.mpg.de/~haspelmt/") DBSession.flush() dataset = common.Dataset( id='wold', name='WOLD', description='World Loanword Database', domain='wold.clld.org', published=date(2009, 8, 15), license='http://creativecommons.org/licenses/by/3.0/de/', contact='*****@*****.**', jsondata={ 'license_icon': 'http://i.creativecommons.org/l/by/3.0/de/88x31.png', 'license_name': 'Creative Commons Attribution 3.0 Germany License' }) DBSession.add(dataset) for i, editor in enumerate(['haspelmathmartin', 'tadmoruri']): common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1) # # migrate semantic_field table: complete # for row in old_db.execute("select * from semantic_field"): if row['id'] != 25: kw = dict((key, row[key]) for key in ['id', 'name', 'description']) data.add(models.SemanticField, row['id'], **kw) # # migrate language table: complete # recipient flag is replaced by vocabulary_pk! # for row in old_db.execute("select * from language order by id"): kw = dict((key, row[key]) for key in [ 'fm_dl_id', 'name', 'latitude', 'longitude', 'wals_equivalent', 'affiliation', 'family', 'genus', 'countries' ]) data.add(models.WoldLanguage, row['id'], id=str(row['id']), **kw) # # migrate language_code table: complete # for row in old_db.execute("select * from language_code"): _id = '%(type)s-%(code)s' % row data.add(common.Identifier, _id, id=_id, type=row['type'], name=row['code']) if row['type'] == 'iso639-3' and row['code'] in glottocodes: gc = glottocodes[row['code']] data.add(common.Identifier, gc, id=gc, type=common.IdentifierType.glottolog.value, name=gc) DBSession.flush() # # migrate language_code_language table: complete # for row in old_db.execute("select * from language_code_language"): _id = '%(type)s-%(code)s' % row data.add(common.LanguageIdentifier, '%s-%s' % (_id, row['language_id']), identifier_pk=data['Identifier'][_id].pk, language_pk=data['WoldLanguage'][row['language_id']].pk) if row['type'] == 'iso639-3' and row['code'] in glottocodes: gc = glottocodes[row['code']] data.add(common.LanguageIdentifier, '%s-%s' % (gc, row['language_id']), identifier_pk=data['Identifier'][gc].pk, language_pk=data['WoldLanguage'][row['language_id']].pk) DBSession.flush() # # migrate vocabulary table: complete # for row in old_db.execute("select * from vocabulary order by id"): jsondata = {} for key in row.keys(): if key.startswith('fd_') or key in [ 'other_information', 'abbreviations' ]: jsondata[key] = row[key] vocab = data.add(models.Vocabulary, row['id'], id=str(row['id']), name=row['name'], color=row['color'], jsondata=jsondata) DBSession.flush() data['WoldLanguage'][row['language_id']].vocabulary_pk = vocab.pk DBSession.flush() # # migrate contact_situation and age tables: complete # contact situations and ages are unitdomainelements! # contact_situation = common.UnitParameter(id='cs', name='Contact Situation') age = common.UnitParameter(id='a', name='Age') DBSession.add(contact_situation) DBSession.add(age) DBSession.flush() for row in old_db.execute("select * from contact_situation"): if row['vocabulary_id'] is None: continue kw = dict((key, row[key]) for key in ['description', 'id', 'name']) kw['id'] = 'cs-%s' % kw['id'] p = data.add(models.WoldUnitDomainElement, row['id'], **kw) p.vocabulary = data['Vocabulary'][row['vocabulary_id']] p.unitparameter_pk = contact_situation.pk for row in old_db.execute("select * from age"): id_ = '%(vocabulary_id)s-%(label)s' % row kw = dict((key, row[key]) for key in ['start_year', 'end_year']) p = data.add(models.WoldUnitDomainElement, id_, id='a-%s' % id_, name=row['label'], description=row['description'], jsondata=kw) p.vocabulary = data['Vocabulary'][row['vocabulary_id']] p.unitparameter_pk = age.pk # # migrate meaning table: complete # for row in old_db.execute("select * from meaning"): kw = dict((key, row[key]) for key in [ 'description', 'core_list', 'ids_code', 'typical_context', 'semantic_category' ]) p = data.add( models.Meaning, row['id'], id=row['id'].replace('.', '-'), name=row['label'], sub_code=row['id'].split('.')[1] if '.' in row['id'] else '', semantic_field=data['SemanticField'][row['semantic_field_id']], **kw) DBSession.flush() for field in ['french', 'spanish', 'german', 'russian']: DBSession.add( models.Translation(name=row[field], lang=field, meaning=p)) for key in data['WoldLanguage']: lang = data['WoldLanguage'][key] data.add(common.ValueSet, '%s-%s' % (key, row['id']), id='%s-%s' % (key, row['id'].replace('.', '-')), language=lang, contribution=lang.vocabulary, parameter=p) DBSession.flush() # # migrate word table: # TODO: all the other word properties!! # fields = [ 'age_label', 'original_script', 'grammatical_info', 'comment_on_word_form', 'gloss', "comment_on_borrowed", "calqued", "borrowed_base", "numeric_frequency", "relative_frequency", "effect", "integration", "salience", "reference", "other_comments", "register", "loan_history", 'colonial_word', 'paraphrase_in_dutch', 'word_source', 'paraphrase_in_german', 'lexical_stratum', 'comparison_with_mandarin', 'year', 'comparison_with_korean', 'czech_translation', 'hungarian_translation', 'early_romani_reconstruction', 'etymological_note', 'boretzky_and_igla_etymology', 'manuss_et_al_etymology', 'vekerdi_etymology', 'turner_etymology', 'other_etymologies', 'mayrhofer_etymology', ] word_to_vocab = {} for row in old_db.execute("select * from word"): word_to_vocab[row['id']] = row['vocabulary_id'] kw = dict((key, row[key]) for key in [ 'id', 'age_score', 'borrowed', 'borrowed_score', 'analyzability', 'simplicity_score' ]) w = data.add(models.Word, row['id'], name=row['form'], description=row['free_meaning'], jsondata={k: row[k] for k in fields}, **kw) w.language = data['Vocabulary'][row['vocabulary_id']].language if row['age_label']: DBSession.add( common.UnitValue( id='%(id)s-a' % row, unit=w, unitparameter=age, unitdomainelement=data['WoldUnitDomainElement'][ '%(vocabulary_id)s-%(age_label)s' % row], contribution=data['Vocabulary'][row['vocabulary_id']])) if row['contact_situation_id'] and row[ 'contact_situation_id'] != '9129144185487768': DBSession.add( common.UnitValue( id='%(id)s-cs' % row, unit=w, unitparameter=contact_situation, unitdomainelement=data['WoldUnitDomainElement'][ row['contact_situation_id']], contribution=data['Vocabulary'][row['vocabulary_id']])) DBSession.flush() # # migrate word_meaning table: complete # for i, row in enumerate(old_db.execute("select * from word_meaning")): data.add( models.Counterpart, i, id=i, description='%(relationship)s (%(comment_on_relationship)s)' % row, name=data['Word'][row['word_id']].name, valueset=data['ValueSet']['%s-%s' % (word_to_vocab[row['word_id']], row['meaning_id'])], word=data['Word'][row['word_id']]) DBSession.flush() # # migrate vocabulary_contributor table: complete # for row in old_db.execute("select * from vocabulary_contributor"): DBSession.add( common.ContributionContributor( ord=row['ordinal'], primary=row['primary'], contributor_pk=data['Contributor'][row['contributor_id']].pk, contribution_pk=data['Vocabulary'][row['vocabulary_id']].pk)) DBSession.flush() # # source words: we have to make sure a word does only belong to one language. # thus, we have to reassign identifier! # # loop over source_word, source_word_donor_language pairs keeping track of source_word ids: known_ids = {} for row in old_db.execute( "select sw.id, sw.meaning, sw.form, dl.language_id from source_word as sw, source_word_donor_language as dl where sw.id = dl.source_word_id" ): if row['id'] in known_ids: # source_word was already seen associated to a different donor language! assert row['language_id'] not in known_ids[row['id']] known_ids[row['id']].append(row['language_id']) id_ = '%s-%s' % (row['id'], len(known_ids[row['id']])) else: id_ = '%s-%s' % (row['id'], 1) known_ids[row['id']] = [row['language_id']] new = data.add(models.Word, id_, id=id_, name=row['form'], description=row['meaning']) new.language = data['WoldLanguage'][row['language_id']] # source words may end up as words without language! for row in old_db.execute( "select id, meaning, form from source_word where id not in (select source_word_id from source_word_donor_language)" ): id_ = '%s-%s' % (row['id'], 1) new = data.add(models.Word, id_, id=id_, name=row['form'], description=row['meaning']) DBSession.flush() # # migrate word_source_word relations # TODO: should be modelled as UnitParameter! # j = 0 for row in old_db.execute("select * from word_source_word"): # there may be more than one word associated with a source_word_id (see above) source_words = [] for i in range(4): # but we guess no more than 4 :) id_ = '%s-%s' % (row['source_word_id'], i + 1) if id_ in data['Word']: source_words.append(data['Word'][id_]) if not source_words: j += 1 #print(row['source_word_id']) #raise ValueError(row['source_word_id']) for sw in source_words: DBSession.add( models.Loan(source_word=sw, target_word=data['Word'][row['word_id']], relation=row['relationship'], certain=len(source_words) == 1)) print('%s source words not migrated because they have no donor language!' % j)