def add_pronouns(ctx): """Populates `PronounStem` and `Pronoun`.""" session = ctx.session gender_group = ENUM['gender_group'] gender = ENUM['gender'] case = ENUM['case'] number = ENUM['number'] seen_stems = {} # (stem, genders_id) -> id for row in util.read_csv(ctx.config['PRONOUNS']): stem = row['stem'] genders_id = gender_group[row['stem_genders']] if (stem, genders_id) not in seen_stems: pronoun_stem = PronounStem(name=stem, genders_id=genders_id) session.add(pronoun_stem) session.flush() seen_stems[(stem, genders_id)] = pronoun_stem.id util.tick(stem) stem_id = seen_stems[(stem, genders_id)] session.add( Nominal(stem_id=stem_id, name=row['form'], gender_id=gender[row['form_gender']], case_id=case[row['case']], number_id=number[row['number']])) session.flush() session.commit() session.close()
def add_nominal_endings(ctx): """Populates `NominalEnding`.""" session = ctx.session gender = ENUM['gender'] case = ENUM['case'] number = ENUM['number'] for row in util.read_csv(ctx.config['COMPOUNDED_NOMINAL_ENDINGS']): ending = NominalEnding(name=row['ending'], stem_type=row['stem_type'], gender_id=gender[row['form_gender']], case_id=None, number_id=None, compounded=True) session.add(ending) seen = set() for row in util.read_csv(ctx.config['INFLECTED_NOMINAL_ENDINGS']): ending = NominalEnding(name=row['ending'], stem_type=row['stem_type'], gender_id=gender[row['form_gender']], case_id=case[row['case']], number_id=number[row['number']], compounded=False) session.add(ending) if row['stem_type'] not in seen: util.tick(row['stem_type']) seen.add(row['stem_type']) session.commit() session.close()
def add_verbs(ctx, root_map): """Add inflected verbs to the database.""" session = ctx.session vclass = ENUM['vclass'] person = ENUM['person'] number = ENUM['number'] mode = ENUM['mode'] voice = ENUM['voice'] skipped = set() i = 0 for row in util.read_csv(ctx.config['VERBS']): root = row['root'] hom = row['hom'] try: root_id = root_map[(root, hom)] except KeyError: skipped.add((root, hom)) continue data = { 'name': row['form'], 'root_id': root_id, 'vclass_id': vclass[row['class']] if row['class'] else None, 'person_id': person[row['person']], 'number_id': number[row['number']], 'mode_id': mode[row['mode']], 'voice_id': voice[row['voice']] } session.add(Verb(**data)) i += 1 if i % 1000 == 0: util.tick(row['form']) session.commit() session.commit() session.close() print('Skipped', len(skipped), 'roots.')
def add_irregular_adjectives(ctx): """Add regular irregular adjectives to the database.""" session = ctx.session gender = ENUM['gender'] case = ENUM['case'] number = ENUM['number'] with open(ctx.config['IRREGULAR_ADJECTIVES']) as f: for adj in yaml.load_all(f): stem = AdjectiveStem(name=adj['name']) session.add(stem) session.flush() # Mark the stem as irregular complete = adj['complete'] irreg = StemIrregularity(stem=stem, fully_described=complete) session.add(irreg) session.flush() util.tick(stem.name) for form in adj['forms']: name = form['name'] gender_id = gender[form['gender']] case_id = case[form['case']] number_id = number[form['number']] result = Adjective(stem=stem, name=name, gender_id=gender_id, case_id=case_id, number_id=number_id) session.add(result) session.commit() session.close()
def add_participle_stems(ctx): """Populates `ParticipleStem`.""" session = ctx.session mode = ENUM['mode'] voice = ENUM['voice'] i = 0 for row in util.read_csv(ctx.config['PARTICIPLE_STEMS']): data = { 'name': row['stem'].split("#")[0], 'mode_id': mode[row['mode']], 'voice_id': voice[row['voice']] } session.add(ParticipleStem(**data)) i += 1 if i % 100 == 0: util.tick(row['stem']) session.commit() session.commit() session.close()
def add_enums(ctx): """Add enumerated data to the database. Among others, this includes: - persons - numbers - modes - voices - genders - cases and any other data with small, known limits. """ session = ctx.session type_to_class = { 'case': Case, 'class': VClass, 'gender': Gender, 'gender_group': GenderGroup, 'modification': Modification, 'mode': Mode, 'number': Number, 'person': Person, 'sandhi_rule_type': SandhiType, 'voice': Voice, } # First pass: ordinary enums for row in util.read_csv(ctx.config['ENUMS']): if row['enum_type'] == 'gender_group': continue cls = type_to_class.get(row['enum_type'], None) # TODO: always non-None? if cls is None: continue enum_abbr = cls.__tablename__ if enum_abbr not in ENUM: util.tick(cls.__name__) ENUM.setdefault(enum_abbr, {}) abbreviation = row['abbreviation'] e = cls(name=row['human_readable_value'], abbr=abbreviation) session.add(e) session.flush() ENUM[enum_abbr][abbreviation] = e.id session.commit() # Second pass: gender groups for row in util.read_csv(ctx.config['ENUMS']): if row['enum_type'] != 'gender_group': continue cls = type_to_class.get(row['enum_type'], None) enum_abbr = cls.__tablename__ if enum_abbr not in ENUM: util.tick(cls.__name__) ENUM.setdefault(enum_abbr, {}) abbreviation = row['abbreviation'] e = cls(name=row['human_readable_value'], abbr=abbreviation) session.add(e) session.flush() if set(abbreviation).issubset('mfn'): e.members = [ENUM['gender'][x] for x in abbreviation] ENUM[enum_abbr][abbreviation] = e.id session.commit() session.close()