def test_enumerate_variations_all_db(self): gitdb = GitInterface( origin='https://github.com/plevyieml/ieml-language.git') gitdb.pull() db = IEMLDatabase(folder=gitdb.folder) usls = db.list(type=Word, parse=True) + db.list( type=PolyMorpheme, parse=True) + db.list(type=Lexeme, parse=True) for u in tqdm.tqdm(usls): dim, partitions = enumerate_partitions(u)
def test_expand_compose_into_paths(self): # parser = IEMLParser().parse gitdb = GitInterface( origin='https://github.com/plevyieml/ieml-language.git') gitdb.pull() db = IEMLDatabase(folder=gitdb.folder) usls = db.list(type=Word, parse=True) + db.list( type=PolyMorpheme, parse=True) + db.list(type=Lexeme, parse=True) for u in tqdm.tqdm(usls): p_u = list(u.iter_structure_path_by_script_ss()) res = usl_from_path_values(p_u) self.assertEqual(str(u), str(res), "expand_compose_into_paths failed on: " + str(u))
def update_all_ieml(self, f, message: str): db = IEMLDatabase(folder=self.gitdb.folder, use_cache=self.use_cache, cache_folder=self.cache_folder) desc = db.get_descriptors() with self.gitdb.commit( self.signature, '[IEML migration] Update all ieml in db: {}'.format(message)): for old_ieml in tqdm.tqdm(db.list(parse=True), "Migrate all usls"): new_ieml = f(old_ieml) value = desc.get_values_partial(old_ieml) db.remove_descriptor(old_ieml, None, None) for l in LANGUAGES: for d in value[l]: for e in value[l][e]: db.add_descriptor(new_ieml, l, d, e)
# instanciate a ieml.ieml_database.IEMLDatabase from the downloaded git repository db = IEMLDatabase(folder=gitdb.folder) # usls = db.list(parse=True, type='word') # # parsed_usls = list() # for e in tqdm(usls): # parsed_usls.append(get_word_structure(e)) # # with bz2.open(WORDS_FILENAME + ".bz2", "wt") as fout: # json.dump(parsed_usls, fout, indent=2) descriptors = db.get_descriptors() usls = db.list() translations = list() for e in tqdm(usls): assert (e not in translations) tr_dict = dict() values = descriptors.get_values_partial(e) for (usl, lang, label), tr_list in values.items(): assert (usl == e) if label == "translations": assert (lang not in tr_dict) tr_dict[lang] = tr_list translations.append({"usl": e, "translations": tr_dict}) with bz2.open(DICTIONARY_FILENAME + ".bz2", "wt") as fout: json.dump(translations, fout, indent=2)
gitdb.pull() signature = pygit2.Signature("Louis van Beurden", "*****@*****.**") db = IEMLDatabase(folder=folder, use_cache=False) desc = db.get_descriptors() struct = db.get_structure() to_migrate = {} to_remove = [] parser = IEMLParser(dictionary=db.get_dictionary()) all_db = db.list() # assert "[E:.b.E:B:.- E:S:. ()(a.T:.-) > ! E:.l.- ()(d.i.-l.i.-')]" in all_db for s in TO_REMOVE: to_pass = True try: _s = parser.parse(s) except CannotParse as e: print(str(e)) print("\t", str(s)) to_pass = False else: if s not in all_db: repr("{} not in database".format(s)) else:
# gitdb.pull() signature = pygit2.Signature("Louis van Beurden", "*****@*****.**") db = IEMLDatabase(folder=folder, use_cache=False) desc = db.get_descriptors() struct = db.get_structure() to_migrate = {} to_remove = [] parser = IEMLParser(dictionary=db.get_dictionary()) all_db = db.list(type="word") # assert "[E:.b.E:B:.- E:S:. ()(a.T:.-) > ! E:.l.- ()(d.i.-l.i.-')]" in all_db for s in all_db: to_pass = True try: _s = parser.parse(s) except CannotParse as e: print(str(e)) print("\t", str(s)) to_pass = False else: if not isinstance(_s, Word): print("!!! Not a word", _s) continue
class IemlData: """ Modify the ieml data to output a vector representation """ def __init__(self, input_database_folder_path, out_file_path=None): from ieml.ieml_database import IEMLDatabase # input file self.database = IEMLDatabase(folder=input_database_folder_path) # output file if out_file_path is not None: with open(out_file_path, "w") as output_file: output_file.write("") self.vocab_file_path = "{0}.vocab".format( out_file_path.replace(".tsv", "").replace(".csv", "")) self.vocab_file = open(self.vocab_file_path, "a") self.out_file_path = out_file_path self.out_file = open(out_file_path, "a") def close_all(self): try: self.vocab_file.close() except AttributeError: pass try: self.out_file.close() except AttributeError: pass def get_word_objects(self): return self.database.list(parse=False, type='word') def list_polymorpheme_of_word(self, w): ########WORkAROUND############TO BE SOLVED THEN REMOVE########################################3 if w == "[! E:B:. ()(k.a.-k.a.-' l.o.-k.o.-') > E:.f.- ()(p.E:A:T:.-)] [>role>E:B:.>content>constant>k.a.-k.a.-'": return [] ################################################################## w = usl(w) assert isinstance(w, Word) polyList = [] for sfun in w.syntagmatic_fun.actors.values(): if sfun.actor is not None: polyList.append((sfun.actor.pm_content, sfun.actor.pm_flexion)) # return list(chain.from_iterable((sfun.actor.pm_content, sfun.actor.pm_flexion) # for sfun in w.syntagmatic_fun.actors.values())) # encounteres AttributeError: 'NoneType' object has no attribute 'pm_content' since sfun.actor can be None return polyList def get_natural_lang_meanings(self, lang="en"): nl_meanings = [] descriptors = self.database.get_descriptors() for word in self.get_word_objects(): word_nl_meanings = [] # get meaning of word desc_w_vals = descriptors.get_values_partial(word) for (usl_w, language_w, label_w), tr_w_list in desc_w_vals.items(): if language_w == lang and label_w == "translations": word_nl_meanings.append([" , ".join(tr_w_list)]) # divide the words form the polymorphemes word_nl_meanings.append([" : "]) # get meaning of polymorpheme polymorphemes = self.list_polymorpheme_of_word(word) for polymorph in polymorphemes: for poly in polymorph: desc_p_vals = descriptors.get_values_partial(poly) for (usl_p, language_p, label_p), tr_p_list in desc_p_vals.items(): if language_p == lang and label_p == "translations": word_nl_meanings.append(tr_p_list) nl_meanings.append(word_nl_meanings) return nl_meanings def get_bert_emb(self, string, bert_class): bert_class = bert_class if bert_class is not None else BertEmbedd() return bert_class.bert([string]) def make_bert_emb_list(self, lang="en", bert_class=None, dump=False): bert_class = bert_class if bert_class is not None else BertEmbedd() bert_embeddings = [] for ieml_pm_in_nl in self.get_natural_lang_meanings(lang): ieml_w_pm_sent = " ".join( [" ".join(pm) for pm in ieml_pm_in_nl if len(pm) != 0]) # yield self.get_bert_emb(ieml_pm_in_nl, bert_class) bert_embeddings.append( self.get_bert_emb(ieml_w_pm_sent, bert_class)) # dump ieml the sentence embeddings if dump is not False: # dump the embeddings for bert_emb in bert_embeddings: for (bert_vocab, bert_vect) in bert_emb: bert_vect = np.array(bert_vect) self.vocab_file.write("{0}\n".format( json.dumps(bert_vocab))) try: self.out_file.write("{0}\n".format( json.dumps(bert_vect))) except TypeError: self.out_file.write("{0}\n".format(bert_vect.dumps())) # numpy.save(self.out_file, sent_emb) self.close_all() return bert_embeddings