Exemplo n.º 1
0
    def test_enumerate_variations_all_db(self):
        gitdb = GitInterface(
            origin='https://github.com/plevyieml/ieml-language.git')
        gitdb.pull()
        db = IEMLDatabase(folder=gitdb.folder)

        usls = db.list(type=Word, parse=True) + db.list(
            type=PolyMorpheme, parse=True) + db.list(type=Lexeme, parse=True)
        for u in tqdm.tqdm(usls):
            dim, partitions = enumerate_partitions(u)
Exemplo n.º 2
0
    def test_expand_compose_into_paths(self):
        # parser = IEMLParser().parse
        gitdb = GitInterface(
            origin='https://github.com/plevyieml/ieml-language.git')
        gitdb.pull()
        db = IEMLDatabase(folder=gitdb.folder)

        usls = db.list(type=Word, parse=True) + db.list(
            type=PolyMorpheme, parse=True) + db.list(type=Lexeme, parse=True)
        for u in tqdm.tqdm(usls):
            p_u = list(u.iter_structure_path_by_script_ss())
            res = usl_from_path_values(p_u)
            self.assertEqual(str(u), str(res),
                             "expand_compose_into_paths failed on: " + str(u))
Exemplo n.º 3
0
    def update_all_ieml(self, f, message: str):
        db = IEMLDatabase(folder=self.gitdb.folder,
                          use_cache=self.use_cache,
                          cache_folder=self.cache_folder)
        desc = db.get_descriptors()

        with self.gitdb.commit(
                self.signature,
                '[IEML migration] Update all ieml in db: {}'.format(message)):

            for old_ieml in tqdm.tqdm(db.list(parse=True), "Migrate all usls"):
                new_ieml = f(old_ieml)

                value = desc.get_values_partial(old_ieml)

                db.remove_descriptor(old_ieml, None, None)

                for l in LANGUAGES:
                    for d in value[l]:
                        for e in value[l][e]:
                            db.add_descriptor(new_ieml, l, d, e)
Exemplo n.º 4
0
# instanciate a ieml.ieml_database.IEMLDatabase from the downloaded git repository
db = IEMLDatabase(folder=gitdb.folder)

# usls = db.list(parse=True, type='word')
#
# parsed_usls = list()
# for e in tqdm(usls):
#     parsed_usls.append(get_word_structure(e))
#
# with bz2.open(WORDS_FILENAME + ".bz2", "wt") as fout:
#     json.dump(parsed_usls, fout, indent=2)

descriptors = db.get_descriptors()

usls = db.list()

translations = list()
for e in tqdm(usls):
    assert (e not in translations)
    tr_dict = dict()
    values = descriptors.get_values_partial(e)
    for (usl, lang, label), tr_list in values.items():
        assert (usl == e)
        if label == "translations":
            assert (lang not in tr_dict)
            tr_dict[lang] = tr_list
    translations.append({"usl": e, "translations": tr_dict})

with bz2.open(DICTIONARY_FILENAME + ".bz2", "wt") as fout:
    json.dump(translations, fout, indent=2)
Exemplo n.º 5
0
    gitdb.pull()

    signature = pygit2.Signature("Louis van Beurden",
                                 "*****@*****.**")

    db = IEMLDatabase(folder=folder, use_cache=False)

    desc = db.get_descriptors()
    struct = db.get_structure()

    to_migrate = {}
    to_remove = []

    parser = IEMLParser(dictionary=db.get_dictionary())

    all_db = db.list()
    # assert "[E:.b.E:B:.- E:S:. ()(a.T:.-) > ! E:.l.- ()(d.i.-l.i.-')]" in all_db
    for s in TO_REMOVE:
        to_pass = True

        try:
            _s = parser.parse(s)
        except CannotParse as e:
            print(str(e))
            print("\t", str(s))
            to_pass = False
        else:
            if s not in all_db:
                repr("{} not in database".format(s))

            else:
Exemplo n.º 6
0
    # gitdb.pull()

    signature = pygit2.Signature("Louis van Beurden",
                                 "*****@*****.**")

    db = IEMLDatabase(folder=folder, use_cache=False)

    desc = db.get_descriptors()
    struct = db.get_structure()

    to_migrate = {}
    to_remove = []

    parser = IEMLParser(dictionary=db.get_dictionary())

    all_db = db.list(type="word")
    # assert "[E:.b.E:B:.- E:S:. ()(a.T:.-) > ! E:.l.- ()(d.i.-l.i.-')]" in all_db
    for s in all_db:
        to_pass = True

        try:
            _s = parser.parse(s)
        except CannotParse as e:
            print(str(e))
            print("\t", str(s))
            to_pass = False
        else:
            if not isinstance(_s, Word):
                print("!!! Not a word", _s)
                continue
class IemlData:
    """
    Modify the ieml data to output a vector representation
    """
    def __init__(self, input_database_folder_path, out_file_path=None):
        from ieml.ieml_database import IEMLDatabase
        # input file
        self.database = IEMLDatabase(folder=input_database_folder_path)
        # output file
        if out_file_path is not None:
            with open(out_file_path, "w") as output_file:
                output_file.write("")
            self.vocab_file_path = "{0}.vocab".format(
                out_file_path.replace(".tsv", "").replace(".csv", ""))
            self.vocab_file = open(self.vocab_file_path, "a")
            self.out_file_path = out_file_path
            self.out_file = open(out_file_path, "a")

    def close_all(self):
        try:
            self.vocab_file.close()
        except AttributeError:
            pass
        try:
            self.out_file.close()
        except AttributeError:
            pass

    def get_word_objects(self):
        return self.database.list(parse=False, type='word')

    def list_polymorpheme_of_word(self, w):
        ########WORkAROUND############TO BE SOLVED THEN REMOVE########################################3
        if w == "[! E:B:. ()(k.a.-k.a.-' l.o.-k.o.-') > E:.f.- ()(p.E:A:T:.-)] [>role>E:B:.>content>constant>k.a.-k.a.-'":
            return []
        ##################################################################
        w = usl(w)
        assert isinstance(w, Word)
        polyList = []
        for sfun in w.syntagmatic_fun.actors.values():
            if sfun.actor is not None:
                polyList.append((sfun.actor.pm_content, sfun.actor.pm_flexion))
        # return list(chain.from_iterable((sfun.actor.pm_content, sfun.actor.pm_flexion)
        #                                 for sfun in w.syntagmatic_fun.actors.values())) # encounteres AttributeError: 'NoneType' object has no attribute 'pm_content' since sfun.actor can be None
        return polyList

    def get_natural_lang_meanings(self, lang="en"):
        nl_meanings = []
        descriptors = self.database.get_descriptors()
        for word in self.get_word_objects():
            word_nl_meanings = []
            # get meaning of word
            desc_w_vals = descriptors.get_values_partial(word)
            for (usl_w, language_w, label_w), tr_w_list in desc_w_vals.items():
                if language_w == lang and label_w == "translations":
                    word_nl_meanings.append([" , ".join(tr_w_list)])
            # divide the words form the polymorphemes
            word_nl_meanings.append([" : "])
            # get meaning of polymorpheme
            polymorphemes = self.list_polymorpheme_of_word(word)
            for polymorph in polymorphemes:
                for poly in polymorph:
                    desc_p_vals = descriptors.get_values_partial(poly)
                    for (usl_p, language_p,
                         label_p), tr_p_list in desc_p_vals.items():
                        if language_p == lang and label_p == "translations":
                            word_nl_meanings.append(tr_p_list)
            nl_meanings.append(word_nl_meanings)
        return nl_meanings

    def get_bert_emb(self, string, bert_class):
        bert_class = bert_class if bert_class is not None else BertEmbedd()
        return bert_class.bert([string])

    def make_bert_emb_list(self, lang="en", bert_class=None, dump=False):
        bert_class = bert_class if bert_class is not None else BertEmbedd()
        bert_embeddings = []
        for ieml_pm_in_nl in self.get_natural_lang_meanings(lang):
            ieml_w_pm_sent = " ".join(
                [" ".join(pm) for pm in ieml_pm_in_nl if len(pm) != 0])
            # yield self.get_bert_emb(ieml_pm_in_nl, bert_class)
            bert_embeddings.append(
                self.get_bert_emb(ieml_w_pm_sent, bert_class))
        # dump ieml the sentence embeddings
        if dump is not False:
            # dump the embeddings
            for bert_emb in bert_embeddings:
                for (bert_vocab, bert_vect) in bert_emb:
                    bert_vect = np.array(bert_vect)
                    self.vocab_file.write("{0}\n".format(
                        json.dumps(bert_vocab)))
                    try:
                        self.out_file.write("{0}\n".format(
                            json.dumps(bert_vect)))
                    except TypeError:
                        self.out_file.write("{0}\n".format(bert_vect.dumps()))
            # numpy.save(self.out_file, sent_emb)
        self.close_all()
        return bert_embeddings