def main():
    # python en_dataset_creation.py ../../datasets/WNs ../../datasets/en/ 2.0 3.0

    if len(sys.argv) < 3:
        raise Exception(
            "The following arguments are required:<WordNet path> <output_path> <old_version_float> <new_version_float>"
        )

    path = sys.argv[1]
    out_path = sys.argv[2]
    old_version = sys.argv[3]

    if len(sys.argv) == 5:
        new_version = sys.argv[4]
    else:
        new_version = "3.0"

    wn2 = WordNetCorpusReader(os.path.join(path, 'WN' + old_version), None)
    wn3 = WordNetCorpusReader(os.path.join(path, 'WN' + new_version), None)

    for pos in ['nouns', 'verbs']:
        synsets_2n = set(wn2.all_synsets(pos[0]))
        synsets_3n = set(wn3.all_synsets(pos[0]))

        reference_nouns = synsets_3n.intersection(synsets_2n)
        new = extract_new_lemmas(synsets_3n.difference(synsets_2n), wn2,
                                 pos[0])
        hypernyms = generate_gold(new, wn3, reference_nouns, pos[0])

        print(f"Len {pos} {len(hypernyms)}")
        save(dict(hypernyms), out_path,
             f"{pos}_en.{old_version}-{new_version}.tsv")
Exemplo n.º 2
0
class TestTransform(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.languages = ["cat", "eng", "eus", "glg", "spa"]
        cls.wn_names = {}
        for lang in cls.languages:
            cls.wn_names[lang] = '.wordnet_' + lang
            with tarfile.open('wordnet_' + lang + '.tar.gz') as f:
                f.extractall(cls.wn_names[lang])

    def test_all_synsets(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.all_synsets():
            a = synset
        # success if there is no error
        # This will also test that all synsets in data files are in index files.

    def test_invalid_literal_for_int_16(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.synsets("agudeza"):
            a = synset


#        self.wncr._synset_from_pos_and_line('n',
#                                            "04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @ 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467 v 0000 | comentario ingenioso para hacer reír  \n")
#        # success if there is no error

    def test_key_error(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        self.wncr.lemma("menor.a.09.menor").antonyms()
        # success if there is no error

    def test_load_wordnet(self):
        for lang in self.languages:
            self.wncr = WordNetCorpusReader(self.wn_names[lang], None)
            # success if there is no error

    @classmethod
    def tearDownClass(cls):
        for lang in cls.languages:
            shutil.rmtree(cls.wn_names[lang])
Exemplo n.º 3
0
class TestTransform(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.languages = ["cat", "eng", "eus", "glg", "spa"]
        cls.wn_names = {}
        for lang in cls.languages:
            cls.wn_names[lang] = '.wordnet_' + lang
            with tarfile.open('wordnet_' + lang + '.tar.gz') as f:
                    f.extractall(cls.wn_names[lang])

    def test_all_synsets(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.all_synsets():
            a = synset
        # success if there is no error
        # This will also test that all synsets in data files are in index files.

    def test_invalid_literal_for_int_16(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.synsets("agudeza"):
            a = synset
#        self.wncr._synset_from_pos_and_line('n',
#                                            "04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @ 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467 v 0000 | comentario ingenioso para hacer reír  \n")
#        # success if there is no error

    def test_key_error(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        self.wncr.lemma("menor.a.09.menor").antonyms()
        # success if there is no error

    def test_load_wordnet(self):
        for lang in self.languages:
            self.wncr = WordNetCorpusReader(self.wn_names[lang], None)
            # success if there is no error

    @classmethod
    def tearDownClass(cls):
        for lang in cls.languages:
            shutil.rmtree(cls.wn_names[lang])
Exemplo n.º 4
0
#!/usr/bin/env python

import sys
from nltk.corpus import WordNetCorpusReader

dict_dir = sys.argv[1]

wn = WordNetCorpusReader(dict_dir)

for synset in wn.all_synsets():
    for lem in synset.lemmas:
        print lem.name, synset.lexname


Exemplo n.º 5
0
    # with open('gloss17_idfs.pickle', 'wb') as f:
    #     pickle.dump(dic, f)

    with open('gloss_idfs.pickle', 'rb') as f:
        dic = pickle.load(f)
    print(dic['to'])
    with open('gloss17_idfs.pickle', 'rb') as f:
        idfs = pickle.load(f)

    model = word2vec.Word2Vec.load_word2vec_format(
        "../word2vec/models/GoogleNews-vectors-negative300.bin", binary=True)

    vec_dict = {}
    index = 0

    for synset in WN17.all_synsets():
        vec = 0
        gloss = synset.definition()
        gloss = gloss.replace(";", "").replace("(", "").replace(
            ")", "").replace(":", "").replace('"', "").replace("'",
                                                               "").lower()

        gloss_words = gloss.split(" ")
        for gw in gloss_words:
            if gw in model.wv.vocab:
                if vec is 0:
                    vec = idfs[gw] * model[gw]
                else:
                    vec += idfs[gw] * model[gw]

        vec_dict[synset.name()] = vec