def main(): # python en_dataset_creation.py ../../datasets/WNs ../../datasets/en/ 2.0 3.0 if len(sys.argv) < 3: raise Exception( "The following arguments are required:<WordNet path> <output_path> <old_version_float> <new_version_float>" ) path = sys.argv[1] out_path = sys.argv[2] old_version = sys.argv[3] if len(sys.argv) == 5: new_version = sys.argv[4] else: new_version = "3.0" wn2 = WordNetCorpusReader(os.path.join(path, 'WN' + old_version), None) wn3 = WordNetCorpusReader(os.path.join(path, 'WN' + new_version), None) for pos in ['nouns', 'verbs']: synsets_2n = set(wn2.all_synsets(pos[0])) synsets_3n = set(wn3.all_synsets(pos[0])) reference_nouns = synsets_3n.intersection(synsets_2n) new = extract_new_lemmas(synsets_3n.difference(synsets_2n), wn2, pos[0]) hypernyms = generate_gold(new, wn3, reference_nouns, pos[0]) print(f"Len {pos} {len(hypernyms)}") save(dict(hypernyms), out_path, f"{pos}_en.{old_version}-{new_version}.tsv")
class TestTransform(unittest.TestCase): @classmethod def setUpClass(cls): cls.languages = ["cat", "eng", "eus", "glg", "spa"] cls.wn_names = {} for lang in cls.languages: cls.wn_names[lang] = '.wordnet_' + lang with tarfile.open('wordnet_' + lang + '.tar.gz') as f: f.extractall(cls.wn_names[lang]) def test_all_synsets(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) for synset in self.wncr.all_synsets(): a = synset # success if there is no error # This will also test that all synsets in data files are in index files. def test_invalid_literal_for_int_16(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) for synset in self.wncr.synsets("agudeza"): a = synset # self.wncr._synset_from_pos_and_line('n', # "04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @ 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467 v 0000 | comentario ingenioso para hacer reír \n") # # success if there is no error def test_key_error(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) self.wncr.lemma("menor.a.09.menor").antonyms() # success if there is no error def test_load_wordnet(self): for lang in self.languages: self.wncr = WordNetCorpusReader(self.wn_names[lang], None) # success if there is no error @classmethod def tearDownClass(cls): for lang in cls.languages: shutil.rmtree(cls.wn_names[lang])
class TestTransform(unittest.TestCase): @classmethod def setUpClass(cls): cls.languages = ["cat", "eng", "eus", "glg", "spa"] cls.wn_names = {} for lang in cls.languages: cls.wn_names[lang] = '.wordnet_' + lang with tarfile.open('wordnet_' + lang + '.tar.gz') as f: f.extractall(cls.wn_names[lang]) def test_all_synsets(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) for synset in self.wncr.all_synsets(): a = synset # success if there is no error # This will also test that all synsets in data files are in index files. def test_invalid_literal_for_int_16(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) for synset in self.wncr.synsets("agudeza"): a = synset # self.wncr._synset_from_pos_and_line('n', # "04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @ 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467 v 0000 | comentario ingenioso para hacer reír \n") # # success if there is no error def test_key_error(self): self.wncr = WordNetCorpusReader(self.wn_names['spa'], None) self.wncr.lemma("menor.a.09.menor").antonyms() # success if there is no error def test_load_wordnet(self): for lang in self.languages: self.wncr = WordNetCorpusReader(self.wn_names[lang], None) # success if there is no error @classmethod def tearDownClass(cls): for lang in cls.languages: shutil.rmtree(cls.wn_names[lang])
#!/usr/bin/env python import sys from nltk.corpus import WordNetCorpusReader dict_dir = sys.argv[1] wn = WordNetCorpusReader(dict_dir) for synset in wn.all_synsets(): for lem in synset.lemmas: print lem.name, synset.lexname
# with open('gloss17_idfs.pickle', 'wb') as f: # pickle.dump(dic, f) with open('gloss_idfs.pickle', 'rb') as f: dic = pickle.load(f) print(dic['to']) with open('gloss17_idfs.pickle', 'rb') as f: idfs = pickle.load(f) model = word2vec.Word2Vec.load_word2vec_format( "../word2vec/models/GoogleNews-vectors-negative300.bin", binary=True) vec_dict = {} index = 0 for synset in WN17.all_synsets(): vec = 0 gloss = synset.definition() gloss = gloss.replace(";", "").replace("(", "").replace( ")", "").replace(":", "").replace('"', "").replace("'", "").lower() gloss_words = gloss.split(" ") for gw in gloss_words: if gw in model.wv.vocab: if vec is 0: vec = idfs[gw] * model[gw] else: vec += idfs[gw] * model[gw] vec_dict[synset.name()] = vec