def test_conversion(self): word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder) with utils.open(self.metadata_file, 'rb') as f: metadata = f.readlines() with utils.open(self.tensor_file, 'rb') as f: vectors = f.readlines() # check if number of words and vector size in tensor file line up with word2vec with utils.open(self.datapath, 'rb') as f: first_line = f.readline().strip() number_words, vector_size = map(int, first_line.split(b' ')) self.assertTrue(len(metadata) == len(vectors) == number_words, ('Metadata file %s and tensor file %s imply different number of rows.' % (self.metadata_file, self.tensor_file))) # grab metadata and vectors from written file metadata = [word.strip() for word in metadata] vectors = [vector.replace(b'\t', b' ') for vector in vectors] # get the originaly vector KV model orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False) # check that the KV model and tensor files have the same values key-wise for word, vector in zip(metadata, vectors): word_string = word.decode("utf8") vector_string = vector.decode("utf8") vector_array = np.array(list(map(float, vector_string.split()))) np.testing.assert_almost_equal(orig_model[word_string], vector_array, decimal=5)
def testConversion(self): word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder) with smart_open(self.metadata_file, 'rb') as f: metadata = f.readlines() with smart_open(self.tensor_file, 'rb') as f: vectors = f.readlines() # check if number of words and vector size in tensor file line up with word2vec with smart_open(self.datapath, 'rb') as f: first_line = f.readline().strip() number_words, vector_size = map(int, first_line.split(b' ')) self.assertTrue(len(metadata) == len(vectors) == number_words, ('Metadata file %s and tensor file %s imply different number of rows.' % (self.metadata_file, self.tensor_file))) # grab metadata and vectors from written file metadata = [word.strip() for word in metadata] vectors = [vector.replace(b'\t', b' ') for vector in vectors] # get the originaly vector KV model orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False) # check that the KV model and tensor files have the same values key-wise for word, vector in zip(metadata, vectors): word_string = word.decode("utf8") vector_string = vector.decode("utf8") vector_array = np.array(list(map(float, vector_string.split()))) np.testing.assert_almost_equal(orig_model[word_string], vector_array, decimal=5)
def main(): save_name = "word2vec.model" wiki_parser = Wiki() sentence_corpus_, d_fname, corpora_fname = wiki_parser.clean_corpora( should_save=True) model = Word2Vec(sentence_corpus_, size=150, window=5, min_count=5) # model = FastText(size=300, window=4, min_count=4) # instantiate #model.build_vocab(sentences=sentence_corpus_) print('Training') model.train(sentences=sentence_corpus_, total_examples=model.corpus_count, epochs=50, total_words=model.corpus_total_words) # train # fname = get_tmpfile("fasttext.model") model.wv.save_word2vec_format(save_name, binary=True) print('Plotting') # display_closestwords_tsnescatterplot(model, "amanita_muscaria") print(save_name) word2vec2tensor.word2vec2tensor(save_name, "fungi_w2v.tsv")
def main(): globals.SPLIT_WITH_DATE = False globals.VOCAB_LOWERCASE = True global NUM_POS global NUM_NEG parser = argparse.ArgumentParser() # Data loading params parser.add_argument("-f", "--data-file", help="location of data file", required=True) arguments = parser.parse_args() globals.XML_FILE = arguments.data_file xml_file = globals.XML_FILE print("All arguments: ", arguments) text_list = [] with open(xml_file, "rb") as xmlf: journal_context = etree.iterparse(xmlf, events=( 'start', 'end', ), encoding='utf-8') fast_iter(journal_context, get_text_and_metadata, text_list) # should_remove_stop_words = True # should_stem = False # pos_vocab_proc = VocabProcessor(word_tokenize, 16, should_remove_stop_words, should_stem) # neg_vocab_proc = VocabProcessor(word_tokenize, 16, should_remove_stop_words, should_stem) text_only = [str(text['text']) for text in text_list] train_corpus = list(read_corpus(text_only)) # train_corpus = list(read_corpus(text_list)) # for idx, doc in enumerate(text_list): print(train_corpus[:2]) model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=2, iter=3) model.build_vocab(train_corpus) model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter) print("Training done!") output_file = os.path.splitext(os.path.basename( globals.XML_FILE))[0] + "_doc2vec_50dim.w2v" # model.save_word2vec_format(output_file, doctag_vec=True, word_vec=False) model.save_word2vec_format(output_file) print("Model saved!") word2vec2tensor.word2vec2tensor(output_file, "chemistry") with open("chemistry_metadata.tsv", "wb") as out: out.write("Title\tIndexed\n".encode("utf8")) for doc in text_list: # print(doc['article_title']) out_string = doc['article_title'] + "\t" + doc['target'] + "\n" out.write(out_string.encode("utf8"))