def build_trained_embeddings(self): helper._print_header('Getting word2vec trained on Enron corpus...') if not os.path.isdir(directories.WORD2VEC_DIR): os.makedirs(directories.WORD2VEC_DIR) sentences = self.get_enron_sentences() model_logger = Word2VecLogger() path = directories.WORD2VEC_DIR + 'trained_word2vec.model' if os.path.isfile(path): helper._print('Loading previously trained model...') word2vec_model = KeyedVectors.load(path) else: helper._print_subheader('Building model...') word2vec_model = gensim.models.Word2Vec( sentences, size=FLAGS.word_embedding_size, sg=1, # Use Skip-Gram (0 for CBOW) hs=0, # Use Negative sampling. (1 for Hierarchical Softmax) window=FLAGS.word2vec_window, min_count=FLAGS.word2vec_min_count, workers=10, iter=1 ) pool = multiprocessing.Pool() word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs, callbacks=[model_logger]) # word2vec_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs) helper._print(f'Saving model to {path}') word2vec_model.save(path) vocab = self.build_vocab(sentences) return self.word2vec_index_keyed_vector(keyed_vector=word2vec_model.wv, vocab=vocab)
def glove_finetuned_embeddings(self): helper._print_header('Getting fine-tuned GloVe embeddings') self.glove_download_pretrained_model() sentences = self.get_enron_sentences() vocab = helper.get_or_build(FLAGS.enron_emails_vocab_path, self.build_vocab, sentences) # idx2word = {i: word for word, i in word2idx.items()} print(len(vocab)) cooccur = helper.get_or_build(FLAGS.enron_emails_cooccur_path, self.build_cooccur, vocab, sentences, type='numpy') print(np.shape(cooccur)) pretrained_embeddings = self.glove2dict(self.word_embed_file_path) helper._print_subheader('Starting Mittens model...') mittens_model = Mittens(n=self.dimensions, max_iter=1000, display_progress=1, log_dir=FLAGS.glove_dir + 'mittens/') finetuned_embeddings = mittens_model.fit( cooccur, vocab=vocab, initial_embedding_dict=pretrained_embeddings) print(finetuned_embeddings) return 'test', 'test', 'test'
def get_TSNE_plot(self, embeddings, vocab, words=None): helper._print_subheader('Plotting embeddings') vocab = vocab embeddings = embeddings # fit a 2d PCA model to the vectors tsne = TSNE(perplexity=30, n_components=2, verbose=2, init='pca', n_iter=5000, method='exact') result = tsne.fit_transform(embeddings) # create a scatter plot of the projection if not words is None: result = np.array( [[x, y, i] for i, (x, y) in enumerate(result) if vocab[i] in words], dtype=np.float64) pyplot.scatter(result[:, 0], result[:, 1]) for r in result: pyplot.annotate(vocab[int(r[2])], xy=(r[0], r[1])) else: pyplot.scatter(result[:, 0], result[:, 1]) for i, word in enumerate(vocab): pyplot.annotate(word, xy=(result[i, 0], result[i, 1])) pyplot.show()
def word2vec_index_keyed_vector(self, keyed_vector, vocab): helper._print_subheader('Creating index files!') vocab_keys = keyed_vector.vocab.keys() ZERO_TOKEN = 0 word2idx = {'ZERO': ZERO_TOKEN} idx2word = ['ZERO'] weights = [np.zeros(self.dimensions)] pbar = tqdm( bar_format='Indexing keyed_vector |{bar}| Elapsed: {elapsed} | ({n_fmt}/{total_fmt})', total=len(vocab_keys)) i = 0 for word in vocab_keys: if word in vocab.keys(): i += 1 word2idx[word] = i idx2word.append(word) weights.append(keyed_vector[word]) pbar.update(1) pbar.close() print() UNKNOWN_TOKEN = len(weights) word2idx['UNK'] = UNKNOWN_TOKEN np.random.seed(240993) weights.append(np.random.randn(self.dimensions)) helper._print('Index files ready!') # self.get_TSNE_plot(weights, [key for key in word2idx.keys()]) return np.array(weights, dtype=np.float32), word2idx, idx2word
def build_cooccur(self, vocab, corpus, window=10): helper._print_subheader("Building cooccurrence matrix") vocab_size = len(vocab) idx2word = {i: word for word, i in vocab.items()} cooccurrences = np.zeros((vocab_size, vocab_size), dtype=np.float64) helper._print('Enumerating through the corpus...') for i, sent in enumerate(corpus): if i % 10000 == 0 and i != 0: helper._print(f"{i}/{len(corpus)} sentences processed") if i == 500000: break token_ids = [vocab[word] for word in sent if word in vocab.keys()] for center_i, center_id in enumerate(token_ids): # Collect all word IDs in left window of center word context_ids = token_ids[max(0, center_i - window):center_i] contexts_len = len(context_ids) for left_i, left_id in enumerate(context_ids): # Distance from center word distance = contexts_len - left_i # Weight by inverse of distance between words increment = 1.0 / float(distance) # Build co-occurrence matrix symmetrically (pretend we # are calculating right contexts as well) cooccurrences[center_id, left_id] += increment cooccurrences[left_id, center_id] += increment return cooccurrences
def train_and_save_embeddings(self): sentences = self.get_enron_sentences() vocab = self.build_vocab(sentences) if not os.path.isfile(directories.TRAINED_GLOVE_EMBEDDING_FILE_PATH): sentences = self.get_enron_sentences() vocab = self.build_vocab(sentences) cooccur = self.build_cooccur(vocab, sentences) helper._print_subheader('Building model...') glove_model = mittens_glove(n=300, xmax=100, max_iter=20000, learning_rate=0.01, alpha=0.75, tol=1e-4, display_progress=10, log_dir=directories.GLOVE_DIR + 'mittens/') helper._print_subheader('Training GloVE model...') trained_embeddings = glove_model.fit(cooccur) resulting_embeddings = {} for word, weights in zip(vocab.keys(), trained_embeddings): resulting_embeddings[word] = weights self.dict2glove(resulting_embeddings, directories.TRAINED_GLOVE_EMBEDDING_FILE_PATH) return vocab, cooccur, resulting_embeddings return vocab, None, None
def glove_generate_indexes(self): helper._print_subheader('Generating indexes for embeddings') ZERO_TOKEN = 0 word2idx = {'ZERO': ZERO_TOKEN} idx2word = {ZERO_TOKEN: 'ZERO'} weights = [np.zeros(self.dimensions)] with open(self.word_embed_file_path, 'r', encoding="utf8") as file: for index, line in enumerate(file): values = line.split() # Word and weights separated by space word = values[0] # Word is first symbol on each line word_weights = np.asarray( values[1:], dtype=np.float32) # Remainder of line is weights for word word2idx[ word] = index + 1 # ZERO is our zeroth index so shift by one weights.append(word_weights) idx2word[index + 1] = word weights.append(word_weights) if index % FLAGS.word_embed_subset_size == 0 and index != 0: helper._print(f'{index} words indexed') if FLAGS.word_embed_subset: break UNKNOWN_TOKEN = len(weights) word2idx['UNK'] = UNKNOWN_TOKEN np.random.seed(240993) weights.append(np.random.randn(self.dimensions)) helper._print_subheader('Indexes done!') return np.array(weights, dtype=np.float32), word2idx, idx2word
def glove2dict(self, glove_filename): helper._print_subheader( 'Generating dict from pretrained GloVe embeddings') with open(glove_filename, 'r', encoding="utf8") as file: embed = {} for index, line in enumerate(file): values = line.split() embed[values[0]] = np.asarray(values[1:], dtype=np.float32) return embed
def predict_and_label(self, data, sess): helper._print_subheader("Predicting") prob, labels = [], [] batches = helper.batches(data, batch_size=500 if FLAGS.use_gpu else 2, use_tail=True, perm=False) for batch in batches: feed_dict, _ = self.build_feed_dict(batch) p, l = sess.run([self.p, self.labels], feed_dict=feed_dict) prob.extend(p) labels.extend(l) return prob, labels
def word2vec_trained_embeddings(self): helper._print_header('Getting word2vec trained on Enron corpus...') if not os.path.isdir(FLAGS.word2vec_dir): os.makedirs(FLAGS.word2vec_dir) documents = self.get_enron_sentences() model_logger = Word2VecLogger() if os.path.isfile(FLAGS.word2vec_dir + 'word2vec.model'): helper._print_subheader('Loading previously trained model...') model = KeyedVectors.load(FLAGS.word2vec_dir + 'word2vec.model') else: helper._print_subheader('Building model...') model = Word2Vec( documents, size=300, sg=1, # Use Skip-Gram (0 for CBOW) hs=0, # Use Negative sampling. (1 for Hierarchical Softmax) window=10, min_count=3, workers=10, iter=1) helper._print_subheader('Saving untrained model...') model.save(FLAGS.word2vec_dir + 'word2vec.model') model.train(documents, total_examples=len(documents), epochs=FLAGS.word2vec_trained_mode_epochs, callbacks=[model_logger]) helper._print_subheader('Saving model...') model.save(FLAGS.word2vec_dir + 'trained_word2vec.model') return self.word2vec_index_keyed_vector(model.wv)
def get_enron_sentences(self): helper._print_subheader('Reading ' + FLAGS.enron_emails_txt_path + '...') if not os.path.isfile(FLAGS.enron_emails_txt_path): self.load_enron_txt_data() with open(FLAGS.enron_emails_txt_path, 'r', encoding='utf-8') as txt_file: for index, line in enumerate(txt_file): if index % 1000000 == 0 and index != 0: helper._print(f'{index} sentences read') break preproccesed_line = simple_preprocess(line) if preproccesed_line != []: yield preproccesed_line helper._print(f'{index} sentences read') helper._print_subheader('Done reading Enron email data!')
def dict2glove(self, embeddings_dict, path): helper._print_subheader('Saving to glove format...') with open(path, 'w', encoding="utf8") as file: pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ', total=len(embeddings_dict)) for index, (word, weights) in enumerate(embeddings_dict.items()): if index % 1000 == 0 and index != 0: pbar.update(1000) embeddings_string = word for weight in weights: embeddings_string += ' ' + str(weight) file.write(embeddings_string + '\n') pbar.update(len(embeddings_dict) % 1000) pbar.close() print()
def build_pretrained_embeddings(self): helper._print_header('Getting pretrained word2vec embeddings') path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH sentences = self.get_enron_sentences() if not os.path.isdir(directories.WORD2VEC_DIR): os.makedirs(directories.WORD2VEC_DIR) if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') if not os.path.isfile(path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM') sys.exit() else: helper._print_subheader('Unpacking ' + path) model = KeyedVectors.load_word2vec_format(path, binary=True) helper._print_subheader('Done unpacking!') vocab = self.build_vocab(sentences) return self.word2vec_index_keyed_vector(keyed_vector=model, vocab=vocab)
def glove2dict(self, glove_filename): helper._print_subheader( 'Generating dict from pretrained GloVe embeddings') with open(glove_filename, 'r', encoding="utf8") as file: embed = {} lines = file.readlines() pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ', total=len(lines)) for index, line in enumerate(lines): if index % 10000 == 0 and index != 0: pbar.update(10000) values = line.split() embed[values[0]] = np.asarray(values[1:], dtype=np.float32) pbar.update(len(lines) % 10000) pbar.close() print() return embed
def get_enron_sentences(self): """ Generator for getting the enron data as individual sentences. """ helper._print_subheader('Reading ' + directories.ENRON_TRAIN_SENTENCES_TXT_PATH + '...') with open(directories.ENRON_TRAIN_SENTENCES_TXT_PATH, 'r', encoding='utf-8') as txt_file: for index, line in enumerate(txt_file): if index % 1000000 == 0 and index != 0: helper._print(f'{index} sentences read') break preproccesed_line = simple_preprocess(line) if preproccesed_line != []: yield preproccesed_line helper._print(f'{index} sentences read') helper._print_subheader('Done reading Enron email data!')
def generate_indexes(self, vocab, file): helper._print_subheader('Generating indexes for embeddings') weights = [np.zeros(self.dimensions)] ZERO_TOKEN = 0 word2idx = {'ZERO': ZERO_TOKEN} idx2word = ['ZERO'] i = 0 with open(file, 'r', encoding='utf-8', newline='\n', errors='ignore') as file: if FLAGS.word_embed_model == 'fasttext': n, d = map(int, file.readline().split()) lines = file.readlines() pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ', total=len(lines)) for index, line in enumerate(lines): values = line.split() # Word and weights separated by space word = values[0] # Word is first symbol on each line if word in vocab.keys() and helper.is_float(values[1]): i += 1 word_weights = np.asarray( values[1:], dtype=np.float32 ) # Remainder of line is weights for word word2idx[word] = i idx2word.append(word) weights.append(word_weights) pbar.update(1) pbar.close() UNKNOWN_TOKEN = len(weights) word2idx['UNK'] = UNKNOWN_TOKEN idx2word.append('UNK') np.random.seed(240993) weights.append(np.random.randn(self.dimensions)) # self.get_TSNE_plot(weights, [key for key in word2idx.keys()]) helper._print_subheader( f'Indexes done! {len(weights) - 2} word embeddings!') return np.array(weights, dtype=np.float32), word2idx, idx2word
def get_enron_sentences(self, kaggle=True, all=True): if kaggle: path = directories.ENRON_EMAILS_TXT_PATH if not os.path.isfile(path): self.load_enron_txt_data() else: if all: path = directories.TREE_ALL_SENTENCES_TXT_PATH else: path = directories.TREE_SENTENCES_TXT_PATH helper._print_subheader('Reading ' + path + '...') with open(path, 'r', encoding='utf-8') as txt_file: for index, line in enumerate(txt_file): if index % 1000000 == 0 and index != 0: helper._print(f'{index} sentences read') break preproccesed_line = simple_preprocess(line) if preproccesed_line != []: yield preproccesed_line helper._print(f'{index} sentences read') helper._print_subheader('Done reading Enron email data!')
def word2vec_pretrained_embeddings(self): helper._print_header('Getting pretrained word2vec embeddings') if not os.path.isdir(FLAGS.word2vec_dir): os.makedirs(FLAGS.word2vec_dir) self.word_embed_file_path = FLAGS.word2vec_dir + self.embedding_file + '.txt' if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') if not os.path.isfile(self.word_embed_file_path): binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin' if not os.path.isfile(binary_file_path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM' ) sys.exit() else: helper._print_subheader('Unpacking ' + binary_file_path) model = KeyedVectors.load_word2vec_format(binary_file_path, binary=True) helper._print_subheader('Done unpacking!') return self.word2vec_index_keyed_vector(model)
def build_vocab(self, corpus, min_count=FLAGS.glove_min_count): """ Credit to https://github.com/hans/glove.py/blob/master/glove.py Returns a dictionary `w -> (i, f)`, mapping word strings to pairs of word ID and word corpus frequency. """ helper._print_subheader('Building vocabulary from corpus') vocab = Counter() for i, doc in enumerate(corpus): if i % 100000 == 0 and i != 0: helper._print(f"{i}/{len(corpus)} sentences processed") break vocab.update(doc) helper._print_subheader('Done building vocabulary') i = 0 word2index = {} for word, freq in vocab.items(): if freq >= min_count: word2index[word] = i i += 1 return word2index
def word2vec_index_keyed_vector(self, keyed_vector): helper._print_subheader('Creating index files!') vocab_keys = keyed_vector.vocab.keys() ZERO_TOKEN = 0 word2idx = {'ZERO': ZERO_TOKEN} idx2word = {ZERO_TOKEN: 'ZERO'} weights = [np.zeros(self.dimensions)] for index, word in enumerate(vocab_keys): word2idx[word] = index + 1 idx2word[index + 1] = word weights.append(keyed_vector[word]) if index % FLAGS.word_embed_subset_size == 0 and index != 0: helper._print(f'{index} words indexed') if FLAGS.word_embed_subset: break UNKNOWN_TOKEN = len(weights) word2idx['UNK'] = UNKNOWN_TOKEN np.random.seed(240993) weights.append(np.random.randn(self.dimensions)) helper._print_subheader('Index files ready!') return np.array(weights, dtype=np.float32), word2idx, idx2word
def build_vocab(self, corpus, min_count=FLAGS.word_min_count): helper._print_subheader('Building vocabulary from corpus') vocab = Counter() pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ', total=len(corpus)) for i, doc in enumerate(corpus): if (i + 1) % 1000 == 0 and i != 0: pbar.update(1000) vocab.update(doc) pbar.update(len(corpus) % 1000) pbar.close() print() i = 0 word2index = {} for word, freq in vocab.items(): if freq >= min_count: word2index[word] = i i += 1 helper._print(f'Done building vocabulary. Length: {len(word2index)}') return word2index
def build_cooccur(self, vocab, corpus, window=10): helper._print_subheader("Building cooccurrence matrix") vocab_size = len(vocab) cooccurrences = np.zeros((vocab_size, vocab_size), dtype=np.float64) pbar = tqdm( bar_format= '{percentage:.0f}%|{bar}| Elapsed: {elapsed}, Remaining: {remaining} ({n_fmt}/{total_fmt}) ', total=len(corpus)) for i, sent in enumerate(corpus): if (i + 1) % 10000 == 0 and i != 0: pbar.update(10000) token_ids = [vocab[word] for word in sent if word in vocab.keys()] for center_i, center_id in enumerate(token_ids): # Collect all word IDs in left window of center word context_ids = token_ids[max(0, center_i - window):center_i] contexts_len = len(context_ids) for left_i, left_id in enumerate(context_ids): # Distance from center word distance = contexts_len - left_i # Weight by inverse of distance between words increment = 1.0 / float(distance) # Build co-occurrence matrix symmetrically (pretend we # are calculating right contexts as well) cooccurrences[center_id, left_id] += increment cooccurrences[left_id, center_id] += increment pbar.update(len(corpus) % 10000) pbar.close() print() helper._print( f'Done building cooccurrence matrix. Shape: {np.shape(cooccurrences)}' ) return cooccurrences
def load_enron_txt_data(self): helper._print_header("Loading Enron emails") try: if os.name == 'nt': """ Using sys.maxsize throws an Overflow error on Windows 64-bit platforms since internal representation of 'int'/'long' on Win64 is only 32-bit wide. Ideally limit on Win64 should not exceed ((2**31)-1) as long as internal representation uses 'int' and/or 'long' """ csv.field_size_limit((2**31) - 1) else: csv.field_size_limit(sys.maxsize) except OverflowError as e: # skip setting the limit for now pass if not os.path.isfile(directories.ENRON_EMAILS_CSV_PATH): data = 'wcukierski/enron-email-dataset' helper._print_subheader(f'Downloading enron emails from Kaggle') helper.download_from_kaggle(data, directories.ENRON_DIR) helper._print_subheader('Download finished! Unzipping...') with zipfile.ZipFile(directories.ENRON_EMAILS_ZIP_PATH, 'r') as zip: zip.extractall(path=directories.ENRON_DIR) if not os.path.isfile(directories.ENRON_EMAILS_TXT_PATH): helper._print_subheader('Processing emails into .txt file!') with open(directories.ENRON_EMAILS_CSV_PATH, 'r', encoding='utf-8') as emails_csv: with open(directories.ENRON_EMAILS_TXT_PATH, 'w', encoding='utf-8') as text_file: email_reader = csv.reader(emails_csv, delimiter=",") for index, row in enumerate(email_reader): if index == 0: continue sentences = nltk.sent_tokenize( self.format_email_body(row)) for sent in sentences: if len(sent.split(' ')) > 2: text_file.write(sent + '\n') if index % 100000 == 0 and index != 0: helper._print(f'{index} emails processed') helper._print_subheader('Enron email data loaded!')
def train_and_save_finetuned_embeddings(self): sentences = self.get_enron_sentences() vocab = self.build_vocab(sentences) if not os.path.isfile(directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH): # idx2word = {i: word for word, i in word2idx.items()} cooccur = self.build_cooccur(vocab, sentences) pretrained_embeddings = self.glove2dict( directories.GLOVE_EMBEDDING_FILE_PATH) helper._print( f'{len([v for v in vocab.keys() if v in pretrained_embeddings.keys()])} words in common with the pretrained set' ) helper._print_subheader('Building model...') mittens_dir = directories.GLOVE_DIR + 'mittens/' if not os.path.isdir(mittens_dir): os.makedirs(mittens_dir) mittens_model = Mittens(n=self.dimensions, xmax=100, max_iter=10000, display_progress=10, learning_rate=0.05, alpha=0.75, tol=1e-4, log_dir=mittens_dir, mittens=0.1) helper._print_subheader('Training Mittens model...') finetuned_embeddings = mittens_model.fit( cooccur, vocab=vocab, initial_embedding_dict=pretrained_embeddings) print() helper._print_subheader( 'Done training finetuned embeddings! Merging with pre-trained embeddings...' ) resulting_embeddings = pretrained_embeddings for word, weights in zip(vocab.keys(), finetuned_embeddings): resulting_embeddings[word] = weights self.dict2glove(resulting_embeddings, directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH) return vocab, cooccur, resulting_embeddings return vocab, None, None
def on_train_end(self, model): helper._print_subheader('Training ended!')
def on_train_begin(self, model): helper._print_subheader( f'Training started! Going through {model.iter} epochs...')
def on_train_begin(self, model): helper._print_subheader(f'Training Model ({model.iter} epochs)...')
def on_train_end(self, model): # self.pbar.close() helper._print_subheader('Training ended!')
def word2vec_finetuned_embeddings(self): helper._print_header('Getting fine-tuned word2vec embeddings') if not os.path.isdir(FLAGS.word2vec_dir): os.makedirs(FLAGS.word2vec_dir) if os.path.isfile(FLAGS.word2vec_dir + 'finetuned_word2vec.model'): helper._print_subheader('Loading previously fine-tuned model...') finetuned_model = {} finetuned_model.wv = KeyedVectors.load(FLAGS.word2vec_dir + 'word2vec.model') else: if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') sys.exit() binary_file_path = FLAGS.word2vec_dir + self.embedding_file + '.bin' if not os.path.isfile(binary_file_path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM' ) sys.exit() helper._print_subheader('Unpacking ' + binary_file_path) model = KeyedVectors.load_word2vec_format(binary_file_path, binary=True) helper._print_subheader('Done unpacking!') sentences = self.get_enron_sentences() finetuned_model = Word2Vec(size=300, min_count=3) helper._print_subheader('Building fine-tuned model vocab...') finetuned_model.build_vocab(sentences) helper._print_subheader('Updating with pretrained model vocab...') finetuned_model.build_vocab([list(model.vocab.keys())], update=True) helper._print_subheader('Intersection with pretrained vectors...') finetuned_model.intersect_word2vec_format(binary_file_path, binary=True, lockf=1.0) model_logger = Word2VecLogger() finetuned_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_finetuned_mode_epochs, callbacks=[model_logger]) helper._print_subheader('Saving model...') model.save(FLAGS.word2vec_dir + 'finetuned_word2vec.model') return self.word2vec_index_keyed_vector(finetuned_model.wv)
def build_finetuned_embeddings(self): helper._print_header('Getting fine-tuned word2vec embeddings') path = directories.WORD2VEC_DIR + 'finetuned_word2vec.model' pretrained_path = directories.WORD2VEC_EMBEDDINGS_FILE_PATH sentences = self.get_enron_sentences() if not os.path.isdir(directories.WORD2VEC_DIR): os.makedirs(directories.WORD2VEC_DIR) if os.path.isfile(path): helper._print_subheader('Loading previously fine-tuned model...') finetuned_model = {} finetuned_model.wv = KeyedVectors.load(path) else: if not self.dimensions == 300: helper._print('Only support word2vec with vectors of size 300') sys.exit() if not os.path.isfile(pretrained_path): helper._print( 'Binary file not there. Download from: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM') sys.exit() helper._print_subheader('Unpacking ' + pretrained_path) model = KeyedVectors.load_word2vec_format(pretrained_path, binary=True) helper._print_subheader('Done unpacking!') finetuned_model = gensim.models.Word2Vec( size=FLAGS.word_embedding_size, sg=1, # Use Skip-Gram (0 for CBOW) hs=0, # Use Negative sampling. (1 for Hierarchical Softmax) window=FLAGS.word2vec_window, min_count=FLAGS.word2vec_min_count, workers=10, iter=1 ) helper._print_subheader('Building fine-tuned model vocab...') finetuned_model.build_vocab(sentences) helper._print_subheader('Updating with pretrained model vocab...') finetuned_model.build_vocab([list(model.vocab.keys())], update=True) helper._print_subheader('Intersection with pretrained vectors...') finetuned_model.intersect_word2vec_format(pretrained_path, binary=True, lockf=1.0) model_logger = Word2VecLogger() finetuned_model.train(sentences, total_examples=len(sentences), epochs=FLAGS.word2vec_epochs, callbacks=[model_logger]) helper._print_subheader('Saving model...') finetuned_model.save(path) vocab = self.build_vocab(sentences) return self.word2vec_index_keyed_vector(keyed_vector=finetuned_model.wv, vocab=vocab)