def run(args): doc = read_txt(args.path_to_doc) doc_tokens = [ process_text(entry, lower=not args.cased, remove_stopwords=args.remove_stopwords, remove_punctuation=args.remove_punctuation) for entry in doc ] all_tokens = [] for entry_tokens in doc_tokens: all_tokens += entry_tokens rare_tokens, selected_tokens = get_rare_tokens(all_tokens, args.min_freq, args.max_tokens, return_non_rare=True) if args.remove_rare: doc_tokens = [ filter_tokens(entry_tokens, set(rare_tokens)) for entry_tokens in doc_tokens ] gu = GloVeUtility(args.path_to_glove) vectorizer = CountVectorizer(ngram_range=(args.ngram_lower, args.ngram_upper), vocabulary=selected_tokens) count_vector = vectorizer.fit_transform( [" ".join(entry_tokens) for entry_tokens in doc_tokens]) csr_mat = count_vector.T * count_vector csr_mat.setdiag(0) cooccur_ar = csr_mat.toarray() mittens_model = Mittens(n=gu.d, max_iter=args.iter) embeddings = mittens_model.fit(cooccur_ar, vocab=selected_tokens, initial_embedding_dict=gu.vector_dict) filename = args.path_to_glove.split(os.path.sep)[-1] os.makedirs(args.output, exist_ok=True) embeddings_dict = dict(zip(selected_tokens, embeddings)) progress_bar.std_print("\nTrained on {} tokens.".format( len(embeddings_dict))) if args.save_new_only: savepath = os.path.join(args.output, "new_" + filename) embeddings_list = [ " ".join([key] + [str(val) for val in embeddings_dict[key]]) for key in embeddings_dict ] write_txt(savepath, embeddings_list) else: savepath = os.path.join(args.output, filename) gu.add_replace_vectors(embeddings_dict) gu.save_vectors(savepath)
def glove_finetuned_embeddings(self): helper._print_header('Getting fine-tuned GloVe embeddings') self.glove_download_pretrained_model() sentences = self.get_enron_sentences() vocab = helper.get_or_build(FLAGS.enron_emails_vocab_path, self.build_vocab, sentences) # idx2word = {i: word for word, i in word2idx.items()} print(len(vocab)) cooccur = helper.get_or_build(FLAGS.enron_emails_cooccur_path, self.build_cooccur, vocab, sentences, type='numpy') print(np.shape(cooccur)) pretrained_embeddings = self.glove2dict(self.word_embed_file_path) helper._print_subheader('Starting Mittens model...') mittens_model = Mittens(n=self.dimensions, max_iter=1000, display_progress=1, log_dir=FLAGS.glove_dir + 'mittens/') finetuned_embeddings = mittens_model.fit( cooccur, vocab=vocab, initial_embedding_dict=pretrained_embeddings) print(finetuned_embeddings) return 'test', 'test', 'test'
def train_mittens(coocc_ar, oov_vocabs, pre_glove, emb_dim=cfg['embeddings']['emb_dim'], max_iter=200, glove_oov_save_path=None, dataset_dir=dataset_dir, embedding_file=cfg["embeddings"]["embedding_file"], dataset_name=cfg['data']['train'] + cfg['data']['test']): """ :param coocc_ar: :param oov_vocabs: :param pre_glove: :param emb_dim: :param max_iter: :param glove_oov_save_path: :param dataset_dir: :param embedding_file: :param dataset_name: :return: """ mittens_model = Mittens(n=emb_dim, max_iter=max_iter) new_embeddings = mittens_model.fit( coocc_ar, vocab=oov_vocabs, initial_embedding_dict=pre_glove) newglove = dict(zip(oov_vocabs, new_embeddings)) if glove_oov_save_path is None: glove_oov_save_path = join(dataset_dir, embedding_file + dataset_name + '_oov.pkl') f = open(glove_oov_save_path, "wb") pickle.dump(newglove, f) f.close() return newglove
def create_monthly_glove_models(begin_month=None): model = Mittens(n=300, max_iter=1000) vocab, embedding = glove2dict(TRUNCATED_GLOVE_EMBEDDING) months = arrow.Arrow.span_range('month', arrow.get(START_MONTH), arrow.get(END_MONTH)) for begin, end in months: print("Training mittens model for {}".format(begin.format("YYYY-MM"))) print(" loading cooccurrence matrix") coo_matrix = np.load(get_month_cooccurrence_matrix_filepath(begin.year, begin.month)) print(" training") embedding = model.fit(coo_matrix, vocab=vocab, initial_embedding_dict=embedding) print(" saving") np.save(get_month_glove_embedding_filepath(begin.year, begin.month), embedding)
def loadGloVe(co_occur, dim_embed, vocab, is_fine): if is_fine: glove_original = glove2dict(glove_filename) mittens_model = Mittens(n=dim_embed, max_iter=5000) embeddings = mittens_model.fit(np.asarray(co_occur), vocab=vocab, initial_embedding_dict=glove_original) else: glove_model = GloVe(n=dim_embed, max_iter=5000) embeddings = glove_model.fit(np.asarray(co_occur)) return embeddings
def glove_embedding(filename, vocab_file, cooccurence_file, domain): gv = Glove() out_dir = './preprocessed_data/' + domain if vocab_file and cooccurence_file: vocab = gv.load_vocab_in_order(vocab_file) cooccurence = gv.load_cooccurence_matrix(cooccurence_file) logger.info('get pre-trained glove embedding') original_embedding = gv.get_original_embedding( './pretrained_embeddings/glove.6B/glove.6B.300d.txt') mittens_model = Mittens(n=300, max_iter=1000) logger.info('Start fine tuning...') new_embeddings = mittens_model.fit( cooccurence, vocab=vocab, initial_embedding_dict=original_embedding) fin = open(out_dir + '/fine_tuned_glove_300', 'wb') pickle.dump(new_embeddings, fin) fin.close() logger.info('Fine tuning complete') else: logger.info('Load english data') fin = codecs.open(filename, 'r', 'utf-8') corpus = [] for line in fin: corpus.append(line) vocab = gv.build_vocab(corpus) vocab_file = out_dir + '/vocab.pkl' createPath(vocab_file) outfile = open(vocab_file, 'wb') pickle.dump(vocab, outfile) outfile.close() logger.info("Fetching cooccurrence list..") cooccurrences = gv.build_cooccur(vocab, corpus) cooccurrences = gv.convert_cooccurence_matrix(cooccurrences, len(vocab)) cooccurrence_file = out_dir + '/cooccurrence.pkl' outfile = open(cooccurrence_file, 'wb') pickle.dump(cooccurrences, outfile) outfile.close() logger.info("Cooccurrence list fetch complete (%i pairs).\n", cooccurrences.shape[0])
def batch_finetune(finetune_glove, batch_word, dimension): oov = [token for token in batch_word if token not in finetune_glove.keys()] en_doc = [' '.join(batch_word)] corp_vocab = list(set(oov)) cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab) X = cv.fit_transform(en_doc) Xc = (X.T * X) Xc.setdiag(0) coocc_ar = Xc.toarray() mittens_model = Mittens(n=dimension, max_iter=1800) new_embeddings = mittens_model.fit( coocc_ar, vocab=corp_vocab, initial_embedding_dict=finetune_glove) newglove = dict(zip(corp_vocab, new_embeddings)) finetune_glove.update(newglove) return finetune_glove
def train_and_save_finetuned_embeddings(self): sentences = self.get_enron_sentences() vocab = self.build_vocab(sentences) if not os.path.isfile(directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH): # idx2word = {i: word for word, i in word2idx.items()} cooccur = self.build_cooccur(vocab, sentences) pretrained_embeddings = self.glove2dict( directories.GLOVE_EMBEDDING_FILE_PATH) helper._print( f'{len([v for v in vocab.keys() if v in pretrained_embeddings.keys()])} words in common with the pretrained set' ) helper._print_subheader('Building model...') mittens_dir = directories.GLOVE_DIR + 'mittens/' if not os.path.isdir(mittens_dir): os.makedirs(mittens_dir) mittens_model = Mittens(n=self.dimensions, xmax=100, max_iter=10000, display_progress=10, learning_rate=0.05, alpha=0.75, tol=1e-4, log_dir=mittens_dir, mittens=0.1) helper._print_subheader('Training Mittens model...') finetuned_embeddings = mittens_model.fit( cooccur, vocab=vocab, initial_embedding_dict=pretrained_embeddings) print() helper._print_subheader( 'Done training finetuned embeddings! Merging with pre-trained embeddings...' ) resulting_embeddings = pretrained_embeddings for word, weights in zip(vocab.keys(), finetuned_embeddings): resulting_embeddings[word] = weights self.dict2glove(resulting_embeddings, directories.FINETUNED_GLOVE_EMBEDDING_FILE_PATH) return vocab, cooccur, resulting_embeddings return vocab, None, None
def glove_embedding(filename, vocab_file, cooccurence_file, lang): gv = Glove() if vocab_file and cooccurence_file: vocab = gv.load_vocab_in_order(vocab_file) cooccurence = gv.load_cooccurence_matrix(cooccurence_file) logger.info('get pre-trained glove embedding') original_embedding = gv.get_original_embedding(config.glove_pretrained_emb[lang]) mittens_model = Mittens(n=300, max_iter=1000) logger.info('Start fine tuning...') new_embeddings = mittens_model.fit(cooccurence, vocab=vocab, initial_embedding_dict=original_embedding) fin = open(config.glove_fine_tuned_emb[lang], 'wb') pickle.dump(new_embeddings, fin) fin.close() logger.info('Fine tuning complete') else: if lang == 'de': logger.info('Load german data') elif lang == 'en': logger.info('Load english data') fin = codecs.open(filename, 'r', 'utf-8') corpus = [] for line in fin: corpus.append(line) vocab = gv.build_vocab(corpus) vocab_file = config.glove_fine_tuned_vocab[lang] createPath(vocab_file) outfile = open(vocab_file, 'wb') pickle.dump(vocab, outfile) outfile.close() logger.info("Fetching cooccurrence list..") cooccurrences = gv.build_cooccur(vocab, corpus) cooccurrences = gv.convert_cooccurence_matrix(cooccurrences, len(vocab)) cooccurrence_file = config.glove_fine_tuned_cooccurance[lang] #outfile = open(cooccurrence_file, 'wb') joblib.dump(cooccurrences, cooccurrence_file) #outfile.close() logger.info("Cooccurrence list fetch complete (%i pairs).\n", cooccurrences.shape[0])
print("Creating co-occurance matrix") co_matrix = np.zeros((5000, 5000)) for word1, word2 in co_dict.keys(): co_matrix[top_5k[word1], top_5k[word2]] = co_dict[(word1, word2)] def glove2dict(glove_filename): with open(glove_filename) as f: reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE) embed = { line[0]: np.array(list(map(float, line[1:]))) for line in reader } return embed print("Training GloVe") original_embeddings = glove2dict("glove.6B/glove.6B.200d.txt") vocab_array = vocab.keys() mittens_model = Mittens(n=200, max_iter=2000) new_embeddings = mittens_model.fit(co_matrix, vocab=top_5k.keys(), initial_embedding_dict=original_embeddings) np.save('GloVe_wine_5k.npy', new_embeddings) print("Done")
# corp_vocab = list(set(oov) - set(oov_rare)) #corp_vocab = get_freqw(all_texts_tokenized_clean, 10000) #pickle.dump(corp_vocab, open("vocab_clpsych_10000.pkl", "wb+")) corp_vocab = pickle.load(open("all_vocab_clpsych_erisk_stop_40000.pkl", "rb")) original_glove = {k: v for k, v in pre_glove.items() if k in corp_vocab} pickle.dump(original_glove, open("original_glove_clpsych_erisk_stop_40000.pkl", "wb+")) # Train with mittens print("Computing cooccurrence matrix...") #cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab) #X = cv.fit_transform([all_texts]) #Xc = (X.T * X) #Xc.setdiag(0) #coocc_ar = Xc.toarray() #pickle.dump(coocc_ar, open("coocc_mat_clpsych_erisk_stop_40000.pkl", "wb+"), protocol=4) coocc_ar = pickle.load(open("coocc_mat_clpsych_erisk_stop_40000.pkl", "rb")) #coocc_ar = pickle.load(open("coocc_mat_clpsych_oov2.pkl", "rb")) print("Training with mittens...") mittens_model = Mittens(n=100, max_iter=1000, mittens=0.2) new_embeddings = mittens_model.fit(coocc_ar, vocab=corp_vocab, initial_embedding_dict=pre_glove) print("Serializing embeddings...") newglove = dict(zip(corp_vocab, new_embeddings)) f = open("finetuned_glove_clpsych_erisk_stop_40000.pkl", "wb") pickle.dump(newglove, f) f.close()
def fine_tune_glove(ID, train_type, doc_name="data/fine_tune_docs/pro_from_collection", glove_file="glove.6B.50d.txt", iteration=2000, glove_dim=50, restrict=0, normal=True, stop_word_list='english'): """ The wrapper function for fine tuning GloVe ID: identifier for the experiment train_type: one of "pro", "con", "all" doc_name: doc_name for the training reviews glove_file: public glove file to use iteration: how many iteration to train glove_dim: dimension for glove embedding restrict: restrict the number of documents to be read, reads all if restrict is 0 normal: whether to normalize the coocurrence matrix or not return: nothing, saves embedding in three files """ assert (train_type in ["pro", "con", "all"]) #read sentences print("reading training file") docs = read_doc(doc_name, restrict=restrict) #create coocurrence matrix if stop_word_list != 'english': stop_word_file = "data/fine_tune_docs/" + train_type + "_stop_words" stop_word_list = read_stop_word(stop_word_file) coocur_model = Cooccurrence(ngram_range=(1, 1), stop_words=stop_word_list, normalize=normal) Xc = coocur_model.fit_transform(docs) # co-occurrence matrix Xc = np.squeeze(np.asarray(Xc.todense())) print(Xc.shape) #read public GloVe embedding print("reading glove original embedding") original_embedding = simple_glove2dict(glove_file) #create vocab print("creating vocabulary") vocab = create_word_list(coocur_model.vocabulary_) print("vocab_size:", len(vocab)) #prepare for fine tune mittens_model = Mittens(n=glove_dim, max_iter=iteration) #fine tune GloVe! print("training started") new_embeddings = mittens_model.fit( Xc, vocab=vocab, initial_embedding_dict=original_embedding) print("training finished") #storing it in a way that can be used at https://projector.tensorflow.org/ with open( "result/" + ID + "_" + train_type + "_" + str(iteration) + "_embedding.tsv", "w") as f: for array in new_embeddings: for number in array: f.write(str(number) + "\t") f.write("\n") with open( "result/" + ID + "_" + train_type + "_" + str(iteration) + "_vocab.tsv", "w") as f2: for word in vocab: f2.write(word + "\n") #storing it in a way for the common glove readers #this should be ready to be read by simple_glove2dict above # and glove2dict function in /utils/vec_function with open( "result/" + ID + "_" + train_type + "_" + str(iteration) + "_word2vectorGloVe." + str(glove_dim) + "d.txt", "w") as f3: for index, word in enumerate(vocab): f3.write(word + " ") for number in new_embeddings[index]: f3.write(str(number) + " ") f3.write("\n") print("file written")
pragmatic_vocab = intersection(pragmatic_list, vocab_total) union_vocab = union(vocab_top_5000, pragmatic_vocab) pd.DataFrame(union_vocab).to_csv('union_vocab.csv') cooccurence_matrix = get_cooccurence_matrix(vocab_path, text_data_path, stock_list_path) df = pd.DataFrame(cooccurence_matrix) df.to_csv('coocur_union.csv') vocab = pd.read_csv('union_vocab.csv') vocabulary = dict(zip(vocab.iloc[:, 1], range(0, len(vocab)))) vocab = vocabulary.keys() cooccurrence = pd.read_csv('coocur_union.csv').iloc[:, 1:].to_numpy() mittens_model = Mittens(n=300, max_iter=1000) original_embedding = glove2dict('../../data/glove.6B.300d.txt') new_embeddings = mittens_model.fit(cooccurrence, vocab=vocab, initial_embedding_dict=original_embedding) print("MITTENS TRAINED") filename_train = "../../data/train_pk.json" filename_test = "../../data/test_pk.json" filename_val = "../../data/val_pk.json" maxlen = 0 maxlen = max(maxlen, get_max_len(filename_train)) maxlen = max(maxlen, get_max_len(filename_val)) maxlen = max(maxlen, get_max_len(filename_test))