def __init__(self): # Corpus model vocab = dict(torch.load("../data/dialogue.vocab.pt", "text")) self.corpus_model = Corpus(dictionary=vocab['tgt'].stoi) # Model self.glove = Glove(no_components=args.no_components, learning_rate=args.learning_rate)
def train_glove_fashionrec(dimensionality, context, epochs): """ Train with Glove on IG corpora""" total_count, vocab_size = corpus_stats("data/clean2_corpus.txt") print("total word count: {}, vocabulary size: {}".format( total_count, vocab_size)) fileName = "results/training/glove_fashion_epochs" + str( epochs) + "_d" + str(dimensionality) + "_c" + str( context) + "_" + ".txt" corpus = readCorpus() lines = corpus.split("\n") linessplit = map(lambda x: x.split(" "), lines) corpus_model = Corpus() start_time = datetime.now() corpus_model.fit(linessplit, window=context) corpusModelFile = "trained/glove_fashion_epochs" + str( epochs) + "_d" + str(dimensionality) + "_c" + str( context) + "_corpus" + ".model" corpus_model.save(corpusModelFile) glove = Glove(no_components=dimensionality, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=int(epochs), no_threads=8, verbose=True) glove.add_dictionary(corpus_model.dictionary) time_elapsed = datetime.now() - start_time gloveModelFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str( dimensionality) + "_c" + str(context) + "_vecs" + ".model" glove.save(gloveModelFile) notes = "Glove Fashion Data," + str(dimensionality) + " dim, " + str( context) + " context, " + str( epochs) + " epochs \n" + "Training time: " + str(time_elapsed) save_to_file(fileName, notes) gloveVecFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str( dimensionality) + "_c" + str(context) + "_vecs" + ".vec" save_glove_bin_to_vec(glove, gloveVecFile)
def train(path, freq, window, dim, lr, epochs): lines = [] dic = {} print("Start of train method") try: for f in os.listdir(path): text = open(path + '/' + f, 'r').read() text = re.sub('\n', ' ', text) text = text.split() for word in text: if word in dic.keys(): dic[word] += 1 else: dic[word] = 1 print("Created Dictionary for frequencies of words.") for f in os.listdir(path): text = open(path + '/' + f, 'r').read() text = re.sub('\n', ' ', text) text = text.split() text = [word for word in text if dic[word] > freq] lines.append(text) print( "Converted preprocessed text data in input format of array of array of words." ) corpus = Corpus() corpus.fit(lines, window=window) glove = Glove(no_components=dim, learning_rate=lr) glove.fit(corpus.matrix, epochs=epochs, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model') print("Saved the trained model to glove.model.") except: print("Error occured in training glove model")
def parse_Word2Vec(full_content): corpus = Corpus() corpus.fit(full_content, window=10) glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) # Open file to write the results f2 = open('/home/ubuntu/corpus/results.txt', 'w') # Loop through all the article types in the file with open('/home/ubuntu/corpus/article_types.csv', 'r') as f: reader = csv.reader(f, delimiter="\t") for row in reader: article_type = row[0] translator = str.maketrans({key: '' for key in string.punctuation}) article_type_no_punctuation = article_type.translate(translator) wordnet = WordNetLemmatizer() article_type_clean = wordnet.lemmatize(article_type_no_punctuation) try: match = glove.most_similar(article_type_clean, number=10) matched_item = match[0][0] print(article_type_clean + ' -> ' + str(matched_item)) f2.write(article_type + '\n') f2.write(str(matched_item + '\n')) except: pass print('failed for: ' + article_type) f2.close()
def Myself_Model(self, cropus_path, save=None, back_corpus=None, epochs=10, no_threads=8, no_components=100, learning_rate=0.05): """ sd """ self.get_data = self.read_corpus(cropus_path) corpus_model = Corpus() corpus_model.fit(self.get_data, window=10) if back_corpus != None: yield corpus_model #self.glove = Glove() self.glove = Glove(no_components=no_components, learning_rate=learning_rate) self.glove.fit(corpus_model.matrix, epochs=epochs, no_threads=no_threads, verbose=True) self.glove.add_dictionary(corpus_model.dictionary) if save != None: #save = 'model/articles_glove.model' self.glove.save(save) self.model = self.glove return self.glove
def glove_embed(data, embed_dim, window_size, epochs_, step_size): ''' DESCRIPTION : Perform Global Vectors for word embeddings for tokens in data set INPUT: |--- train: list of tweets |--- embed_size: [int] integer representing embedding dimension |--- window_size: [int] integer representing the size of the window of tokens considered during training for each token |--- epochs: [int] integer for number of epochs for Word2Vec training |--- step_size: [float] learning step for the SGD for Word2Vec training OUTPUT: |--- embeddings: [dict] dictionnary with tweets as keys and 1D array of feature vector as values |--- vocab: [dict] dictionnary with tokens as keys and index of each token in vocab as values |--- glove: [Global Vectors Model] GloVe model trained on data ''' sentences = get_tokens(data) model = Corpus() model.fit(sentences, window=window_size) glove = Glove(no_components=embed_dim, learning_rate=step_size) glove.fit(model.matrix, epochs=epochs_, no_threads=1, verbose=True) glove.add_dictionary(model.dictionary) embeddings = np.zeros((len([*glove.dictionary]), embed_dim)) for w, id_ in glove.dictionary.items(): embeddings[id_, :] = np.array([glove.word_vectors[id_]]) vocab = dict() for idx, line in enumerate([*glove.dictionary]): vocab[line.strip()] = idx return embeddings, vocab, glove
def feature_extract(path_dataset): feature_extract_dataset = [] speeches = read_csv(path_dataset, sep="|") speeches['Classe'] = speeches['Classe'].replace(1, 1) #Falas mistas de Estamira e sua Família speeches['Classe'] = speeches['Classe'].replace(0, -1) # Para cada fala for indice, fala in enumerate(speeches.Fala): #inicialização do método para pegar co-ocorrência dataset = Corpus() grafo = Graph() lsa = TruncatedSVD(n_components=1) tolkenizado = [simple_preprocess(str(fala), deacc=True)] quantas_palavras = shape(tolkenizado)[1] dataset.fit(tolkenizado, window=79) graph = Graph(dataset.matrix) values_lsa = lsa.fit_transform(dataset.matrix) values_mean = mean(values_lsa, axis=0) values_std = std(values_lsa, axis=0) feature_extract_dataset.append([ average_clustering(G=graph), average_shortest_path_length(G=graph), speeches.comprimento[indice], values_mean.item(), values_std.item(), quantas_palavras ]) return DataFrame(feature_extract_dataset), speeches['Classe'].values
def build_model_glove(args): if not os.path.exists(args.corpus_model) or \ max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model): # Build the corpus dictionary and the cooccurrence matrix. logging.info('Pre-processing corpus') corpus_model = Corpus() corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window']) corpus_model.save(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) else: # Try to load a corpus from disk. logging.info('Reading corpus statistics') corpus_model = Corpus.load(args.corpus_model) logging.info('Dict size: %s' % len(corpus_model.dictionary)) logging.info('Collocations: %s' % corpus_model.matrix.nnz) # Train the GloVe model and save it to disk. logging.info('Training the GloVe model') glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate']) glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'], no_threads=args.workers, verbose=args.verbose) glove.add_dictionary(corpus_model.dictionary) return glove
def get_embeddings(prepared_input): corpus = Corpus() corpus.fit(prepared_input, window=10) glove = Glove(no_components=5, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model')
def train_glove(save_dir, size): print('START') f_corpus = get_full_corpus() corpus = Corpus() print('CREATE CORPUS') corpus.fit(f_corpus, window=10) word_dict = corpus.dictionary.keys() glove = Glove(no_components=size, learning_rate=0.05) print('START LEARNING') glove.fit(corpus.matrix, epochs=60, no_threads=8, verbose=True) glove.add_dictionary(corpus.dictionary) dict_in_bin = dict() print('START SAVE') for item in word_dict: word_indx = glove.dictionary[item] dict_in_bin[item] = glove.word_vectors[word_indx] with open(save_dir, "wb") as file: pickle.dump(dict_in_bin, file) print('COMMON TEST') while True: try: s = input("Введите строку: ") print(glove.most_similar(s, number=10)) word_indx = glove.dictionary[s] print(glove.word_vectors[word_indx]) except: continue
def train_and_save_model(data_dir, model_name='LeGlove', num_epochs=10, parallel_threads=1): ''' This function processes all the data into a training corpus and fits a GloVe model to this corpus. Parameters: data_dir (string): master directory containing all jurisdiction-level directories model_name (string): name of model to be used for output num_epochs (int): number of epochs for which to train model parallel_threads (int): number of parallel threads to use for training The trained model is saved as "[model_name].model" into the current directory. ''' corpus_model = Corpus() corpus_model.fit(read_corpus(data_dir), window=CONTEXT_WINDOW) glove = Glove(no_components=NUM_COMPONENTS, learning_rate=LEARNING_RATE) glove.fit(corpus_model.matrix, epochs=num_epochs, no_threads=parallel_threads, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(model_name + '.model')
def build_glove_embeddings(corpus): """ DESCRIPTION: Applies the Glove python SGD algorithm given by glove_python library and build the word embeddings from our training set. INPUT: corpus: a list of lists where each sub-list represent a tweet. The outer list represents the whole training dataset. OUTPUT: words: python dictionary of the form (word, [vector of embeddings]) """ words = load_glove_embeddings_from_txt_file( MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE) if words != None: return words model = Corpus() model.fit(corpus, window=algorithm['options']['WE']['window_size']) glove = Glove(no_components=algorithm['options']['WE']['we_features'], learning_rate=algorithm['options']['WE']['learning_rate']) print('\nFitting Glove Python Embeddings') glove.fit(model.matrix, epochs=algorithm['options']['WE']['epochs']) glove.add_dictionary(model.dictionary) words = {} for w, id_ in glove.dictionary.items(): words[w] = np.array(glove.word_vectors[id_]) store_embeddings_to_txt_file(words, MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE) return words
def generate_glove_corpus(): global article_info_path, output_path write_log('GloVe Load article info : Start') with open(article_info_path, 'r') as f_art: article_info = json.load(f_art) write_log('GloVe Load article info : End') write_log('GloVe Generate sentences : Start') sentences = [] for url, dict_info in article_info.items(): sentence_header = dict_info.get('sentence_header', None) sentence_body = dict_info.get('sentence_body', None) if (sentence_header == None) or (sentence_body == None): continue words = [] #for sentence in sentence_header + sentence_body: for sentence in sentence_header: for word in sentence.split(' '): words.append(word) sentences.append(words) write_log('GloVe Generate sentences : End') write_log('GloVe Generate corpus : Start') corpus = Corpus() corpus.fit(sentences, window=10) write_log('GloVe Generate corpus : End') corpus.save(output_path)
def main(args): ############################ # corpus_model = Corpus() # corpus_model.fit(read_corpus(args.corpus)) # corpus_model.save('corpus_select.model') ############################ # corpus_model = Corpus().load('corpus_select.model') # print('Dict size: %s' % len(corpus_model.dictionary)) # print('Collocations: %s' % corpus_model.matrix.nnz) # with open('global_vocab.pkl', 'wb') as handle: # pickle.dump(corpus_model.dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL) ############################ # opening vocab to create the corpus object with open('global_vocab.pkl', 'rb') as f: vocab_dict = pickle.load(f) doc_model = Corpus(dictionary=vocab_dict) texts = list(read_corpus(args.corpus)) #opening weight csv diff_bias = pd.read_csv(args.diff_bias, header=0) #col 2 is science/arts, col 3 is weapons/instruments total = {} # for i in range(10): for i in tqdm(range(len(texts))): doc = [texts[i]] doc_model.fit(doc) # we might not even need to save it, just put it into one matrix and save that coo = doc_model.matrix.todok() weight = diff_bias.iloc[i, 2] coo = {k:weight*v for k,v in coo.items()} total = Counter(coo) + Counter(total) def _dict_to_csr(term_dict): term_dict_v = term_dict.values() term_dict_k = term_dict.keys() term_dict_k_zip = zip(*term_dict_k) term_dict_k_zip_list = list(term_dict_k_zip) shape = (len(term_dict_k_zip_list[0]), len(term_dict_k_zip_list[1])) csr = sp.csr_matrix((list(term_dict_v), list(map(list, zip(*term_dict_k)))), shape = shape) coo = csr.tocoo() return coo total = dict(total) total = _dict_to_csr(total) print(total.get_shape()) with open('doc_matrices_weighted.pkl', 'wb') as handle: pickle.dump(total, handle, protocol=pickle.HIGHEST_PROTOCOL)
def initiate_model(self, input_corpus): self.corpus_model = Corpus() self.corpus_model.fit(self.__read_corpus(input_corpus), window=10) self.glove = Glove(no_components=100, learning_rate=0.05) self.glove.fit(self.corpus_model.matrix, epochs=200) self.glove.add_dictionary(self.corpus_model.dictionary)
def train_glove(target_group, glove_para, src_file, save_model_name): """ example: train_glove(target_group='words', glove_para=glove_para_word) after save the mode, u can use it by : glove_ana = Glove.load('glove_words.model') :param target_group: 'words' or 'chars' :param glove_para: glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4} :return: """ corpus_model = Corpus() corpus_model.fit(read_corpus(src_file=src_file, words_or_chars=target_group), window=glove_para['window_size'] ) #avg word size is 6 for each sentence corpus_model.save('corpus_model_{}.model'.format(target_group)) print target_group print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) print('Training the GloVe model') glove = Glove(no_components=glove_para['no_components'], learning_rate=glove_para['learning_rate']) glove.fit(corpus_model.matrix, epochs=glove_para['no_epochs'], no_threads=glove_para['parallelism'], verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save(save_model_name)
def test_fitting(): """ Verify that the square error diminishes with fitting """ num_sentences = 5000 seed = 10 corpus = Corpus() corpus.fit(generate_training_corpus(num_sentences, vocabulary_size=50, seed=seed)) # Check that the performance is poor without fitting glove_model = Glove(no_components=100, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=0, no_threads=2) log_cooc_mat = corpus.matrix.copy() log_cooc_mat.data = np.log(log_cooc_mat.data) log_cooc_mat = np.asarray(log_cooc_mat.todense()) repr_matrix = _reproduce_input_matrix(glove_model) assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0 # Check that it is good with fitting glove_model = Glove(no_components=100, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=500, no_threads=2) repr_matrix = _reproduce_input_matrix(glove_model) assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
def train_glove(src_filename, dim=100): corpus = Corpus() corpus.fit(get_lines(src_filename), window=10) glove = Glove(no_components=dim, learning_rate=0.001) glove.fit(corpus.matrix, epochs=100, no_threads=20, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save(DATA_DIR + 'glove.{}d.model'.format(dim))
def main(): corpus_model = Corpus() corpus_model = Corpus.load('bioc-corpus-AZ2.model') glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('bioc-glove-AZ2.model')
def getWordEmbeddings(processed_text): corpus = Corpus() corpus.fit(processed_text, window=3) glove = Glove(no_components=500, learning_rate=0.001) glove.fit(corpus.matrix, epochs=300000, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) print(glove.most_similar('price'))
def glove_vectors(x, embedding_size, epochs=50, lr=0.05, alpha=0.75, max_count=100, tmp_loc='glove.w2vmodel'): # create dict ourselves so that the ids correspond to their location in the df, starting to count from first col downwards df = pd.DataFrame(x) word_id_dict = create_vocab_dict(df) # Creating a corpus object corpus = Corpus(dictionary=word_id_dict) # Training the corpus to generate the co occurence matrix which is used in GloVe # Distance scaling: standard glove reduces the occurence count based on how far a context word is from the focus word. # Should not be used since distance has no meaning for purely categorical variables. corpus.fit(df.values.tolist(), window=len(df.columns), distance_scaling=False) # alpha is the weighing of the loss, based on how likely a cooccurence is (Xij), less likely = less weight. glove = Glove(no_components=embedding_size, learning_rate=lr, alpha=alpha, max_count=max_count) glove.fit( corpus.matrix, epochs=epochs, no_threads=1, verbose=True ) # glove paper: 50 epochs for dimensionality <300, 100 otherwise glove.add_dictionary(corpus.dictionary) glove.save_word2vec_format(tmp_loc) model = KeyedVectors.load_word2vec_format(tmp_loc) if os.path.exists(tmp_loc): os.remove(tmp_loc) return model
def train_glove(corpus, vocabulary, zero_init_indices=0, rand_init_indices=1, embedding_dim=300): """Use glove to train on corpus to obtain embedding Here we use a python implementation of Glove, but the official glove implementation of C version is also highly recommended: https://github.com/stanfordnlp/GloVe/blob/master/demo.sh Args: corpus: list of tokenized texts, corpus to train on vocabulary: dict, a mapping of words to indices zero_init_indices: int or a list, the indices which use zero-initialization. These indices usually represent padding token. rand_init_indices: int or a list, the indices which use randomly-initialization.These indices usually represent other special tokens, such as "unk" token. embedding_dim: int, dimensionality of embedding Returns: np.array, a word embedding matrix. """ corpus_model = Corpus() corpus_model.fit(corpus, window=10) glove = Glove(no_components=embedding_dim, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True) glove.add_dictionary(corpus_model.dictionary) word_vectors = dict( (w, glove.word_vectors[glove.dictionary[w]]) for w in glove.dictionary) emb = filter_embeddings(word_vectors, embedding_dim, vocabulary, zero_init_indices, rand_init_indices) return emb
def train_glove(path): import itertools from gensim.models.word2vec import Text8Corpus from gensim.scripts.glove2word2vec import glove2word2vec from glove import Corpus, Glove #import os #import struct sentences = list(itertools.islice(Text8Corpus(path), None)) corpus = Corpus() corpus.fit(sentences, window=10) glove = Glove(no_components=300, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) file_name = 'embeddings_models/model_glove_' + str(TRAINING_SENTENCES) glove.save(file_name) glove2word2vec(file_name, file_name + '_modified') """ command = 'python -m gensim.scripts.glove2word2vec -i ' +file_name+' -o '+file_name+'_modified' os.system(command) with open(file_name+'_modified', mode='rb') as file: # b is important -> binary fileContent = file.read() print 'Content',fileContent """ print 'Finished' return glove
def train_glove(corpus, params, exp_id, save_dir, save_dict=False): dictionary = load_glove_dictionary(exp_id, save_dir) # Build the corpus dictionary and the cooccurrence matrix. print('Pre-processing corpus') dict_path = os.path.join(save_dir, 'glove_dict_{}.model'.format(exp_id)) if os.path.exists(dict_path): corpus_model = Corpus.load(dict_path) else: corpus_model = Corpus(dictionary) corpus_model.fit(corpus, window=params['window'] * 2, ignore_missing=True) if save_dict: corpus_model.save(dict_path) print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=100, learning_rate=params['alpha']) glove.fit(corpus_model.matrix, epochs=50, no_threads=params['workers'], verbose=True) glove.add_dictionary(corpus_model.dictionary) return glove
def glove(eng_vect,L1_eng,hin_vect,L1_hin): eng_cor = Corpus() # object of english corpus hin_cor = Corpus() # object of hindi corpus eng_cor.fit(eng_vect, window=10) #train the english vector to compute the co occurence matrix hin_cor.fit(hin_vect, window=10) #train the hindi vector gloveE = Glove(no_components=4, learning_rate=0.04) #create a Glove object for english to create embeddings gloveH = Glove(no_components=4, learning_rate=0.04) #create a Glove object for hindi to create embeddings gloveE.fit(eng_cor.matrix, epochs=400, no_threads=5, verbose=False) #fitting the model for english vectors gloveH.fit(hin_cor.matrix, epochs=400, no_threads=5, verbose=False) #fitting the model for hindi vectors gloveE.add_dictionary(eng_cor.dictionary) #adding the english embeddings generated to the gloveE dictionary gloveE.save('gloveE.model') #saving the model gloveH.add_dictionary(hin_cor.dictionary) #adding the hindi embeddings generated to the gloveH dictionary gloveH.save('gloveH.model') # printing the embeddings for english words eng_emb = [] for i in (eng_vect): for words in i: eng_emb.append(gloveE.word_vectors[gloveE.dictionary[words]]) print(eng_emb[:50]) # printing the embeddings for hindi words hin_emb = [] for i in (hin_vect): for words in i: hin_emb.append(gloveH.word_vectors[gloveH.dictionary[words]]) print(hin_emb[:50]) #printing the most similar words of english words from L1 for i in L1_eng: try: print(gloveE.most_similar(i)) count1 = count1+1 except: pass #printing the most similar words of hindi words from L1 for i in L1_hin: try: print(gloveH.most_similar(i)) count1 = count1+1 except: pass
def buildCorpus(data_path=None, context_window=5): # function that loads in wikipedia data and fits corpus model print('Fitting data...') # intialize and fit corpus corpus = Corpus() corpus.fit(textGenerator(data_path), window=context_window) return corpus
def build_glove_word_vectors(data_frame, vec_dim, vectorizer, window_size, niter): corpus = Corpus(vectorizer.vocabulary_) corpus.fit(data_frame.post, window=window_size, ignore_missing=True) glove = Glove(no_components=vec_dim, learning_rate=0.01) glove.fit(corpus.matrix, epochs=niter, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) return glove
def trainShake2(self): corpus = Corpus() shakespeare_words = self.shakespeare_lines() # corpus.fit(shakespeare_corpus + sonnets_corpus, window=10) corpus.fit(shakespeare_words, window=10) self.glove = Glove(no_components=100, learning_rate=0.05) self.glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) self.glove.add_dictionary(corpus.dictionary)
def getGloveEmbedding(seqs, size=300, window=10, epochs=20): corpus = Corpus() corpus.fit(seqs, window=window) glove = Glove(no_components=size, learning_rate=0.05) glove.fit(corpus.matrix, epochs=epochs, verbose=True) return corpus.dictionary, glove.word_vectors
def train_model(line): corpus = Corpus() corpus.fit(line) glove = Glove(no_components=5, learning_rate=0.05, random_state=0) glove.fit(corpus.matrix, epochs=10, no_threads=100, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('glove.model') return glove