def load_vocab_all( load=True ): if load==False: vocab_dict = {} reverse_vocab_dict = {} embedding = glove.Glove() vocabs = build_vocab_all() vocab_tokens = [] for i,word in enumerate(vocabs): vocab_dict[word]=i reverse_vocab_dict[i]=word vocab_tokens.append([word]) np.save('data/vocab_dict_all.npy',vocab_dict) np.save('data/reverse_vocab_dict_all.npy',reverse_vocab_dict) vocab_emb = embedding.embedding(vocab_tokens, maxlen=1) vocab_emb = vocab_emb[:,0] #retrieve embedding np.save('data/vocab_emb_all.npy',vocab_emb) print('Vocab shape:') print(vocab_emb.shape) else: vocab_emb=np.load('data/vocab_emb_all.npy') vocab_dict=np.load('data/vocab_dict_all.npy').item() reverse_vocab_dict=np.load('data/reverse_vocab_dict_all.npy').item() print('Vocab shape:') print(vocab_emb.shape) return vocab_dict,reverse_vocab_dict,vocab_emb
def _do_glove(self, package, cooccurrence_dict, dimensions, alpha, x_max, vocab): glove_start = time.time() model = glove.Glove(cooccurrence_dict, d=dimensions, alpha=alpha, x_max=x_max) glove_time = (time.time() - glove_start) log.getLogger().info("glove_time " + str(glove_time)) glove_train_start = time.time() model.train(batch_size=200, workers=9) glove_train_time = (time.time() - glove_train_start) log.getLogger().info("glove_train_time " + str(glove_train_time)) glove_list = self.output_format(model.W, vocab) glove_output_key = str(dimensions) + "d_" + str(x_max) + "_" + str( alpha) + "_glove_output" if "glove_output_key" in package.any_inputs_dict.keys(): package.any_inputs_dict[ "glove_output_key"] = package.any_inputs_dict[ "glove_output_key"] + "," + glove_output_key else: package.any_inputs_dict["glove_output_key"] = glove_output_key package.any_analysis_dict[glove_output_key] = glove_list package.any_analysis_dict["gl0ve_vocab"] = vocab
def build_gensim(self, docs, model=None): dp = DocumentPreprocessor() docs_tokenized = (dp.tokenizer(doc) for doc in docs) # Get the word co-occurrence matrix -- needs lots of RAM!! cooccur = glove.Corpus() cooccur.fit(docs_tokenized, window=10) # wiki_generator = lambda: (filter_text(text) for text in wiki) # cooccur.fit(wiki_generator(), window=10) # and train GloVe model itself, using 10 epochs if model is None: model = glove.Glove(no_components=600, learning_rate=0.05) model.fit(cooccur.matrix, epochs=10) doc_vectors = [] docs_tokenized = (dp.tokenizer(doc) for doc in docs) for doc in docs_tokenized: doc_vector = np.zeros(len(model.word_vectors[0]), dtype=np.float) if len(doc): for word in doc: try: doc_vector += model[word] except: log.debug( 'Word: {} doesn\'t appear in model.'.format(word)) else: log.debug('Empty document in data') doc_vectors.append(doc_vector) return np.array(doc_vectors), model
def _embed_list(ls, g=None, maxlen_p=20, maxlen_q=2): if g == None: g = glove.Glove() for line in ls: assert len(line.split('\t')) == 2 questions = [ nltk.word_tokenize(line.split('\t')[0]) for line in ls] cols = [ nltk.word_tokenize(line.split('\t')[1]) for line in ls] return g.embedding(questions, maxlen=maxlen_p-1), g.embedding(cols, maxlen=maxlen_q-1)
def train_glove(dic_comtx, dimension = 100, alpha = 0.75, x_max = 100.0, epoch = 20, batch = 200): model = glove.Glove(dic_comtx, d = dimension, alpha = alpha, x_max = x_max) for epoch in range(epoch): err = model.train(batch_size = batch, workers = 4) print("epoch %d, error %.3f" % (epoch, err), flush=True) wordvectors = model.W #extract wordvector return wordvectors
def train_glove(self): processed_sentence = self.preprocessing(self._filename) model = glove.Glove(processed_sentence, d=self._n_dim, alpha=0.75, x_max=100.0) for epoch in range(150): err = model.train(batch_size=150, workers=3) X = model.W return model, X
def train_glove(self): processed_sentence = self.preprocessing(self._filename) model = glove.Glove(processed_sentence, d=300, alpha=0.75, x_max=100.0) for epoch in range(150): err = model.train(batch_size=150, workers=3) X = model.W self.save_embed_file( 'embed_{}/{}.embd'.format(self._embed_dir, self.class_name), X, self._labels) return X
def train_all(self): """ builds the vocab and trains the model :return: """ documents = list(self.read_input()) corpus = glove.Corpus() corpus.fit(documents, window=self.window_size) self.model = glove.Glove(no_components=self.vector_size, learning_rate=self.learning_rate, alpha=self.alpha) self.model.fit(corpus.matrix, epochs=self.iterations, no_threads=self.workers, verbose=True) self.dictionary=corpus.dictionary self.model.add_dictionary(corpus.dictionary)
def train(word2id, id2word, corpus, win, dim): cooccur = glove.Corpus(dictionary=word2id) cooccur.fit(corpus(), window=win) logger.info("glove model creating") logger.info('Dict size: %s' % len(cooccur.dictionary)) logger.info('Collocations: %s' % cooccur.matrix.nnz) model = glove.Glove(no_components=dim, learning_rate=0.05) model.fit(cooccur.matrix, epochs=10, no_threads=5, verbose=True) model.add_dictionary(cooccur.dictionary) model.word2id = dict( (utils.to_unicode(w), id) for w, id in model.dictionary.items()) model.id2word = gensim.utils.revdict(model.word2id) utils.pickle(model, './model/glove.model')
def train_glove(self): cleaned_text = self.text_preprocessing() dic = self.count_vectorize(cleaned_text) model = glove.Glove(dic, d=self._n_dim, alpha=0.75, x_max=100.0) for epoch in range(150): err = model.train(batch_size=150, workers=3) # print("epoch %d, error %.3f" % (epoch, err), flush=True) X = model.W # self.save_embed_file('embed_{}/{}.embd'.format(self._embed_dir, self.class_name), X, self._labels) return model, X
def generate_embedding_dictionary(docs_tokens, embedding_dim, iters, window=2, learning_rate=0.05): time_start = time() corpus_model = glove.Corpus() corpus_model.fit(docs_tokens, window=window) glove_model = glove.Glove(no_components=embedding_dim, learning_rate=learning_rate) glove_model.fit(corpus_model.matrix, epochs=iters, no_threads=4) end_time = time() glove_model.add_dictionary(corpus_model.dictionary) word_to_index = glove_model.dictionary index_word = glove_model.inverse_dictionary embedding_dictionary = {index_word[i]: vector for i, vector in enumerate(glove_model.word_vectors)} # embedding_dictionary["<<UNKNOWN>>"] = np.zeros(embedding_dim) return embedding_dictionary, embedding_dim, word_to_index, end_time - time_start
def glove2vec(text_sentence, win=10, noc=1, lr=0.05, epochs=10, nothr=1, verbose=True): corpus_model = glove.Corpus() corpus_model.fit(text_sentence, window=win) word_list = glove.Glove(no_components=noc, learning_rate=lr) word_list.fit(corpus_model.matrix, epochs=epochs, no_threads=nothr, verbose=verbose) word_list.add_dictionary(corpus_model.dictionary) return word_list
def load_data(filepath='imdb/imdb.npz', rawpath='~/data/aclImdb', maxlen=400, embedding=None): filepath = os.path.expanduser(os.path.join('~/data', filepath)) datapath = os.path.expanduser('~/data/imdb') rawpath = os.path.expanduser(rawpath) if os.path.exists(filepath): data = np.load(filepath) X_train, y_train = data['X_train'], data['y_train'] X_test, y_test = data['X_test'], data['y_test'] else: g = embedding if g is None: import glove g = glove.Glove() _collect_reviews() import nltk def _embedding(fpath): reviews = [nltk.word_tokenize(line) for line in open(fpath, 'r')] # maxlen-1 since we add a <bos> symbol to each sentence return g.embedding(reviews, maxlen=maxlen-1) print('\nGenerating training data') X_train_pos = _embedding(os.path.join(datapath, 'train-pos.txt')) X_train_neg = _embedding(os.path.join(datapath, 'train-neg.txt')) X_train = np.vstack((X_train_pos, X_train_neg)) y_train = np.append(np.zeros(X_train_pos.shape[0]), np.ones(X_train_neg.shape[0])) y_train = np.reshape(y_train, [-1, 1]) print('\nGenerating testing data') X_test_pos = _embedding(os.path.join(datapath, 'test-pos.txt')) X_test_neg = _embedding(os.path.join(datapath, 'test-neg.txt')) X_test = np.vstack((X_test_pos, X_test_neg)) y_test = np.append(np.zeros(X_test_pos.shape[0]), np.ones(X_test_neg.shape[0])) y_test = np.reshape(y_test, [-1, 1]) print('\nSaving {}'.format(filepath)) np.savez(filepath, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) return X_train, y_train, X_test, y_test
def main(): if os.path.exists(FITTED_MODEL_FILENAME): glove_model = glove.Glove.load(FITTED_MODEL_FILENAME) else: matrix, dictionary = load_graph_adj_matrix() glove_model = glove.Glove(2) glove_model.fit(matrix, epochs=10) glove_model.add_dictionary(dictionary=dictionary) glove_model.save(FITTED_MODEL_FILENAME) graph_positions = {} for vertex_idx, vertex_name in glove_model.inverse_dictionary.items(): vertex_pos = tuple(glove_model.word_vectors[vertex_idx]) graph_positions[vertex_name] = vertex_pos pass
def run_multi(d, size): cores = multiprocessing.cpu_count() sentiment_doc2vec_amazon_cv( base_path='/datasets/amazon-data/csv/nan-removed', # base_path='/nfs/amazon/csv/nan-removed', dataset_filter=d, # stars=[1, 5], stars=[1, 2, 3, 4, 5], n_cv=1, model=glove.Glove(no_components=size, learning_rate=0.05), d2v_size=size, save_model='/models/gensim/domains', # save_model='/nfs/amazon/doc2vec/models', output_folder='/models/gensim/domains/results' # output_folder='/nfs/amazon/doc2vec/results' # n_max_unsupervised=100000 )
def main(): wheel_graph = networkx.generators.classic.wheel_graph(10) model = glove.Glove(2, learning_rate=0.01, alpha=0.2, max_count=1000) adj_matrix = networkx.adjacency_matrix(wheel_graph) adj_matrix = adj_matrix.toarray().astype('d') normalized_adj_matrix = scipy.divide(adj_matrix, adj_matrix.sum(1)[:, scipy.newaxis]) model.fit(scipy.sparse.coo_matrix(normalized_adj_matrix), epochs=1000) vertex_positions = { vertex_idx: tuple(model.word_vectors[vertex_idx]) for vertex_idx in range(wheel_graph.order()) } networkx.drawing.draw(wheel_graph, pos=vertex_positions) plt.savefig("asdf.png") pass
def embed_data(maxlen_p=maxlen0,maxlen_q=maxlen1,embedding=None,save=False,datapath=None,savepath=None): if not datapath: datapath = os.path.dirname(path).replace('/data','') if not savepath: savepath = os.path.dirname(path).replace('/data','') filepath ='bc.npz' filepath_X = os.path.expanduser(os.path.join(savepath, filepath)) filepath = 'bc_label.npz' filepath_y = os.path.expanduser(os.path.join(savepath, filepath)) g = embedding if g is None: g = glove.Glove() def _embedding(fpath): for line in codecs.open(fpath,'r','utf-8-sig'): # assert len(line.split('\t')) == 3 or line.startswith('#') assert len(line.split('\t')) == 3 questions = [ nltk.word_tokenize(line.split('\t')[0]) for line in codecs.open(fpath,'r','utf-8-sig') if not line.startswith('#')] cols = [ nltk.word_tokenize(line.split('\t')[1]) for line in codecs.open(fpath,'r','utf-8-sig') if not line.startswith('#')] labels = [ line.split('\t')[2] for line in codecs.open(fpath,'r','utf-8-sig') if not line.startswith('#')] return g.embedding(questions, maxlen=maxlen_p-1), g.embedding(cols, maxlen=maxlen_q-1) def _read_label(fpath): #labels = [ line.split('\t')[2] for line in codecs.open(fpath,'r','utf-8-sig') if not line.startswith('#')] labels = [ int(line.split('\t')[2].strip('\n')) for line in codecs.open(fpath,'r','utf-8-sig')] return labels print('\nGenerating training/test data') X_train_p,X_train_q = _embedding(os.path.join(datapath, 'train_model_const.txt')) X_test_p,X_test_q = _embedding(os.path.join(datapath, 'test_model_const.txt')) X_dev_p,X_dev_q = _embedding(os.path.join(datapath, 'dev_model_const.txt')) X_train_ans = _read_label(os.path.join(datapath, 'train_model_const.txt')) X_test_ans = _read_label(os.path.join(datapath, 'test_model_const.txt')) X_dev_ans = _read_label(os.path.join(datapath, 'dev_model_const.txt')) if save: print('\nSaving') np.savez(filepath_y, y_train=X_train_ans, y_test=X_test_ans, y_dev=X_dev_ans) np.savez(filepath_X, X_train_qu=X_train_p, X_train_col=X_train_q, X_test_qu=X_test_p, X_test_col=X_test_q, X_dev_qu=X_dev_p, X_dev_col=X_dev_q) print('\nSaved!')
def glove(windows, num_components=16, glove_window=10, epochs=20, verbose=False): import glove import hdbscan import multiprocessing ws = [[template_id for template_id in w] for w in windows] corpus = glove.Corpus() corpus.fit(ws, window=glove_window) # TODO: Explore reasonable glove defaults glove_model = glove.Glove(no_components=num_components, learning_rate=0.05) glove_model.fit(corpus.matrix, epochs=epochs, no_threads=multiprocessing.cpu_count(), verbose=verbose) glove_model.add_dictionary(corpus.dictionary) labels = [] vectors = [] # TODO: Explore how to pull data more nicely from glove for key in glove_model.__dict__['dictionary']: word_vector_index = glove_model.__dict__['dictionary'][key] labels.append(key) vectors.append( list(glove_model.__dict__['word_vectors'][word_vector_index])) # Clustering output_events = defaultdict(list) for i, val in enumerate( hdbscan.HDBSCAN(min_cluster_size=2).fit_predict(vectors)): output_events[val].append(labels[i]) # Create event objects events = [] for item in output_events: event = Event(id=str(uuid.uuid4()), template_ids=map(int, output_events[item])) if len(event.template_ids) > 0: events.append(event) return events
def __init__(self, docs_tokens, emb_dim, iters, window, learn_rate): self.time = 0. self.time = time() corpus_model = glove.Corpus() corpus_model.fit(docs_tokens, window=window) glove_model = glove.Glove(no_components=emb_dim, learning_rate=learn_rate) glove_model.fit(corpus_model.matrix, epochs=iters, no_threads=4) glove_model.add_dictionary(corpus_model.dictionary) self.time = time() - self.time word_to_index = glove_model.dictionary index_word = glove_model.inverse_dictionary embedding_dictionary = {index_word[i]: vector for i, vector in enumerate(glove_model.word_vectors)} super(EmbeddingModel, self).get_from_data(embedding_dictionary, emb_dim, word_to_index, self) self.name = 'glove'
def get_glove(X): print("X_cooc 생성...") X_cooc = X.T @ X print("X_cooc 생성 완료") X_cooc.setdiag(0) result = X_cooc.toarray() dic = {} for idx1, doc in enumerate(result): tmpdic = {} for idx2, word2 in enumerate(doc): if word2 > 0: tmpdic[idx2] = word2 dic[idx1] = tmpdic model = glove.Glove(dic, d=200, alpha=0.75, x_max=5.0) for epoch in range(150): err = model.train(batch_size=100, workers=4, step_size=0.05) print("epoch %d, error %.3f" % (epoch, err), flush=True) gloveVectors = model.W print("glove vectors shape: " + str(gloveVectors.shape)) return gloveVectors
def glove_pemb(self): """Computes Glove embeddings from co-occurrence matrix and returns patient embeddings Return ------ list pids list list matrix of patient embeddings array word embeddings """ corpus = self.__build_corpus() coocc_dict = self.__build_cooccur(corpus, window_size=10) model = glove.Glove(coocc_dict, alpha=0.75, x_max=10.0, d=ut.n_dim_glove, seed=1234) logging.info("\nTraining Glove embeddings...") for epoch in range(ut.n_epoch_glove): err = model.train(batch_size=ut.batch_size_glove, step_size=ut.learning_rate_glove) if epoch % 10 == 0: logging.info("epoch %d, error %.3f" % (epoch, err)) logging.info("epoch %d, error %.3f" % (epoch, err)) wemb = model.W + model.ContextW # as suggested in Pennington et al. p_emb = [] pid_list = [] for pid, term in corpus.items(): if len(term) != 0: pid_list.append(pid) p_emb.append( np.mean([wemb[int(t)].tolist() for t in term], axis=0).tolist()) return pid_list, p_emb, wemb
def glovex_model(filepath, argstring, cooccurrence, dims=100, alpha=0.75, x_max=100, force_overwrite=False, suffix=".glovex"): model_path = filepath + argstring model_files = glob.glob(model_path + "_epochs*" + suffix) if not len(model_files) or force_overwrite: model = glove.Glove(cooccurrence, d=dims, alpha=alpha, x_max=x_max) else: highest_epochs = max( [int(f.split("epochs")[1].split(".")[0]) for f in model_files]) logger.info( " ** Existing model file found. Re-run with --overwrite_model if you did not intend to reuse it." ) with open(model_path + "_epochs" + str(highest_epochs) + suffix, "rb") as pro_f: model = pickle.load(pro_f) return model
def train_glove(sentences=None, nr_feature=None, save_name=None): verify_cwd() if sentences is None: print("preprocessing sentences...") sentences = list( itertools.islice(word2vec.Text8Corpus('./data/text8'), None)) print("{} sentences found.".format(len(sentences))) if save_name is None: save_name = "./data/glove.model" if nr_feature is None: nr_feature = 200 corpus = glove.Corpus() print("start fiting sentences...") corpus.fit(sentences, window=10) gl = glove.Glove(no_components=nr_feature, learning_rate=0.05) print("start training glove...") gl.fit(corpus.matrix, epochs=10, no_threads=multiprocessing.cpu_count(), verbose=True) corpus.save("./data/corpus.model") gl.save("./data/glove.model")
def __init__(self): self.options = { '1': self.key, '2': self.key, '3': self.key, '4': self.key, '5': self.key, '6': self.key, '7': self.key, '8': self.key, '\x80': self.key, 'R': self.release, 'H': self.hold, 'O': self.on } self.index_array = [ 'P1', 'P2', 'R1', 'R2', 'M1', 'M2', 'I1', 'I2', '3', '1', '2', '4', '8', '5', '8', '7' ] self.hold_flag = False self.glove = glove.Glove('11') self.on_increment = 0 self.last_read_byte = '0' self.increment = 0
def __init__(self, parse=PARSE_PATH, split='train'): p_length = self.phrase_length remove_stop = self.remove_stop add_stop = self.add_stop parse = os.path.expanduser(parse) rawfile = '%s_org.qu' % split rawfile = os.path.join(parse, rawfile) self.split = split self.key_w, self.human_info, self.street, self.city, self.county, self.region, self.rest, self.foodtype, self.rating = read_word_rest( ) self.g = glove.Glove() self.embed() with open(rawfile, 'r') as f: res = [] res_pair = [] for i, line in enumerate(f): self.qu_pairs = [] #words pairs self.count = 0 #number of <f> self.count_c = 0 #number of <c> words = word_tokenize(line) self.w_filter = words #line filtered out the stop words later self.qu_annot = [''] * len(words) #final result line = ' '.join(line.strip('\n').split(' ')) self.stop_words = set(stopwords.words('english')) for rw in remove_stop: self.stop_words.remove(rw) for aw in add_stop: self.stop_words.add(aw) for k, w in enumerate(words): if w not in self.stop_words: self.w_filter[k] = w else: self.w_filter[k] = '' if self.qu_annot[k] == '': self.qu_annot[ k] = w #append stop words in final result self.special_w_1(line) for le in range(p_length - 1, -1, -1): for idx in range(len(self.w_filter) - le): word = self.check_phrase(idx, le) if word != None: word = ' '.join(word) self.find_const_w(word, idx, le, line) #find the human knowledge word first before moving out stop words, since 'of' have to be moved out but then we can't find 'number of citizens' for le in range(p_length - 1, -1, -1): for idx in range(len(self.w_filter) - le): word = self.check_phrase(idx, le) if word != None: word = ' '.join(word) self.find_human_w(word, idx, le, line) #the word exact match key word for le in range(p_length - 1, -1, -1): for idx in range(len(self.w_filter) - le): word = self.check_phrase(idx, le) if word != None: word = ' '.join(word) self.exact_match(word, idx, le, line) self.special_w_2(line) if len(self.qu_annot) == len(self.w_filter): for aw_idx, aw in enumerate(self.qu_annot): if aw == '' and '<>' not in self.w_filter[aw_idx]: self.qu_annot[aw_idx] = self.w_filter[aw_idx] else: print('-------------wrong length for result--------------') print(self.qu_annot) print(self.w_filter) print( '----------------------------------------------------') qu_annot = [ item for item in filter(lambda x: x != '', self.qu_annot) ] qu_annot = ' '.join(qu_annot) qu_pairs = ''.join(self.qu_pairs) #all the key words have been picked up and labled, but their index number in <f+num> and <c+num> are not in order. Then we need to reorder the index. qu_pairs, qu_annot = self.reorder(qu_pairs, qu_annot) res.append(qu_annot) res_pair.append(qu_pairs) print('\nSaving questions') with open(os.path.join(parse, '%s.qu' % self.split), 'w') as f: f.write('\n'.join(res)) print('\nSaving pairs') with open(os.path.join(parse, '%s_sym_pairs.txt' % self.split), 'w') as f: f.write('\n'.join(res_pair)) generate_lon(parse, split, 8)
USE_I18N = True USE_L10N = True USE_TZ = True # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.10/howto/static-files/ STATIC_ROOT = os.path.join(BASE_DIR, 'static') STATIC_URL = '/static/' STATICFILES_DIRS = ( os.path.join(BASE_DIR, 'staticfiles'), # if your staticfiles files folder is named "staticfiles" ) glove_instance = glove.Glove() print("Loading tags and images dictionary") IMAGE_RETRIEVED_TAGS_FILE_PATH = 'tags_images.txt' IMAGE_RETRIEVED_TAGS = dict() with open("contentBasedSearch/"+IMAGE_RETRIEVED_TAGS_FILE_PATH, "rb") as f: IMAGE_RETRIEVED_TAGS = pickle.load(f) print("Finished loading dictionary") print(IMAGE_RETRIEVED_TAGS) print("Loading Glove vectors") TERM_SEARCH_ENGINE = glove_instance.load_stanford(filename="vectorsGloveLight.txt") print("Finished Loading vectors")
def glove_pro(df_raw, sentence_id, word_id, emb_size=128, window=50, dropna=False, n_jobs=16, learning_rate=0.05, epoch=8, return_model=False): """ conda create -y -n TF1.14 python=3.6 pip install glove_python ------ test_glove = datalog.head(10000) sentence_id = 'user_id' word_id = 'industry' res = glove_pro(test_glove, sentence_id, word_id, emb_size=32, window=20, dropna=False, n_jobs=16, learning_rate=0.05, epoch=8,return_model=True) res.keys() res['sentence_emb_df'].info() res['model'].most_similar("6", number=10) """ list_col_nm = f'{sentence_id}__{word_id}_list' if (n_jobs is None) or (n_jobs <= 0): n_jobs = multiprocessing.cpu_count() logger.info(f"========== GloVE: {sentence_id} {word_id} ==========") df = df_raw[[sentence_id, word_id]].copy() if df[sentence_id].isnull().sum() > 0: logger.warning("NaNs exist in sentence_id column!!") if dropna: df = df.dropna(subset=[sentence_id, word_id]) else: df = df.fillna('NULL_zhangqibot') df = df.astype(str) tmp = df.groupby(sentence_id, as_index=False)[word_id].agg({list_col_nm: list}) sentences = tmp[list_col_nm].values.tolist() all_words_vocabulary = df[word_id].unique().tolist() del tmp[list_col_nm] gc.collect() matrix = glv.Corpus() matrix.fit(corpus=sentences, window=window) model = glv.Glove(no_components=emb_size, learning_rate=learning_rate, alpha=0.75, max_count=100, max_loss=10.0, random_state=666) model.fit(matrix.matrix, epochs=epoch, no_threads=n_jobs, verbose=1) model.add_dictionary(matrix.dictionary) # get word embedding matrix emb_dict = {} for word_i in all_words_vocabulary: if word_i in model.dictionary: emb_dict[word_i] = model.word_vectors[model.dictionary[word_i]] else: emb_dict[word_i] = np.zeros(emb_size, dtype="float32") return {"word_emb_dict": emb_dict}
import glove import glove cooccur = {0: {0: 1.0, 2: 3.5}, 1: {2: 0.5}, 2: {0: 3.5, 1: 0.5, 2: 1.2}} model = glove.Glove(cooccur, vocab_size=3, d=50, alpha=0.75, x_max=100.0) for epoch in range(25): err = model.train(batch_size=200, workers=9) print("epoch %d, error %.3f" % (epoch, err), flush=True)
import glove cooccur = {0: {0: 1.0, 2: 3.5}, 1: {2: 0.5}, 2: {0: 3.5, 1: 0.5, 2: 1.2}} cooccur_mat = [[1, 2, 3], [2, 1, 2], [2, 1, 3]] # convert matrix to dict keys = range(len(cooccur_mat)) lines = [] for i in range(len(cooccur_mat)): line = dict(zip(keys, cooccur_mat[i])) lines.append(line) c_c_mat = dict(zip(keys, lines)) model = glove.Glove(c_c_mat, d=50, alpha=0.75, x_max=100.0) for epoch in range(25): err = model.train(step_size=0.05, workers=9, batch_size=50) print err print model.W print model.b
def run_emb(datadir, level=None): outdir = datadir + '/level-' + level # load vocabulary and behrs (ID_SUBJ:[terms]; ID_SUBJ:Fn:[terms]) bt_to_idx, idx_to_bt = _load_vocab(outdir, ut.file_names['vocab']) behr, behr_tf = _load_data(outdir, ut.file_names['behr']) terms = [] for vec in behrs.values(): terms.extend(vec) count = 0 list_count = {} for idx, lab in idx_to_bt.items(): co = terms.count(str(idx)) list_count[lab] = co if co > 1: count += 1 print("Number of repeated terms: {0} -- Terms with one occurrence: {1}\n".format(count, len(bt_to_idx)-count)) print('Most frequent terms (TF>20)') x = [] y = [] for lab, co in list_count.items(): if co > 20: x.append(lab) y.append(co) print('%s, %d' % (lab, co)) else: x.append('TF<20') y.append(co) # save plot term distribution plt.figure(figsize=(30, 20)) plt.bar(x, y) plt.tick_params(axis='x', rotation=90, labelsize=10) plt.savefig(os.path.join(outdir, 'term20-distribution.png')) plt.figure(figsize=(20, 10)) plt.bar(range(len(list_count.values())), list(list_count.values())) plt.tick_params(axis='x', rotation=90, labelsize=10) plt.savefig(os.path.join(outdir, 'term-distribution.png')) print('\n') # TF-IDF print('Computing TF-IDF matrix...') doc_list = list(map(lambda x: ' '.join(x), list(behrs.values()))) id_subj = [id_lab for id_lab in behrs] vectorizer = TfidfVectorizer(norm='l2') tfidf_mtx = vectorizer.fit_transform(doc_list) print('Performing SVD on the TF-IDF matrix...') reducer = TruncatedSVD(n_components=ut.n_dim, random_state=123) svd_mtx = reducer.fit_transform(tfidf_mtx) # save SVD mtx with open(os.path.join(outdir, 'svd-mtx.csv'), 'w') as f: wr = csv.writerow(f) for idx, lab in enumerate(id_subj): wr.writerow([lab] + svd_mtx[idx]) print('\n\n') # GloVe embeddings print('Starting computing GloVe embeddings for {0} epochs'.format(ut.n_epoch)) corpus = _build_corpus(behrs_tf) coocc_dict = build_cooccur(idx_to_bt, corpus, window_size=20) model = glove.Glove(out, alpha=0.75, x_max=100.0, d=ut.n_dim) for epoch in range(ut.n_epoch): err = model.train(batch_size=ut.batch_size) print("epoch %d, error %.3f" % (epoch, err), flush=True) Wemb = model.W + model.ContextW # as suggested in Pennington et al. p_emb = [] id_list = [] for id_subj, term in corpus.items(): if len(term)!=0: id_list.append(id_subj) p_emb.append(np.mean([Wemb[int(t)].tolist() for t in term], axis=0).tolist()) # save subject embeddings with open(os.path.join(outdir, 'glove-mtx.csv'), 'w') as f: wr = csv.writer(f) for id_p, pe in zip(id_list, p_emb): wr.writerow([id_p] + list(pe))