def training_callback(autoencoder, epoch, lr, loss, perplexity): if verbose_mode: decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[ reverse_vocab[item.item()] for item in topic ] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence="u_mass", ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print( str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic)) print(coherence) else: coherence = 0 writer.add_scalars( "data/autoencoder", { "lr": lr, "loss": loss, "perplexity": perplexity, "coherence": coherence, }, global_step=epoch, )
def vect2gensim(self, vectorizer, dtmatrix): # transform sparse matrix into gensim corpus and dictionary corpus_vect_gensim = Sparse2Corpus(dtmatrix, documents_columns=False) dictionary = Dictionary.from_corpus( corpus_vect_gensim, id2word=dict( (id, word) for word, id in vectorizer.vocabulary_.items())) return (corpus_vect_gensim, dictionary)
def test_hdp_model(self): print "Building model" d = Dictionary.from_corpus(self.k_corpus) model = models.hdpmodel.HdpModel(self.k_corpus, d, T=500) #model = models.ldamulticore.LdaMulticore(self.k_corpus, num_topics=479, workers=self.t-1) #model = models.ldamodel.LdaModel(self.k_corpus, num_topics=479, passes=100) model.save("kmer_hdp.k%s" % self.ks, ignore=['corpus']) print "Done"
class tip_rec: def __init__(self, num_topics = 15): self.numtopics = num_topics self.topic_dict = dict(enumerate(np.zeros(num_topics))) self.user_dict = {} self.model = None self.worddict = {} self.mydict = None def train(self, df): self.user_dict = {el:self.topic_dict.copy() for el in df.sender.unique()} cv = CV(stop_words='english') X = cv.fit_transform(df['context']) vocab = cv.vocabulary_.keys() self.worddict=dict([(i, s) for i, s in enumerate(vocab)]) self.mydict = Dictionary() self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(X, documents_columns=False), id2word=self.worddict) self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict) for i in df.iterrows(): if i[1]['context'] == '': continue else: values = new_model[mydict.doc2bow(i[1]['context'].split())] for val in values: if val[0] in user_dict[i[1].sender].keys(): if i[1].amt == '': continue user_dict[i[1].sender][val[0]] += val[1] * float(i[1].amt) continue user_dict[i[1].sender][val[0]] = val[1] for i in user_dict.keys(): norm_const = sum(user_dict[i].values()) for j in user_dict[i].keys(): user_dict[i][j] = user_dict[i][j]/norm_const def predict(self, text, username = ''): topics = self.model[self.mydict.doc2bow(text.split())] doc_aff = np.zeros(self.numtopics) for i in topics: doc_aff[i[0]] = i[1] if username == '': returndict = {} for user in self.user_dict.keys(): user_aff = np.array(self.user_dict[user].values()) score = np.linalg.norm(user_aff - doc_aff) returndict[user] = score return returndict else: user_aff = np.array(self.user_dict[username].values()) score = np.linalg.norm(user_aff - doc_aff) return (username, score)
def start(self, verbose=True): if verbose: print("Preprocessing dataset...") if verbose: print('Creating Tokens For Lemmetization...') t0 = time() # Tokenize data_tokens = self.remove_stopwords_( list(self.preprocess_(self.data[:]))) if verbose: print("done in %0.3fs." % (time() - t0)) if verbose: print('Lemmetization in progress using spaCy...') t0 = time() # Lemmatization function using spaCy self.lemmatized_text = self.lemmatization_(data_tokens) token_text = [] for doc in self.lemmatized_text: token_text.append(doc.split()) if self.bigrams: self.token_text = self.make_ngrams_(token_text) input_text = [] for doc in self.token_text: input_text.append(" ".join(doc)) else: self.token_text = token_text input_text = self.lemmatized_text if verbose: print("done in %0.3fs." % (time() - t0)) # Vectorizing dataset for use with LDA algorithm if verbose: print('Vectorizing dataset...') t0 = time() self.vectorizer = CountVectorizer( analyzer='word', min_df=self.min_df, stop_words=self.stopwords, lowercase=True, token_pattern=self.token_pattern, ngram_range=self.ngram_range, #max_features=50000, max_df=self.max_df) self.lda_input_data = self.vectorizer.fit_transform(input_text) self.gensim_corpus_vect = gensim.matutils.Sparse2Corpus( self.lda_input_data, documents_columns=False) self.id2word = Dictionary.from_corpus( self.gensim_corpus_vect, id2word=dict((idn, word) for word, idn in self.vectorizer.vocabulary_.items())) if verbose: print(self.vectorizer) #checking sparcity because it was in my tutorial (is this an important thing?) data_dense = self.lda_input_data.todense() if verbose: print('Sparcity: ', ((data_dense > 0).sum() / data_dense.size) * 100, '%') if verbose: print("done in %0.3fs." % (time() - t0))
def vect2gensim(vectorizer, dtmatrix): # transform sparse matrix into gensim corpus and dictionary start = time() corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtmatrix, documents_columns=False) dictionary = Dictionary.from_corpus( corpus_vect_gensim, id2word=dict( (id, word) for word, id in vectorizer.vocabulary_.items())) end = time() print( "Transform vector model to gensim format ... done in {0:0.3f} miliseconds" .format((end - start) * 1000)) return (corpus_vect_gensim, dictionary)
def score(self, X, y=None, sample_weight=None) -> float: # TODO this needs further testing for correctness, WIP if self.autoencoder is None: raise NotFittedError self.autoencoder.eval() corpus = Sparse2Corpus(X, documents_columns=False) decoder_weight = self.autoencoder.decoder.linear.weight.detach().cpu() id2word = {index: str(index) for index in range(X.shape[1])} topics = [[str(item.item()) for item in topic] for topic in decoder_weight.topk( min(self.score_num, X.shape[1]), dim=0)[1].t()] cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, id2word), coherence='u_mass') return cm.get_coherence()
def vectorize_text(df): cv = CountVectorizer(preprocessor=preprocessor, stop_words=stop, lowercase=True, decode_error='replace', ngram_range=(1, 2), max_df=0.9, min_df=10, max_features=10000) sparse_df = cv.fit_transform(df['all_text']) gensim_df = matutils.Sparse2Corpus(sparse_df.T) vocab = cv.get_feature_names() id2word = {i: s for i, s in enumerate(vocab)} dictionary = Dictionary.from_corpus(gensim_df, id2word) # for coherence model return gensim_df, id2word, dictionary
def filterDS(data, featureNames, no_below=5, no_above=0.3, keep_n=1200): corpus = gensim.matutils.Dense2Corpus(data, documents_columns=False) dictionary = Dictionary.from_corpus( corpus, {i: w for i, w in enumerate(featureNames)}) if len(featureNames) > keep_n: dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dw = dictionary.values() newWId = [i for i, w in enumerate(featureNames) if w in dw] words = [featureNames[i] for i in newWId] dictionary.compactify() data = data[:, newWId] return data, words
def get_topics(cv, train_data): """ Uses gensim to perform topic modeling. Parameters --------- cv: A TfidfVectorizer instance. train_data: A scipy csr_matrix. Returns ------- A list of strings (functions of the most important terms in each topic). """ td_gensim = Sparse2Corpus(train_data, documents_columns=False) tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items()) dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct) lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20) topics = lda.top_topics(corpus=td_gensim, num_words=5) return topics
def ldaperplexity(train, test, topics): corpus = gensim.matutils.Dense2Corpus(train.astype(int), documents_columns=False) corpusTest = gensim.matutils.Dense2Corpus(test.astype(int), documents_columns=False) dictionary = Dictionary.from_corpus(corpus) with warnings.catch_warnings(): warnings.simplefilter("ignore") c = Chrono().start() lda = runLda(corpus, dictionary, topics=topics) c.end() corpus_words = sum(cnt for document in corpusTest for _, cnt in document) with warnings.catch_warnings(): warnings.simplefilter("ignore") perwordbound = lda.log_perplexity(corpusTest) print( "LDA %.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words" % (perwordbound, numpy.exp2(-perwordbound), len(corpusTest), corpus_words)) return numpy.exp2(-perwordbound), c.elapsed()
def get_topics(cv, train_data): ''' Uses gensim to perform topic modeling. Paramters --------- cv: A TfidfVectorizer instance. train_data: A scipy csr_matrix. Returns ------- A list of strings (functions of the most important terms in each topic). ''' #Create the gensim corpus from train data td_gensim = Sparse2Corpus(train_data) #Create vocab dictionary tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items()) dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct) #Create LDA model with specified parameters lda_gs = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20) topics = lda_gs.top_topics(corpus=td_gensim, num_words=5) return topics
def get_topics(cv, train_data): """ Uses gensim to perform topic modeling. Parameters --------- cv: A TfidfVectorizer instance. train_data: A scipy csr_matrix. Returns ------- A list of strings (functions of the most important terms in each topic). """ td_gensim = Sparse2Corpus(train_data, documents_columns=False) tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items()) dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct) lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20) topics = lda.top_topics(corpus=td_gensim, num_words=5) return topics
def training_callback(autoencoder, epoch, lr, loss, perplexity): decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [ [reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t() ] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence='u_mass' ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print(str(index) + ':' + str(coherences[index]) + ':' + ','.join(topic)) print(coherence) writer.add_scalars('data/autoencoder', { 'lr': lr, 'loss': loss, 'perplexity': perplexity, 'coherence': coherence, }, global_step=epoch)
def main(cuda, batch_size, epochs, top_words, testing_mode, verbose_mode): print("Loading input data") # TODO fix relative paths data_train = load_npz("data/train.txt.npz") data_val = load_npz("data/test.txt.npz") corpus = Sparse2Corpus(data_train, documents_columns=False) with open("data/vocab.pkl", "rb") as f: vocab = pickle.load(f) reverse_vocab = {vocab[word]: word for word in vocab} indexed_vocab = [ reverse_vocab[index] for index in range(len(reverse_vocab)) ] writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(autoencoder, epoch, lr, loss, perplexity): if verbose_mode: decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[ reverse_vocab[item.item()] for item in topic ] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence="u_mass", ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print( str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic)) print(coherence) else: coherence = 0 writer.add_scalars( "data/autoencoder", { "lr": lr, "loss": loss, "perplexity": perplexity, "coherence": coherence, }, global_step=epoch, ) ds_train = CountTensorDataset(data_train) ds_val = CountTensorDataset(data_val) autoencoder = ProdLDA(in_dimension=len(vocab), hidden1_dimension=100, hidden2_dimension=100, topics=50) if cuda: autoencoder.cuda() print("Training stage.") ae_optimizer = Adam(autoencoder.parameters(), 0.0001, betas=(0.99, 0.999)) train( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=epochs, batch_size=batch_size, optimizer=ae_optimizer, update_callback=training_callback, sampler=WeightedRandomSampler(torch.ones(data_train.shape[0]), 20000), num_workers=4, ) autoencoder.eval() decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [[reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t()] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence="u_mass", ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print( str(index) + ":" + str(coherences[index]) + ":" + ",".join(topic)) print(coherence) if not testing_mode: writer.add_embedding( autoencoder.encoder.linear1.weight.detach().cpu().t(), metadata=indexed_vocab, tag="feature_embeddings", ) writer.close()
class tip_rec: def __init__(self, num_topics=15): self.numtopics = num_topics self.topic_dict = dict(enumerate(np.zeros(num_topics))) self.user_dict = {} self.model = None self.worddict = {} self.mydict = None def train(self, df): self.user_dict = { el: self.topic_dict.copy() for el in df.sender.unique() } cv = CV(stop_words='english') X = cv.fit_transform(df['context']) vocab = cv.vocabulary_.keys() self.worddict = dict([(i, s) for i, s in enumerate(vocab)]) self.mydict = Dictionary() self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus( X, documents_columns=False), id2word=self.worddict) self.model = LatentDA.LdaModel(matutils.Sparse2Corpus( X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict) for i in df.iterrows(): if i[1]['context'] == '': continue else: values = new_model[mydict.doc2bow(i[1]['context'].split())] for val in values: if val[0] in user_dict[i[1].sender].keys(): if i[1].amt == '': continue user_dict[i[1].sender][val[0]] += val[1] * float( i[1].amt) continue user_dict[i[1].sender][val[0]] = val[1] for i in user_dict.keys(): norm_const = sum(user_dict[i].values()) for j in user_dict[i].keys(): user_dict[i][j] = user_dict[i][j] / norm_const def predict(self, text, username=''): topics = self.model[self.mydict.doc2bow(text.split())] doc_aff = np.zeros(self.numtopics) for i in topics: doc_aff[i[0]] = i[1] if username == '': returndict = {} for user in self.user_dict.keys(): user_aff = np.array(self.user_dict[user].values()) score = np.linalg.norm(user_aff - doc_aff) returndict[user] = score return returndict else: user_aff = np.array(self.user_dict[username].values()) score = np.linalg.norm(user_aff - doc_aff) return (username, score)
def main( cuda, batch_size, epochs, top_words, testing_mode, ): print('Loading input data') # TODO fix relative paths input_train = np.load('data/train.txt.npy', encoding='bytes') input_val = np.load('data/test.txt.npy', encoding='bytes') with open('data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) reverse_vocab = {vocab[word]: word for word in vocab} indexed_vocab = [reverse_vocab[index] for index in range(len(reverse_vocab))] data_train = np.array( [np.bincount(doc.astype('int'), minlength=len(vocab)) for doc in input_train if doc.sum() > 0] ) data_val = np.array([np.bincount(doc.astype('int'), minlength=len(vocab)) for doc in input_val if doc.sum() > 0]) corpus = Dense2Corpus(data_train, documents_columns=False) writer = SummaryWriter() # create the TensorBoard object # callback function to call during training, uses writer from the scope def training_callback(autoencoder, epoch, lr, loss, perplexity): decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [ [reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t() ] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence='u_mass' ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print(str(index) + ':' + str(coherences[index]) + ':' + ','.join(topic)) print(coherence) writer.add_scalars('data/autoencoder', { 'lr': lr, 'loss': loss, 'perplexity': perplexity, 'coherence': coherence, }, global_step=epoch) ds_train = TensorDataset(torch.from_numpy(data_train).float()) ds_val = TensorDataset(torch.from_numpy(data_val).float()) autoencoder = ProdLDA( in_dimension=len(vocab), hidden1_dimension=100, hidden2_dimension=100, topics=50 ) if cuda: autoencoder.cuda() print('Training stage.') ae_optimizer = Adam(autoencoder.parameters(), 0.001, betas=(0.99, 0.999)) train( ds_train, autoencoder, cuda=cuda, validation=ds_val, epochs=epochs, batch_size=batch_size, optimizer=ae_optimizer, update_callback=training_callback ) autoencoder.eval() decoder_weight = autoencoder.decoder.linear.weight.detach().cpu() topics = [ [reverse_vocab[item.item()] for item in topic] for topic in decoder_weight.topk(top_words, dim=0)[1].t() ] cm = CoherenceModel( topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, reverse_vocab), coherence='u_mass' ) coherence = cm.get_coherence() coherences = cm.get_coherence_per_topic() for index, topic in enumerate(topics): print(str(index) + ':' + str(coherences[index]) + ':' + ','.join(topic)) print(coherence) if not testing_mode: writer.add_embedding( autoencoder.encoder.linear1.weight.detach().cpu().t(), metadata=indexed_vocab, tag='feature_embeddings', ) writer.close()
if row['Content']: docs.append( str(row['Content']).encode(encoding='UTF-8', errors='strict')) def vect2gensim(vectorizer, dtmatrix): # transform sparse matrix into gensim corpus and dictionary corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtmatrix, documents_columns=False) dictionary = Dictionary.from_corpus( corpus_vect_gensim, id2word=dict( (id, word) for word, id in vectorizer.vocabulary_.items())) return (corpus_vect_gensim, dictionary) # compute vector space with sklearn vect = CountVectorizer(min_df=1, ngram_range=(1, 1), max_features=30000) corpus_vect = vect.fit_transform(docs) # transport to gensim (corpus_vect_gensim, gensim_dict) = vect2gensim(vect, corpus_vect) dictionary = Dictionary.from_corpus( corpus_vect_gensim, id2word=dict((id, word) for word, id in vect.vocabulary_.items())) pd.to_pickle(vect, 'pickles/vocab_' + filename + '.pkl') print(dictionary)