def calc_lda(tf_matrix, n_topics=10, max_iter=20): lda = decomposition.LatentDirichletAllocation(n_topics=n_topics, max_iter=max_iter, learning_method='online', learning_offset=50.) doctopic = lda.fit_transform(tf_matrix) return doctopic, lda
def summary(): if 'show_id' in request.args: showrun = request.args.get('show_id') states = State.query.filter_by(showrun=int(showrun)).order_by( State.created_date.desc()).all() texts = [s.text for s in states] else: texts = [] show_ids = request.args.get('show_ids') show_ids = [int(x) for x in show_ids.split(',')] for show_id in show_ids: states = State.query.filter_by(showrun=show_id).order_by( State.created_date.desc()).all() t = [s.text for s in states] texts.extend(t) vectorizer = TfidfVectorizer(stop_words='english', min_df=1, tokenizer=tokenize_nltk) dtm = vectorizer.fit_transform(texts).toarray() vocab = np.array(vectorizer.get_feature_names()) #Define Topic Model: LatentDirichletAllocation (LDA) clf = decomposition.LatentDirichletAllocation(n_topics=5, random_state=3) num_top_words = 3 doctopic = clf.fit_transform(dtm) topic_words = [] for topic in clf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] topic_words.append([vocab[i] for i in word_idx]) merged = list(set(itertools.chain.from_iterable( topic_words))) # use set for unique items return jsonify({'data': {'topics': merged}})
def runLDA(train, test, dev, numComponents, applianceName): # Function that run the Latent Dirichlet Allocation # It takes in a train,test and dev dataset that comprises frequency distributions # numComponents specifies the number of clusters or topics for the model # It returns the perplexity score of the dev set, and the clusters assignements on the train,dev and test set model = skld.LatentDirichletAllocation(n_components=numComponents, verbose=0) model.fit(train) predictions = model.transform(test) test_classification = np.argmax(predictions, 1) test_classification = test_classification.astype(int) predictions2 = model.transform(dev) dev_classification = np.argmax(predictions2, 1) dev_classification = dev_classification.astype(int) predictions3 = model.transform(train) train_classification = np.argmax(predictions3, 1) train_classification = train_classification.astype(int) if numComponents > 10: nRows = 3 else: nRows = 2 perplex = model.perplexity(dev) #compare_distributions(test,dev_classification,numComponents,nRows,applianceName) #compare_distributions(dev, dev_classification, numComponents, nRows, applianceName) return perplex, train_classification, dev_classification, test_classification
def latent_dirichlet(): ''' Fits a latent dirichlet allocation model to the corpus of organization descriptions and/or self-reported IRS designations. Then prints the top 10 words in each component. Does NOT return the model for further use. ''' n_top_words = 10 connect = sql.connect("myform/with_coords") db = connect.cursor() query = '''SELECT text_dump, pp_text FROM mcp''' corpus = [] for all_text in db.execute(query).fetchall(): writing = "" for alpha in all_text: if alpha: writing += alpha + " " corpus.append(writing) new_stops = ["chicago", "illinois", "founded", "year"] vect = sktext.CountVectorizer(stop_words = STOP_WORDS + new_stops) dtm = vect.fit_transform(corpus) model = skdecomp.LatentDirichletAllocation( n_components=N_CLUST, max_iter=5, learning_method='online', learning_offset=50) model.fit(dtm) tf_feature_names = vect.get_feature_names() print_top_words(model, tf_feature_names, n_top_words)
def __init__(self, config): self.topics_name = 'X_topics' self.name = 'lda_model' self.model = decomposition.LatentDirichletAllocation( n_components=config['n_components'], learning_method=config['learning_method'], max_iter=config['max_iter'])
def train_lda_model(x_train_count, count_vector): """ train a LDA Model """ lda_model = decomposition.LatentDirichletAllocation( n_components=20, learning_method='online', max_iter=20) x_topics = lda_model.fit_transform(x_train_count) topic_word = lda_model.components_ vocab = count_vector.get_feature_names() return x_topics, topic_word, vocab
def modeling(docs, alpha, beta, num_words, num_topics): learning = sk.LatentDirichletAllocation(num_topics, alpha, beta, learning_method='batch', max_iter=5000) learning.fit(docs) return (learning.transform(docs), learning.components_)
def Linear_discriminant_analysis(self, source): min_max_scaler = preprocessing.MinMaxScaler() data_source = min_max_scaler.fit_transform(source) pca = decomposition.LatentDirichletAllocation(n_components=2) #居然LDA没有??? result = {} result['data'] = pca.fit_transform(data_source) result['params'] = 0 return result
def lda( n_topics: int, name: Optional[str] = "lda", ) -> TopicModelingOperation: model = skdecomp.LatentDirichletAllocation(n_components=n_topics) return TopicModelingOperation( model=model, name=name, )
def extract_topics_lda(data_samples, preprocessor, n_features, n_topics, n_top_words, n_gram_range=(1,1), more_stopwords=None): lda = decomposer.LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) topics_words = _extract_topics_decomposer(data_samples, preprocessor, lda, n_features, n_topics, n_top_words, n_gram_range, more_stopwords) return topics_words
def run_latent_dirichlet_allocation(X=None): lda = decomposition.LatentDirichletAllocation() if X is None: X = users_as_real_vectors(users) p = lda.fit_transform(X) mask = (p[:, 0] * p[:, 0] + p[:, 1] * p[:, 1] < 12) p = p[mask] plt.figure() plt.scatter(p[:, 0], p[:, 1]) plt.show()
def extract_topics(data_samples, lang, n_features, n_topics, n_top_words, more_stopwords=None): n_gram_range = (2, 2) t0 = time() preprocessor = None if lang in ["en", "english"]: preprocessor = TokenHandler.EnTokenHandler(stemming=True, stopword=True) if lang in ["tr", "turkish"]: preprocessor = TokenHandler.TrTokenHandler(stopword=True, more_stopwords=more_stopwords, stemming=False, remove_numbers=True, deasciify=True, remove_punkt=True) ''' tf_vectorizer = txtfeatext.CountVectorizer(tokenizer=preprocessor, ngram_range=(1, 2), max_features=n_features) tf_matrix = tf_vectorizer.fit_transform(data_samples) ''' tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=preprocessor, ngram_range=n_gram_range, max_features=n_features) tfidf_matrix = tfidf_vectorizer.fit_transform(data_samples) t1 = time() print("1- Vectorizing took ", (t1-t0), "sec.") # apply NMF ''' print("Applying NMF on tf*idf weighted terms, n_samples=%d and n_features=%d..." % (n_samples, n_features)) ''' nmf = decomposer.NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf_matrix) print("\nTopics in NMF model:") print_topic_words(nmf, tfidf_vectorizer, n_top_words) #nmf_topics = get_topic_words(model, vectorizer, n_top_words) t2 = time() print("NMF took ", t2 - t1, "sec.") #print("Applying LDA on tf weighted terms, n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda = decomposer.LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tfidf_matrix) print("\nTopics in LDA model:") print_topic_words(lda, tfidf_vectorizer, n_top_words) t3 = time() print("LDA took ", t3 - t2, "sec.") '''
def perform_lda(max_df, min_df, topics, ngram): vectorizer = CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df, ngram_range=ngram) matrixX = vectorizer.fit_transform(wordX) lda = d.LatentDirichletAllocation(n_components=topics, max_iter=10, verbose=1) lda.fit(matrixX) return [lda, vectorizer, max_df, min_df, topics, ngram]
def LDA(num_topics, num_top_words, deck): vectorizer = CountVectorizer(tokenizer=word_tokenize) X = vectorizer.fit_transform(deck) X_vocab = np.array(vectorizer.get_feature_names()) lda = decomposition.LatentDirichletAllocation(n_topics=num_topics,learning_method='online') lda.fit_transform(X) lda_topic_words = [] for topic in lda.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] lda_topic_words.append([X_vocab[i] for i in word_idx]) lda_topic_words
def topic_modelling(flag): #function for tokenization, training etc pd.set_option('display.max_colwidth', -1) X_train, x_test = train_test_split(reviews_datasets, test_size=0.9, random_state=111) # printx_test,typex_test)) vectorizer_tf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.75, min_df=50, max_features=10000, use_idf=False, norm=None) tf_vectors = vectorizer_tf.fit_transform(X_train.text) if (flag == 1): lda = decomposition.LatentDirichletAllocation(n_components=10, max_iter=3, learning_method='online', learning_offset=50, n_jobs=-1, random_state=111) with open("lda_model.pk", "wb") as f: pickle.dump(lda, f) else: with open("lda_model.pk", "rb") as f: lda = pickle.load(f) W1 = lda.fit_transform(tf_vectors) H1 = lda.components_ num_words = 15 vocab = np.array(vectorizer_tf.get_feature_names()) top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words - 1:-1]] topic_words = ([top_words(t) for t in H1]) topics = [' '.join(t) for t in topic_words] colnames = ["Topic" + str(i) for i in range(lda.n_components)] docnames = ["Doc" + str(i) for i in range(len(X_train.text))] df_doc_topic = pd.DataFrame(np.round(W1, 2), columns=colnames, index=docnames) df_doc_topic = pd.DataFrame(np.round(W1, 2), columns=colnames, index=docnames) topic_important = np.argmax(df_doc_topic.values, axis=1) df_doc_topic['most_matched_topic'] = topic_important print("Log Likelihood: ", lda.score(tf_vectors)) print("Perplexity: ", lda.perplexity(tf_vectors)) return lda, vectorizer_tf, topics
def _topics_extraction_with_lda(X): tf_vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words=get_stop_words('nl')) data = tf_vectorizer.fit_transform(X) #print('Before LDA, sample size: {}'.format(data.shape)) best_params_ = { 'n_topics': 4, 'max_iter': 100, 'n_jobs': -1, 'learning_method': 'batch' } lda = decomposition.LatentDirichletAllocation(**best_params_) data = lda.fit_transform(data) #print('After LDA, sample size: {}'.format(data.shape)) return data
def construct_model ( self ): ''' Learn an 10 topic LDA model on the wine descriptions provided in the database. ''' df = self.df tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=2000, stop_words='english') tf_names = tf_vectorizer.fit_transform ( df['description'].dropna() ) model = decomposition.LatentDirichletAllocation ( ) model.fit ( tf_names ) self.model = model self.tf_vectorizer = tf_vectorizer
def get_top_k_latent_semantics(k, model, gesture_word_matrix): if model == "PCA": model = decomposition.PCA(n_components=top_k_input) if model == "SVD": model = decomposition.TruncatedSVD(n_components=top_k_input) if model == "NMF": model = decomposition.NMF(n_components=top_k_input, max_iter=10000) if model == "LDA": model = decomposition.LatentDirichletAllocation(n_components=top_k_input) model.fit(gesture_word_matrix) # take the top-k latent semantics top_k_matrix = model.components_ return top_k_matrix
def get_lda_df(): lda = decomp.LatentDirichletAllocation( n_topics=data_config.lda_num_topics, doc_topic_prior=1./200, topic_word_prior=1./200, ) print "getting sample 1" lda_w_2_idx, lda_sample = models.create_lda_sample(p=.1) print "partial fit 1" lda.partial_fit(lda_sample) print "getting sample 2" _, lda_sample = models.create_lda_sample(p=.1) print "partial fit 2" lda.partial_fit(lda_sample) lda_mean = {} lda_mean_mean = {} #lda_max = {} print "labeling corpus" for day, comments in corpus.get_day_preprocessed_comments(p=.25, include_oos=True): print day coo_dict = coll.defaultdict(int) for doc_idx, sent_tokens in enumerate(comments): for sent in sent_tokens: for token in sent: if token not in lda_w_2_idx: continue word_idx = lda_w_2_idx[token] coo_dict[doc_idx, word_idx] += 1 lda_day = lda.transform(util.d_to_sparse(coo_dict, shape=(doc_idx+1, lda.components_.shape[1]))) lda_day_mean = (lda_day.T / lda_day.sum(axis=1)).T lda_mean[day] = lda_day.mean(axis=0) lda_mean_mean[day] = lda_day_mean.mean(axis=0) #lda_max[day] = lda_day.max(axis=0) lda_idx_2_w = {idx: w for w, idx in lda_w_2_idx.viewitems()} days, lda_reps = zip(*sorted(lda_mean.items(), key=lambda x:x[0])) df_lda_mean = pd.DataFrame(map(tuple, lda_reps), index=days) days, lda_reps = zip(*sorted(lda_mean_mean.items(), key=lambda x:x[0])) df_lda_mean_mean = pd.DataFrame(map(tuple, lda_reps), index=days) days, lda_reps = zip(*sorted(lda_max.items(), key=lambda x:x[0])) df_lda_max = pd.DataFrame(map(tuple, lda_reps), index=days) return lda, lda_w_2_idx, pd.DataFrame(map(tuple, lda_reps), index=days)
def Dirichlet(documents, vectorizer): dtm = vectorizer.fit_transform(documents).toarray() lda = decomposition.LatentDirichletAllocation(n_topics=num_topics, max_iter=50, learning_method='online', learning_offset=50., random_state=0, verbose=1, evaluate_every=1, doc_topic_prior=0.2, topic_word_prior=0.6) lda.fit(dtm) print_top_words(lda, vectorizer) return lda
def generate_topic(data, num_topic=10): vector = CountVectorizer() vector.fit(data) vocab = vector.vocabulary_ vector = CountVectorizer(stop_words="english", vocabulary=vocab.keys()) X = vector.fit_transform(data) lda = decomposition.LatentDirichletAllocation(n_topics=num_topic, learning_method="online") for day in range(X.shape[0] - 1, -1, -1): lda.partial_fit(X[day, :]) doc_topic = lda.transform(X[day, :]) alpha = sum(doc_topic) / len(doc_topic) eta = sum(doc_topic) / len(doc_topic) lda.set_params(doc_topic_prior=alpha, topic_word_prior=eta) doc_topic = lda.transform(X) doc_topic = pandas.DataFrame(doc_topic) return doc_topic
def fit_and_predict_LDA(num_topics, num_top_words, vocab, dtm_train, dtm_test): """ Fit the LDA topic modeling to the training document term matrix. Using the generated topics, map test document term matrix to document to topic matrix. Also return topic words. Parameters ---------- num_topics: int number of topics NMF decomposition should generate num_top_words: int number of topic words stored in topic_words list vocab: set set of unique terms in the reviews dtm_train: scipy sparse matrix Data for training (matrix with features, e.g. BOW or tf-idf) dtm_test: scipy sparse matrix Data for testing and used for 'prediction' (matrix with features, e.g. BOW or tf-idf) Returns ------- Tuple(numpy.ndarray, set) Returns doctopic, topic_words as a tuple """ lda = decomposition.LatentDirichletAllocation(n_topics=num_topics, random_state=1) lda.fit(dtm_train) doctopic = lda.transform(dtm_test) #scale the document-component matrix such that the component values associated with each document sum to one doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True) topic_words = [] for topic in lda.components_: # components is the topic-term matrix word_idx = np.argsort(topic)[::-1][0:num_top_words] topic_words.append([vocab[i] for i in word_idx]) print_top_5_topics(doctopic, len(lda.components_), topic_words) # for t in range(len(topic_words)): # print("Topic {}: {}".format(t+1, ' '.join(topic_words[t][:10]))) # I just hard coded the type of tokenization, because I didn't want to over-complicate the arguments to this function pickle.dump((doctopic, topic_words), open("pickles/lda-np-"+str(num_topics)+"-doctopic-topic_words.p", "wb")) return (doctopic, topic_words)
def process_lda(): # inputs file_lines = [] lines = [] with codecs.open(sys.argv[1], encoding='UTF-8') as f: file_lines = f.read().splitlines() #lines = cleanup(file_lines) lines = file_lines # params for LDA n_feats = 250 #1000 n_topics = 20 n_top_words = 50 # getting a custom stop-words list en_stop_words = [] en_stop_words = stop_words() # use tf (raw term count) features for LDA tf_vectorizer = text.CountVectorizer( max_df=0.95, min_df=2, #max_features=n_feats, stop_words='english') max_features=n_feats, stop_words=en_stop_words) tf = tf_vectorizer.fit_transform(lines) tf_feature_names = tf_vectorizer.get_feature_names() # fit an LDA model to the tf feats of the textual data lda = decomposition.LatentDirichletAllocation(max_iter=10, learning_method='online', learning_offset=50., random_state=0, verbose=1).fit(tf) # outputs with open( sys.argv[2] + "/lda." + sys.argv[1].split("/")[-1:].pop() + ".out", "w") as f: messages = print_top_words(lda, tf_feature_names, n_top_words) for m in messages: f.write("{}\n".format(m.encode('utf-8')))
def extract_topic(str_arg, num_topics=1, num_top_words=3): vectorizer = text.CountVectorizer(input='content', analyzer='word', lowercase=True, stop_words='english') dtm = vectorizer.fit_transform(str_arg.split()) vocab = np.array(vectorizer.get_feature_names()) #clf = decomposition.NMF(n_components=num_topics, random_state=1) ## topic extraction clf = decomposition.LatentDirichletAllocation(n_components=num_topics, learning_method='online') clf.fit_transform(dtm) topic_words = [] for topic in clf.components_: word_idx = np.argsort(topic)[::-1][ 0:num_top_words] ##[::-1] reverses the list topic_words.append([vocab[i] for i in word_idx]) return topic_words
def lda(df, cat=[ 'Name', 'Platform', 'Genre', 'Developer', 'Year_of_Release', 'Rating', ]): for v in tqdm(itertools.permutations(cat, 2)): # co-occurence matrix n_comp = 3 if (v[0] in ['Publisher', 'Name', 'Developer']) & ( v[1] in ['Publisher', 'Name', 'Developer', 'Rating']): continue if (v[0] == 'Platform_Genre') & (v[1] in ['Platform', 'Genre']): continue if (v[1] == 'Platform_Genre') & (v[0] in ['Platform', 'Genre']): continue if f'lda1_{v[0]}_{v[1]}' not in df.columns.values.tolist(): print(f'{v[0]} vs {v[1]}') agg_df = pd.crosstab(df[v[0]], df[v[1]]) # lda trans = decomposition.LatentDirichletAllocation( n_components=n_comp, random_state=42) trans2 = decomposition.NMF(n_components=n_comp, max_iter=8000, random_state=42) trans3 = decomposition.PCA(n_components=n_comp, random_state=42) lda_df = add_transformed(agg_df, trans, v, method='lda') nmf_df = add_transformed(agg_df, trans2, v, method='nmf') pca_df = add_transformed(agg_df, trans3, v, method='pca') # merge df = df.merge(lda_df, how='left', on=v[0]) df = df.merge(nmf_df, how='left', on=v[0]) df = df.merge(pca_df, how='left', on=v[0]) return df
def get_lda_features(features, xtrain_count): # create a count vectorizer object count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') count_vect.fit(features) # train a LDA Model lda_model = decomposition.LatentDirichletAllocation( n_components=20, learning_method='online', max_iter=20) X_topics = lda_model.fit_transform(xtrain_count) topic_word = lda_model.components_ vocab = count_vect.get_feature_names() # view the topic models n_top_words = 10 topic_summaries = [] for i, topic_dist in enumerate(topic_word): topic_words = numpy.array(vocab)[numpy.argsort( topic_dist)][:-(n_top_words + 1):-1] topic_summaries.append(' '.join(topic_words)) return topic_summaries
def clustervis_pipelines(visdim): return { 'PCA': Pipeline([ ('sca', preprocessing.MaxAbsScaler()), ('clu', decomposition.PCA(n_components=visdim)), ('sca2', preprocessing.MaxAbsScaler()), ]), 'NMF': Pipeline([ ('sca', preprocessing.MaxAbsScaler()), ('clu', decomposition.NMF(n_components=visdim, random_state=1, alpha=.1, l1_ratio=.5)), ('sca2', preprocessing.MaxAbsScaler()), ]), 'LDA': Pipeline([ ('sca', preprocessing.MaxAbsScaler()), ('clu', decomposition.LatentDirichletAllocation(n_components=visdim, learning_method='online')), ('sca2', preprocessing.MaxAbsScaler()), ]), 'SVD': Pipeline([ ('sca', preprocessing.MaxAbsScaler()), ('clu', decomposition.TruncatedSVD(n_components=visdim)), ('sca2', preprocessing.MaxAbsScaler()), ]) }
def perform(self): lda = skd.LatentDirichletAllocation(n_components=self.n_components, random_state=self.random_state) transform = lda.fit_transform(self.data) return transform
terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i), for ind in order_centroids[i, :10]: print(' %s' % terms[ind]), print print("\n") print("Prediction") prediction = model.predict(valid_x) print(prediction)''' # train a LDA Model lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20) X_topics = lda_model.fit_transform(xtrain_count) topic_word = lda_model.components_ vocab = count_vect.get_feature_names() # view the topic models n_top_words = 10 topic_summaries = [] for i, topic_dist in enumerate(topic_word): topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1] topic_summaries.append(' '.join(topic_words)) def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False): # fit the training dataset on the classifier classifier.fit(feature_vector_train, label)
#dtm = extract_tfidf_dtm(documents, advanced_parsing.extract_np_tokens) count_vect = pickle.load(open("pickles/np-30000-count-vect.p", "rb")) tfidf_transformer = pickle.load(open("pickles/np-30000-tfidf-trans.p", "rb")) dtm = pickle.load(open("pickles/np-30000-dtm.p", "rb")) import numpy as np # a conventional alias import sklearn.feature_extraction.text as text from sklearn import decomposition num_topics = 60 num_top_words = 20 lda = decomposition.LatentDirichletAllocation(n_topics=num_topics, random_state=1) # this next step may take some time doctopic = nmf.fit_transform(dtm) doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True) # print words associated with topics topic_words = [] for topic in nmf.components_: word_idx = np.argsort(topic)[::-1][0:num_top_words] topic_words.append([vocab[i] for i in word_idx]) print(topic_words) print(word_idx) # #