vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}') data_vectorized = vectorizer.fit_transform(data) # Build a Latent Dirichlet Allocation Model lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online') lda_Z = lda_model.fit_transform(data_vectorized) #print(lda_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Build a Non-Negative Matrix Factorization Model nmf_model = NMF(n_components=NUM_TOPICS) nmf_Z = nmf_model.fit_transform(data_vectorized) #print(nmf_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Build a Latent Semantic Indexing Model lsi_model = TruncatedSVD(n_components=NUM_TOPICS) lsi_Z = lsi_model.fit_transform(data_vectorized) #print(lsi_Z.shape) # (NO_DOCUMENTS, NO_TOPICS) # Let's see how the first document in the corpus looks like in different topic spaces #print(lda_Z[0]) #print(nmf_Z[0]) #print(lsi_Z[0]) def print_topics(model, vectorizer, top_n=10):
# Rm*n m = item n = user RATE_MATRIX = np.array( [[5, 5, 3, 0, 5, 5, 4, 3, 2, 1, 4, 1, 3, 4, 5], [5, 0, 4, 0, 4, 4, 3, 2, 1, 2, 4, 4, 3, 4, 0], [0, 3, 0, 5, 4, 5, 0, 4, 4, 5, 3, 0, 0, 0, 0], [5, 4, 3, 3, 5, 5, 0, 1, 1, 3, 4, 5, 0, 2, 4], [5, 4, 3, 3, 5, 5, 3, 3, 3, 4, 5, 0, 5, 2, 4], [5, 4, 2, 2, 0, 5, 3, 3, 3, 4, 4, 4, 5, 2, 5], [5, 4, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0], [5, 4, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1], [5, 4, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2], [5, 4, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]] ) nmf_model = NMF(n_components=2) # 设有2个主题 item_dis = nmf_model.fit_transform(RATE_MATRIX) user_dis = nmf_model.components_ print('用户的主题分布:' + str(user_dis.shape)) print(user_dis) print('电影的主题分布:' + str(item_dis.shape)) print(item_dis) plt1 = plt plt1.plot(item_dis[:, 0], item_dis[:, 1], 'ro') plt1.xlim((-1, 3)) plt1.ylim((-1, 3)) plt1.title(u'Item Distribution')#设置图的标题 count = 1
def test_n_components_greater_n_features(): # Smoke test for the case of more components than features. rng = np.random.mtrand.RandomState(42) A = np.abs(rng.randn(30, 10)) NMF(n_components=15, random_state=0, tol=1e-2).fit(A)
def test_custom_nmf(self): mat = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]], dtype=np.float64) mat[:mat.shape[1], :] += np.identity(mat.shape[1]) mod = NMF(n_components=2) W = mod.fit_transform(mat) H = mod.components_ def predict(W, H, row_index, col_index): return np.dot(W[row_index, :], H[:, col_index]) pred = mod.inverse_transform(W) exp = [] got = [] for i in range(mat.shape[0]): for j in range(mat.shape[1]): exp.append((i, j, pred[i, j])) got.append((i, j, predict(W, H, i, j))) max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, got)) assert max_diff <= 1e-5 def nmf_to_onnx(W, H): """ The function converts a NMF described by matrices *W*, *H* (*WH* approximate training data *M*). into a function which takes two indices *(i, j)* and returns the predictions for it. It assumes these indices applies on the training data. """ col = OnnxArrayFeatureExtractor(H, 'col') row = OnnxArrayFeatureExtractor(W.T, 'row') dot = OnnxMul(col, row, op_version=TARGET_OPSET) res = OnnxReduceSum(dot, output_names="rec", op_version=TARGET_OPSET) indices_type = np.array([0], dtype=np.int64) onx = res.to_onnx(inputs={'col': indices_type, 'row': indices_type}, outputs=[('rec', FloatTensorType((None, 1)))]) return onx model_onnx = nmf_to_onnx(W, H) sess = InferenceSession(model_onnx.SerializeToString()) def predict_onnx(sess, row_indices, col_indices): res = sess.run(None, {'col': col_indices, 'row': row_indices}) return res onnx_preds = [] for i in range(mat.shape[0]): for j in range(mat.shape[1]): row_indices = np.array([i], dtype=np.int64) col_indices = np.array([j], dtype=np.int64) pred = predict_onnx(sess, row_indices, col_indices)[0] onnx_preds.append((i, j, pred[0, 0])) max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, onnx_preds)) assert max_diff <= 1e-5
start_time = time.time() U_50, sigma_50, Vt_50 = svds(demeaned_input, k=50) sigma_50 = np.diag(sigma_50) svd_50_prediction = np.dot(np.dot(U_50, sigma_50), Vt_50) + user_mean end_time = time.time() svd_50_HR10 = test.hit_rate(svd_50_prediction[len(train_data):], last_item, 10) svd_50_HR25 = test.hit_rate(svd_50_prediction[len(train_data):], last_item, 25) svd_50_arhr = test.arhr(svd_50_prediction[len(train_data):], last_item) svd_50_time = end_time - start_time # NMF start_time = time.time() nmf = NMF(2) W = nmf.fit_transform(entire_data) H = nmf.components_ nmf_prediction = np.dot(W, H) end_time = time.time() nmf_HR10 = test.hit_rate(nmf_prediction[len(train_data):], last_item, 10) nmf_HR25 = test.hit_rate(nmf_prediction[len(train_data):], last_item, 25) nmf_arhr = test.arhr(nmf_prediction[len(train_data):], last_item) nmf_time = end_time - start_time # print tabulated result table = tabulate( [[ 'HR10', dhrbm_HR10, itempop_HR10, itempop_cluster_HR10, svd_10_HR10, svd_50_HR10, nmf_HR10
def log_stdvar_NMF_L2(X): X = log_stdvar(X) k = compute_pcs_needed_to_explain_variance(X,50) nmf = NMF(n_components=k) Xrd = nmf.fit_transform(X) return pairwise_distances(Xrd)
print "Extracting tf-idf features for NMF..." tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english') tfidf = tfidf_vectorizer.fit_transform(posts) print "Extracting tf features for LDA..." tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(posts) # cell 3 - Using NMF to get top topics print "Fitting the NMF model with tf-idf features," "n_samples=%d and n_features=%d..." % ( n_samples, n_features) nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print "\nTopics in NMF model:" tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) # cell 4 - Using LDA to get top topics print "Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % ( n_samples, n_features) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(tf)
E_symbol = np.asarray(E_symbol) P_symbol = np.asarray(P_symbol) E = pd.DataFrame(E) PeakO = pd.DataFrame(PeakO) E = quantileNormalize(E) PeakO = quantileNormalize(PeakO) print("Initializing non-negative matrix factorization for E...") E[E > 10000] = 10000 X = np.log(1 + E) err1 = np.zeros(rep) for i in range(0, rep): model = NMF(n_components=K, init='random', random_state=i, solver='cd', max_iter=50) W20 = model.fit_transform(X) H20 = model.components_ err1[i] = LA.norm(X - np.dot(W20, H20), ord='fro') model = NMF(n_components=K, init='random', random_state=np.argmin(err1), solver='cd', max_iter=1000) W20 = model.fit_transform(X) H20 = model.components_ S20 = np.argmax(H20, 0)
def ldatopicmodeling(sentencetuples, searchobject): """ see: http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py CountVectorizer: max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. see sample results at end of file :param sentencetuples: :param activepoll: :return: """ maxfeatures = 2000 components = 15 topwords = 15 maxfreq = .60 minfreq = 5 iterations = 12 mustbelongerthan = 2 sentencetuples = [ s for s in sentencetuples if len(s[1].strip().split(' ')) > mustbelongerthan ] sentences = [s[1] for s in sentencetuples] sentences = [s.split(' ') for s in sentences] allwordsinorder = [ item for sublist in sentences for item in sublist if item ] morphdict = getrequiredmorphobjects(set(allwordsinorder)) morphdict = convertmophdicttodict(morphdict) bagsofwords = buildwordbags(searchobject, morphdict, sentences) bagsofsentences = [' '.join(b) for b in bagsofwords] # Use tf (raw term count) features for LDA. ldavectorizer = CountVectorizer(max_df=maxfreq, min_df=minfreq, max_features=maxfeatures) ldavectorized = ldavectorizer.fit_transform(bagsofsentences) lda = LatentDirichletAllocation(n_components=components, max_iter=iterations, learning_method='online', learning_offset=50., random_state=0) lda.fit(ldavectorized) print("\nTopics in LDA model:") tf_feature_names = ldavectorizer.get_feature_names() print_top_words(lda, tf_feature_names, topwords) # Use tf-idf features for NMF. tfidfvectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=maxfeatures) tfidf = tfidfvectorizer.fit_transform(bagsofsentences) # Fit the NMF model nmf = NMF(n_components=components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("\nTopics in NMF model (Frobenius norm):") tfidffeaturenames = tfidfvectorizer.get_feature_names() print_top_words(nmf, tfidffeaturenames, topwords) # Fit the NMF model print( "Fitting the NMF model (generalized Kullback-Leibler divergence) with " "tf-idf features, n_samples=%d and n_features=%d..." % (len(sentences), maxfeatures)) nmf = NMF(n_components=components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf) print("\nTopics in NMF model (generalized Kullback-Leibler divergence):") tfidffeaturenames = tfidfvectorizer.get_feature_names() print_top_words(nmf, tfidffeaturenames, topwords) return
y = x_p[:, 1] plt.figure() plt.title('apres la methode pca') plt.scatter(x, y, c=label) plt.xlabel('dimension 1') plt.ylabel('diemnsion 2') # <p style="color:green">Donc la couleur jaune représente les personnes mort </p> # <i style="color:blue">On peut aussi utiliser la méthode NMF</i> # In[31]: from sklearn.decomposition import NMF nmf = NMF(n_components=2) x_n = nmf.fit(data).transform(data) print(x_n) x = x_n[:, 0] y = x_n[:, 1] plt.figure() plt.title('apres la methode NMF') plt.scatter(x, y, c=label) plt.xlabel('dimension 1') plt.ylabel('diemnsion 2') # <h3 style="color:#8080C0"> # Dans la suite, nous utilisons une méthode d'apprentissage automatique afin de prédire la classe : les patients sont soit «décédés» (‘died’) soit «sortis» (‘discharged’) de l'hôpital. Vous pouvez utiliser la classification par K-Nearest Neighbours (K-NN), l’arbre de decision ou le classificateur Bayes.</h3> # In[42]:
def lanchNMF(self): model = NMF(n_components=3, init='random', random_state=0) self.nmf_ = model.fit_transform(self.img)
# Challenge 1 #%% import numpy as np np.set_printoptions(threshold=np.inf) from sklearn.decomposition import NMF M = [[4, 4, 2, 2, 3, 1, 1], [1, 5, 5, 2, 1, 4, 5], [1, 5, 1, 1, 4, 1, 4], [5, 4, 3, 1, 1, 1, 2], [1, 4, 4, 1, 1, 5, 5], [5, 5, 3, 5, 5, 1, 2], [1, 5, 3, 5, None, 5, 5]] M1 = [[4, 4, 2, 2, 3, 1, 1], [1, 5, 5, 2, 1, 4, 5], [1, 5, 1, 1, 4, 1, 4], [5, 4, 3, 1, 1, 1, 2], [1, 4, 4, 1, 1, 5, 5], [5, 5, 3, 5, 5, 1, 2]] M2 = [[4, 4, 2, 2, 1, 1], [1, 5, 5, 2, 4, 5], [1, 5, 1, 1, 1, 4], [5, 4, 3, 1, 1, 2], [1, 4, 4, 1, 5, 5], [5, 5, 3, 5, 1, 2], [1, 5, 3, 5, 5, 5]] model1 = NMF(n_components=3) model1.fit(M1) W2 = model1.fit_transform(M2) H2 = model1.components_ W1 = model1.fit_transform(M1) H1 = model1.components_ print(np.matmul(W2, H1)) # Challenge 2 #%% # Lloyd’s algorithm import random import matplotlib.pyplot as plt class lloyds(object):
def handle(self, *args, **options): parent_run_id = options['run_id'] K = options['K'] nWords = 50 #options['nWords'] fileDest = "" #options['fileDest'] parent_stat = RunStats.objects.get(pk=parent_run_id) n_features = parent_stat.max_features if fileDest == '': run_id = init(n_features) stat = RunStats.objects.get(run_id=run_id) stat.query = Query.objects.get(pk=parent_stat.query.id) stat.method = "DT" stat.parent_run_id = parent_run_id stat.save() for tp in parent_stat.periods.all(): stat.periods.add(tp) tops = Topic.objects.filter(run_id=parent_run_id, topicterm__isnull=False).distinct() terms = Term.objects.all() B = np.zeros((tops.count(), terms.count())) wt = 0 for topic in tops: tts = TopicTerm.objects.filter( topic=topic).order_by('-score')[:nWords] if len(tts) == 0: if fileDest != '': print(wt) continue print(topic) for tt in tts: B[wt, tt.term.id] = tt.score * np.log1p(topic.score) wt += 1 col_sum = np.sum(B, axis=0) vocab_ids = np.flatnonzero(col_sum) row_sum = np.sum(B, axis=1) top_ids = np.flatnonzero(row_sum) print(np.where(~B.any(axis=1))) # we only want the columns where there are at least some # topic-term values B = B[:, vocab_ids] print(B.shape) print(np.where(~B.any(axis=1))) if fileDest != '': np.save(fileDest, B) sys.exit() nmf = NMF(n_components=K, random_state=1, alpha=.1, l1_ratio=.5).fit(B) ## Add dynamic topics dtopics = [] for k in range(K): dtopic = DynamicTopic(run_id=RunStats.objects.get(pk=run_id)) dtopic.save() dtopics.append(dtopic) dtopic_ids = list( DynamicTopic.objects.filter(run_id=run_id).values_list('id', flat=True)) print(dtopic_ids) ################## ## Add the dtopic*term matrix to the db print("Adding topicterms to db") t0 = time() ldalambda = find(csr_matrix(nmf.components_)) topics = range(len(ldalambda[0])) tts = [] pool = Pool(processes=8) tts.append( pool.map( partial(f_dlambda, m=ldalambda, v_ids=vocab_ids, t_ids=dtopic_ids, run_id=run_id), topics)) pool.terminate() tts = flatten(tts) gc.collect() sys.stdout.flush() django.db.connections.close_all() DynamicTopicTerm.objects.bulk_create(tts) print("done in %0.3fs." % (time() - t0)) ## Add the wtopic*dtopic matrix to the database gamma = nmf.transform(B) for topic in range(len(gamma)): for dtopic in range(len(gamma[topic])): if gamma[topic][dtopic] > 0: tdt = TopicDTopic(topic=tops[topic], dynamictopic_id=dtopic_ids[dtopic], score=gamma[topic][dtopic]) tdt.save() ## Calculate the primary dtopic for each topic for t in tops: try: t.primary_dtopic = TopicDTopic.objects.filter( topic=t).order_by('-score').first().dynamictopic t.save() except: pass stat.error = parent_stat.error + nmf.reconstruction_err_ stat.errortype = "Frobenius" stat.last_update = timezone.now() stat.save() print("updating and summarising run, {}".format(run_id)) management.call_command('update_run', run_id) management.call_command('update_run', run_id)
def plot_optimal_k(docs, document_term_mat, vectorizer, kmin=3, kmax=15, num_top_terms=15, alpha=.1, l1_ratio=.5, dim_size=500, min_df=20, max_vocab_size=5000, model_file_path='./data/', model_file_name='w2v-model.bin'): ''' Run NMF for each k between min and max and plot to assess optimal k. Input docs - corpus of docuemnts as a list document_term_mat - TFIDF matrix from the vectorizer vectorizer - scikit-learn TFIDF vectorizer (trained in TopicModeller) Returns: Int - optimal k number ''' topic_models = [] # Run NMF for each value of k for k in range(kmin, kmax+1): t1 = time.time() # Run NMF model = NMF(n_components=k, init='nndsvd', alpha=alpha, l1_ratio=l1_ratio) W = model.fit_transform(document_term_mat) H = model.components_ # Store for iterating over all the models (of each k size) topic_models.append((k, W, H)) print("Processed NMF for k=%d of %d - Time: %0.3fs." % (k, kmax, (time.time() - t1)), end='\r', flush=True) print() # If the model is already built get it from disk, otherwise # build a Skipgram Word2Vec model from all documents # in the input file using Gensim: model_path = model_file_path + model_file_name if not os.path.exists(model_file_path): os.makedirs(model_file_path) w2v_model = None try: w2v_model = gensim.models.Word2Vec.load(model_path) except Exception as e: print('No existing word2vec model found to load. Exception: %s.\n' 'Building it...' % (e)) # w2v_model = None - uncomment to force rebuild every time if w2v_model: print('Existing word2vec Model loaded from \'%s\'' % model_path) else: docgen = nlp_utils.TokenGenerator(docs) # Process w2v with model of n dimensions and min doc-term freq as min_df t1 = time.time() w2v_model = gensim.models.Word2Vec(docgen, sg=1, size=dim_size, max_vocab_size=max_vocab_size, min_count=min_df) print("- Time: %0.3fs." % (time.time() - t1)) # Save for later use, so that we do not need to rebuild it: print('Saving it...') w2v_model.save(model_path) print(('word2vec model has %d terms' % len(w2v_model.wv.vocab))) # Implement TC-W2V coherence score measure def calculate_coherence(w2v_model, term_rankings): overall_coherence = 0.0 for topic_index in range(len(term_rankings)): # check each pair of terms pair_scores = [] # print 'Topic %s: %s top words: %s' % (topic_index, # len(term_rankings[topic_index]), # term_rankings[topic_index]) for pair in combinations(term_rankings[topic_index], 2): pair_scores.append(w2v_model.similarity(pair[0], pair[1])) # get the mean for all pairs in this topic topic_score = sum(pair_scores) / len(pair_scores) overall_coherence += topic_score # get the mean score across all topics return overall_coherence / len(term_rankings) # Function to get the topic descriptor # (i.e. list of top terms) for each topic: def get_descriptor(all_terms, H, topic_index, num_top_terms): # reverse sort the values to sort the indices top_indices = np.argsort(H[topic_index, :])[::-1] # now get the terms corresponding to the top-ranked indices top_terms = [] for term_index in top_indices[0:num_top_terms]: top_terms.append(all_terms[term_index]) return top_terms # Process each of the models for different values of k: vocab = vectorizer.get_feature_names() # vocab = w2v_model.wv.vocab # Process each of the models for different values of k: k_values = [] coherences = [] print('Calculating coherence scores...') for (k, W, H) in topic_models: # Get all topic descriptors - the term_rankings, based on top n terms term_rankings = [] for topic_index in range(k): # term_rankings.append(get_descriptor(vocab, H, topic_index, num_top_terms)) top_words = [vocab[i] for i in H[topic_index, :].argsort()[:-num_top_terms - 1:-1]] top_words = [x for x in top_words if x in w2v_model.wv.vocab] term_rankings.append(top_words) # Calculate the coherence based on our Word2vec model k_values.append(k) coherences.append(calculate_coherence(w2v_model, term_rankings)) # print(('K=%02d: Coherence=%.4f' % (k, coherences[-1]))) # Plot a line of coherence scores to identify an appropriate k value. plt.style.use("ggplot") matplotlib.rcParams.update({"font.size": 14}) fig = plt.figure(figsize=(13, 7)) # Create the line plot ax = plt.plot(k_values, coherences) plt.xticks(k_values) plt.xlabel("Number of Topics") plt.ylabel("Mean Coherence") # Add the points plt.scatter(k_values, coherences, s=120) # Find and annotate the maximum point on the plot ymax = max(coherences) xpos = coherences.index(ymax) best_k = k_values[xpos] plt.annotate('k=%d' % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16) print('Optimal number of k topics: %s' % best_k) # Show the plot plt.show() k = best_k # Get the model that we generated earlier. W = topic_models[k-kmin][1] H = topic_models[k-kmin][2] # Display the topics and descriptor words for the best k model for topic_index in range(k): descriptor = get_descriptor(vectorizer.get_feature_names(), H, topic_index, num_top_terms) str_descriptor = ", ".join(descriptor) print(("Topic %02d: %s" % (topic_index, str_descriptor))) return int(k)
def topics(df, model="lda", stopwords=None): """ Either executes LDA or NMF on a dutch document. This is a simple implementation and only used for "fun" purposes. It is not so much to find the very best topics, but topics that are good enough. Parameters: ----------- df : pandas dataframe Pandas dataframe that contains the raw messages mode : str, default "lda" Which model to use for topic modelling. Either "lda" or "nmf" works for now stopwords : str, default None If you want to remove stopwords, provide a local link to the text file (that includes a list of words) including the extension. """ # Prepare stopwords if stopwords: with open(stopwords) as stopwords_list: stopwords_list = stopwords_list.readlines() stopwords_list = [word[:-1] for word in stopwords_list] else: stopwords_list = [] # Create Topics for user in df.User.unique(): print("#" * len(user) + "########") print("### " + user + " ###") print("#" * len(user) + "########\n") data_samples = df[df.User == user].Message_Prepared data_samples = data_samples.tolist() if model == "lda": # Extracting Features tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords_list) tf = tf_vectorizer.fit_transform(data_samples) # Fitting LDA topic_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', learning_offset=50., random_state=0) topic_model.fit(tf) feature_names = tf_vectorizer.get_feature_names() else: # MNF uses tfidf tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stopwords_list) tfidf = tfidf_vectorizer.fit_transform(data_samples) feature_names = tfidf_vectorizer.get_feature_names() # Run NMF topic_model = NMF(n_components=5, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd') topic_model.fit(tfidf) print("\nTopics in {} model:".format(model)) print_top_words(topic_model, feature_names, 7)
def gen_decomposition_stats_vector_ftr51(stats_name, size='7d', non_zero=False, decomp_method='lda', n_components=5): """ :param stats_name: str,对药品数量进行统计的名字 :param size: str, 统计的时间粒度 1d, 4d, 7d, 15d, 30d, 45d :param non_zero: bool, 统计是否非0 :param decomp_method: str, 分解方法 :param n_components: int , 分解之后的维度 :return: """ assert decomp_method in ['svd', 'nmf', 'lda'] mask = (stats_name in ['sum', 'max', 'sum_ratio', 'max_ratio']) & non_zero assert not mask matrix_name = '{}_vector_ftr51_by_{}_{}'.format(stats_name, size, non_zero) # 0 读取数据 ftr51_stats_sparse_matrix = sparse.load_npz( get_path() + 'Data/Feature/{}.npz'.format(matrix_name)).toarray() if decomp_method == 'svd': print(' svd decomposition...') svd = TruncatedSVD(n_components=n_components, n_iter=50, random_state=42) ftr51_stats_matrix_decomp = svd.fit_transform( ftr51_stats_sparse_matrix) if decomp_method == 'nmf': print(' nmf decomposition...') nmf = NMF(n_components=n_components, init='random', random_state=0, max_iter=200) ftr51_stats_matrix_decomp = nmf.fit_transform( ftr51_stats_sparse_matrix) if decomp_method == 'lda': print(' lda decomposition...') lda = LatentDirichletAllocation(n_components=n_components, max_iter=50, learning_method='online', learning_offset=50., random_state=0, n_jobs=1) ftr51_stats_matrix_decomp = lda.fit_transform( ftr51_stats_sparse_matrix) joblib.dump(lda, "lda_{}_{}.m".format(stats_name, size)) columns = [ '{}_{}_vector_by_{}_{}_{}_{}'.format(decomp_method, stats_name, size, non_zero, n_components, j) for j in range(ftr51_stats_matrix_decomp.shape[1]) ] stats_df = pd.DataFrame(data=ftr51_stats_matrix_decomp, columns=columns) train = stats_df[:15000].reset_index(drop=True) test = stats_df[15000:].reset_index(drop=True) for feature in columns: SaveFeature(train, test, feature) return columns, 'gen_decomposition_stats_vector_ftr51("{}", "{}", {}, "{}", {})'.format( stats_name, size, non_zero, decomp_method, n_components)
# 1 パイプラインの定義 ------------------------------------------------------------------------------ # パイプラインの定義 # --- 次元削減 # --- SVM分類器 pipe = Pipeline([ ('reduce_dim', PCA()), ('classify', SVC()) ]) # パラメータ設定 params_grid = [ { 'reduce_dim': [PCA(), NMF(), Isomap(), TruncatedSVD()], 'reduce_dim__n_components': [2, 3], 'classify': [SVC(), LinearSVC()], 'classify__C': [1, 10, 100, 1000] } ] # 確認 print(params_grid) # 2 パラメータチューニングの実行 ----------------------------------------------------------------------- # <ポイント> # - グリッドサーチを用いてハイパーパラメータのチューニングを行う
# Use tf (raw term count) features for LDA. print("抽取 tf 特征,用于LDA") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tf = tf_vectorizer.fit_transform(data_samples) print("抽取 tf 特征完成 in %0.3fs." % (time() - t0)) print() # Fit the NMF model print("用tf-idf特征训练NMF模型(范数),, " "文章个数=%d and 特征个数=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("训练完成。done in %0.3fs." % (time() - t0)) print("\n在非负的矩阵分解模型(范数)的主题:") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) # Fit the NMF model print("用ft-idf特征训练非负的矩阵分解模型(普通的KL散度), 文章个数=%d and 特征个数=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000,
# plot the mean cross-validation scores mglearn . tools . heatmap ( scores , xlabel = 'svm__C' , xticklabels = param_grid [ 'svm__C' ], ylabel = 'svm__gamma' , yticklabels = param_grid [ 'svm__gamma' ], cmap = "viridis" ) """-----------------------------------------------------------------------------""" """===========================================================================================""" """-----------------------------------------------------------------------------""" """NMF pre-processing with SVC algorithm """ ##Pipelines in Grid Searches pipe = Pipeline([("scaler", NMF()), ("svm", SVC())]) param_grid = { 'scaler__n_components' : [5], 'svm__C' : [0.00001, 0.1], 'svm__gamma' : [0.00001, 0.1]} grid = GridSearchCV(pipe, param_grid = param_grid, cv = 5) grid.fit(X_train,y_train ) pred = grid.predict(X_test) print("NMF pre-processing with SVC algorithm") print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) print("Test set accuracy: {:.2f}".format(grid.score(X_test,y_test))) print("f1 score: {:.2f}".format(f1_score(y_test,pred))) print("Best parameters: {}".format(grid.best_params_)) print ( classification_report ( y_test, pred, target_names = [ "mol" , "no_mol" ])) scores = grid.cv_results_ [ 'mean_test_score' ] . reshape ( 2,2 ) # plot the mean cross-validation scores mglearn . tools . heatmap ( scores , xlabel = 'svm__C' ,
stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}') data_vectorized = vectorizer.fit_transform(wines["processed_description"]) NUM_TOPICS = 10 # Latent Dirichlet Allocation Model lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online', verbose=True) data_lda = lda.fit_transform(data_vectorized) # Non-Negative Matrix Factorization Model nmf = NMF(n_components=NUM_TOPICS) data_nmf = nmf.fit_transform(data_vectorized) # Latent Semantic Indexing Model using Truncated SVD lsi = TruncatedSVD(n_components=NUM_TOPICS) data_lsi = lsi.fit_transform(data_vectorized) # Functions for printing keywords for each topic def selected_topics(model, vectorizer, top_n=10): for idx, topic in enumerate(model.components_): print("Topic %d:" % (idx)) print([(vectorizer.get_feature_names()[i], topic[i]) for i in topic.argsort()[:-top_n - 1:-1]])
model.add(e) model.add(Flatten()) model.add(Dense(10173, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(13, activation='softmax')) model.compile(optimizer='Adadelta', loss='categorical_crossentropy', metrics=['acc']) history = model.fit(tfidf, y_label, epochs=20, verbose=1,validation_split=0.3) # Run NMF from sklearn.decomposition import NMF, LatentDirichletAllocation no_topics = 13 nmf = NMF(n_components=no_topics, init='nndsvd').fit(tfidf) W = nmf.fit_transform(tfidf) H = nmf.components_ # Run LDA lda = LatentDirichletAllocation(n_components=no_topics, learning_method='online', learning_offset=50.).fit(tf) W_lda = lda.fit_transform(tf) H_lda = lda.components_ def display_topics(model, feature_names, no_top_words): for topic_idx, topic in enumerate(model.components_): print ("Topic %d:" % (topic_idx)) print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())]) N_EXPERIMENTS = 5 N_FEATURES_OPTIONS = [4] C_OPTIONS = [1, 10, 100, 1000] reducer_labels = ['PCA', 'NMF', 'KBest(chi2)'] non_nested_scores = np.zeros(N_EXPERIMENTS) nested_scores = np.zeros(N_EXPERIMENTS) ############################################################ param_grid = [ { 'reduce_dim': [PCA(iterated_power=7), NMF()], 'reduce_dim__n_components': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, { 'reduce_dim': [SelectKBest(chi2)], 'reduce_dim__k': N_FEATURES_OPTIONS, 'classify__C': C_OPTIONS }, ] print('Grid Search experiments... ') start = time() for ith_exp in range(N_EXPERIMENTS): # CV technique
import numpy as np from sklearn.decomposition import NMF,TruncatedSVD,ProjectedGradientNMF model = NMF(n_components=2, alpha=0.01) #Store AD ad_ID_dict = {} #ad_list = [] #ad_list = list(ad_list) #Assign ID number ad_ID = 0 user_ID = 0 max_feature = 0 #ad_ID for ad_nmu adID_for_num = {} with open ('ad_ID.dat') as file: for line in file: data = line.strip('\n').split(' ') #print(data) adID_for_num[int(data[1])] = int(data[0]) file.close()
# tf-idf for max_fq in df_gradients: tweetImport = codecs.open(importfilename, 'r', 'utf-8') # NMF can use tf-idf # lowercase=False tfidf_vectorizer = TfidfVectorizer(strip_accents='ascii', ngram_range=(ngram_min, ngram_max), max_df=max_fq, min_df=1, max_features=num_features, stop_words=stop_words, analyzer='word', token_pattern='[a-zA-Z]+') tfidf_matrix = tfidf_vectorizer.fit_transform(tweetImport) tfidf_feature_names = tfidf_vectorizer.get_feature_names() stop_words.extend(tfidf_feature_names) tweetImport.close() # save the terms ranked by tfidf scores into a list, to be used for wordcloud plotting version = 3 saveTerms_sortedTFIDFscores(outputPath, max_fq, num_features, version, tfidf_feature_names, tfidf_matrix) # Run NMF (results not as good as LDA) nmf = NMF(n_components=num_topics, random_state=1, alpha=0, init='random').fit(tfidf_matrix) display_topics(nmf, tfidf_feature_names, num_top_words) # plot all wordclouds in one figure # wordcloud_in_one_figure(outputPath, num_features, df_gradients) # plot individual wordclouds: plt.rcParams['figure.figsize'] = (10.0, 7.0) for max_fq in df_gradients: tfidffilename = outputPath + 'tweet_keyword_tradewar_tfidf_features_' + str(max_fq) + '_' + str(num_features) + '_v3.csv' tfidffile = open(tfidffilename, 'r') word_text = tfidffile.read() wordcloud = WordCloud(colormap='hsv', max_words=1000, width=3000, height=2000, margin=3, collocations=False).generate(word_text)
image_shape = people.images[0].shape mask = np.zeros(people.target.shape, dtype=np.bool) for target in np.unique(people.target): mask[np.where(people.target == target)[0][:50]] = 1 X_people = people.data[mask] y_people = people.target[mask] X_people = X_people / 255. X_train, X_test, y_train, y_test = train_test_split( \ X_people, y_people, stratify=y_people, random_state=0) mglearn.plots.plot_nmf_illustration() mglearn.plots.plot_nmf_faces(X_train, X_test, image_shape) from sklearn.decomposition import NMF nmf = NMF(n_components=15, random_state=0) nmf.fit(X_train) X_train_nmf = nmf.transform(X_train) X_test_nmf = nmf.transform(X_test) fix, axes = plt.subplots(3, 5, figsize=(15, 12), \ subplot_kw={'xticks': (), 'yticks': ()}) for i, (component, ax) in enumerate(zip(nmf.components_, axes.ravel())): ax.imshow(component.reshape(image_shape)) ax.set_title("{}. component".format(i)) # display the data that has large weighting for comp compn = 11 inds = np.argsort(X_train_nmf[:, compn])[::-1] fix, axes = plt.subplots(2, 5, figsize=(15, 8), \ subplot_kw={'xticks': (), 'yticks': ()})
start_time = time.time() # vectorize documents by using tfidf vectorizer tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenization, max_features=n_features, max_df=0.9, min_df=2) docs_tfidf = tfidf_vectorizer.fit_transform(doc_set) termid_word_list = tfidf_vectorizer.get_feature_names( ) # word = termid_word_list[indx] print("Fitting the NMF model...") # solver: coordinate descent; learning rate: alpha = 0.1; #l1_ratio 0: L2 regularization, NO L1 regularization nmf_model = NMF(n_components=n_factors, random_state=1, solver='cd', alpha=.1, l1_ratio=.0) # generate latent factors for documents based on NMF model docs_lf = nmf_model.fit_transform(docs_tfidf) for qIndex in range(0, len(queryID_list)): #for qIndex in range(0, 2): print(str(qIndex) + "/" + str(len(queryID_list))) query_str = queries_dict[queryID_list[qIndex]] query = [query_str] # generate tfidf vector for the query query_tfidf = tfidf_vectorizer.transform(query) # generate latent factor for the query based on NMF model query_lf = nmf_model.transform(query_tfidf)
def test_nmf_fit_close(solver): rng = np.random.mtrand.RandomState(42) # Test that the fit is not too far away pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0, max_iter=600) X = np.abs(rng.randn(6, 5)) assert pnmf.fit(X).reconstruction_err_ < 0.1
def update_nmf_graph1(no_topics, nmf_components_value, nmf_alpha_value, nmf_l1ratio_value, min_df_value, max_df_value, ngram_range_value, num_clicks): if num_clicks > 0: # Getting the filenames matrix_filename = 'temp_data/' + temporary_key + '_output_matrix.csv' processed_docs_filename = 'temp_data/' + temporary_key + '_processed_docs.csv' features_list_filename = 'temp_data/' + temporary_key + '_features_list.csv' tfidf_fit_filename = 'temp_data/' + temporary_key + '_vectorizer_model.pickle' print('loading nmf input objects') # Read in tfidf dense_tfidf_matrix = pd.read_csv(matrix_filename) print('The shape of the tfidf_matrix is: {}.'.format(dense_tfidf_matrix.shape)) # Reading in the processed documents processed_docs = pd.read_csv(processed_docs_filename, encoding = 'latin1') processed_docs = processed_docs['processed_doc'].tolist() print(processed_docs[0]) features_df = pd.read_csv(features_list_filename) features_list = features_df['feature_list'].tolist() print('The first five token features are: {}.'.format(features_list[:5])) sparse_tfidf_matrix = scipy.sparse.csr_matrix(dense_tfidf_matrix.values) # print(sparse_tfidf_matrix) print('the sparse tfidf matrix is loaded') # Defining the NMF object nmf = NMF(n_components=no_topics, random_state=42, alpha=0.1, l1_ratio=.2, \ max_iter = 500, verbose = False, shuffle = True, init='nndsvd', solver = 'cd') print('Computing the NMF for the sparse tfidf matrix') nmf_model = nmf.fit(sparse_tfidf_matrix) print(nmf_model) #-------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------- def generate_topic_table(model, feature_names, n_top_words): topics = {} for topic_idx, topic in enumerate(model.components_): t = ("topic_%d" % topic_idx) topics[t] = [feature_names[i] for i in top_words(topic, n_top_words)] out_df = pd.DataFrame(topics) out_df = out_df[list(topics.keys())] return out_df #-------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------- print(processed_docs[0])
def extract_components(mov_tot, n_components=6, normalize_std=True, max_iter_DL=-30, method_factorization='nmf', **kwargs): """ From optical flow images can extract spatial and temporal components Parameters: ---------- mov_tot: ndarray (can be 3 or 4D) contains the optical flow values, either in cartesian or polar, either one (3D) or both (4D coordinates) the input is generated by the compute_optical_flow function n_components: int number of components to look for normalize_std: bool whether to normalize each oof the optical flow components normalize_output_traces: boolean whether to normalize the behavioral traces so that they match the units in the movie Returns: ------- spatial_filter: ndarray set of spatial inferred filters time_trace:ndarray set of time components norm_fact: ndarray used notmalization factors """ if mov_tot.ndim == 4: if normalize_std: norm_fact = np.nanstd(mov_tot, axis=(1, 2, 3)) mov_tot = old_div(mov_tot, norm_fact[:, np.newaxis, np.newaxis, np.newaxis]) else: norm_fact = np.array([1., 1.]) c, T, d1, d2 = np.shape(mov_tot) else: norm_fact = 1 T, d1, d2 = np.shape(mov_tot) c = 1 tt = time.time() newm = np.reshape(mov_tot, (c * T, d1 * d2)) if method_factorization == 'nmf': nmf = NMF(n_components=n_components, **kwargs) time_trace = nmf.fit_transform(newm) spatial_filter = nmf.components_ spatial_filter = np.concatenate([ np.reshape(sp, (d1, d2))[np.newaxis, :, :] for sp in spatial_filter ], axis=0) elif method_factorization == 'dict_learn': import spams newm = np.asfortranarray(newm, dtype=np.float32) time_trace = spams.trainDL(newm, K=n_components, mode=0, lambda1=1, posAlpha=True, iter=max_iter_DL) spatial_filter = spams.lasso(newm, D=time_trace, return_reg_path=False, lambda1=0.01, mode=spams.spams_wrap.PENALTY, pos=True) spatial_filter = np.concatenate([ np.reshape(sp, (d1, d2))[np.newaxis, :, :] for sp in spatial_filter.toarray() ], axis=0) time_trace = [np.reshape(ttr, (c, T)).T for ttr in time_trace.T] el_t = time.time() - tt print(el_t) return spatial_filter, time_trace, norm_fact
# Nuevamente, agregamos las restricciones al vectorizador encontradas previamente, y sumamos otras # observadas en el proceso de LDA que no aportan a definir un tipo de objeto de sociedad stop_w = ['de','la','a','el','que','en','los','las','con','al','sus','del','por','como','para','toda','todo','servicios', 'cualquier','otros','general','tipo','tipos','actividades','ya','similares','objeto','no','actividad','otra', 'terceros','cuenta','propia','bienes','clase','ajena','act','propios','sociedad','sociedades','socios','su','sea', 'relacionadas','otras','relacionados','especializado','especializados','nuevos','empleadores'] tfidf = TfidfVectorizer(max_df=0.9,min_df=2,stop_words=stop_w) mtx = tfidf.fit_transform(lines) # Ahora importamos la clase NMF (Non matrix Factorization) from sklearn.decomposition import NMF # Tal como fue mencionado en el metodo LDA, trataremos de definir 15 tipos de objeto k = 15 nmf_model = NMF(n_components=k,random_state=7) nmf_model.fit(mtx) # Observamos las 10 palabras mas utilizadas por tipos de objeto for i, tema in enumerate(nmf_model.components_): print(f"Tema {i}:") print([tfidf.get_feature_names()[index] for index in tema.argsort()[-10:]]) print("\n") # Asociamos los tipos de objetos a cada entrada import pandas as pd df = pd.DataFrame() temas_resultantes = nmf_model.transform(mtx) df['Texto'] = lines df['Grupo'] = temas_resultantes.argmax(axis=1) df.head()