vectorizer = CountVectorizer(min_df=5,
                             max_df=0.9,
                             stop_words='english',
                             lowercase=True,
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS,
                                      max_iter=10,
                                      learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
#print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
#print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
#print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# Let's see how the first document in the corpus looks like in different topic spaces
#print(lda_Z[0])
#print(nmf_Z[0])
#print(lsi_Z[0])


def print_topics(model, vectorizer, top_n=10):
Exemplo n.º 2
0
# Rm*n m = item n = user
RATE_MATRIX = np.array(
    [[5, 5, 3, 0, 5, 5, 4, 3, 2, 1, 4, 1, 3, 4, 5],
     [5, 0, 4, 0, 4, 4, 3, 2, 1, 2, 4, 4, 3, 4, 0],
     [0, 3, 0, 5, 4, 5, 0, 4, 4, 5, 3, 0, 0, 0, 0],
     [5, 4, 3, 3, 5, 5, 0, 1, 1, 3, 4, 5, 0, 2, 4],
     [5, 4, 3, 3, 5, 5, 3, 3, 3, 4, 5, 0, 5, 2, 4],
     [5, 4, 2, 2, 0, 5, 3, 3, 3, 4, 4, 4, 5, 2, 5],
     [5, 4, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0],
     [5, 4, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
     [5, 4, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2],
     [5, 4, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]]
)

nmf_model = NMF(n_components=2) # 设有2个主题
item_dis = nmf_model.fit_transform(RATE_MATRIX)
user_dis = nmf_model.components_

print('用户的主题分布:' + str(user_dis.shape))
print(user_dis)
print('电影的主题分布:' + str(item_dis.shape))
print(item_dis)

plt1 = plt
plt1.plot(item_dis[:, 0], item_dis[:, 1], 'ro')
plt1.xlim((-1, 3))
plt1.ylim((-1, 3))
plt1.title(u'Item Distribution')#设置图的标题

count = 1
Exemplo n.º 3
0
def test_n_components_greater_n_features():
    # Smoke test for the case of more components than features.
    rng = np.random.mtrand.RandomState(42)
    A = np.abs(rng.randn(30, 10))
    NMF(n_components=15, random_state=0, tol=1e-2).fit(A)
    def test_custom_nmf(self):

        mat = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0],
                        [1, 0, 0, 0], [1, 0, 0, 0]], dtype=np.float64)
        mat[:mat.shape[1], :] += np.identity(mat.shape[1])

        mod = NMF(n_components=2)
        W = mod.fit_transform(mat)
        H = mod.components_

        def predict(W, H, row_index, col_index):
            return np.dot(W[row_index, :], H[:, col_index])

        pred = mod.inverse_transform(W)

        exp = []
        got = []
        for i in range(mat.shape[0]):
            for j in range(mat.shape[1]):
                exp.append((i, j, pred[i, j]))
                got.append((i, j, predict(W, H, i, j)))

        max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, got))
        assert max_diff <= 1e-5

        def nmf_to_onnx(W, H):
            """
            The function converts a NMF described by matrices
            *W*, *H* (*WH* approximate training data *M*).
            into a function which takes two indices *(i, j)*
            and returns the predictions for it. It assumes
            these indices applies on the training data.
            """
            col = OnnxArrayFeatureExtractor(H, 'col')
            row = OnnxArrayFeatureExtractor(W.T, 'row')
            dot = OnnxMul(col, row, op_version=TARGET_OPSET)
            res = OnnxReduceSum(dot, output_names="rec",
                                op_version=TARGET_OPSET)
            indices_type = np.array([0], dtype=np.int64)
            onx = res.to_onnx(inputs={'col': indices_type,
                                      'row': indices_type},
                              outputs=[('rec', FloatTensorType((None, 1)))])
            return onx

        model_onnx = nmf_to_onnx(W, H)
        sess = InferenceSession(model_onnx.SerializeToString())

        def predict_onnx(sess, row_indices, col_indices):
            res = sess.run(None,
                           {'col': col_indices,
                            'row': row_indices})
            return res

        onnx_preds = []
        for i in range(mat.shape[0]):
            for j in range(mat.shape[1]):
                row_indices = np.array([i], dtype=np.int64)
                col_indices = np.array([j], dtype=np.int64)
                pred = predict_onnx(sess, row_indices, col_indices)[0]
                onnx_preds.append((i, j, pred[0, 0]))

        max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, onnx_preds))
        assert max_diff <= 1e-5
Exemplo n.º 5
0
    start_time = time.time()
    U_50, sigma_50, Vt_50 = svds(demeaned_input, k=50)
    sigma_50 = np.diag(sigma_50)
    svd_50_prediction = np.dot(np.dot(U_50, sigma_50), Vt_50) + user_mean
    end_time = time.time()

    svd_50_HR10 = test.hit_rate(svd_50_prediction[len(train_data):], last_item,
                                10)
    svd_50_HR25 = test.hit_rate(svd_50_prediction[len(train_data):], last_item,
                                25)
    svd_50_arhr = test.arhr(svd_50_prediction[len(train_data):], last_item)
    svd_50_time = end_time - start_time

    # NMF
    start_time = time.time()
    nmf = NMF(2)
    W = nmf.fit_transform(entire_data)
    H = nmf.components_
    nmf_prediction = np.dot(W, H)
    end_time = time.time()

    nmf_HR10 = test.hit_rate(nmf_prediction[len(train_data):], last_item, 10)
    nmf_HR25 = test.hit_rate(nmf_prediction[len(train_data):], last_item, 25)
    nmf_arhr = test.arhr(nmf_prediction[len(train_data):], last_item)
    nmf_time = end_time - start_time

    # print tabulated result
    table = tabulate(
        [[
            'HR10', dhrbm_HR10, itempop_HR10, itempop_cluster_HR10,
            svd_10_HR10, svd_50_HR10, nmf_HR10
Exemplo n.º 6
0
def log_stdvar_NMF_L2(X):
    X = log_stdvar(X)
    k = compute_pcs_needed_to_explain_variance(X,50)
    nmf = NMF(n_components=k)
    Xrd = nmf.fit_transform(X)
    return pairwise_distances(Xrd)
Exemplo n.º 7
0
print "Extracting tf-idf features for NMF..."
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(posts)

print "Extracting tf features for LDA..."
tf_vectorizer = CountVectorizer(max_df=0.95,
                                min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(posts)

# cell 3 - Using NMF to get top topics
print "Fitting the NMF model with tf-idf features," "n_samples=%d and n_features=%d..." % (
    n_samples, n_features)
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1,
          l1_ratio=.5).fit(tfidf)

print "\nTopics in NMF model:"
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# cell 4 - Using LDA to get top topics
print "Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (
    n_samples, n_features)
lda = LatentDirichletAllocation(n_topics=n_topics,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
Exemplo n.º 8
0
E_symbol = np.asarray(E_symbol)
P_symbol = np.asarray(P_symbol)
E = pd.DataFrame(E)
PeakO = pd.DataFrame(PeakO)
E = quantileNormalize(E)
PeakO = quantileNormalize(PeakO)

print("Initializing non-negative matrix factorization for E...")
E[E > 10000] = 10000
X = np.log(1 + E)

err1 = np.zeros(rep)
for i in range(0, rep):
    model = NMF(n_components=K,
                init='random',
                random_state=i,
                solver='cd',
                max_iter=50)
    W20 = model.fit_transform(X)
    H20 = model.components_
    err1[i] = LA.norm(X - np.dot(W20, H20), ord='fro')

model = NMF(n_components=K,
            init='random',
            random_state=np.argmin(err1),
            solver='cd',
            max_iter=1000)
W20 = model.fit_transform(X)
H20 = model.components_
S20 = np.argmax(H20, 0)
Exemplo n.º 9
0
def ldatopicmodeling(sentencetuples, searchobject):
    """

	see:
		http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

	CountVectorizer:
	max_df : float in range [0.0, 1.0] or int, default=1.0
		When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).

	min_df : float in range [0.0, 1.0] or int, default=1
		When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.

	see sample results at end of file

	:param sentencetuples:
	:param activepoll:
	:return:
	"""

    maxfeatures = 2000
    components = 15
    topwords = 15

    maxfreq = .60
    minfreq = 5
    iterations = 12

    mustbelongerthan = 2

    sentencetuples = [
        s for s in sentencetuples
        if len(s[1].strip().split(' ')) > mustbelongerthan
    ]
    sentences = [s[1] for s in sentencetuples]

    sentences = [s.split(' ') for s in sentences]
    allwordsinorder = [
        item for sublist in sentences for item in sublist if item
    ]

    morphdict = getrequiredmorphobjects(set(allwordsinorder))
    morphdict = convertmophdicttodict(morphdict)

    bagsofwords = buildwordbags(searchobject, morphdict, sentences)

    bagsofsentences = [' '.join(b) for b in bagsofwords]

    # Use tf (raw term count) features for LDA.
    ldavectorizer = CountVectorizer(max_df=maxfreq,
                                    min_df=minfreq,
                                    max_features=maxfeatures)

    ldavectorized = ldavectorizer.fit_transform(bagsofsentences)

    lda = LatentDirichletAllocation(n_components=components,
                                    max_iter=iterations,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)

    lda.fit(ldavectorized)

    print("\nTopics in LDA model:")
    tf_feature_names = ldavectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, topwords)

    # Use tf-idf features for NMF.
    tfidfvectorizer = TfidfVectorizer(max_df=0.95,
                                      min_df=2,
                                      max_features=maxfeatures)

    tfidf = tfidfvectorizer.fit_transform(bagsofsentences)

    # Fit the NMF model
    nmf = NMF(n_components=components, random_state=1, alpha=.1,
              l1_ratio=.5).fit(tfidf)

    print("\nTopics in NMF model (Frobenius norm):")
    tfidffeaturenames = tfidfvectorizer.get_feature_names()
    print_top_words(nmf, tfidffeaturenames, topwords)

    # Fit the NMF model
    print(
        "Fitting the NMF model (generalized Kullback-Leibler divergence) with "
        "tf-idf features, n_samples=%d and n_features=%d..." %
        (len(sentences), maxfeatures))

    nmf = NMF(n_components=components,
              random_state=1,
              beta_loss='kullback-leibler',
              solver='mu',
              max_iter=1000,
              alpha=.1,
              l1_ratio=.5).fit(tfidf)

    print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
    tfidffeaturenames = tfidfvectorizer.get_feature_names()
    print_top_words(nmf, tfidffeaturenames, topwords)

    return
Exemplo n.º 10
0
y = x_p[:, 1]

plt.figure()
plt.title('apres la methode pca')
plt.scatter(x, y, c=label)
plt.xlabel('dimension 1')
plt.ylabel('diemnsion 2')

# <p style="color:green">Donc la couleur jaune représente les personnes mort </p>
# <i style="color:blue">On peut aussi utiliser la méthode NMF</i>

# In[31]:

from sklearn.decomposition import NMF

nmf = NMF(n_components=2)
x_n = nmf.fit(data).transform(data)
print(x_n)
x = x_n[:, 0]
y = x_n[:, 1]

plt.figure()
plt.title('apres la methode NMF')
plt.scatter(x, y, c=label)
plt.xlabel('dimension 1')
plt.ylabel('diemnsion 2')

# <h3 style="color:#8080C0">
# Dans la suite, nous utilisons une méthode d'apprentissage automatique afin de prédire la classe : les patients sont soit «décédés» (‘died’) soit «sortis» (‘discharged’) de l'hôpital. Vous pouvez utiliser la classification par K-Nearest Neighbours (K-NN), l’arbre de decision ou le classificateur Bayes.</h3>

# In[42]:
Exemplo n.º 11
0
 def lanchNMF(self):
     model = NMF(n_components=3, init='random', random_state=0)
     self.nmf_ = model.fit_transform(self.img)
Exemplo n.º 12
0
# Challenge 1
#%%
import numpy as np
np.set_printoptions(threshold=np.inf)
from sklearn.decomposition import NMF

M = [[4, 4, 2, 2, 3, 1, 1], [1, 5, 5, 2, 1, 4, 5], [1, 5, 1, 1, 4, 1, 4],
     [5, 4, 3, 1, 1, 1, 2], [1, 4, 4, 1, 1, 5, 5], [5, 5, 3, 5, 5, 1, 2],
     [1, 5, 3, 5, None, 5, 5]]

M1 = [[4, 4, 2, 2, 3, 1, 1], [1, 5, 5, 2, 1, 4, 5], [1, 5, 1, 1, 4, 1, 4],
      [5, 4, 3, 1, 1, 1, 2], [1, 4, 4, 1, 1, 5, 5], [5, 5, 3, 5, 5, 1, 2]]
M2 = [[4, 4, 2, 2, 1, 1], [1, 5, 5, 2, 4, 5], [1, 5, 1, 1, 1, 4],
      [5, 4, 3, 1, 1, 2], [1, 4, 4, 1, 5, 5], [5, 5, 3, 5, 1, 2],
      [1, 5, 3, 5, 5, 5]]
model1 = NMF(n_components=3)
model1.fit(M1)
W2 = model1.fit_transform(M2)
H2 = model1.components_
W1 = model1.fit_transform(M1)
H1 = model1.components_
print(np.matmul(W2, H1))
# Challenge 2
#%%

# Lloyd’s algorithm
import random
import matplotlib.pyplot as plt


class lloyds(object):
Exemplo n.º 13
0
    def handle(self, *args, **options):
        parent_run_id = options['run_id']
        K = options['K']
        nWords = 50  #options['nWords']
        fileDest = ""  #options['fileDest']

        parent_stat = RunStats.objects.get(pk=parent_run_id)

        n_features = parent_stat.max_features

        if fileDest == '':

            run_id = init(n_features)
            stat = RunStats.objects.get(run_id=run_id)
            stat.query = Query.objects.get(pk=parent_stat.query.id)
            stat.method = "DT"
            stat.parent_run_id = parent_run_id
            stat.save()

        for tp in parent_stat.periods.all():
            stat.periods.add(tp)

        tops = Topic.objects.filter(run_id=parent_run_id,
                                    topicterm__isnull=False).distinct()
        terms = Term.objects.all()

        B = np.zeros((tops.count(), terms.count()))

        wt = 0
        for topic in tops:
            tts = TopicTerm.objects.filter(
                topic=topic).order_by('-score')[:nWords]
            if len(tts) == 0:
                if fileDest != '':
                    print(wt)
                    continue
                print(topic)
            for tt in tts:
                B[wt, tt.term.id] = tt.score * np.log1p(topic.score)
            wt += 1

        col_sum = np.sum(B, axis=0)
        vocab_ids = np.flatnonzero(col_sum)

        row_sum = np.sum(B, axis=1)
        top_ids = np.flatnonzero(row_sum)

        print(np.where(~B.any(axis=1)))

        # we only want the columns where there are at least some
        # topic-term values
        B = B[:, vocab_ids]

        print(B.shape)

        print(np.where(~B.any(axis=1)))

        if fileDest != '':
            np.save(fileDest, B)
            sys.exit()

        nmf = NMF(n_components=K, random_state=1, alpha=.1, l1_ratio=.5).fit(B)

        ## Add dynamic topics
        dtopics = []
        for k in range(K):
            dtopic = DynamicTopic(run_id=RunStats.objects.get(pk=run_id))
            dtopic.save()
            dtopics.append(dtopic)

        dtopic_ids = list(
            DynamicTopic.objects.filter(run_id=run_id).values_list('id',
                                                                   flat=True))

        print(dtopic_ids)

        ##################
        ## Add the dtopic*term matrix to the db
        print("Adding topicterms to db")
        t0 = time()
        ldalambda = find(csr_matrix(nmf.components_))
        topics = range(len(ldalambda[0]))
        tts = []
        pool = Pool(processes=8)
        tts.append(
            pool.map(
                partial(f_dlambda,
                        m=ldalambda,
                        v_ids=vocab_ids,
                        t_ids=dtopic_ids,
                        run_id=run_id), topics))
        pool.terminate()
        tts = flatten(tts)
        gc.collect()
        sys.stdout.flush()
        django.db.connections.close_all()
        DynamicTopicTerm.objects.bulk_create(tts)
        print("done in %0.3fs." % (time() - t0))

        ## Add the wtopic*dtopic matrix to the database
        gamma = nmf.transform(B)

        for topic in range(len(gamma)):
            for dtopic in range(len(gamma[topic])):
                if gamma[topic][dtopic] > 0:
                    tdt = TopicDTopic(topic=tops[topic],
                                      dynamictopic_id=dtopic_ids[dtopic],
                                      score=gamma[topic][dtopic])
                    tdt.save()

        ## Calculate the primary dtopic for each topic
        for t in tops:
            try:
                t.primary_dtopic = TopicDTopic.objects.filter(
                    topic=t).order_by('-score').first().dynamictopic
                t.save()
            except:
                pass

        stat.error = parent_stat.error + nmf.reconstruction_err_
        stat.errortype = "Frobenius"
        stat.last_update = timezone.now()
        stat.save()
        print("updating and summarising run, {}".format(run_id))
        management.call_command('update_run', run_id)

        management.call_command('update_run', run_id)
Exemplo n.º 14
0
def plot_optimal_k(docs, document_term_mat, vectorizer,
                   kmin=3, kmax=15, num_top_terms=15,
                   alpha=.1, l1_ratio=.5,
                   dim_size=500, min_df=20, max_vocab_size=5000,
                   model_file_path='./data/',
                   model_file_name='w2v-model.bin'):
    '''
    Run NMF for each k between min and max and plot to assess optimal k.

    Input
        docs - corpus of docuemnts as a list
        document_term_mat - TFIDF matrix from the vectorizer
        vectorizer - scikit-learn TFIDF vectorizer (trained in TopicModeller)

    Returns:
        Int - optimal k number
    '''
    topic_models = []

    # Run NMF for each value of k
    for k in range(kmin, kmax+1):
        t1 = time.time()

        # Run NMF
        model = NMF(n_components=k, init='nndsvd',
                    alpha=alpha, l1_ratio=l1_ratio)

        W = model.fit_transform(document_term_mat)
        H = model.components_

        # Store for iterating over all the models (of each k size)
        topic_models.append((k, W, H))

        print("Processed NMF for k=%d of %d - Time: %0.3fs." % (k, kmax, (time.time() - t1)), end='\r', flush=True)
    print()

    # If the model is already built get it from disk, otherwise
    # build a Skipgram Word2Vec model from all documents
    # in the input file using Gensim:
    model_path = model_file_path + model_file_name
    if not os.path.exists(model_file_path):
        os.makedirs(model_file_path)

    w2v_model = None
    try:
        w2v_model = gensim.models.Word2Vec.load(model_path)
    except Exception as e:
        print('No existing word2vec model found to load. Exception: %s.\n'
              'Building it...' % (e))

    # w2v_model = None - uncomment to force rebuild every time
    if w2v_model:
        print('Existing word2vec Model loaded from \'%s\'' % model_path)
    else:
        docgen = nlp_utils.TokenGenerator(docs)
        # Process w2v with model of n dimensions and min doc-term freq as min_df
        t1 = time.time()
        w2v_model = gensim.models.Word2Vec(docgen, sg=1, size=dim_size,
                                           max_vocab_size=max_vocab_size,
                                           min_count=min_df)
        print("- Time: %0.3fs." % (time.time() - t1))
        # Save for later use, so that we do not need to rebuild it:
        print('Saving it...')
        w2v_model.save(model_path)

    print(('word2vec model has %d terms' % len(w2v_model.wv.vocab)))

    # Implement TC-W2V coherence score measure
    def calculate_coherence(w2v_model, term_rankings):
        overall_coherence = 0.0
        for topic_index in range(len(term_rankings)):
            # check each pair of terms
            pair_scores = []
            # print 'Topic %s: %s top words: %s' % (topic_index,
            #                                       len(term_rankings[topic_index]),
            #                                       term_rankings[topic_index])
            for pair in combinations(term_rankings[topic_index], 2):
                pair_scores.append(w2v_model.similarity(pair[0], pair[1]))
            # get the mean for all pairs in this topic
            topic_score = sum(pair_scores) / len(pair_scores)
            overall_coherence += topic_score
        # get the mean score across all topics
        return overall_coherence / len(term_rankings)

    # Function to get the topic descriptor
    # (i.e. list of top terms) for each topic:
    def get_descriptor(all_terms, H, topic_index, num_top_terms):
        # reverse sort the values to sort the indices
        top_indices = np.argsort(H[topic_index, :])[::-1]
        # now get the terms corresponding to the top-ranked indices
        top_terms = []
        for term_index in top_indices[0:num_top_terms]:
            top_terms.append(all_terms[term_index])
        return top_terms

    # Process each of the models for different values of k:
    vocab = vectorizer.get_feature_names()
    # vocab = w2v_model.wv.vocab

    # Process each of the models for different values of k:
    k_values = []
    coherences = []
    print('Calculating coherence scores...')
    for (k, W, H) in topic_models:
        # Get all topic descriptors - the term_rankings, based on top n terms
        term_rankings = []
        for topic_index in range(k):
            # term_rankings.append(get_descriptor(vocab, H, topic_index, num_top_terms))
            top_words = [vocab[i] for i in H[topic_index, :].argsort()[:-num_top_terms - 1:-1]]
            top_words = [x for x in top_words if x in w2v_model.wv.vocab]
            term_rankings.append(top_words)
        # Calculate the coherence based on our Word2vec model
        k_values.append(k)
        coherences.append(calculate_coherence(w2v_model, term_rankings))
        # print(('K=%02d: Coherence=%.4f' % (k, coherences[-1])))

    # Plot a line of coherence scores to identify an appropriate k value.
    plt.style.use("ggplot")
    matplotlib.rcParams.update({"font.size": 14})
    fig = plt.figure(figsize=(13, 7))
    # Create the line plot
    ax = plt.plot(k_values, coherences)
    plt.xticks(k_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Mean Coherence")
    # Add the points
    plt.scatter(k_values, coherences, s=120)
    # Find and annotate the maximum point on the plot
    ymax = max(coherences)
    xpos = coherences.index(ymax)
    best_k = k_values[xpos]
    plt.annotate('k=%d' % best_k, xy=(best_k, ymax), xytext=(best_k, ymax),
                 textcoords="offset points", fontsize=16)
    print('Optimal number of k topics: %s' % best_k)
    # Show the plot
    plt.show()

    k = best_k
    # Get the model that we generated earlier.
    W = topic_models[k-kmin][1]
    H = topic_models[k-kmin][2]

    # Display the topics and descriptor words for the best k model
    for topic_index in range(k):
        descriptor = get_descriptor(vectorizer.get_feature_names(),
                                    H, topic_index, num_top_terms)
        str_descriptor = ", ".join(descriptor)
        print(("Topic %02d: %s" % (topic_index, str_descriptor)))

    return int(k)
Exemplo n.º 15
0
def topics(df, model="lda", stopwords=None):
    """ Either executes LDA or NMF on a dutch document.
    This is a simple implementation and only used for
    "fun" purposes. It is not so much to find the very
    best topics, but topics that are good enough. 
    
    
    Parameters:
    -----------
    df : pandas dataframe
        Pandas dataframe that contains the raw messages
    mode : str, default "lda"
        Which model to use for topic modelling. 
        Either "lda" or "nmf" works for now
    stopwords : str, default None
        If you want to remove stopwords, provide a local 
        link to the text file (that includes a list of words)
        including the extension. 
    
    """
    # Prepare stopwords
    if stopwords:
        with open(stopwords) as stopwords_list:
            stopwords_list = stopwords_list.readlines()
            stopwords_list = [word[:-1] for word in stopwords_list]
    else:
        stopwords_list = []

    # Create Topics
    for user in df.User.unique():
        print("#" * len(user) + "########")
        print("### " + user + " ###")
        print("#" * len(user) + "########\n")

        data_samples = df[df.User == user].Message_Prepared
        data_samples = data_samples.tolist()

        if model == "lda":
            # Extracting Features
            tf_vectorizer = CountVectorizer(max_df=0.95,
                                            min_df=2,
                                            stop_words=stopwords_list)
            tf = tf_vectorizer.fit_transform(data_samples)

            # Fitting LDA
            topic_model = LatentDirichletAllocation(n_components=5,
                                                    max_iter=5,
                                                    learning_method='online',
                                                    learning_offset=50.,
                                                    random_state=0)
            topic_model.fit(tf)
            feature_names = tf_vectorizer.get_feature_names()
        else:
            # MNF uses tfidf
            tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                               min_df=2,
                                               stop_words=stopwords_list)
            tfidf = tfidf_vectorizer.fit_transform(data_samples)
            feature_names = tfidf_vectorizer.get_feature_names()

            # Run NMF
            topic_model = NMF(n_components=5,
                              random_state=1,
                              alpha=.1,
                              l1_ratio=.5,
                              init='nndsvd')
            topic_model.fit(tfidf)

        print("\nTopics in {} model:".format(model))
        print_top_words(topic_model, feature_names, 7)
Exemplo n.º 16
0
def gen_decomposition_stats_vector_ftr51(stats_name,
                                         size='7d',
                                         non_zero=False,
                                         decomp_method='lda',
                                         n_components=5):
    """
    :param stats_name: str,对药品数量进行统计的名字
    :param size: str, 统计的时间粒度 1d, 4d, 7d, 15d, 30d, 45d
    :param non_zero: bool, 统计是否非0
    :param decomp_method: str, 分解方法
    :param n_components: int , 分解之后的维度
    :return:
    """
    assert decomp_method in ['svd', 'nmf', 'lda']
    mask = (stats_name in ['sum', 'max', 'sum_ratio', 'max_ratio']) & non_zero
    assert not mask
    matrix_name = '{}_vector_ftr51_by_{}_{}'.format(stats_name, size, non_zero)
    # 0 读取数据

    ftr51_stats_sparse_matrix = sparse.load_npz(
        get_path() + 'Data/Feature/{}.npz'.format(matrix_name)).toarray()

    if decomp_method == 'svd':
        print(' svd decomposition...')
        svd = TruncatedSVD(n_components=n_components,
                           n_iter=50,
                           random_state=42)
        ftr51_stats_matrix_decomp = svd.fit_transform(
            ftr51_stats_sparse_matrix)

    if decomp_method == 'nmf':
        print(' nmf decomposition...')
        nmf = NMF(n_components=n_components,
                  init='random',
                  random_state=0,
                  max_iter=200)
        ftr51_stats_matrix_decomp = nmf.fit_transform(
            ftr51_stats_sparse_matrix)

    if decomp_method == 'lda':
        print(' lda decomposition...')
        lda = LatentDirichletAllocation(n_components=n_components,
                                        max_iter=50,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0,
                                        n_jobs=1)
        ftr51_stats_matrix_decomp = lda.fit_transform(
            ftr51_stats_sparse_matrix)
        joblib.dump(lda, "lda_{}_{}.m".format(stats_name, size))

    columns = [
        '{}_{}_vector_by_{}_{}_{}_{}'.format(decomp_method, stats_name, size,
                                             non_zero, n_components, j)
        for j in range(ftr51_stats_matrix_decomp.shape[1])
    ]
    stats_df = pd.DataFrame(data=ftr51_stats_matrix_decomp, columns=columns)
    train = stats_df[:15000].reset_index(drop=True)
    test = stats_df[15000:].reset_index(drop=True)
    for feature in columns:
        SaveFeature(train, test, feature)

    return columns, 'gen_decomposition_stats_vector_ftr51("{}", "{}", {}, "{}", {})'.format(
        stats_name, size, non_zero, decomp_method, n_components)

# 1 パイプラインの定義 ------------------------------------------------------------------------------

# パイプラインの定義
# --- 次元削減
# --- SVM分類器
pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', SVC())
])

# パラメータ設定
params_grid = [
    {
        'reduce_dim': [PCA(), NMF(), Isomap(), TruncatedSVD()],
        'reduce_dim__n_components': [2, 3],
        'classify': [SVC(), LinearSVC()],
        'classify__C': [1, 10, 100, 1000]
    }
]

# 確認
print(params_grid)


# 2 パラメータチューニングの実行 -----------------------------------------------------------------------

# <ポイント>
# - グリッドサーチを用いてハイパーパラメータのチューニングを行う
Exemplo n.º 18
0
# Use tf (raw term count) features for LDA.
print("抽取 tf 特征,用于LDA")
tf_vectorizer = CountVectorizer(max_df=0.95,
                                min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("抽取 tf 特征完成 in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("用tf-idf特征训练NMF模型(范数),, "
      "文章个数=%d and 特征个数=%d..." % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("训练完成。done in %0.3fs." % (time() - t0))

print("\n在非负的矩阵分解模型(范数)的主题:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("用ft-idf特征训练非负的矩阵分解模型(普通的KL散度), 文章个数=%d and 特征个数=%d..." %
      (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components,
          random_state=1,
          beta_loss='kullback-leibler',
          solver='mu',
          max_iter=1000,
Exemplo n.º 19
0
# plot the mean cross-validation scores
mglearn . tools . heatmap ( scores , xlabel = 'svm__C' , 
                           xticklabels = param_grid [ 'svm__C' ],
                           ylabel = 'svm__gamma' , 
                           yticklabels = param_grid [ 'svm__gamma' ], cmap = "viridis" )


"""-----------------------------------------------------------------------------"""

"""==========================================================================================="""

"""-----------------------------------------------------------------------------"""

"""NMF pre-processing with SVC algorithm """
##Pipelines in Grid Searches
pipe = Pipeline([("scaler", NMF()), ("svm", SVC())])
param_grid = { 'scaler__n_components' : [5],
         'svm__C' : [0.00001, 0.1],
              'svm__gamma' : [0.00001, 0.1]}
grid = GridSearchCV(pipe, param_grid = param_grid, cv = 5)
grid.fit(X_train,y_train )
pred = grid.predict(X_test)
print("NMF pre-processing with SVC algorithm")
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Test set accuracy: {:.2f}".format(grid.score(X_test,y_test)))
print("f1 score: {:.2f}".format(f1_score(y_test,pred)))
print("Best parameters: {}".format(grid.best_params_))
print ( classification_report ( y_test, pred, target_names = [ "mol" , "no_mol" ]))
scores = grid.cv_results_ [ 'mean_test_score' ] . reshape ( 2,2 )
# plot the mean cross-validation scores
mglearn . tools . heatmap ( scores , xlabel = 'svm__C' , 
Exemplo n.º 20
0
                             stop_words='english',
                             lowercase=True,
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(wines["processed_description"])

NUM_TOPICS = 10

# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS,
                                max_iter=10,
                                learning_method='online',
                                verbose=True)
data_lda = lda.fit_transform(data_vectorized)

# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized)

# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)


# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
               for i in topic.argsort()[:-top_n - 1:-1]])

Exemplo n.º 21
0
model.add(e)
model.add(Flatten())
model.add(Dense(10173, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(13, activation='softmax'))

model.compile(optimizer='Adadelta', loss='categorical_crossentropy', metrics=['acc'])

history = model.fit(tfidf, y_label, epochs=20, verbose=1,validation_split=0.3)



# Run NMF
from sklearn.decomposition import NMF, LatentDirichletAllocation
no_topics = 13
nmf = NMF(n_components=no_topics, init='nndsvd').fit(tfidf)
W = nmf.fit_transform(tfidf)
H = nmf.components_

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, learning_method='online', learning_offset=50.).fit(tf)
W_lda = lda.fit_transform(tf)
H_lda = lda.components_


def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
Exemplo n.º 22
0
    pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())])

    N_EXPERIMENTS = 5
    N_FEATURES_OPTIONS = [4]
    C_OPTIONS = [1, 10, 100, 1000]

    reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']

    non_nested_scores = np.zeros(N_EXPERIMENTS)
    nested_scores = np.zeros(N_EXPERIMENTS)

    ############################################################

    param_grid = [
        {
            'reduce_dim': [PCA(iterated_power=7), NMF()],
            'reduce_dim__n_components': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },
        {
            'reduce_dim': [SelectKBest(chi2)],
            'reduce_dim__k': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },
    ]

    print('Grid Search experiments... ')
    start = time()
    for ith_exp in range(N_EXPERIMENTS):

        # CV technique
Exemplo n.º 23
0
import numpy as np
from sklearn.decomposition import NMF,TruncatedSVD,ProjectedGradientNMF
model = NMF(n_components=2, alpha=0.01)

#Store AD
ad_ID_dict   = {}
#ad_list = []
#ad_list = list(ad_list)


#Assign ID number
ad_ID = 0
user_ID = 0

max_feature = 0


#ad_ID for ad_nmu
adID_for_num = {}


with open ('ad_ID.dat') as file:
    for line in file:
        data = line.strip('\n').split('   ')
        #print(data)
        adID_for_num[int(data[1])] = int(data[0])

file.close()


        # tf-idf
        for max_fq in df_gradients:
            tweetImport = codecs.open(importfilename, 'r', 'utf-8')
            # NMF can use tf-idf # lowercase=False
            tfidf_vectorizer = TfidfVectorizer(strip_accents='ascii', ngram_range=(ngram_min, ngram_max), max_df=max_fq, min_df=1, max_features=num_features, stop_words=stop_words, analyzer='word', token_pattern='[a-zA-Z]+')
            tfidf_matrix = tfidf_vectorizer.fit_transform(tweetImport)
            tfidf_feature_names = tfidf_vectorizer.get_feature_names()
            stop_words.extend(tfidf_feature_names)
            tweetImport.close()

            # save the terms ranked by tfidf scores into a list, to be used for wordcloud plotting
            version = 3
            saveTerms_sortedTFIDFscores(outputPath, max_fq, num_features, version, tfidf_feature_names, tfidf_matrix)

            # Run NMF (results not as good as LDA)
            nmf = NMF(n_components=num_topics, random_state=1, alpha=0, init='random').fit(tfidf_matrix)
            display_topics(nmf, tfidf_feature_names, num_top_words)


        # plot all wordclouds in one figure
        # wordcloud_in_one_figure(outputPath, num_features, df_gradients)

        # plot individual wordclouds:
        plt.rcParams['figure.figsize'] = (10.0, 7.0)
        
        for max_fq in df_gradients:
            tfidffilename = outputPath + 'tweet_keyword_tradewar_tfidf_features_' + str(max_fq) + '_' + str(num_features) + '_v3.csv'
            tfidffile = open(tfidffilename, 'r')
            word_text = tfidffile.read()

            wordcloud = WordCloud(colormap='hsv', max_words=1000, width=3000, height=2000, margin=3, collocations=False).generate(word_text)
image_shape = people.images[0].shape

mask = np.zeros(people.target.shape, dtype=np.bool)
for target in np.unique(people.target):
    mask[np.where(people.target == target)[0][:50]] = 1
X_people = people.data[mask]
y_people = people.target[mask]
X_people = X_people / 255.
X_train, X_test, y_train, y_test = train_test_split( \
        X_people, y_people, stratify=y_people, random_state=0)

mglearn.plots.plot_nmf_illustration()
mglearn.plots.plot_nmf_faces(X_train, X_test, image_shape)

from sklearn.decomposition import NMF
nmf = NMF(n_components=15, random_state=0)
nmf.fit(X_train)
X_train_nmf = nmf.transform(X_train)
X_test_nmf = nmf.transform(X_test)

fix, axes = plt.subplots(3, 5, figsize=(15, 12), \
            subplot_kw={'xticks': (), 'yticks': ()})
for i, (component, ax) in enumerate(zip(nmf.components_, axes.ravel())):
    ax.imshow(component.reshape(image_shape))
    ax.set_title("{}. component".format(i))

# display the data that has large weighting for comp
compn = 11
inds = np.argsort(X_train_nmf[:, compn])[::-1]
fix, axes = plt.subplots(2, 5, figsize=(15, 8), \
            subplot_kw={'xticks': (), 'yticks': ()})
Exemplo n.º 26
0
start_time = time.time()
# vectorize documents by using tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenization,
                                   max_features=n_features,
                                   max_df=0.9,
                                   min_df=2)
docs_tfidf = tfidf_vectorizer.fit_transform(doc_set)
termid_word_list = tfidf_vectorizer.get_feature_names(
)  # word = termid_word_list[indx]

print("Fitting the NMF model...")
# solver: coordinate descent; learning rate: alpha = 0.1;
#l1_ratio 0: L2 regularization, NO L1 regularization
nmf_model = NMF(n_components=n_factors,
                random_state=1,
                solver='cd',
                alpha=.1,
                l1_ratio=.0)
# generate latent factors for documents based on NMF model
docs_lf = nmf_model.fit_transform(docs_tfidf)

for qIndex in range(0, len(queryID_list)):
    #for qIndex in range(0, 2):
    print(str(qIndex) + "/" + str(len(queryID_list)))
    query_str = queries_dict[queryID_list[qIndex]]
    query = [query_str]
    # generate tfidf vector for the query
    query_tfidf = tfidf_vectorizer.transform(query)
    # generate latent factor for the query based on NMF model
    query_lf = nmf_model.transform(query_tfidf)
Exemplo n.º 27
0
def test_nmf_fit_close(solver):
    rng = np.random.mtrand.RandomState(42)
    # Test that the fit is not too far away
    pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0, max_iter=600)
    X = np.abs(rng.randn(6, 5))
    assert pnmf.fit(X).reconstruction_err_ < 0.1
Exemplo n.º 28
0
def update_nmf_graph1(no_topics, nmf_components_value, nmf_alpha_value, nmf_l1ratio_value, min_df_value, max_df_value, ngram_range_value, num_clicks):


	if num_clicks > 0:

		# Getting the filenames
		matrix_filename = 'temp_data/' + temporary_key + '_output_matrix.csv'
		processed_docs_filename = 'temp_data/' + temporary_key + '_processed_docs.csv'
		features_list_filename = 'temp_data/' + temporary_key + '_features_list.csv'
		tfidf_fit_filename = 'temp_data/' + temporary_key + '_vectorizer_model.pickle'

		print('loading nmf input objects')
		# Read in tfidf

		dense_tfidf_matrix = pd.read_csv(matrix_filename)
		print('The shape of the tfidf_matrix is: {}.'.format(dense_tfidf_matrix.shape))

		# Reading in the processed documents
		processed_docs = pd.read_csv(processed_docs_filename, encoding = 'latin1')
		processed_docs = processed_docs['processed_doc'].tolist() 

		print(processed_docs[0])


		features_df = pd.read_csv(features_list_filename)
		features_list = features_df['feature_list'].tolist()
		print('The first five token features are: {}.'.format(features_list[:5]))


		sparse_tfidf_matrix = scipy.sparse.csr_matrix(dense_tfidf_matrix.values)

		# print(sparse_tfidf_matrix)
		print('the sparse tfidf matrix is loaded')

		# Defining the NMF object
		nmf = NMF(n_components=no_topics, random_state=42, alpha=0.1, l1_ratio=.2, \
          max_iter = 500, verbose = False, shuffle = True, init='nndsvd', solver = 'cd')


		print('Computing the NMF for the sparse tfidf matrix')
		nmf_model = nmf.fit(sparse_tfidf_matrix)


		print(nmf_model)
		#--------------------------------------------------------------------------------------------------
		#--------------------------------------------------------------------------------------------------
		def generate_topic_table(model, feature_names, n_top_words):
		    topics = {}
		    for topic_idx, topic in enumerate(model.components_):
		        t = ("topic_%d" % topic_idx)
		        topics[t] = [feature_names[i] for i in top_words(topic, n_top_words)]
		        
		    out_df = pd.DataFrame(topics)
		    out_df = out_df[list(topics.keys())]
		    
		    return out_df
		#--------------------------------------------------------------------------------------------------
		#--------------------------------------------------------------------------------------------------



		print(processed_docs[0])
Exemplo n.º 29
0
def extract_components(mov_tot,
                       n_components=6,
                       normalize_std=True,
                       max_iter_DL=-30,
                       method_factorization='nmf',
                       **kwargs):
    """
    From optical flow images can extract spatial and temporal components

    Parameters:
    ----------
    mov_tot: ndarray (can be 3 or 4D)
        contains the optical flow values, either in cartesian or polar, either one (3D) or both (4D coordinates)
        the input is generated by the compute_optical_flow function    

    n_components: int
        number of components to look for

    normalize_std: bool
        whether to normalize each oof the optical flow components

    normalize_output_traces: boolean
        whether to normalize the behavioral traces so that they match the units in the movie

    Returns:
    -------
    spatial_filter: ndarray
        set of spatial inferred filters     

    time_trace:ndarray
        set of time components

    norm_fact: ndarray
        used notmalization factors

    """

    if mov_tot.ndim == 4:
        if normalize_std:
            norm_fact = np.nanstd(mov_tot, axis=(1, 2, 3))
            mov_tot = old_div(mov_tot, norm_fact[:, np.newaxis, np.newaxis,
                                                 np.newaxis])
        else:
            norm_fact = np.array([1., 1.])
        c, T, d1, d2 = np.shape(mov_tot)

    else:
        norm_fact = 1
        T, d1, d2 = np.shape(mov_tot)
        c = 1

    tt = time.time()
    newm = np.reshape(mov_tot, (c * T, d1 * d2))

    if method_factorization == 'nmf':
        nmf = NMF(n_components=n_components, **kwargs)

        time_trace = nmf.fit_transform(newm)
        spatial_filter = nmf.components_
        spatial_filter = np.concatenate([
            np.reshape(sp, (d1, d2))[np.newaxis, :, :] for sp in spatial_filter
        ],
                                        axis=0)

    elif method_factorization == 'dict_learn':

        import spams
        newm = np.asfortranarray(newm, dtype=np.float32)
        time_trace = spams.trainDL(newm,
                                   K=n_components,
                                   mode=0,
                                   lambda1=1,
                                   posAlpha=True,
                                   iter=max_iter_DL)

        spatial_filter = spams.lasso(newm,
                                     D=time_trace,
                                     return_reg_path=False,
                                     lambda1=0.01,
                                     mode=spams.spams_wrap.PENALTY,
                                     pos=True)

        spatial_filter = np.concatenate([
            np.reshape(sp, (d1, d2))[np.newaxis, :, :]
            for sp in spatial_filter.toarray()
        ],
                                        axis=0)

    time_trace = [np.reshape(ttr, (c, T)).T for ttr in time_trace.T]

    el_t = time.time() - tt
    print(el_t)
    return spatial_filter, time_trace, norm_fact
# Nuevamente, agregamos las restricciones al vectorizador encontradas previamente, y sumamos otras
# observadas en el proceso de LDA que no aportan a definir un tipo de objeto de sociedad
stop_w = ['de','la','a','el','que','en','los','las','con','al','sus','del','por','como','para','toda','todo','servicios',
         'cualquier','otros','general','tipo','tipos','actividades','ya','similares','objeto','no','actividad','otra',
         'terceros','cuenta','propia','bienes','clase','ajena','act','propios','sociedad','sociedades','socios','su','sea',
         'relacionadas','otras','relacionados','especializado','especializados','nuevos','empleadores']

tfidf = TfidfVectorizer(max_df=0.9,min_df=2,stop_words=stop_w)
mtx = tfidf.fit_transform(lines)
# Ahora importamos la clase NMF (Non matrix Factorization)
from sklearn.decomposition import NMF

# Tal como fue mencionado en el metodo LDA, trataremos de definir 15 tipos de objeto
k = 15
nmf_model = NMF(n_components=k,random_state=7)
nmf_model.fit(mtx)

# Observamos las 10 palabras mas utilizadas por tipos de objeto
for i, tema in enumerate(nmf_model.components_):
    print(f"Tema {i}:")
    print([tfidf.get_feature_names()[index] for index in tema.argsort()[-10:]])
    print("\n")

# Asociamos los tipos de objetos a cada entrada
import pandas as pd
df = pd.DataFrame()
temas_resultantes = nmf_model.transform(mtx)
df['Texto'] = lines
df['Grupo'] = temas_resultantes.argmax(axis=1)
df.head()