def nmf_analysis_comparison(self):
        print('Running NMF component comparisons')
        fig=plt.figure(figsize=(10, 2*len(self.values)))
        columns = self.n_classes+1
        rows = len(self.values)
        for indx, val in enumerate(self.values):
            self.nmf = decomposition.NMF(n_components=val)
            self.nmf.fit(self.x_train_flat)

            fig.add_subplot(rows, columns, 1+(indx*columns))
            plt.imshow(self.nmf.components_.mean(0).reshape(48, 48),
                       cmap=plt.cm.bone)
            plt.gca().set_title('Overall')
            plt.axis('off')
            for emo_val, i in enumerate(range(2, columns+1)):
                t1 = self.df[self.df['emotion']==emo_val]
                temp_x_train = np.stack(t1.pop('img_array').values)
                temp_x_train_flat = temp_x_train.reshape(temp_x_train.shape[0],-1)
                # print(temp_x_train_flat.shape) 
                fig.add_subplot(rows, columns, i+(indx*columns))
                nmf = decomposition.NMF(n_components=val)
                nmf.fit(temp_x_train_flat)
                plt.imshow(nmf.components_.mean(0).reshape(48, 48),
                       cmap=plt.cm.bone)
                plt.gca().set_title(self.emo_list[emo_val])
                plt.axis('off')
        plt.savefig('../images/nmf_images_comparison.png')
        # plt.show()
        plt.close()
Пример #2
0
def build_embedding(path, embedding=None):
    """
    Build the desired embedding and save to specified path.

    Available embeddings :
        * Interaction Matrix
        * t-SNE (unused)
        * Spectral Embedding
        * Locally Linear Embedding
        * Non-negative Matrix Factorisation
        * Factor Analysis
    """
    if embedding == 'spectral':
        mat = np.load(path)
        u_spectral = manifold.SpectralEmbedding(n_components=64, random_state=0, n_jobs=8).fit_transform(mat)
        i_spectral = manifold.SpectralEmbedding(n_components=64, random_state=0, n_jobs=8).fit_transform(mat.T)
        return u_spectral, i_spectral
    elif embedding == 'lle':
        mat = np.load(path)
        u_lle = manifold.LocallyLinearEmbedding(n_components=64, random_state=0, n_jobs=8).fit_transform(mat)
        i_lle = manifold.LocallyLinearEmbedding(n_components=64, random_state=0, n_jobs=8).fit_transform(mat.T)
        return u_lle, i_lle
    elif embedding == 'fa':
        mat = np.load(path)
        u_fa = decomposition.FactorAnalysis(n_components=64, random_state=0).fit_transform(mat)
        i_fa = decomposition.FactorAnalysis(n_components=64, random_state=0).fit_transform(mat.T)
        return u_fa, i_fa
    elif embedding == 'nmf':
        mat = np.load(path)
        u_nmf = decomposition.NMF(n_components=64, random_state=0).fit_transform(mat)
        i_nmf = decomposition.NMF(n_components=64, random_state=0).fit_transform(mat.T)
        return u_nmf, i_nmf
def nmf_step(spectra,
             n_components,
             sparsity='components',
             beta=1e-5,
             **kwargs):
    """
    Performs the non-negative matrix factorization of the spectra into a
    partial spectra aka components matrix, and a mixing coefficients
    matrix.
    kwargs are passed to decomposition.NMF.

    Parameters
    ----------
    spectra : numpy.ndarray, n_spectra * n_features
        Clean input spectra
    n_components : int
        Number of significant components
    sparsity : {'data', 'components', None},  default 'components'
        Where to enforce sparsity in the model.
    beta : double, default 1e-5
        Degree of sparseness, if sparseness is not None. Larger values mean
        more sparseness

    Returns
    -------
    components : numpy.ndarray, n_components * n_features
        Resulting components, aka H matrix
    mixing_matrix : numpy.ndarray, n_samples * n_components
        Resulting mixing coefficients, aka W matrix
    reconstruction_error : float
        Frobenius norm of (S - WH)
    """
    init = kwargs.pop('init', 'nndsvda')
    max_iter = kwargs.pop('max_iter', 1000)
    nls_max_iter = kwargs.pop('nls_max_iter', 10000)

    try:
        # Old version of sklearn.decomposition.nfm
        nmf = decomposition.NMF(n_components=n_components,
                                init=init,
                                sparseness=sparsity,
                                beta=beta,
                                tol=1e-5,
                                max_iter=max_iter,
                                nls_max_iter=nls_max_iter,
                                **kwargs)
    except:
        # New version of sklearn.decomposition.nfm (beta and nls_max_iter are
        # no longer arguments).
        nmf = decomposition.NMF(n_components=n_components,
                                init=init,
                                tol=1e-5,
                                max_iter=max_iter,
                                **kwargs)

    mix = nmf.fit_transform(spectra)
    components = nmf.components_
    print('Reconstruction error: {:.3e}'.format(nmf.reconstruction_err_))
    return components, mix, nmf.reconstruction_err_
Пример #4
0
def get_matrix_nmf(features, num_topics):
    """
    Performs non-negative matrix factorization.

    Parameters
    ----------
    features: pandas.DataFrame of features.
    num_topics: number of topics to return.

    Returns
    -------
    W: pandas.DataFrame (shape= # participants by num_topics)... relates the participants to the topics.
    H: pandas.DataFrame (shape= num_topics by # questions)... relates the topics to the questions.
    """

    mat = pd.DataFrame(features)
    nmf = decomposition.NMF(n_components=num_topics)
    nmf.fit(mat)

    W = nmf.transform(mat)
    H = nmf.components_

    W = pd.DataFrame(W)
    H = pd.DataFrame(H)

    W, H = (np.around(x, 2) for x in (W, H))

    # this shows the components
    print(W.head(30), '\n\n', H.head(num_topics))
    return (W, H)
Пример #5
0
    def nmf_topic_modeling(self):
        corpus = []
        client = MongoDB()
        tweetsCollection = client.get_collection('illinois')
        findCorpusQuery = tweetsCollection.find()

        for fileid in findCorpusQuery:
            corpus.append(fileid['text'])

        cachedStopWords = set(STOPWORDS)
        cachedStopWords.update(
            ('well', 'say', 'New', 'RT', 'now', 'https', 'via', 'CIA', 'make', 'says', 'new', 'will', 'said',
             'take', 'amp', 'one', 'go', 'know', 'day', 'look', 'think', 'gt', 'lt', 'co', 'rt', 'zbvnkkvl48', 'gtwkxla6bn',
             'cia', 'elizasoul80', '5febckimwt', 'af', 'ex', 'ahca', 'heioaednwr', 'realdonaldtrump', 'mitchellvii',
             'f0mei9xcwv', 're', '63', 'went', 'still', 'thanks', 'vp', 'olson_micki', 'going', 'il', 'potus', 'housegop'))

        vectorizer = TfidfVectorizer(stop_words=cachedStopWords, min_df=2)
        dtm = vectorizer.fit_transform(corpus)
        vocab = vectorizer.get_feature_names()

        num_topics = 5

        clf = decomposition.NMF(n_components=num_topics, random_state=1)
        doctopic = clf.fit_transform(dtm)

        topic_words = []
        num_top_words = 5
        for topic in clf.components_:
            word_idx = np.argsort(topic)[::-1][0:num_top_words]  # get indexes with highest weights
            topic_words.append([vocab[i] for i in word_idx])

        for t in range(len(topic_words)):
            print "Topic {}: {}".format(t, ' '.join(topic_words[t][:15]))
Пример #6
0
def topic_model_nmf(chunks,
                    num_of_topics=20,
                    topic_size=10,
                    max_iterations=200):

    from sklearn.feature_extraction import DictVectorizer
    from sklearn import decomposition
    import numpy as np

    v = DictVectorizer(sparse=True)
    D = chunks
    X = v.fit_transform(D)

    num_terms = len(v.vocabulary_)
    terms = [""] * num_terms
    for term in v.vocabulary_.keys():
        terms[v.vocabulary_[term]] = term

    model = decomposition.NMF(init="nndsvd",
                              n_components=num_of_topics,
                              max_iter=max_iterations)
    W = model.fit_transform(X)
    H = model.components_
    topics = []
    for topic_index in range(H.shape[0]):
        top_indices = np.argsort(H[topic_index, :])[::-1][0:topic_size]
        term_ranking = [terms[i] for i in top_indices]
        #print(term_ranking)
        topics += [", ".join(term_ranking)]
    return topics
Пример #7
0
    def build_stable_topic_model(self):

        matrices = []
        for i in range(Constants.TOPIC_MODEL_PASSES):
            topic_term_matrix = self.build_single_topic_model().transpose()
            matrices.append(topic_term_matrix)

        stack_matrix = numpy.hstack(matrices)
        stack_matrix = normalize(stack_matrix, axis=0)
        stack_matrix = stack_matrix.transpose()

        print "Stack matrix M of size %s" % str(stack_matrix.shape)

        self.topic_model = decomposition.NMF(
            init="nndsvd",
            n_components=self.num_topics,
            max_iter=Constants.TOPIC_MODEL_ITERATIONS,
            alpha=Constants.NMF_REGULARIZATION,
            l1_ratio=Constants.NMF_REGULARIZATION_RATIO)

        self.document_topic_matrix = \
            self.topic_model.fit_transform(stack_matrix)
        self.topic_term_matrix = self.topic_model.components_

        row_sums = self.topic_term_matrix.sum(axis=1)
        self.topic_term_matrix /= row_sums[:, numpy.newaxis]

        print "Generated factor W of size %s and factor H of size %s" % (str(
            self.document_topic_matrix.shape), str(
                self.topic_term_matrix.shape))
Пример #8
0
def NMF(arquivo, nt):
	vector = text.CountVectorizer(input='arquivo', stop_words='english', min_df=1, strip_accents='unicode')
	arqArray = vector.fit_transform(arquivo).toarray()
	vocabulario = np.array(vector.get_feature_names())
	
	ntw = arqArray.shape[0]
	
	#NMF
	num_topics = nt
	num_top_words = ntw
	#Decomposição
	clf = decomposition.NMF(n_components=num_topics, random_state=1)
	doctopic = clf.fit_transform(arqArray)

	topic_words = []
	for topic in clf.components_:
		word_idx = np.argsort(topic)[::-1][0:num_top_words]
		topic_words.append([vocabulario[i] for i in word_idx])
		
	with np.errstate(invalid='ignore'):
		doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)
	
	
	#print(doctopic)

	FechaDocumento(arquivo)

	return(doctopic)
Пример #9
0
 def NMF(self,N_component):#Non-Negative Matrix Factorization
     NMF_calculator = skdecomp.NMF(N_component,max_iter=1500,tol = 0.0001)
     self.NMFed_data = NMF_calculator.fit(self.vector)
     all_NMFs = self.NMFed_data.components_
     pp.save_variable(all_NMFs,save_folder+r'\\NMFed_Data.pkl')
     print('NMF calculation done, generating graphs')
     self.cell_graph_plot('NMF',all_NMFs)
Пример #10
0
def fast_ai_nlp_nmf(content):
    nltk.download('stopwords')
    stop_words = stopwords.words('russian')

    # class LemmaTokenizer(object):
    #     def __init__(self):
    #         self.wnl = stem.WordNetLemmatizer()
    #
    #     def __call__(self, doc):
    #         return [self.wnl.lemmatize(word) for word in word_tokenize(doc)]

    vectorizer = CountVectorizer(stop_words=stop_words) #, tokenizer=LemmaTokenizer())
    vectors = vectorizer.fit_transform(content).todense()  # (documents, vocab)
    vocab = np.array(vectorizer.get_feature_names())
    m, n = vectors.shape
    d = 5  # num topics
    clf = decomposition.NMF(n_components=d, random_state=1)
    W1 = clf.fit_transform(vectors)
    H1 = clf.components_
    print(show_topics(H1, vocab))
    vectorizer_tfidf = TfidfVectorizer(stop_words=stop_words)
    vectors_tfidf = vectorizer_tfidf.fit_transform(content)  # (documents, vocab)
    W1 = clf.fit_transform(vectors_tfidf)
    H1 = clf.components_
    print(show_topics(H1, vocab))
Пример #11
0
    def apply(self, k=-1, alpha=1.0, l1=0.75, max_iter=100, rel_err=1e-3):
        if k == -1:
            k = self.num_cluster
        X = self.pre_processing()

        fixed_W = pd.get_dummies(self.labels)
        fixed_W_t = fixed_W.T  # interpret W as H (transpose), you can only fix H, while optimizing W in the code. So we simply switch those matrices (invert their roles).
        learned_H_t, fixed_W_t_same, n_iter = decomp.non_negative_factorization(X.astype(np.float), n_components=k, init='custom', random_state=0, update_H=False, H=fixed_W_t.astype(np.float), alpha=alpha, l1_ratio=l1, max_iter=max_iter, shuffle=True, solver='cd',tol=rel_err, verbose=0)

        init_W = fixed_W_t_same.T
        init_H = learned_H_t.T

        nmf = decomp.NMF(alpha=alpha, init='custom',l1_ratio=l1, max_iter=max_iter, n_components=k, random_state=0, shuffle=True, solver='cd', tol=rel_err, verbose=0)
        W = nmf.fit_transform(X.T, W=init_W, H = init_H)
        H = nmf.components_
        self.cluster_labels = np.argmax(W, axis=1)

        if np.any(np.isnan(H)):
            raise Exception('H contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format(
                alpha, k, l1, X.shape[0], X.shape[1]))
        if np.any(np.isnan(W)):
            raise Exception('W contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.format(
                alpha, k, l1, X.shape[0], X.shape[1]))

        # self.print_reconstruction_error(X, W, H)
        self.dictionary = H.T
        self.data_matrix = W.T
Пример #12
0
 def __get_implementation(self, method: Method, n_components):
     # decomposition
     if method == self.Method.PCA:
         return self.Implementation(
             'pca',
             decomposition.PCA(n_components=n_components,
                               svd_solver='randomized',
                               whiten=True))
     elif method == self.Method.ICA:
         return self.Implementation(
             'ica',
             decomposition.FastICA(n_components=n_components,
                                   whiten=True,
                                   max_iter=1000))
     elif method == self.Method.FA:
         return self.Implementation(
             'fa', decomposition.FactorAnalysis(n_components=n_components))
     elif method == self.Method.TSVD:
         return self.Implementation(
             'tsvd', decomposition.TruncatedSVD(n_components=n_components))
     elif method == self.Method.NMF:
         return self.Implementation(
             'nmf', decomposition.NMF(n_components=n_components))
     #clustering
     elif method == self.Method.KMEANS:
         return self.Implementation(
             'kmeans',
             cluster.MiniBatchKMeans(n_clusters=n_components, tol=1e-3))
     else:
         raise Exception(
             'Error creating estimator. Invalid Type specified.')
Пример #13
0
 def apply(self,
           k=-1,
           mix=0.0,
           reject_ratio=0.,
           alpha=1.0,
           l1=0.75,
           max_iter=100,
           rel_err=1e-3,
           calc_transferability=False):
     if k == -1:
         k = self.num_cluster
     mixed_data, new_trg_data, trg_data = self.get_mixed_data(
         mix=mix,
         reject_ratio=reject_ratio,
         max_iter=max_iter,
         rel_err=rel_err,
         calc_transferability=calc_transferability)
     nmf = decomp.NMF(alpha=alpha,
                      init='nndsvdar',
                      l1_ratio=l1,
                      max_iter=max_iter,
                      n_components=k,
                      random_state=0,
                      shuffle=True,
                      solver='cd',
                      tol=1e-6,
                      verbose=0)
     W = nmf.fit_transform(mixed_data)
     H = nmf.components_
     self.dictionary = W
     self.data_matrix = H
     self.cluster_labels = np.argmax(nmf.components_, axis=0)
     self.mixed_data = mixed_data
Пример #14
0
 def create_model(self, k):
     A, terms = self.vectorize()
     model = decomposition.NMF(init="nndsvd", n_components=k)
     # apply the model and extract the two factor matrices
     W = model.fit_transform(A)
     H = model.components_
     return W, H
Пример #15
0
def write_topics(ftopics, fwords, ftopics_words, poem_words, n_topic,
                 n_topic_words):
    """
    找出唐诗语料库中的所有topics
    :param ftopics:
    :param fwords:
    :param ftopics_words:
    :param poem_words:
    :param n_topic:
    :param n_topic_words:
    :return:
    """
    count_matrix = count_vect.fit_transform(poem_words)
    tfidf = TfidfTransformer().fit_transform(count_matrix)
    nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
    feature_names = count_vect.get_feature_names()
    fw = codecs.open(ftopics, 'w', 'utf-8')
    for topic in nmf.components_:
        fw.write(' '.join([
            feature_names[i] for i in topic.argsort()[:-n_topic_words - 1:-1]
        ]) + '\n')
    fw.close()
    print('Write topics done.')
    fw = codecs.open(fwords, 'wb')
    pickle.dump(feature_names, fw)
    fw.close()
    print('Write words done.')
    fw = codecs.open(ftopics_words, 'wb')
    pickle.dump(nmf.components_, fw)
    fw.close()
    print('Write topic_words done.')
Пример #16
0
    def nmf_topic_modeling(self):
        corpus = []
        client = MongoDB()
        tweetsCollection = client.get_collection('florida')
        findCorpusQuery = tweetsCollection.find()

        for fileid in findCorpusQuery:
            corpus.append(fileid['text'])

        cachedStopWords = set(STOPWORDS)
        cachedStopWords.update(
            ('well', 'say', 'New', 'RT', 'now', 'https', 'via', 'CIA', 'make', 'says', 'new', 'will', 'said',
             'take', 'amp', 'one', 'go', 'know', 'day', 'look', 'think', 'gt', 'lt', 'co', 'rt', 'zbvnkkvl48', 'gtwkxla6bn',
             'cia', 'elizasoul80', '5febckimwt', 'af', 'ex', 'realdonaldtrump', 'judgejeanine', 'epucokw6b3', 'vqpgqgfwye',
             'mr', '12', '61m', 'putin', 're', 'donthecon', 'let', 'ass', 'rep', 'still', 'much', 'steph93065', 'dems'))

        vectorizer = TfidfVectorizer(stop_words=cachedStopWords, min_df=2)
        dtm = vectorizer.fit_transform(corpus)
        vocab = vectorizer.get_feature_names()

        num_topics = 5

        clf = decomposition.NMF(n_components=num_topics, random_state=1)
        doctopic = clf.fit_transform(dtm)

        topic_words = []
        num_top_words = 5
        for topic in clf.components_:
            word_idx = np.argsort(topic)[::-1][0:num_top_words]  # get indexes with highest weights
            topic_words.append([vocab[i] for i in word_idx])

        for t in range(len(topic_words)):
            print "Topic {}: {}".format(t, ' '.join(topic_words[t][:15]))
Пример #17
0
    def apply(self, k=-1, alpha=1.0, l1=0.75, max_iter=4000, rel_err=1e-3):
        if k == -1:
            k = self.num_cluster
        X = self.pre_processing()

        nmf = decomp.NMF(alpha=alpha,
                         init='nndsvdar',
                         l1_ratio=l1,
                         max_iter=1000,
                         n_components=k,
                         random_state=0,
                         shuffle=True,
                         solver='cd',
                         tol=0.00001,
                         verbose=0)

        W = nmf.fit_transform(X)
        H = nmf.components_
        self.cluster_labels = np.argmax(nmf.components_, axis=0)

        if np.any(np.isnan(H)):
            raise Exception(
                'H contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.
                format(alpha, k, l1, X.shape[0], X.shape[1]))
        if np.any(np.isnan(W)):
            raise Exception(
                'W contains NaNs (alpha={0}, k={1}, l1={2}, data={3}x{4}'.
                format(alpha, k, l1, X.shape[0], X.shape[1]))

        self.print_reconstruction_error(X, W, H)
        self.dictionary = W
        self.data_matrix = H
Пример #18
0
def code_sklearn(arr,
                 method=None,
                 init=None,
                 c=None,
                 params=None,
                 transform=True):
    """Non-negative matrix factorization using scikits-learn. 

     - arr(``path``) input array (loci x scaled features) see: ``scale_features``.
     - c(``int``) number of expected histone codes (factorization rank).
     - init(``str``) matrix factorization initialization method.

    """
    chk_exit(*inp_file(path(arr)))
    with open(arr) as fh:
        bam_names = fh.readline().strip().split("\t")
        bam_scaled = np.loadtxt(fh, delimiter="\t")
    kwargs = parse_params(params, {"max_iter": 1000})
    nmf = decomposition.NMF(n_components=c,
                            init=init,
                            sparseness='components',
                            **kwargs)
    nmf.fit(bam_scaled)
    ofn_epi = arr.replace(
        ".arr", "_%s-c#%s-i#%s-p#%s.epi" % ("pgnmf", c, init, (params or "")))
    ofn_arr = arr.replace(
        ".arr", "_%s-c#%s-i#%s-p#%s.arr" % ("pgnmf", c, init, (params or "")))
    write_codes(ofn_epi, nmf.components_, bam_names)
    if transform:
        bam_transformed = nmf.transform(bam_scaled)
        write_values(ofn_arr, bam_transformed, c)
    return ofn_epi, ofn_arr
Пример #19
0
    def nmf(self,
            n_components,
            transform_df=False,
            transform_test_df=False,
            seed=1,
            init='nndsvdar',
            tol=5e-3):
        self.nmf_fit = decomposition.NMF(n_components=n_components,
                                         init=init,
                                         tol=tol,
                                         random_state=seed)
        self.nmf_df = self.nmf_fit.fit_transform(self.df)
        colnames = ['nmf_{}'.format(x) for x in range(n_components)]

        self.nmf_df = pd.DataFrame(self.nmf_df,
                                   index=self.df.index,
                                   columns=colnames)
        self.nmf_weights = pd.DataFrame(self.nmf_fit.components_,
                                        columns=self.df.columns,
                                        index=colnames)
        if transform_df:
            out_df = self.nmf_fit.transform(self.df)
            return out_df

        if transform_test_df:
            self.nmf_test_df = self.nmf_fit.transform(self.test_df)
Пример #20
0
    def extractTopic(self):
        """	* Tokenize the all words
			* Eliminates any word with less than two letters
			* Forms term frequency–inverse document frequency for each word
			* Generate Document-term_matrix
			* Classify topics based on Document-term_matrix and frequency–inverse document frequency 
			* Gathers first "N" numbers from each topic 
		"""

        self.vectorizer = text.CountVectorizer(input='filename',
                                               stop_words='english',
                                               min_df=2)

        for x in range(len(self.fileNames)):
            temp = self.fileNames[x]
            self.fileNames[x] = self.baseDirectory + temp

        self.dtm = self.vectorizer.fit_transform(self.fileNames).toarray()
        self.vocab = np.array(self.vectorizer.get_feature_names())
        self.clf = decomposition.NMF(n_components=self.num_topics,
                                     random_state=1)

        self.doctopic = self.clf.fit_transform(self.dtm)

        for topic in self.clf.components_:
            word_idx = np.argsort(topic)[::-1][0:self.num_top_words]
            self.topic_words.append([self.vocab[i] for i in word_idx])

        for t in range(len(self.topic_words)):
            print("Topic {}: {}".format(t, ' '.join(self.topic_words[t][:15])))
    def transform(self, df, y=None):
        pca = decomposition.PCA(n_components=self.n_components,
                                random_state=self.random_state)
        pca_train = pca.fit_transform(df)

        ica = decomposition.FastICA(n_components=self.n_components,
                                    random_state=self.random_state)
        ica_train = ica.fit_transform(df)

        tsvd = decomposition.TruncatedSVD(n_components=self.n_components,
                                          random_state=self.random_state)
        tsvd_train = tsvd.fit_transform(df)

        nmf = decomposition.NMF(n_components=self.n_components,
                                random_state=self.random_state)
        nmf_train = nmf.fit_transform(df)

        for i in range(1, self.n_components + 1):
            df['pca_' + str(i)] = pca_train[:, i - 1]

            df['ica_' + str(i)] = ica_train[:, i - 1]

            df['tsvd_' + str(i)] = tsvd_train[:, i - 1]

            df['nmf_' + str(i)] = nmf_train[:, i - 1]

        return df
Пример #22
0
    def find_NMF_topics(self):
        """
        :param num_topics:
        :param num_top_words: a list of top words for each topic
        :return:
        """

        vectorizer = text.CountVectorizer(input='filename',
                                          stop_words='english',
                                          min_df=self.min_df,
                                          max_df=self.max_df)
        dtm = vectorizer.fit_transform(self.all_documents).toarray()

        vocab = np.array(vectorizer.get_feature_names())
        clf = decomposition.NMF(n_components=self.num_topics, random_state=1)

        # it shows for how many proability each corpus is related to a word in topic results
        self.doctopic = clf.fit_transform(dtm)

        self.topic_words = []
        for topic in clf.components_:
            word_idx = np.argsort(topic)[::-1][0:self.num_top_words]
            self.topic_words.append([vocab[i] for i in word_idx])

        return
def extract_topics_nmf(data_samples, preprocessor, n_features, n_topics, n_top_words, n_gram_range=(1,1), more_stopwords=None):
     
    nmf = decomposer.NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5)
    
    topics_words = _extract_topics_decomposer(data_samples, preprocessor, nmf, n_features, n_topics, n_top_words, n_gram_range, more_stopwords)

    return topics_words
Пример #24
0
    def get_cluster_membership(self):
        """ Determine the cluster number that each sample is associated with. """

        model = skld.NMF(n_components=self._num_clusters,
                         init='random',
                         beta=.3,
                         eta=.5,
                         max_iter=5000,
                         nls_max_iter=10000)

        w = model.fit_transform(self._matrix)
        h = model.components_

        # convert the 'H' matrix, which represents weights for our data matrix W, into
        # an array representing cluster membership. Index of biggest value in each
        # col of matrix H is the cluster
        clusters = []
        model_width = len(h[0])

        for col_idx in range(model_width):
            # convert column into an array
            col_vals = h[:, col_idx]

            # determine biggest row index and it's value from the array
            (row_idx, max_val) = max(enumerate(col_vals), key=lambda x: x[1])

            clusters.append(row_idx)

        # clusters array, w, h
        return (clusters, w, h)
Пример #25
0
def init_params(X, S, Kpart, nmf_noise=1e-2):
    F, N, J = S.shape
    I = X.shape[2]
    W = []
    H = []
    for j in range(J):
        model = skd.NMF(n_components=Kpart[j], init='random', random_state=0)
        s_2 = squared_module(S[:, :, j])
        W.append(
            model.fit_transform(s_2) +
            np.random.uniform(0, nmf_noise, (F, Kpart[j])))
        H.append(model.components_ +
                 np.random.uniform(0, nmf_noise, (Kpart[j], N)))

    W = np.concatenate(tuple(W), axis=1).astype(complex)
    H = np.concatenate(tuple(H), axis=0).astype(complex)
    A = np.zeros((F, I, J), dtype=complex)
    sigma_b = np.zeros(F, dtype=complex)

    Rxx = np.zeros((F, I, I), dtype=complex)
    Rxs = np.zeros((F, I, J), dtype=complex)
    Rss = np.zeros((F, J, J), dtype=complex)

    for f in range(F):
        Rxx[f] = r_hat(X[f])
        Rxs[f] = r_hat(X[f], S[f])
        Rss[f] = r_hat(S[f])
        A[f] = Rxs[f].dot(np.linalg.inv(Rss[f]))
        sigma_b[f] = np.mean(
            np.real(
                np.diagonal(Rxx[f] - A[f].dot(np.matrix(Rxs[f]).getH()) -
                            Rxs[f].dot(np.matrix(A[f]).getH()) +
                            A[f].dot(Rss[f].dot(np.matrix(A[f]).getH())))))
    return A, W, H, sigma_b, Rxx, Rxs, Rss
Пример #26
0
def nmf(bag_of_words, vocab, topics, top_words):
    dtm = bag_of_words.toarray()

    from sklearn import decomposition
    num_topics = topics
    num_top_words = top_words
    clf = decomposition.NMF(n_components=num_topics, random_state=1)

    doctopic = clf.fit_transform(dtm)

    #print words associated with topics
    topic_words = []

    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][0:num_top_words]
        topic_words.append([vocab[i] for i in word_idx])

    #make visualization easier
    #doctopic = doctopic/np.sum(doctopic, axis = 1, keepdims=True)
    #print "doctopic matrix"
    #print doctopic[0:5]


    print "show top 15 words"
    for t in range(len(topic_words)):
        print ("topic {}: {}".format(t, ' '.join(topic_words[t][:15])))

    return doctopic
Пример #27
0
def generate_topics_NMF(pickle_file, state, no_of_topics):
    corpus = []
    tweet_list = pickle.load(open(pickle_file, 'rb'))
    tweet_corpus = []

    # Removing Stopwords
    for tweet in tweet_list:
        cln_tweet = ''
        for wrd in tweet.split():
            if wrd not in stopwords:
                cln_tweet += ' ' + wrd
        tweet_corpus.append(cln_tweet)

#NMF
    vectorizer = TfidfVectorizer(stop_words='english', min_df=2)
    dtm = vectorizer.fit_transform(tweet_corpus)
    vocab = vectorizer.get_feature_names()
    num_topics = no_of_topics
    clf = decomposition.NMF(n_components=num_topics, random_state=1)
    doctopic = clf.fit_transform(dtm)

    topic_words = []
    num_top_words = 6
    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][0:num_top_words]
        topic_words.append([vocab[i] for i in word_idx])

#Saving	data to text file
    with open('op4_topics_{}_{}.txt'.format(pickle_file.split('.')[0], state),
              'w') as outfile:
        outfile.write('{}\nPossible topics in the given data are\n{}\n'.format(
            '==' * 50, '==' * 50))
        for t in range(len(topic_words)):
            outfile.write('Topic {}: {}\n'.format(
                t + 1, ' '.join(topic_words[t][:15])))
    def nmf_topic_modeling(self):
        corpus = []
        client = MongoDB()
        tweetsCollection = client.get_collection('newmexico')
        findCorpusQuery = tweetsCollection.find()

        for fileid in findCorpusQuery:
            corpus.append(fileid['text'])

        cachedStopWords = set(STOPWORDS)
        cachedStopWords.update(
            ('well', 'say', 'New', 'RT', 'now', 'https', 'via', 'CIA', 'make', 'says', 'new', 'will', 'said',
             'take', 'amp', 'one', 'go', 'know', 'day', 'look', 'think', 'gt', 'lt', 'co', 'rt', 'zbvnkkvl48', 'gtwkxla6bn',
             'cia', 'elizasoul80', '5febckimwt', 'af', 'ex', 'putin', 'realdonaldtrump', 'msm', 'dworkinreport',
             'knightstrumplar', 'vp', 'ur', 'us', 'join', 'trumppence', 'thanx', 'pvtjokerus', 're', 'see', 'mayflowerscandal', 'agree',
             'maga', 'wh', 'yeah'))

        vectorizer = TfidfVectorizer(stop_words=cachedStopWords, min_df=2)
        dtm = vectorizer.fit_transform(corpus)
        vocab = vectorizer.get_feature_names()

        num_topics = 5

        clf = decomposition.NMF(n_components=num_topics, random_state=1)
        doctopic = clf.fit_transform(dtm)

        topic_words = []
        num_top_words = 5
        for topic in clf.components_:
            word_idx = np.argsort(topic)[::-1][0:num_top_words]  # get indexes with highest weights
            topic_words.append([vocab[i] for i in word_idx])

        for t in range(len(topic_words)):
            print "Topic {}: {}".format(t, ' '.join(topic_words[t][:15]))
Пример #29
0
def topic_extractions(df):
    vectorizer = CountVectorizer(stop_words='english')
    X_train_dtm = vectorizer.fit_transform(df['article'])
    vocab = np.array(vectorizer.get_feature_names())

    # Generating Decomposition Model to extract topics
    clf = decomposition.NMF(n_components=NUM_TOPICS, random_state=1)
    doctopic = clf.fit_transform(X_train_dtm)

    # Generating dominant topics for each words
    topic_words = []
    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][0:NUM_TOP_WORDS]
        topic_words.append([vocab[i] for i in word_idx])

    # Making DataFrame that gets the doctopic (values of topics for each text)
    dftopic = pd.DataFrame(doctopic, columns=topic_words)
    dftopicinv = dftopic.T

    # Getting the dominant topic
    topic_series = []
    for i in np.arange(dftopic.shape[0]):
        topic_series.append(dftopicinv[i].idxmax())

    df['toptopic'] = topic_series

    return df
Пример #30
0
def write_topics(ftopics, fwords, ftopics_words, poem_words, n_topic, n_topic_words):
    count_matrix = count_vect.fit_transform(poem_words)
    tfidf = TfidfTransformer().fit_transform(count_matrix)
    nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
    feature_names = count_vect.get_feature_names()
    fw = codecs.open(ftopics, 'w', 'utf-8')
    for topic in nmf.components_:
        fw.write(' '.join([feature_names[i] for i in topic.argsort()[:-n_topic_words - 1:-1]]) + '\n')
    fw.close()
    print('Write topics done.')
    fw = codecs.open(fwords, 'wb')
    flen = len(feature_names)
    #for i in range(flen):
        #fw.write(feature_names[i])
        #fw.write('\n')
    pickle.dump(feature_names, fw)
    #feature_names = feature_names.encode('gbk')
    fw.close()
    print('Write words done.')
    fw = codecs.open(ftopics_words, 'wb')
    nmf.components_.tolist()
    nmflen=len(nmf.components_)
    #for i in range(nmflen):
        #print(nmf.components_[i])
        #print(type(nmf.components_[i]))
        #fw.write(nmf.components_[i])
        #fw.write('\n')    
    pickle.dump(nmf.components_, fw)
    print((nmf.components_))
    fw.close()
    print('Write topic_words done.')