Exemplo n.º 1
0
 def train(self, df):
     self.user_dict = {
         el: self.topic_dict.copy()
         for el in df.sender.unique()
     }
     cv = CV(stop_words='english')
     X = cv.fit_transform(df['context'])
     vocab = cv.vocabulary_.keys()
     self.worddict = dict([(i, s) for i, s in enumerate(vocab)])
     self.mydict = Dictionary()
     self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus(
         X, documents_columns=False),
                                           id2word=self.worddict)
     self.model = LatentDA.LdaModel(matutils.Sparse2Corpus(
         X, documents_columns=False),
                                    num_topics=self.numtopics,
                                    passes=20,
                                    id2word=self.worddict)
     for i in df.iterrows():
         if i[1]['context'] == '':
             continue
         else:
             values = new_model[mydict.doc2bow(i[1]['context'].split())]
             for val in values:
                 if val[0] in user_dict[i[1].sender].keys():
                     if i[1].amt == '':
                         continue
                     user_dict[i[1].sender][val[0]] += val[1] * float(
                         i[1].amt)
                     continue
                 user_dict[i[1].sender][val[0]] = val[1]
     for i in user_dict.keys():
         norm_const = sum(user_dict[i].values())
         for j in user_dict[i].keys():
             user_dict[i][j] = user_dict[i][j] / norm_const
Exemplo n.º 2
0
    def prepare_corpus(self, doc, gram=(1, 2), option='c'):

        if option == 'c':
            # Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
            cvna = CountVectorizer(tokenizer=self.parse_text,
                                   ngram_range=gram,
                                   stop_words=stop_fr,
                                   strip_accents='ascii',
                                   max_df=.8)
            data_cvna = cvna.fit_transform(doc)
            data_dtmna = pd.DataFrame(data_cvna.toarray(),
                                      columns=cvna.get_feature_names())
            # Create the gensim corpus (term_document matrix)
            doc_term_matrix = matutils.Sparse2Corpus(
                scipy.sparse.csr_matrix(data_dtmna.transpose()))
            # Create the vocabulary dictionary
            dictionary = dict((v, k) for k, v in cvna.vocabulary_.items())
        elif option == 'tf':
            tfna = TfidfVectorizer(tokenizer=self.parse_text,
                                   ngram_range=gram,
                                   stop_words=stop_fr,
                                   strip_accents='ascii',
                                   max_df=.8)
            data_tfna = tfna.fit_transform(doc)
            data_dtmna = pd.DataFrame(data_tfna.toarray(),
                                      columns=tfna.get_feature_names())
            # Create the gensim corpus (term_document matrix)
            doc_term_matrix = matutils.Sparse2Corpus(
                scipy.sparse.csr_matrix(data_dtmna.transpose()))
            # Create the vocabulary dictionary
            dictionary = dict((v, k) for k, v in tfna.vocabulary_.items())

    # generate LDA model
        return dictionary, doc_term_matrix
Exemplo n.º 3
0
def cluster(sentences):

    my_stop_words = {'okay', 'don', 've', 'didn', 'know', 'think', 'really'}

    corpus = [c['text'].replace("%hesitation", "").lower() for c in sentences]

    corpus = np.array(corpus)
    tf_vectorizer = TfidfVectorizer(decode_error='ignore',
                                    max_df=0.7,
                                    stop_words=my_stop_words.union(stop_words),
                                    ngram_range=(1, 1))

    tf_mat = tf_vectorizer.fit_transform(corpus)
    id2word = {i: s for i, s in enumerate(tf_vectorizer.get_feature_names())}
    n_topics = 5

    lsi = LsiModel(matutils.Sparse2Corpus(tf_mat.T),
                   num_topics=n_topics,
                   id2word=id2word,
                   onepass=False)
    gs_lsi_mat = lsi[matutils.Sparse2Corpus(tf_mat.T)]
    lsi_mat = matutils.corpus2dense(gs_lsi_mat, n_topics).T
    norm = Normalizer(copy=False)
    lsi_mat = norm.fit_transform(lsi_mat)

    valid_indices = np.where(lsi_mat.any(axis=1))[0]
    valid_sent = lsi_mat[valid_indices]

    n_clusters = 7

    cluster = KMeans(n_clusters, n_init=100)
    cluster.fit(valid_sent)

    clusters = {}
    for i in range(n_clusters):
        clusters[i] = np.where(cluster.labels_ == i)[0]

    for i in clusters.keys():
        if np.sum(
                np.square(valid_sent[clusters[i]] - cluster.cluster_centers_[i]
                          )) > cluster.inertia_ / n_clusters:
            del clusters[i]

    last_cluster = [
        valid_indices[clusters[i][np.where(
            np.sum(np.square(valid_sent[clusters[i]] -
                             cluster.cluster_centers_[i]),
                   axis=1) < cluster.inertia_ / len(corpus))]].tolist()
        for i in clusters
    ]
    return last_cluster
Exemplo n.º 4
0
    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {iterable of list of (int, number), scipy.sparse matrix}
            A collection of documents in BOW format used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.hdp.HdpTransformer`
            The trained model.

        """
        if sparse.issparse(X):
            corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
        else:
            corpus = X

        self.gensim_model = models.HdpModel(
            corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks,
            max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau,
            K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale,
            var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state
        )
        return self
Exemplo n.º 5
0
    def transform(self, docs):
        """
        Takes a list of documents as input ('docs').
        Returns a matrix of topic distribution for the given document bow, where a_ij
        indicates (topic_i, topic_probability_j).
        The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ]
        or a single document like : [(4, 1), (7, 1)]
        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # The input as array of array
        # import pdb; pdb.set_trace()
        # check = lambda x: [x] if isinstance(x[0], tuple) else x
        # docs = check(docs)
        if sparse.issparse(docs):
            docs = matutils.Sparse2Corpus(docs, documents_columns=False)
        X = [[] for i in range(0, len(docs))]
        for k, v in enumerate(docs):
            doc_topics = self.gensim_model[v]
            probs_docs = list(map(lambda x: x[1], doc_topics))
            # Everything should be equal in length
            if len(probs_docs) != self.num_topics:
                probs_docs.extend([1e-12] *
                                  (self.num_topics - len(probs_docs)))
            X[k] = probs_docs
            probs_docs = []
        return np.reshape(np.array(X), (len(docs), self.num_topics))
    def predict_On_Unseen_Corpus(self, new_dataFrameWithText):

        # Take raw text and remove all none-nouns and adj, convert to BOW:
        new_data_dtmna = clean_df_text_from_nounsAndAdj(new_dataFrameWithText, self.textColumnName, self.applyStemming,
                                                        minCountThreshold=self.minCountThreshold,maxCountThreshold=self.maxCountThreshold)

        # Take BOW, remove redundent features, convert to TDM (Transpose)
        print("Number of feature of new dataset: %s" % (str(new_data_dtmna.shape)))
        new_termDocumentMatric = new_data_dtmna.filter(
            items=self.mainCorpusFeatures).T  # filter(items=self.mainCorpusFeatures,axis=1).
        print("Number of feature of new dataset: %s" % (str(new_termDocumentMatric.shape)))

        self.new_dataFrameWithText = new_dataFrameWithText
        self.new_termDocumentMatric = new_termDocumentMatric

        new_Corpus = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(new_termDocumentMatric))

        # for i,corp in enumerate(new_Corpus):
        #   print(i,corp)

        self.new_Corpus = new_Corpus

        topicDict_new = assign_Topic_To_document(self.lda_nounAdj,  # Use previsouly trained LDA model
                                                 new_Corpus,  # Use the new corpus
                                                 self.id2word_nounAdj,  # Use existing Id2Word dictionary
                                                 new_dataFrameWithText,
                                                 self.textColumnName)  # Use new dataFrameWithText

        self.topicDict_new = topicDict_new
        self.new_Corpus = new_Corpus

        return topicDict_new, new_Corpus
Exemplo n.º 7
0
    def add_features(corpus,
                     X,
                     dictionary,
                     compute_values=None,
                     var_attrs=None):
        order = np.argsort([dictionary[i] for i in range(len(dictionary))])
        if compute_values is not None:
            compute_values = np.array(compute_values)[order]

        variable_attrs = {
            'hidden': True,
            'skip-normalization': True,
        }
        if isinstance(var_attrs, dict):
            variable_attrs.update(var_attrs)

        feature_names = [dictionary[i] for i in order]
        corpus = corpus.extend_attributes(X[:, order],
                                          feature_names=feature_names,
                                          var_attrs=variable_attrs,
                                          compute_values=compute_values,
                                          sparse=True,
                                          rename_existing=True)
        corpus.ngrams_corpus = matutils.Sparse2Corpus(X.T)
        return corpus
    def fit(self, X):
        """
        For fitting corpus into the class object.
        Calls gensim.model.LdaModel:
        >>>gensim.models.LdaModel(corpus=corpus,num_topics=num_topics,id2word=id2word,passes=passes,update_every=update_every,alpha=alpha,iterations=iterations,eta=eta,random_state=random_state)
        """
        if sparse.issparse(X):
            self.corpus = matutils.Sparse2Corpus(X)
        else:
            self.corpus = X

        models.LdaModel.__init__(self,
                                 corpus=self.corpus,
                                 num_topics=self.num_topics,
                                 id2word=self.id2word,
                                 chunksize=self.chunksize,
                                 passes=self.passes,
                                 update_every=self.update_every,
                                 alpha=self.alpha,
                                 eta=self.eta,
                                 decay=self.decay,
                                 offset=self.offset,
                                 eval_every=self.eval_every,
                                 iterations=self.iterations,
                                 gamma_threshold=self.gamma_threshold,
                                 minimum_probability=self.minimum_probability,
                                 random_state=self.random_state)
        return self
Exemplo n.º 9
0
def reduce_nlp_data(vectorizer, data, n_components, reducer):

    transformed_data = vectorizer.fit_transform(data)
    id2word = {
        identifier: word
        for word, identifier in vectorizer.vocabulary_.items()
    }

    if reducer == 'lda':
        corpus = matutils.Sparse2Corpus(transformed_data.transpose())
        lda = models.LdaModel(corpus=corpus,
                              num_topics=n_components,
                              minimum_probability=0.03,
                              id2word=id2word,
                              passes=10,
                              random_state=42)
        print(lda.print_topics())
        lda_corpus = lda[corpus]
        return lda, matutils.corpus2csc(lda_corpus).toarray().transpose()
    elif reducer == 'svd':
        SVD = TruncatedSVD(n_components, n_iter=10, random_state=42)
        svd_data = SVD.fit_transform(transformed_data)
        get_eigenvectors(SVD, id2word)
        return SVD, svd_data
    elif reducer == 'nmf':
        nmf = NMF(n_components, random_state=42)
        nmf_data = nmf.fit_transform(transformed_data)
        get_eigenvectors(nmf, id2word)
        return nmf, nmf_data

    else:
        return None, None
Exemplo n.º 10
0
    def partial_fit(self, X):
        """Train model over a potentially incomplete set of documents.

        Uses the parameters set in the constructor.
        This method can be used in two ways:
        * On an unfitted model in which case the model is initialized and trained on `X`.
        * On an already fitted model in which case the model is **updated** by `X`.

        Parameters
        ----------
        X : {iterable of list of (int, number), scipy.sparse matrix}
            A collection of documents in BOW format used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.hdp.HdpTransformer`
            The trained model.

        """
        if sparse.issparse(X):
            X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)

        if self.gensim_model is None:
            self.gensim_model = models.HdpModel(
                id2word=self.id2word, max_chunks=self.max_chunks,
                max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau,
                K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale,
                var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state
            )

        self.gensim_model.update(corpus=X)
        return self
Exemplo n.º 11
0
    def fit(self, X, y=None):
        """
        Fit the model according to the given training data.
        Calls gensim.models.LdaModel
        """
        if sparse.issparse(X):
            corpus = matutils.Sparse2Corpus(X)
        else:
            corpus = X

        self.gensim_model = models.LdaModel(
            corpus=corpus,
            num_topics=self.num_topics,
            id2word=self.id2word,
            chunksize=self.chunksize,
            passes=self.passes,
            update_every=self.update_every,
            alpha=self.alpha,
            eta=self.eta,
            decay=self.decay,
            offset=self.offset,
            eval_every=self.eval_every,
            iterations=self.iterations,
            gamma_threshold=self.gamma_threshold,
            minimum_probability=self.minimum_probability,
            random_state=self.random_state)
        return self
Exemplo n.º 12
0
def evaluate_improved_cllsi(x_train1_in, x_test1_in, x_train2_in, x_test2_in,
                            dimensions, evaluation_function):
    scores = []

    for k in dimensions:
        x_train1, x_test1 = tfidf(data=(x_train1_in, x_test1_in))
        x_train2, x_test2 = tfidf(data=(x_train2_in, x_test2_in))

        n_train, n_test = len(x_train1), len(x_test1)

        X1 = matutils.corpus2csc(list(x_train1) + list(x_test1))
        X2 = matutils.corpus2csc(list(x_train2) + list(x_test2))

        x_train1, x_train2 = X1[:, :n_train], X2[:, :n_train]
        x_test1, x_test2 = X1[:, n_train:], X2[:, n_train:]

        x = sp.sparse.vstack([x_train1, x_train2])
        x = matutils.Sparse2Corpus(x)

        lsa = models.LsiModel(x, num_topics=k)
        n = x_train1.shape[0]
        U = lsa.projection.u
        U1, U2 = U[:n, :], U[n:, :]
        p1, p2 = sp.sparse.csr_matrix(
            np.linalg.pinv(U1)), sp.sparse.csr_matrix(np.linalg.pinv(U2))
        a1, a2 = np.dot(x_test1.T, p1.T).todense(), np.dot(x_test2.T,
                                                           p2.T).todense()

        score = evaluation_function(a1, a2)
        scores.append(score)
    return scores
Exemplo n.º 13
0
    def fit(self, X, y=None):
        """
        Fit the model according to the given training data.
        Calls gensim.models.HdpModel
        """
        if sparse.issparse(X):
            corpus = matutils.Sparse2Corpus(X)
        else:
            corpus = X

        self.gensim_model = models.HdpModel(corpus=corpus,
                                            id2word=self.id2word,
                                            max_chunks=self.max_chunks,
                                            max_time=self.max_time,
                                            chunksize=self.chunksize,
                                            kappa=self.kappa,
                                            tau=self.tau,
                                            K=self.K,
                                            T=self.T,
                                            alpha=self.alpha,
                                            gamma=self.gamma,
                                            eta=self.eta,
                                            scale=self.scale,
                                            var_converge=self.var_converge,
                                            outputdir=self.outputdir,
                                            random_state=self.random_state)
        return self
Exemplo n.º 14
0
 def partial_fit(self, X):
     """
     Train model over X.
     """
     if sparse.issparse(X):
         X = matutils.Sparse2Corpus(X)
     self.add_documents(corpus=X)
Exemplo n.º 15
0
    def partial_fit(self, X):
        """
        Train model over X.
        """
        if sparse.issparse(X):
            X = matutils.Sparse2Corpus(X)

        if self.gensim_model is None:
            self.gensim_model = models.HdpModel(id2word=self.id2word,
                                                max_chunks=self.max_chunks,
                                                max_time=self.max_time,
                                                chunksize=self.chunksize,
                                                kappa=self.kappa,
                                                tau=self.tau,
                                                K=self.K,
                                                T=self.T,
                                                alpha=self.alpha,
                                                gamma=self.gamma,
                                                eta=self.eta,
                                                scale=self.scale,
                                                var_converge=self.var_converge,
                                                outputdir=self.outputdir,
                                                random_state=self.random_state)

        self.gensim_model.update(corpus=X)
        return self
Exemplo n.º 16
0
def fit_lda(X, vocab, num_topics=5, passes=20):
    """ Fit LDA from a scipy CSR matrix (X). """
    print('fitting lda...')
    return LdaModel(matutils.Sparse2Corpus(X.T),
                    num_topics=num_topics,
                    passes=passes,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]))
Exemplo n.º 17
0
def create_corpus(dtm):
    """Creates a word corpus from a document term matrix.
    """
    tdm = dtm.transpose()
    sparse_counts = scipy.sparse.csr_matrix(tdm)
    corpus = matutils.Sparse2Corpus(sparse_counts)
    return corpus
Exemplo n.º 18
0
    def partial_fit(self, X):
        """Train model over a potentially incomplete set of documents.

        This method can be used in two ways:
            1. On an unfitted model in which case the model is initialized and trained on `X`.
            2. On an already fitted model in which case the model is **further** trained on `X`.

        Parameters
        ----------
        X : {iterable of list of (int, number), scipy.sparse matrix}
            Stream of document vectors or sparse matrix of shape: [`num_terms`, `num_documents`].

        Returns
        -------
        :class:`~gensim.sklearn_api.lsimodel.LsiTransformer`
            The trained model.

        """
        if sparse.issparse(X):
            X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)

        if self.gensim_model is None:
            self.gensim_model = models.LsiModel(
                num_topics=self.num_topics,
                id2word=self.id2word,
                chunksize=self.chunksize,
                decay=self.decay,
                onepass=self.onepass,
                power_iters=self.power_iters,
                extra_samples=self.extra_samples)

        self.gensim_model.add_documents(corpus=X)
        return self
Exemplo n.º 19
0
    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {iterable of iterable of (int, int), scipy.sparse matrix}
            A collection of documents in BOW format used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.ldamodel.LdaTransformer`
            The trained model.

        """
        if sparse.issparse(X):
            corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
        else:
            corpus = X

        self.gensim_model = models.LdaModel(
            corpus=corpus, num_topics=self.num_topics, id2word=self.id2word,
            chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
            alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
            eval_every=self.eval_every, iterations=self.iterations,
            gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability,
            random_state=self.random_state, dtype=self.dtype
        )
        return self
Exemplo n.º 20
0
    def partial_fit(self, X):
        """Train model over a potentially incomplete set of documents.

        Uses the parameters set in the constructor.
        This method can be used in two ways:
        * On an unfitted model in which case the model is initialized and trained on `X`.
        * On an already fitted model in which case the model is **updated** by `X`.

        Parameters
        ----------
        X : {iterable of iterable of (int, int), scipy.sparse matrix}
            A collection of documents in BOW format used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.ldamodel.LdaTransformer`
            The trained model.

        """
        if sparse.issparse(X):
            X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)

        if self.gensim_model is None:
            self.gensim_model = models.LdaModel(
                num_topics=self.num_topics, id2word=self.id2word,
                chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
                alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
                eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold,
                minimum_probability=self.minimum_probability, random_state=self.random_state,
                dtype=self.dtype
            )

        self.gensim_model.update(corpus=X)
        return self
Exemplo n.º 21
0
 def infer_topics(self, num_topics=10):
     if self.corpus.gensim_vector_space is None:
         self.corpus.gensim_vector_space = matutils.Sparse2Corpus(self.corpus.sklearn_vector_space,
                                                                  documents_columns=False)
     self.nb_topics = num_topics
     lsa = models.LsiModel(corpus=self.corpus.gensim_vector_space,
                           id2word=self.corpus.vocabulary,
                           num_topics=num_topics)
     tmp_topic_word_matrix = list(lsa.show_topics(num_topics=num_topics,
                                                  num_words=len(self.corpus.vocabulary),
                                                  formatted=False))
     row = []
     col = []
     data = []
     for topic_id in range(self.nb_topics):
         topic_description = tmp_topic_word_matrix[topic_id]
         for weight, word_id in topic_description:
             row.append(topic_id)
             col.append(word_id)
             data.append(weight)
     self.topic_word_matrix = coo_matrix((data, (row, col)),
                                         shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr()
     self.document_topic_matrix = np.transpose(matutils.corpus2dense(lsa[self.corpus.gensim_vector_space],
                                                                     num_topics,
                                                                     self.corpus.size))
     self.corpus.gensim_vector_space = None
Exemplo n.º 22
0
 def infer_topics(self, num_topics=10):
     if self.corpus.gensim_vector_space is None:
         self.corpus.gensim_vector_space = matutils.Sparse2Corpus(self.corpus.sklearn_vector_space,
                                                                  documents_columns=False)
     self.nb_topics = num_topics
     lda = models.LdaModel(corpus=self.corpus.gensim_vector_space,
                           iterations=10000,
                           num_topics=num_topics)
     tmp_topic_word_matrix = list(lda.show_topics(num_topics=num_topics,
                                                  num_words=len(self.corpus.vocabulary),
                                                  formatted=False))
     row = []
     col = []
     data = []
     for topic_id in range(self.nb_topics):
         topic_description = tmp_topic_word_matrix[topic_id]
         for word_id, probability in topic_description[1]:
             row.append(topic_id)
             col.append(int(word_id))
             data.append(probability)
     self.topic_word_matrix = coo_matrix((data, (row, col)),
                                         shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr()
     self.document_topic_matrix = sparse.csr_matrix(
         np.transpose(matutils.corpus2dense(lda[self.corpus.gensim_vector_space],
                                            num_topics,
                                            self.corpus.size)))
     self.corpus.gensim_vector_space = None
Exemplo n.º 23
0
def main(K, numfeatures, sample_file, num_display_words, outputfile):
	
	K_clusters = K
	vectorizer = idfVectorizer(sample_file, numfeatures)

	t0 = time()
	print("Applying topic modeling, using LDA")
	print(str(K_clusters) + "topics")
	corpus = matutils.Sparse2Corpus(vectorizer.X, documents_columns=False)
	lda = models.ldamodel.LdaModel(corpus, num_topics=K_clusters, id2word=vectorizer.id2words)
	print("done in %fs" % (time() - t0))

	output_text = []
	# for K_clusters, return num_words_most significant word
	# return as a list - a list of strings if formatted is True, or (word, probability) 2-tuples if False
	for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)):
		output_text.append("Topic: " + str(i))
		for term, weight in item[1]:
			output_text.append(term + ":" + str(weight))

	print "writing topics to file: ", outputfile
	with open (outputfile, 'w') as f:
		f.write('\n'.join(output_text))

	output_json = []
	for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)):
		topic_terms = {term: str(weight) for term, weight in item[1]}
		output_json.append(topic_terms)
Exemplo n.º 24
0
 def createModel(self, corpus, dictionary, info):
     logging.basicConfig(format='%(asctime)s: %(levelname)s : %(message)s',
                         level=logging.INFO)
     path = 'TopicModel/' + info.data + '_' + info.identifier
     if not type(corpus) == list:
         corpus = matutils.Sparse2Corpus(corpus, documents_columns=False)
     if not os.path.exists(path):
         if self.name == 'LDA':
             if info.multicore:
                 self.model = models.LdaMulticore(
                     corpus,
                     num_topics=info.numberTopics,
                     id2word=dictionary,
                     passes=info.passes,
                     iterations=info.iterations,
                     batch=0)
             else:
                 self.model = models.LdaModel(corpus,
                                              num_topics=info.numberTopics,
                                              id2word=dictionary,
                                              passes=info.passes,
                                              iterations=info.iterations,
                                              update_every=info.online,
                                              chunksize=info.chunksize)
         elif self.name == 'LSI':
             self.model = models.LsiModel(corpus, info.numberTopics,
                                          dictionary)
             self.info = str(self.model)
         else:
             print 'Unkown Model type'
         print 'save Model'
         self.model.save(path)
     else:
         print 'Load Model'
         self.model = models.LdaModel.load(path)
Exemplo n.º 25
0
    def partial_fit(self, X):
        """
        Train model over X.
        By default, 'online (single-pass)' mode is used for training the LDA model.
        Configure `passes` and `update_every` params at init to choose the mode among :

            - online (single-pass): update_every != None and passes == 1
            - online (multi-pass): update_every != None and passes > 1
            - batch: update_every == None

        """
        if sparse.issparse(X):
            X = matutils.Sparse2Corpus(X)

        if self.gensim_model is None:
            self.gensim_model = models.LdaModel(
                num_topics=self.num_topics,
                id2word=self.id2word,
                chunksize=self.chunksize,
                passes=self.passes,
                update_every=self.update_every,
                alpha=self.alpha,
                eta=self.eta,
                decay=self.decay,
                offset=self.offset,
                eval_every=self.eval_every,
                iterations=self.iterations,
                gamma_threshold=self.gamma_threshold,
                minimum_probability=self.minimum_probability,
                random_state=self.random_state)

        self.gensim_model.update(corpus=X)
        return self
Exemplo n.º 26
0
    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {iterable of list of (int, number), scipy.sparse matrix}
            A collection of documents in BOW format to be transformed.

        Returns
        -------
        :class:`~gensim.sklearn_api.lsimodel.LsiTransformer`
            The trained model.

        """
        if sparse.issparse(X):
            corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
        else:
            corpus = X

        self.gensim_model = models.LsiModel(corpus=corpus,
                                            num_topics=self.num_topics,
                                            id2word=self.id2word,
                                            chunksize=self.chunksize,
                                            decay=self.decay,
                                            onepass=self.onepass,
                                            power_iters=self.power_iters,
                                            extra_samples=self.extra_samples)
        return self
Exemplo n.º 27
0
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):
    if not matutils.ismatrix(corpus):
        corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary))
    else:
        corpus_csc = corpus
        # Need corpus to be a streaming gensim list corpus for len and inference functions below:
        corpus = matutils.Sparse2Corpus(corpus_csc)

    beta = 0.01
    fnames_argsort = np.asarray(list(dictionary.token2id.values()),
                                dtype=np.int_)
    term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
    term_freqs[term_freqs == 0] = beta
    doc_lengths = corpus_csc.sum(axis=0).A.ravel()

    assert term_freqs.shape[0] == len(
        dictionary
    ), 'Term frequencies and dictionary have different shape {} != {}'.format(
        term_freqs.shape[0], len(dictionary))
    assert doc_lengths.shape[0] == len(
        corpus
    ), 'Document lengths and corpus have different sizes {} != {}'.format(
        doc_lengths.shape[0], len(corpus))

    if hasattr(topic_model, 'lda_alpha'):
        num_topics = len(topic_model.lda_alpha)
    else:
        num_topics = topic_model.num_topics

    if doc_topic_dists is None:
        # If its an HDP model.
        if hasattr(topic_model, 'lda_beta'):
            gamma = topic_model.inference(corpus)
        else:
            gamma, _ = topic_model.inference(corpus)
        doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
    else:
        if isinstance(doc_topic_dists, list):
            doc_topic_dists = matutils.corpus2dense(doc_topic_dists,
                                                    num_topics).T
        elif issparse(doc_topic_dists):
            doc_topic_dists = doc_topic_dists.T.todense()
        doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)

    assert doc_topic_dists.shape[
        1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format(
            doc_topic_dists.shape[1], num_topics)

    # get the topic-term distribution straight from gensim without
    # iterating over tuples
    if hasattr(topic_model, 'lda_beta'):
        topic = topic_model.lda_beta
    else:
        topic = topic_model.state.get_lambda()
    topic = topic / topic.sum(axis=1)[:, None]
    topic_term_dists = topic[:, fnames_argsort]

    assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]
    return doc_topic_dists
    def partial_fit(self, X):
        """
        Train model over X.
        """
        if sparse.issparse(X):
            X = matutils.Sparse2Corpus(X)

        self.update(corpus=X)
Exemplo n.º 29
0
def ns(numpy_matrix, number_of_corpus_features, scipy_sparse_matrix):
    corpus = matutils.Dense2Corpus(numpy_matrix)

    numpy_matrix = matutils.corpus2dense(corpus,
                                         num_terms=number_of_corpus_features)

    corpus = matutils.Sparse2Corpus(scipy_sparse_matrix)
    scipy_csc_matrix = matutils.corpus2csc(corpus)
Exemplo n.º 30
0
def fit_lda(X, vocab, num_topics=10, passes=20, alpha=0.001):
    ''' fit LDA from a scipy CSR matrix (X). '''
    print("fitting lda...")
    return LdaModel(matutils.Sparse2Corpus(X),
                    num_topics=num_topics,
                    passes=passes,
                    alpha=alpha,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]))