예제 #1
0
 def transform(self, X, y=None):
     corpus = Sparse2Corpus(X, documents_columns=False)
     topics = np.array([
         map(lambda x: x[1], self.lda.__getitem__(c, eps=0)) for c in corpus
     ])
     print topics.shape
     return topics
def lda_model():

    # Load the list of documents
    with open('newsgroups', 'rb') as f:
        newsgroup_data = pickle.load(f)

    # Use CountVectorizor to find three letter tokens, remove stop_words,
    # remove tokens that don't appear in at least 20 documents,
    # remove tokens that appear in more than 20% of the documents
    vect = CountVectorizer(min_df=20,
                           max_df=0.2,
                           stop_words='english',
                           token_pattern='(?u)\\b\\w\\w\\w+\\b')
    # Fit and transform
    X = vect.fit_transform(newsgroup_data)

    # Convert sparse matrix to gensim corpus.
    corpus = Sparse2Corpus(X, documents_columns=False)

    # Mapping from word IDs to words (To be used in LdaModel's id2word parameter)
    id_map = dict((v, k) for k, v in vect.vocabulary_.items())

    # Use the gensim.models.ldamodel.LdaModel constructor to estimate
    # LDA model parameters on the corpus, and save to the variable `ldamodel`

    return vect, LdaModel(corpus,
                          num_topics=10,
                          id2word=id_map,
                          passes=25,
                          random_state=34)
예제 #3
0
    def finalize(self):
        if self.model_exist:
            return

        if self.num_of_scans == 1:
            print "Loaded the model from file."
        else:
            print "Performing SVD..."

            # svd = SVD(n_components=self.num_of_features, random_state=42)
            # x = svd.fit_transform(self.vectors)
            # self.vectors = x

            x = Sparse2Corpus(self.vectors)
            lsi = lsimodel.LsiModel(corpus=x,
                                    id2word=None,
                                    num_topics=self.num_of_features)
            lsi.save(self.model_file_name)
            self.vectors = lsi.projection.u

            print "done."

        if self.n <= 1:
            self.n = 2.0
        self.mean = self.sum / self.n
        self.var = (self.sum_sq -
                    (self.sum * self.sum) / self.n) / (self.n - 1)
        self.var = math.sqrt(self.var)

        f = open(self.stat_filename, 'a')
        lang_pair = self.src_language + self.trg_language
        f.write("\n" + lang_pair + "\n")
        f.write("stats\t" + str(self.mean) + "\t" + str(self.var) + "\n")
        f.close()
def getLDAvis(topics, min_df, max_features):
    ldavis_key = f'{int(min_df*1000):d}_{max_features}_{topics}'
    ldavis_path = Path('./pyldavis') / f'{ldavis_key}_tsne.html'
    if not ldavis_path.exists():
        key = f'{max_features}'
        dtm_path = corpus_path / f'dtm_{key}.npz'
        dtm = sparse.load_npz(dtm_path)
        token_path = corpus_path / f'tokens_{key}.csv'
        tokens = pd.read_csv(token_path,
                             header=None,
                             squeeze=True,
                             na_values=[],
                             keep_default_na=False)
        model_file = datapath(
            (experiment_path / 'models' / f'{key}_{topics}').resolve())
        lda_model = LdaModel.load(model_file)
        id2word = tokens.to_dict()
        corpus = Sparse2Corpus(dtm, documents_columns=False)
        dictionary = Dictionary.from_corpus(corpus, id2word)
        vis = prepare(lda_model, corpus, dictionary, mds='tsne')
        kwargs = {"ldavis_url": "/static/ldavis.js"}
        pyLDAvis.save_html(vis, str(ldavis_path), **kwargs)
    with open(str(ldavis_path), 'r') as myfile:
        data = myfile.read()
    return data


#getLDAvis(5, 0.001, 10000)
#getLDAvis(5, 0.001, 25000)
#getLDAvis(10, 0.001, 10000)
#getLDAvis(10, 0.001, 25000)
#getLDAvis(20, 0.001, 10000)
#getLDAvis(20, 0.001, 25000)
예제 #5
0
def graphLDA(name):

    embedFile = 'backendOutput/embeddings-' + name + '.pkl'
    bow, tfidf, _, id2word = loadData(embedFile)

    for (docRep, docRepName) in [(bow, 'bow'), (tfidf, 'tfidf')]:
        ldamodel = loadData('backendOutput/ldamodel-' + name + "-" +
                            docRepName + '.pkl')
        corpus = Sparse2Corpus(docRep, documents_columns=False)
        dictionary = Dictionary.from_corpus(corpus, id2word)
        #This could be more descriptive if we wanted
        document_labels = ["Document " + str(i) for i in range(len(corpus))]

        grapher = LDAGrapher(docRepName, corpus, dictionary, ldamodel,
                             document_labels, name)

        print("Graphing t-SNE for " + docRepName + "...")
        grapher.graphTSNE(perplexity=30)
        print("Graphing pyLDAvis for " + docRepName + "...")
        grapher.graphPyLDAvis()
        print("Creating word cloud for " + docRepName + "...")
        grapher.graphWordCloud()
        print("Graphing word weights for " + docRepName + "...")
        grapher.graphWordWeight()

    print("Done graphing!")
예제 #6
0
파일: cluster.py 프로젝트: PaulHuygen/xtas
def lda(docs, k):
    """Latent Dirichlet allocation topic model.

    Uses Gensim's LdaModel after tokenizing using scikit-learn's
    TfidfVectorizer.

    Parameters
    ----------
    k : integer
        Number of topics.
    """
    from gensim.matutils import Sparse2Corpus
    from gensim.models import LdaModel

    # Use a scikit-learn vectorizer rather than Gensim's equivalent
    # for speed and consistency with LSA and k-means.
    vect = _vectorizer()
    corpus = vect.fit_transform(fetch(d) for d in docs)
    corpus = Sparse2Corpus(corpus)

    model = LdaModel(corpus=corpus, num_topics=k)

    topics = model.show_topics(formatted=False)
    vocab = vect.get_feature_names()
    #return [(vocab[int(idx)], w) for topic in topics for w, idx in topic]
    return [[(vocab[int(idx)], w) for w, idx in topic] for topic in topics]
예제 #7
0
파일: main_v2.py 프로젝트: YMMS/Models
 def load_topic_model(self):
     if not hasattr(self, "word2id"):
         self.load_globel_vocab()
     self.vectorizer = CountVectorizer(vocabulary=self.word2id,
                                       tokenizer=lambda x: x,
                                       preprocessor=lambda x: x)
     file_path = "./preproc_data/topic_model.pkl"
     if os.path.exists(file_path):
         self.topic_model = LdaModel.load(file_path)
     else:
         texts = []
         if not hasattr(self, "domain2data"):
             self.load_domain2data()
         for domain in self.domain2data:
             texts.extend(self.domain2data[domain]["labeled"])
             texts.extend(self.domain2data[domain]["unlabeled"])
         corpus = self.vectorizer.fit_transform(texts)
         corpus = Sparse2Corpus(corpus, documents_columns=False)
         self.topic_model = LdaMulticore(
             corpus=corpus,
             num_topics=self.num_topics,
             id2word=self.id2word,
             iterations=self.num_topic_iterations,
             passes=self.num_topic_passes)
         self.topic_model.save(file_path)
예제 #8
0
 def transform(self, X):
     corpus = Sparse2Corpus(X, documents_columns=False)
     doc_topic = self.model[corpus]
     mat = np.zeros((X.shape[0], self.n_components), dtype=np.float64)
     for did, doc in enumerate(doc_topic):
         for topic in doc:
             mat[did][topic[0]] = topic[1]
     return mat
예제 #9
0
 def vect2gensim(self, vectorizer, dtmatrix):
     # transform sparse matrix into gensim corpus and dictionary
     corpus_vect_gensim = Sparse2Corpus(dtmatrix, documents_columns=False)
     dictionary = Dictionary.from_corpus(
         corpus_vect_gensim,
         id2word=dict(
             (id, word) for word, id in vectorizer.vocabulary_.items()))
     return (corpus_vect_gensim, dictionary)
예제 #10
0
def trainLDA(docRep, dictionary, save=False, name=""):
    ''' Function to train and return an ldamodel. Expects a sparse matrix as input '''
    corpus = Sparse2Corpus(docRep, documents_columns=False)
    ldamodel = ldamulticore.LdaMulticore(
        corpus, num_topics=20, id2word=dictionary, workers=4, passes=4)
    if save:
        saveData(ldamodel, 'ldamodel-' + name)
    return ldamodel
예제 #11
0
 def train(self, corpus, project):
     self.info('creating similarity index')
     veccorpus = project.vectorizer.transform(
         (subj.text for subj in corpus.subjects))
     gscorpus = Sparse2Corpus(veccorpus, documents_columns=False)
     self._index = gensim.similarities.SparseMatrixSimilarity(
         gscorpus, num_features=len(project.vectorizer.vocabulary_))
     annif.util.atomic_save(self._index, self.datadir, self.INDEX_FILE)
예제 #12
0
파일: tfidf.py 프로젝트: juhoinkinen/Annif
 def _create_index(self, veccorpus):
     self.info('creating similarity index')
     gscorpus = Sparse2Corpus(veccorpus, documents_columns=False)
     self._index = gensim.similarities.SparseMatrixSimilarity(
         gscorpus, num_features=len(self.vectorizer.vocabulary_))
     annif.util.atomic_save(
         self._index,
         self.datadir,
         self.INDEX_FILE)
def topic_distribution(vect: CountVectorizer, model: LdaModel):
    new_doc = [
        "\n\nIt's my understanding that the freezing will start to occur because \
    of the\ngrowing distance of Pluto and Charon from the Sun, due to it's\nelliptical orbit. \
    It is not due to shadowing effects. \n\n\nPluto can shadow Charon, and vice-versa.\n\nGeorge \
    Krumins\n-- "
    ]
    bow = Sparse2Corpus(vect.transform(new_doc), documents_columns=False)
    return next(iter(model[bow]), None)
예제 #14
0
def getLDARep(ldaModel, docRep, save=False, name=""):
    ''' Convert doc representation to lda output '''
    corpus = Sparse2Corpus(docRep, documents_columns=False)
    converted = ldaModel.get_document_topics(corpus, minimum_probability=0.0)
    rep = [list(map(lambda topic: topic[1], converted[i]))
           for i in range(len(corpus))]
    if save:
        saveData(rep, name)
    return rep
def getLsiModel(tfidfModel) -> LsiModel:
    modelPath = os.path.join('.cache', 'lsi.gensim_model')
    try:
        lsiModel = LsiModel.load(modelPath)
    except FileNotFoundError:
        corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False)
        lsiModel = LsiModel(corpus, num_topics=200)
        lsiModel.save(modelPath)

    return lsiModel
예제 #16
0
def get_similar_topics_distribution(abst_to_match):
    topics_array = np.zeros(n_topics)

    trans = cvec.transform(list([abst_to_match]))
    corpus = Sparse2Corpus(trans, documents_columns=False)
    results = list(ldamodel.get_document_topics(bow=corpus))[0]

    for items in results:
        topics_array[items[0]] = items[1]
    return topics_array
예제 #17
0
파일: features.py 프로젝트: ligege12/dighub
    def __init__(self, dataset, n_topics, vocabulary_size=None):
        super(DescriptionLDA, self).__init__(dataset,
                                             vocabulary_size,
                                             tfidf=False)
        self.n_topics = n_topics

        id2word = {i: w for w, i in self.vectorizer.vocabulary_.iteritems()}
        corpus = Sparse2Corpus(self.features, documents_columns=False)
        self.transformer = LdaModel(corpus=corpus,
                                    id2word=id2word,
                                    num_topics=n_topics)
예제 #18
0
 def _run(self, info):
     nbprint('Running LDA')
     vocab = data.load_vocab(info)
     id2word = {e['id']: e['token'] for e in vocab}
     corpus = Sparse2Corpus(self.input_mat)
     lda = LdaModel(corpus, id2word=id2word, num_topics=info["num_topics"])
     self.W = lda.get_topics().T
     self.H = np.zeros((info["num_topics"], self.input_mat.shape[1]))
     for idx, doc in enumerate(corpus):
         weights = lda[doc]
         for topic, value in weights:
             self.H[topic, idx] = value
예제 #19
0
def sum_weighted_term_lists(wtlist, dictionary):
    if len(wtlist) == 0:
        return []
    term_vecs = []
    for weight, terms in wtlist:
        term_vec_raw = dictionary.doc2bow(terms)
        term_vec = [(term_id, weight * val) for term_id, val in term_vec_raw]
        term_vecs.append(term_vec)
    # make into numpy matrix for convenience
    term_matrix = corpus2csc(term_vecs)
    # calculate sum
    sum_vec = Sparse2Corpus(csc_matrix(term_matrix.sum(1)))[0]
    return sum_vec
def getMatrixSimilarity(tfidfModel, lsiModel=None) -> SparseMatrixSimilarity:
    similarityPath = os.path.join('.cache', 'sim_mat.gensim_sim')
    try:
        sim = MatrixSimilarity.load(similarityPath)
    except FileNotFoundError:
        corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False)
        if lsiModel is None:
            lsiModel = getLsiModel(tfidfModel)
        sim = SparseMatrixSimilarity(lsiModel[corpus],
                                     num_best=21,
                                     num_features=tfidfModel.vectors.shape[0])
        sim.save(similarityPath)
    return sim
예제 #21
0
파일: main_v2.py 프로젝트: YMMS/Models
 def get_texts_topic_distribution(self, texts):
     if not hasattr(self, "vectorizer"):
         self.load_topic_model()
     vectorized_corpus = self.vectorizer.transform(texts)
     gensim_corpus = Sparse2Corpus(vectorized_corpus,
                                   documents_columns=False)
     topic_representations = []
     for doc in gensim_corpus:
         topic_representations.append([
             topic_prob
             for (_, topic_prob) in self.topic_model.get_document_topics(
                 doc, minimum_probability=0.)
         ])
     return np.array(topic_representations)
예제 #22
0
 def score(self, X, y=None, sample_weight=None) -> float:
     # TODO this needs further testing for correctness, WIP
     if self.autoencoder is None:
         raise NotFittedError
     self.autoencoder.eval()
     corpus = Sparse2Corpus(X, documents_columns=False)
     decoder_weight = self.autoencoder.decoder.linear.weight.detach().cpu()
     id2word = {index: str(index) for index in range(X.shape[1])}
     topics = [[str(item.item()) for item in topic]
               for topic in decoder_weight.topk(
                   min(self.score_num, X.shape[1]), dim=0)[1].t()]
     cm = CoherenceModel(topics=topics,
                         corpus=corpus,
                         dictionary=Dictionary.from_corpus(corpus, id2word),
                         coherence='u_mass')
     return cm.get_coherence()
예제 #23
0
def from_stream_of_tokens_to_sparse2corpus(
        source: Any, vocabulary: Dictionary | dict) -> Sparse2Corpus:

    if not hasattr(vocabulary, 'doc2bow'):
        vocabulary: Dictionary = _from_token2id_to_dictionary(vocabulary)

    bow_corpus: GensimBowCorpus = [
        vocabulary.doc2bow(tokens) for _, tokens in source
    ]
    csc_matrix: sp.csc_matrix = corpus2csc(
        bow_corpus,
        num_terms=len(vocabulary),
        num_docs=len(bow_corpus),
        num_nnz=sum(map(len, bow_corpus)),
    )
    corpus: Sparse2Corpus = Sparse2Corpus(csc_matrix, documents_columns=True)
    return corpus
예제 #24
0
    def fit(self, X, y=None):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        self
        """
        if self.lda is None:
            self.lda = LdaMulticore(id2word=self.id2word,
                                    num_topics=self.num_topics,
                                    passes=self.passes)
        X_flat = sp.vstack(X)
        self.lda.update(Sparse2Corpus(X_flat, documents_columns=False))
        return self
def merge_corpus(corpus, dictionary, docs_ids):
    '''
	Merge documents with same doc_id in the corpus according to docs_ids.
	The function will return the merged corpus and merged docs_ids. Noted that
	the merged corpus is a sparse matrix in scipy.
	'''
    # Deprecated version
    # # References:
    # # - How to sort a list and reorder a list according to indices?
    # #   https://stackoverflow.com/questions/6422700/how-to-get-indices-of-a-sorted-array-in-python/6423325
    # #   https://stackoverflow.com/questions/2177590/how-can-i-reorder-a-list
    # # - How to group by data by their keys?
    # #   https://docs.python.org/2/library/itertools.html#itertools.groupby
    # dense_corpus   = corpus2dense(corpus, num_terms=len(dictionary)).transpose()
    # ordered_indice     = np.argsort(docs_ids)
    # reordered_docs_ids = [ docs_ids[index] for index in ordered_indice ]
    # reordered_dense_corpus = [ dense_corpus[index] for index in ordered_indice ]
    # reordered_key_values   = zip(reordered_docs_ids, reordered_dense_corpus)
    # merged_key_values      = [
    # 	[ key_value[0], np.array(list(zip(*list(key_value[1])))[1]).sum(axis=0) ]
    # 	for key_value in itertools.groupby(reordered_key_values, lambda x: x[0]) ]
    # merged_key_values   = list(zip(*merged_key_values))
    # merged_docs_ids     = merged_key_values[0]
    # merged_dense_corpus = np.array(merged_key_values[1])
    # merged_corpus       = Dense2Corpus(merged_dense_corpus, documents_columns=False)
    # return merged_docs_ids, merged_corpus

    # TODO: Make this function more efficient in the future.
    # Convert corpus to sparse matrix in Scipy
    sparse_corpus = corpus2csc(corpus, num_terms=len(dictionary)).transpose()
    # Get corpus groups with respect to the indices according to their doc_ids
    groups = defaultdict(list)
    for index, doc_id in enumerate(docs_ids):
        groups[doc_id].append(index)
    # Merge corpus with same doc_id
    id_corpus_obj = {
        doc_id: sparse_corpus[indices, :].sum(axis=0)
        for doc_id, indices in groups.items()
    }
    merged_doc_ids = list(id_corpus_obj.keys())
    merged_corpus = vstack(
        [csc_matrix(doc) for doc in list(id_corpus_obj.values())])
    merged_corpus = Sparse2Corpus(merged_corpus, documents_columns=False)
    return merged_doc_ids, merged_corpus
예제 #26
0
    def transform(self, X):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        topic_vectors : [np.ndarray]
            each matrix is of shape (sent_count, topic_count)
        """
        topic_vectors = []
        for doc in X:
            sents_bow = Sparse2Corpus(doc, documents_columns=False)
            gamma, _ = self.lda.inference(sents_bow)
            # divide row by row sum
            topic_dist = (gamma.T / np.sum(gamma, axis=1)).T
            topic_vectors.append(topic_dist)
        return topic_vectors
예제 #27
0
def compete_number_of_words(detoken_data, token_data, min_num, max_num, step, random_state=None):

  '''
  number_of_words를 찾기 위한 함수 

  Parameters :
  -------------
  detoken_data : list 형태의 역토큰화된 데이터
  token_data : coherence 값을 계산하기 위한 token_data
  min_num : number of words range의 최솟값 min_num부터 시작
  max_num : number of words range의 최댓값 max_num까지 찾음
  step : min_num ~ max_num 까지 가기 위해 step을 얼마나 갈것인지
  random_state : 재현성을 주기 위해 설정, default = None

  Output :
  -------------
  coherence_value : Num of Words와 그에 따른 Coherence Value가 있는 DataFrame 반환
  
  '''

  coherence_value = pd.DataFrame(columns=['min_df', 'Perplexity Value','Coherence Value'])

  i = 0
  min_df = list(np.arange(min_num,max_num,step))
  for m in min_df :
    print("{} 번째, min_df = {}".format(i+1, m))

    vectorizer = CountVectorizer(min_df=m) # CountVectorizer 생성
    cv = vectorizer.fit_transform(detoken_data) # fit and transform

    dictionary = corpora.Dictionary([vectorizer.get_feature_names()])

    corpus = Sparse2Corpus(cv.T)

    lda_model = LdaModel(corpus=corpus, id2word=dictionary, random_state=random_state)

    coherence_lda = CoherenceModel(model=lda_model, texts=token_data, dictionary=dictionary, coherence='c_v')

    coherence_value.loc[i] = [m, lda_model.log_perplexity(corpus),coherence_lda.get_coherence()]
    i += 1

  return coherence_value
예제 #28
0
def infer_ngrams_corpus(corpus, return_dict=False):

    bow_features = [(i, attribute.name)
                    for i, attribute in enumerate(corpus.domain.attributes)
                    if 'bow-feature' in attribute.attributes]
    if len(bow_features) == 0:
        corpus = BowVectorizer().transform(corpus)
        bow_features = [(i, attribute.name)
                        for i, attribute in enumerate(corpus.domain.attributes)
                        if 'bow-feature' in attribute.attributes]

    feature_presence = corpus.X.sum(axis=0)
    keep = [(i, a) for i, a in bow_features if feature_presence[0, i] > 0]
    # sort features by the order in the dictionary
    dictionary = Dictionary(corpus.ngrams_iterator(include_postags=True),
                            prune_at=None)
    idx_of_keep = np.argsort([dictionary.token2id[a] for _, a in keep])
    keep = [keep[i][0] for i in idx_of_keep]
    result = Sparse2Corpus(corpus.X[:, keep].T)

    return (result, dictionary) if return_dict else result
예제 #29
0
파일: WE_Average.py 프로젝트: Andhs/TMOP
    def do_after_a_full_scan(self, num_of_finished_scans):
        # First iteration of a normal run (collecting the vocabulary)
        if num_of_finished_scans == 1 and self.num_of_scans == 3:
            self.vocab = Counter(self.all_words)

            self.all_words = {}
            for word in self.vocab:
                if self.vocab[word] >= self.min_count:
                    self.all_words[word] = len(self.all_words)

            self.vectors = lil_matrix(
                (len(self.all_words), self.number_of_tus), dtype=np.int8)

            print("-#-#-#-#-#-#-#-#-#-#-#-")
            print("size of vocab:", len(self.vocab))
            print("size of common words:", len(self.all_words))
            print("number of TUs:", self.number_of_tus)
            self.number_of_tus = 0

            f = open(self.dict_file_name, "a+")

            for w in self.all_words:
                f.write(w)
                f.write("\t" + str(self.all_words[w]) + "\n")
            f.close()

        # Second iteration of a normal run (making the tu-word matrix)
        elif num_of_finished_scans == 2:
            print("Performing SVD...")

            x = Sparse2Corpus(self.vectors)
            lsi = lsimodel.LsiModel(corpus=x,
                                    id2word=None,
                                    num_topics=self.num_of_features)
            lsi.save(self.model_file_name)
            self.vectors = lsi.projection.u

            print("done.")
        else:
            print("-#-#-#-#-#-#-#-#-#-#-#-")
예제 #30
0
def get_topics(cv, train_data):
    """
    Uses gensim to perform topic modeling.

    Parameters
    ---------
    cv: A TfidfVectorizer instance.
    train_data: A scipy csr_matrix.

    Returns
    -------
    A list of strings (functions of the most important terms in each topic).
    """

    td_gensim = Sparse2Corpus(train_data, documents_columns=False)
    tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items())
    dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct)

    lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20)
    topics = lda.top_topics(corpus=td_gensim, num_words=5)

    return topics