Пример #1
0
def extractTopicModelData(articleList, commentList, commentCount, set_tag, tag):
    processed_comment_list = extract_global_bag_of_words_processed(commentList)       
    print len(processed_comment_list)
    
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(processed_comment_list[v])
    for v in test_v:
        test_list.append(processed_comment_list[v])    
        
    lda = models.LdaModel.load(model_path + set_tag.replace("_","") + "_lda_model")
    
    dictionary = corpora.Dictionary.load(model_path + set_tag.replace("_","") + "_dictionary")
    train = [dictionary.doc2bow(text) for text in train_list]
    test = [dictionary.doc2bow(text) for text in test_list]
    
    
    
    docTopicProbMat_train = lda[train]
    docTopicProbMat_test = lda[test]
    
    
    train_lda=matutils.corpus2dense(docTopicProbMat_train)
    test_lda=matutils.corpus2dense(docTopicProbMat_test)
      
    print train_lda.shape
    print test_lda.shape
    
    save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_train", train_lda) 
    save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_test", test_lda) 
    
    print "DONE LDA"
Пример #2
0
 def transformed_corpus():
     for doc in input_data:
         if numpy_output:
             yield self._get_hidden_representations(matutils.corpus2dense(doc, self.input_dimensionality))
         else:
             yield matutils.any2sparse(
                 self._get_hidden_representations(matutils.corpus2dense(doc, self.input_dimensionality)))
def get_BoW_vectors(contents_lines, synopsis_lines):
    """
    文のリストから各文のBoWベクトルのリストを返す
    :param contents_lines: list
    本文の各文を要素とするリスト
    :param synopsis_lines: list
    あらすじの各文を要素とするリスト
    :return: ([np.array], [np.array])
    """
    print('creating BoW vectors...')
    removed_contents_lines = [
        remove_stop_word(cleaning(line)) for line in contents_lines
    ]
    removed_synopsis_lines = [
        remove_stop_word(cleaning(line)) for line in synopsis_lines
    ]
    all_lines = removed_contents_lines + removed_synopsis_lines
    vocaburaly = corpora.Dictionary(all_lines)
    contents_BoWs = [
        vocaburaly.doc2bow(line) for line in removed_contents_lines
    ]
    synopsis_BoWs = [
        vocaburaly.doc2bow(line) for line in removed_synopsis_lines
    ]
    contents_vectors = [
        np.array(matutils.corpus2dense([bow], num_terms=len(vocaburaly)).T[0])
        for bow in contents_BoWs
    ]
    synopsis_vectors = [
        np.array(matutils.corpus2dense([bow], num_terms=len(vocaburaly)).T[0])
        for bow in synopsis_BoWs
    ]
    return contents_vectors, synopsis_vectors
Пример #4
0
 def calculate_similarities(self, corpus):
     num_terms = len(self.dictionary)
     if self.simdict is None:
         return matutils.corpus2dense(corpus, num_terms=num_terms).T
     similarities = np.array([[calculate_similarity(doc1, doc2, self.simdict) for doc1 in self.bow_corpus] for doc2 in corpus])
     print similarities.shape
     print matutils.corpus2dense(corpus, num_terms=num_terms).shape
     return np.concatenate((matutils.corpus2dense(corpus, num_terms=num_terms).T, similarities), axis=1)
Пример #5
0
    def pre_process(self, x):

        bow_corpus = [self.dictionary.doc2bow(text) for text in x]
        data_tfidf = matutils.corpus2dense(self.tfidf_model[bow_corpus], num_terms=len(self.dictionary)).T
        if self.model is None:
            return data_tfidf
        else:
            data_lda = matutils.corpus2dense(self.model[bow_corpus], num_terms=len(self.dictionary)).T
            x_data = np.concatenate((data_tfidf, data_lda), axis=1)

        return x_data
Пример #6
0
    def handle(self, *args, **options):
        """
		tws = Timeline.objects.all().order_by('?')[:50000]
		for t in tws:
			self.__tokenize(t.body)

		dictionary = corpora.Dictionary(self.__words)
		dictionary.filter_extremes(no_below=2)
		dictionary.save_as_text('tw_dic.txt')
		"""

        dictionary = corpora.Dictionary.load_from_text('tw_dic.txt')

        favs = Favorite.objects.all()[:1000]
        for f in favs:
            words = self.__train_tokenize(f.body)
            tmp = dictionary.doc2bow(words)
            dense = list(
                matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])

            self.__train_data.append(dense)
            self.__train_label.append(1)

        ptws = PublicTimeline.objects.all()[:1000]
        for p in ptws:
            words = self.__train_tokenize(p.body)
            tmp = dictionary.doc2bow(words)
            dense = list(
                matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])

            self.__train_data.append(dense)
            self.__train_label.append(0)

        estimator = RandomForestClassifier()

        # 学習
        estimator.fit(self.__train_data, self.__train_label)

        # 予測
        tws = Timeline.objects.all().order_by('-ts')[:100]
        for t in tws:
            words = self.__train_tokenize(t.body)
            tmp = dictionary.doc2bow(words)
            dense = list(
                matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])

            if 1 in estimator.predict(dense):
                print t.body
Пример #7
0
def _doc_term_mtx(table, model, input_col, result_type='doc_to_bow_token'):
    corpus = table[input_col].tolist()

    dictionary = model['dictionary']

    bow_corpus = []
    for doc in corpus:
        bow_corpus.append(dictionary.doc2bow(doc))

    doc_to_bow = []
    for i in range(len(corpus)):
        token_cnt = []
        for j in range(len(bow_corpus[i])):
            token_cnt.append('({token}, {cnt})'.format(
                token=dictionary[bow_corpus[i][j][0]],
                cnt=bow_corpus[i][j][1]))
        doc_to_bow.append(token_cnt)
    doc_to_bow_list = []
    for doc in doc_to_bow:
        doc_to_bow_list.append('{}'.format(list(doc)))

    doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))]
    terms = [term for term in dictionary.token2id.keys()]

    if result_type == 'doc_to_bow_token':
        out_table = pd.DataFrame(data=doc_to_bow_list, columns=['doc_to_bow'])
        out_table.insert(loc=0, column='doc_idx', value=doc_idx)
    elif result_type == 'doc_term_mtx':
        out_table = pd.DataFrame(
            matutils.corpus2dense(bow_corpus,
                                  num_terms=len(dictionary.token2id)).T)
        out_table.insert(loc=0, column=' ', value=doc_idx)
        out_table.columns = np.append('', terms)
    elif result_type == 'term_doc_mtx':
        out_table = pd.DataFrame(
            matutils.corpus2dense(bow_corpus,
                                  num_terms=len(dictionary.token2id)))
        out_table.insert(loc=0, column=' ', value=terms)
        out_table.columns = np.append('', doc_idx)
    else:
        raise_runtime_error("Please check 'result_type'.")

    rb = BrtcReprBuilder()
    model = _model_dict('doc_term_mtx')
    model['bow_corpus'] = bow_corpus
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table}
Пример #8
0
    def get_dense(self, text):
        dict = self.__load_dictionary()
        words = tp.TextPreprocessing(text).get_words_feature()

        vec = dict.doc2bow(words)
        dense = list(matutils.corpus2dense([vec], num_terms=len(dict)).T[0])
        return dense
Пример #9
0
def load_data(fname):

    source = []
    target = []
    f = open(fname, "r")

    document_list = [] #各行に一文書. 文書内の要素は単語
    for l in f.readlines():
        sample = l.strip().split(" ", 1)        #ラベルと単語列を分ける
        label = int(sample[0])                  #ラベル
        target.append(label)
        document_list.append(sample[1].split()) #単語分割して文書リストに追加
    
    #単語辞書を作成   
    dictionary = corpora.Dictionary(document_list)
    dictionary.filter_extremes(no_below=5, no_above=0.8) 
    # no_below: 使われている文書がno_below個以下の単語を無視
    # no_above: 使われてる文章の割合がno_above以上の場合無視
    
    #文書のベクトル化
    for document in document_list:
        tmp = dictionary.doc2bow(document) #文書をBoW表現
        vec = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) 
        source.append(vec)

    dataset = {}
    dataset['target'] = np.array(target)    
    dataset['source'] = np.array(source)    
    print "vocab size:", len(dictionary.items())

    return dataset, dictionary
Пример #10
0
    def log_perplexity(self, corpus):
        """Calculate perplexity bound on the specified corpus.

        Perplexity = e^(-bound).

        Parameters
        ----------
        corpus : iterable of list of (int, float), optional
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).

        Returns
        -------
        float
            The perplexity bound.

        """
        W = self.get_topics().T

        H = np.zeros((W.shape[1], len(corpus)))
        for bow_id, bow in enumerate(corpus):
            for topic_id, factor in self[bow]:
                H[topic_id, bow_id] = factor

        dense_corpus = matutils.corpus2dense(corpus, W.shape[0])

        pred_factors = W.dot(H)
        pred_factors /= pred_factors.sum(axis=0)

        return (np.log(pred_factors, where=pred_factors > 0) * dense_corpus).sum() / dense_corpus.sum()
Пример #11
0
 def transformed_corpus():
     for chunk_no, doc_chunk in utils.grouper(bow, chunksize):
         chunk = matutils.corpus2dense(doc_chunk,
                                       self.input_dimensionality)
         hidden = self._get_hidden_representations(chunk)
         for column in hidden.T:
             yield matutils.any2sparse(column)
Пример #12
0
def calculate_embedding(corpus: Corpus,
                        *,
                        rank=2,
                        svd_dims=50,
                        perplexity=30,
                        seed=0):
    """ Calculate a document embedding that assigns each document in the
    corpus a N-d position based on the word usage.

    :returns: A list of N-d tuples for the documents in the corpus.
    """
    from gensim.models.tfidfmodel import TfidfModel
    from sklearn.decomposition import TruncatedSVD
    from sklearn.manifold import TSNE

    dic = corpus.dictionary
    freqs = corpus.frequencies
    tfidf = corpus2dense(TfidfModel(dictionary=dic)[freqs], len(dic)).T

    if svd_dims is not None:
        svd = TruncatedSVD(n_components=svd_dims, random_state=seed)
        components = svd.fit_transform(tfidf)
    else:
        components = tfidf

    model = TSNE(rank,
                 metric='cosine',
                 square_distances=True,
                 perplexity=perplexity,
                 random_state=seed)
    return model.fit_transform(components)
Пример #13
0
 def transform(self, X):
     X = [text.words for text in X]
     x_tfidf = self.tfidf[[self.dictionary.doc2bow(text) for text in X]]
     x_data = matutils.corpus2dense(x_tfidf, num_terms=len(self.dictionary)).T
     logging.info("Returning data of shape %s " % (x_data.shape,))
     print x_data
     return x_data
Пример #14
0
def getRank(fname):
    fname = path[0]
    twitter_stoplist = ["what's", "it's", "they'd"]
    stoplist = nltk.corpus.stopwords.words('english') + twitter_stoplist
    document = []
    with open(fname) as f:
        for line in f:
            tl = []
            tl.append(line[:32])  #添加标号
            line = re.sub(r"(http:.*?\s)", "url", line[33:])
            d = re.sub(r'\W|\d', ' ', line)
            d = re.sub(r'\s+', ' ', d)  #合并多余空格
            for word in d.split():
                word = word.lower()
                if word not in stoplist and len(word) < 15 and len(word) > 2:
                    word = nltk.PorterStemmer().stem(word)
                    tl.append(word)
                    document.append(tl)
        return document

    print fname + " read complate"
    dc = [dictionary.doc2bow(t) for t in document]
    d = matutils.corpus2dense(dc, dimension)
    # for i in range(len(d[0])):
    #     tsum = sum(d[:,i])
    #     if tsum != 0:
    #         for j in range(len(d)):
    #             if d[j][i]/tsum > 0.5:
    #                 d[j][i] = 2
    r2 = numpy.dot(numpy.transpose(u), d)
    score2 = numpy.array([sum(r2[:, i]) for i in range(len(r2[0]))])
    x = [document[i][0] for i in range(len(document))]
    l2 = zip(score2, x)
    print fname + " process complate"
    return l2
Пример #15
0
def Tf_idf(sentences):
    dictionary = corpora.Dictionary(sentences)#创建词典
    corpus = [dictionary.doc2bow(sentence) for sentence in sentences]#创建文档词频向量
    tfidf_model = models.TfidfModel(corpus)#计算tf-idf值
    corpus_tfidf = tfidf_model[corpus]
    corpus_matrix = corpus2dense(corpus_tfidf, len(dictionary))#转换矩阵形式
    return corpus_matrix
Пример #16
0
 def vectorize(self, docs, vocab_size):
     '''
     Args:
         docs: bag-of-words format, iterable of iterable of (int, number)
         vocab_size (int) – Number of terms in the dictionary. X-axis of the resulting matrix.
     '''
     return matutils.corpus2dense(docs, vocab_size)
Пример #17
0
def get_Xy(corpus, labels, ndims):
    # Term frequency inverse document frequency (tfidf) weighting:
    # reflects how important a word is to a document in a corpus
	tfidf = models.TfidfModel(corpus)
	tfidf_corpus = tfidf[corpus]	    
	docs_tfidf = [doc for doc in tfidf_corpus]
	##scipy_csc_matrix = matutils.corpus2csc(tfidf_corpus).toarray().transpose()	
	# Builds an LSI space from the input TFIDF matrix, it uses SVD
	# for dimensionality reduction with num_topics = dimensions
	lsi = models.LsiModel(tfidf_corpus, num_topics = ndims)
	lsi_corpus = lsi[tfidf_corpus]
	docs_lsi = [doc for doc in lsi_corpus]
	X = matutils.corpus2dense(lsi_corpus, num_terms = ndims).transpose()
	# Convert labels to: promoter: 0, enhancer: 1
	y = []
	error_ind = []
	for i in range(len(labels)):
	    if labels[i] == 'promoter':
	        y.append(0)
	    elif labels[i] == 'enhancer':
	        y.append(1)
	    else:
	        print "Promoter / enhancer not recorded at index", i
	        error_ind.append(i)
	y = np.asarray(y)
	return (X, y)
Пример #18
0
def ns(numpy_matrix, number_of_corpus_features, scipy_sparse_matrix):
    corpus = matutils.Dense2Corpus(numpy_matrix)

    numpy_matrix = matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)

    corpus = matutils.Sparse2Corpus(scipy_sparse_matrix)
    scipy_csc_matrix = matutils.corpus2csc(corpus)
Пример #19
0
def main():
    document_words = list()
    document_labels = list()

    count = 0
    with open('tweets_label.csv', 'r') as f:
        reader = csv.reader(f)
        header = next(reader)

        for row in reader:
            document_words.append(parse_word_list(row[0]))
            document_labels.append(row[1])
            count += 1
            if count == 2000:
                break

    dictionary = corpora.Dictionary(document_words)
    dictionary.filter_extremes(no_below=3, no_above=0.4)

    vecs = list()
    for wordlist in document_words:
        bow = dictionary.doc2bow(wordlist)
        dense = list(matutils.corpus2dense([bow], num_terms=len(dictionary)).T[0])
        vecs.append(dense)

    normal_fit_predict(vecs, document_labels)
def get_vector(dictionary, content):
    tmp = dictionary.doc2bow(get_words_main(content))
    # print(tmp)
    # dense = tfidf_model[tmp]
    # print(dense)
    dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])
    return dense
Пример #21
0
def train_nmf_model(corpus: Corpus,
                    num_topics: int,
                    seed=0,
                    max_iter=500) -> TopicModel:
    """ Train a topic model using NMF.

    :param num_topics: The number of topics to train.
    :param seed: The seed used for random number generation.
    :param max_iter: The maximum number of iterations to use for training.
                     More iterations mean better results, but longer training
                     times.
    """
    import gensim.models.nmf

    dic = corpus.dictionary
    freqs = corpus.frequencies

    tfidf = gensim.models.tfidfmodel.TfidfModel(dictionary=dic)
    model = gensim.models.nmf.Nmf(list(tfidf[freqs]),
                                  num_topics=num_topics,
                                  passes=max_iter,
                                  random_state=seed,
                                  w_stop_condition=1e-9,
                                  h_stop_condition=1e-9,
                                  w_max_iter=50,
                                  h_max_iter=50)

    doc2topic = corpus2dense(model[freqs], num_topics).T
    topic2token = model.get_topics()

    return TopicModel(dic, doc2topic, topic2token)
Пример #22
0
def get_vector(dictionary, content):
    '''
    ある記事の特徴語カウント
    '''
    tmp = dictionary.doc2bow(get_words_main(content))
    dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])
    return dense
Пример #23
0
def arun(corpus, dictionary, min_topics=10, max_topics=100, step=10):
    l = np.array([sum(cnt for _, cnt in doc) for doc in corpus])
    
    kl = []
    for n in range(min_topics, max_topics+step, step):
        print("starting multicore LDA for num_topics={}".format(n))
        st = time.clock()
        lda = LdaMulticore(corpus=corpus,
                           id2word=vocabulary,
                           num_topics=n,
                           passes=20,
                           workers=mp.cpu_count()-1)
        el = time.clock()-st
        print("multicore LDA finished in {:.2f}s!".format(el))
        
        m1 = lda.expElogbeta
        _, cm1, _ = np.linalg.svd(m1)
        
        lda_topics = lda[corpus]
        m2 = matutils.corpus2dense(lda_topics, lda.num_topics).transpose()
        cm2 = l.dot(m2)
        cm2 = cm2 + 0.0001
        cm2norm = np.linalg.norm(l)
        cm2 = cm2/cm2norm
        kl.append(sym_kl(cm1, cm2))
        
    return kl
Пример #24
0
def load_data(fname):
    
    print 'input file name:', fname

    target = [] #ラベル
    source = [] #文書ベクトル

    #文書リストを作成
    document_list = []
    word_list = []
    for l in open(fname, 'r').readlines():
        sample = l.strip().split(' ',  1)
        label = sample[0]
        target.append([label]) #ラベル
        word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング
        document_list.append(word_list) #文書ごとの単語リスト
    
    #辞書を作成
    #低頻度と高頻度のワードは除く
    dct = Dictionary(document_list)
    dct.filter_extremes(no_below=3, no_above=0.6)

    #文書のBOWでベクトル化
    for doc in document_list:
        tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] 
        dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0])
        source.append(dense)

    dataset = {}
    dataset['target'] = np.array(target)    
    dataset['source'] = np.array(source)    

    return dataset #, max_len, width
Пример #25
0
    def train(self):
        self.process_dataset(self.training_path, True)
        self.process_dataset(self.training_path, False)

        self.training_sources_length = len(self.sources)
        self.logger.debug(
            f'After train set processing: sources len {len(self.sources)}, labels len {len(self.labels)}'
        )

        self.process_dataset(self.test_path, True)
        self.process_dataset(self.test_path, False)

        self.logger.debug(
            f'After full processing: sources len {len(self.sources)}, labels len {len(self.labels)}'
        )

        corpus = Texts(self.sources).to_vector()
        dictionary = corpora.Dictionary(corpus)
        corpus = [dictionary.doc2bow(text) for text in corpus]
        model = TfidfModel(corpus)
        corpus = [text for text in model[corpus]]
        self.training_text_matrix = corpus2dense(corpus,
                                                 num_terms=len(
                                                     dictionary.token2id)).T

        if self.pca:
            self.training_text_matrix = self.pca.fit_transform(
                self.training_text_matrix)

        self.classifier.fit(
            self.training_text_matrix[:self.training_sources_length],
            self.labels[:self.training_sources_length])

        self.is_trained = True
Пример #26
0
 def get_dense(self, text):
     self.__load_dictionary()
     words = NLP(text).get_words_feature()
     # Bag of words
     vec = self.dictionary.doc2bow(words)
     dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0])
     return dense
Пример #27
0
 def infer_topics(self, num_topics=10):
     self.nb_topics = num_topics
     lda = models.LdaModel(corpus=self.corpus.gensim_vector_space,
                           iterations=10000,
                           num_topics=num_topics)
     tmp_topic_word_matrix = list(
         lda.show_topics(num_topics=num_topics,
                         num_words=len(self.corpus.vocabulary),
                         formatted=False))
     row = []
     col = []
     data = []
     for topic_id in range(self.nb_topics):
         topic_description = tmp_topic_word_matrix[topic_id]
         for probability, word_id in topic_description:
             row.append(topic_id)
             col.append(word_id)
             data.append(probability)
     self.topic_word_matrix = coo_matrix(
         (data, (row, col)),
         shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr()
     self.document_topic_matrix = sparse.csr_matrix(
         np.transpose(
             matutils.corpus2dense(lda[self.corpus.gensim_vector_space],
                                   num_topics, self.corpus.size)))
 def infer_topics(self, num_topics=10):
     if self.corpus.gensim_vector_space is None:
         self.corpus.gensim_vector_space = matutils.Sparse2Corpus(self.corpus.sklearn_vector_space,
                                                                  documents_columns=False)
     self.nb_topics = num_topics
     lda = models.LdaModel(corpus=self.corpus.gensim_vector_space,
                           iterations=10000,
                           num_topics=num_topics)
     tmp_topic_word_matrix = list(lda.show_topics(num_topics=num_topics,
                                                  num_words=len(self.corpus.vocabulary),
                                                  formatted=False))
     row = []
     col = []
     data = []
     for topic_id in range(self.nb_topics):
         topic_description = tmp_topic_word_matrix[topic_id]
         for word_id, probability in topic_description[1]:
             row.append(topic_id)
             col.append(int(word_id))
             data.append(probability)
     self.topic_word_matrix = coo_matrix((data, (row, col)),
                                         shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr()
     self.document_topic_matrix = sparse.csr_matrix(
         np.transpose(matutils.corpus2dense(lda[self.corpus.gensim_vector_space],
                                            num_topics,
                                            self.corpus.size)))
     self.corpus.gensim_vector_space = None
Пример #29
0
def get_Xy(corpus, labels, ndims):
    # Term frequency inverse document frequency (tfidf) weighting:
    # reflects how important a word is to a document in a corpus
    tfidf = models.TfidfModel(corpus)
    tfidf_corpus = tfidf[corpus]
    docs_tfidf = [doc for doc in tfidf_corpus]
    ##scipy_csc_matrix = matutils.corpus2csc(tfidf_corpus).toarray().transpose()
    # Builds an LSI space from the input TFIDF matrix, it uses SVD
    # for dimensionality reduction with num_topics = dimensions
    lsi = models.LsiModel(tfidf_corpus, num_topics=ndims)
    lsi_corpus = lsi[tfidf_corpus]
    docs_lsi = [doc for doc in lsi_corpus]
    X = matutils.corpus2dense(lsi_corpus, num_terms=ndims).transpose()
    # Convert labels to: promoter: 0, enhancer: 1
    y = []
    error_ind = []
    for i in range(len(labels)):
        if labels[i] == 'promoter':
            y.append(0)
        elif labels[i] == 'enhancer':
            y.append(1)
        else:
            print "Promoter / enhancer not recorded at index", i
            error_ind.append(i)
    y = np.asarray(y)
    return (X, y)
def jaccard_wrapper(doclist):
    """Wrapper function, that 1) calculates jaccard similarities and b) extracts all distances once.

    Args:
        doclist (list of strings): document list

    Returns:
        np.array: flat array of occurring jaccard values
    """
    # Transform into sparse document-word-matrix
    #vectorizer = CountVectorizer(token_pattern="(?u)[\w.!?\\/-]+")
    #vocab_matrix = vectorizer.fit_transform(doclist).todense()

    
    all_words = [x.split(" ") for x in doclist]
    lexicon = corpora.Dictionary(all_words)
    bow_x = []
    for t in all_words:
        bow_x.append(lexicon.doc2bow(t))
    vocab_matrix = matutils.corpus2dense(bow_x, num_terms=len(lexicon.token2id)).T.astype(bool)
    
    # Calculate jaccard similarities
    sim_matrix =  1 - pairwise_distances(vocab_matrix, metric='jaccard')
    
    # Extract values from upper triangle in matrix
    size = len(doclist)
    indices = np.triu_indices(size, k=1)
    flat_values = sim_matrix[indices]
    return flat_values
Пример #31
0
def get_vector(dictionary, content):
    '''
    ある記事の特徴語カウント
    '''
    tmp = dictionary.doc2bow(get_words_main(content))
    dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])
    return dense
def predictData(strDictName, strModelName, sentence, mecab):

    words = []

    # モデルをロード
    with open(strModelName, 'rb') as f:
        model = cPickle.load(f)

    # 引数に入ってきた文字列を分解して入れる
    words.append(make_random_forest.makeWakatiData(mecab, sentence))

    #print(words)
    dictionary = corpora.Dictionary.load_from_text(strDictName)

    # BoW
    corpus = [dictionary.doc2bow(text) for text in words]

    aryDense = []

    # ベクトルを作成
    for c in corpus:
        dense = list(
            matutils.corpus2dense([c], num_terms=len(dictionary)).T[0])
        print(dense)
        aryDense.append(dense)

    result = model.predict(aryDense)

    return result
Пример #33
0
def loadData(features=100):  
    lmtzr = WordNetLemmatizer()
    
    with open('/home/molina/Dropbox/Datasets/VisualGenome/objects.txt') as f:
        images = f.readlines()
        
        stopw = set(stopwords.words('english'))
        
        texts = [[lmtzr.lemmatize(word) for word in document.lower().split() if word not in stopw] for document in images]
        
        print("documents", len(texts))
        
        dictionary = corpora.Dictionary(texts)
        
        print("dict before filtering", dictionary)
        
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=features)
        dictionary.compactify()
        
        print("dict after filtering", dictionary)
        
        
        corpus = []
        
        for text in texts:
            corpus.append(dictionary.doc2bow(text))
        
        denseCorpus = corpus2dense(corpus, num_terms=len(dictionary.keys()))
        
        print(corpus[0])
        print(denseCorpus[0])
        
        
        return corpus, denseCorpus, dictionary
Пример #34
0
def arun_metric(corpus, dictionary, min_topics=1, max_topics=1, iteration=1):
    """ Caluculates Arun et al metric..
    """
    result = [];
    for i in range(min_topics, max_topics, iteration):
        # Instanciates LDA.
        lda = models.ldamodel.LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=i
        )
        # Caluculates raw LDA matrix.
        matrix = lda.expElogbeta
        # Caluculates SVD for LDA matris.
        U, document_word_vector, V = numpy.linalg.svd(matrix)
        # Gets LDA topics.
        lda_topics = lda[my_corpus]
        # Caluculates document-topic matrix.
        term_document_matrix = matutils.corpus2dense(
            lda_topics, lda.num_topics
        ).transpose()
        document_topic_vector = corpus_length_vector.dot(term_document_matrix)
        document_topic_vector = document_topic_vector + 0.0001
        document_topic_norm   = numpy.linalg.norm(corpus_length_vector)
        document_topic_vector = document_topic_vector / document_topic_norm
        result.append(symmetric_kl_divergence(
            document_word_vector,
            document_topic_vector
        ))
    return result
 def get_dense_bow(self, text):
     words = text.split()
     self.__load_dictionary()
     vec = self.dictionary.doc2bow(words)
     dense = list(
         matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0])
     return dense
Пример #36
0
 def infer_topics(self, num_topics=10):
     if self.corpus.gensim_vector_space is None:
         self.corpus.gensim_vector_space = matutils.Sparse2Corpus(self.corpus.sklearn_vector_space,
                                                                  documents_columns=False)
     self.nb_topics = num_topics
     lsa = models.LsiModel(corpus=self.corpus.gensim_vector_space,
                           id2word=self.corpus.vocabulary,
                           num_topics=num_topics)
     tmp_topic_word_matrix = list(lsa.show_topics(num_topics=num_topics,
                                                  num_words=len(self.corpus.vocabulary),
                                                  formatted=False))
     row = []
     col = []
     data = []
     for topic_id in range(self.nb_topics):
         topic_description = tmp_topic_word_matrix[topic_id]
         for weight, word_id in topic_description:
             row.append(topic_id)
             col.append(word_id)
             data.append(weight)
     self.topic_word_matrix = coo_matrix((data, (row, col)),
                                         shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr()
     self.document_topic_matrix = np.transpose(matutils.corpus2dense(lsa[self.corpus.gensim_vector_space],
                                                                     num_topics,
                                                                     self.corpus.size))
     self.corpus.gensim_vector_space = None
Пример #37
0
def arun(corpus, dictionary, min_topics=10, max_topics=21, step=5):
    print "Arun runing"
    output = []
    for i in range(min_topics, max_topics, step):
        lda = LDA(dictionary, corpus, i, "lda20/lda_training_" + str(i))
        print "Модель построена/загружена"
        m1 = lda.expElogbeta
        # U, cm1, V = np.linalg.svd(m1)
        smat = scipy.sparse.csc_matrix(m1)  # convert to sparse CSC format
        U, cm1, V = sparsesvd(smat, i + 30)  # do SVD, asking for 100 factors
        print "sparsesvd сделано"
        #Document-topic matrix
        lda_topics = lda[my_corpus]
        m2 = matutils.corpus2dense(lda_topics, lda.num_topics).transpose()
        cm2 = l.dot(m2)
        cm2 = cm2 + 0.0001
        print "cm2norm begin"
        cm2norm = np.linalg.norm(l)
        print "cm2norm end"
        cm2 = cm2/cm2norm
        print len(cm1), len(cm2)
        kl = sym_kl(cm1, cm2)
        output.append((i, kl))
        print i, kl
    print output
    return output
Пример #38
0
    def log_perplexity(self, corpus):
        """Calculate perplexity bound on the specified corpus.

        Perplexity = e^(-bound).

        Parameters
        ----------
        corpus : list of list of (int, float)
            The corpus on which the perplexity is computed.

        Returns
        -------
        float
            The perplexity bound.

        """
        W = self.get_topics().T

        H = np.zeros((W.shape[1], len(corpus)))
        for bow_id, bow in enumerate(corpus):
            for topic_id, factor in self[bow]:
                H[topic_id, bow_id] = factor

        dense_corpus = matutils.corpus2dense(corpus, W.shape[0])

        pred_factors = W.dot(H)
        pred_factors /= pred_factors.sum(axis=0)

        return (np.log(pred_factors, where=pred_factors > 0) * dense_corpus).sum() / dense_corpus.sum()
Пример #39
0
def getRank(fname):
    fname = path[0]
    twitter_stoplist = ["what's","it's","they'd"]
    stoplist = nltk.corpus.stopwords.words('english')+twitter_stoplist
    document = []
    with open(fname) as f:
        for line in f:
            tl = []
            tl.append(line[:32])#添加标号
            line=re.sub(r"(http:.*?\s)","url" ,line[33:])
            d = re.sub(r'\W|\d', ' ', line)
            d = re.sub(r'\s+',' ',d)#合并多余空格
            for word in d.split():
                word = word.lower()
                if word not in stoplist and len(word)<15 and len(word)>2:
                    word = nltk.PorterStemmer().stem(word)
                    tl.append(word)
                    document.append(tl)
        return document

    print fname+" read complate"
    dc = [dictionary.doc2bow(t) for t in document]
    d = matutils.corpus2dense(dc,dimension)
    # for i in range(len(d[0])):
    #     tsum = sum(d[:,i])
    #     if tsum != 0:
    #         for j in range(len(d)):
    #             if d[j][i]/tsum > 0.5:
    #                 d[j][i] = 2
    r2 = numpy.dot(numpy.transpose(u),d)
    score2 =numpy.array([sum(r2[:,i]) for i in range(len(r2[0]))])
    x = [document[i][0] for i in range(len(document))]
    l2 = zip(score2,x)
    print fname+" process complate"
    return l2
Пример #40
0
 def mm(self, dictionary):
     values_set = set(dictionary.values())
     self.texts = [[token for token in text if token in values_set]
                   for text in self.texts]
     # print(self.texts[0])
     self.corpus = [dictionary.doc2bow(text) for text in self.texts]
     self.dense = matutils.corpus2dense(self.corpus, len(dictionary)).T
Пример #41
0
def text_from_clusters(posts, cluster_labels, threshold=0.7, top_n=10):
    ''' extract high tfidf tokens and artist names from each cluster

    unusual_tokens, cluster_tokens_cleaned = text_from_clusters(posts, cluster_labels, threshold=0.7, top_n=10)

    :param posts: DataFrame with 'text' column
    :param cluster_labels: array of cluster_labels of len=post.shape[0]
    :param threshold=0.7: tfidf threshold above which to return "unusual" tokens
    :param top_n: [deprecated] top # tokens to return
    :return: unusual_tokens, cluster_tokens_cleaned
    '''

    # Get text from each cluster
    from nltk.tokenize import word_tokenize
    from gensim import corpora, matutils
    import string

    # n_clust = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    n_clust = max(cluster_labels)+1

    docs = posts['text'].values
    tokened_docs = [word_tokenize(doc) if doc is not None else ['#'] for doc in docs]

    cluster_tokens = [[]] *  n_clust # only includes clusters that are labeled (not "noise")
    for ind, doc in enumerate(tokened_docs):
        if cluster_labels[ind] == -1:
            pass # ignore points not considered to be in a cluster
        else:
            cluster_tokens[cluster_labels[ind]] = cluster_tokens[cluster_labels[ind]] + doc

    # remove funny characters and spaces
    bad_words = [' ', 'san', 'in', 'the']
    chars = string.punctuation + ' '
    temp_cleaned = [[''.join(ch for ch in word.lower() if ch not in chars) for word in doc] for doc in cluster_tokens]
    temp_cleaned = [[word for word in doc if len(word) > 1] for doc in temp_cleaned]
    cluster_tokens_cleaned = [[word for word in doc if word not in bad_words] for doc in temp_cleaned]

    dictionary = corpora.dictionary.Dictionary(cluster_tokens_cleaned) # indexing: dictionary.token2id['streetart']
    bow_corp = [dictionary.doc2bow(doc) for doc in cluster_tokens_cleaned]
    token_freq = matutils.corpus2dense(bow_corp, len(dictionary.token2id.keys()))

    # normalize words by occurrence
    from sklearn.feature_extraction.text import TfidfTransformer
    transformer = TfidfTransformer()

    tfidf = transformer.fit_transform(token_freq)
    norm_token_freq = tfidf.toarray()

    words = dictionary.token2id.keys()

    # Pick out unusual words
    unusual_tokens = [[]] * n_clust #[[x] for x in range(0,n_clust)] #* n_clust
    for word in words:
        for ind_cluster, p in enumerate(norm_token_freq[dictionary.token2id[word],:]):
            if p > threshold:
                if np.sum(token_freq[dictionary.token2id[word]]) > 1: # check appear more than once in entire corpus
                    unusual_tokens[ind_cluster] = unusual_tokens[ind_cluster] + [(str(word), token_freq[dictionary.token2id[word], ind_cluster])]

    return unusual_tokens, cluster_tokens_cleaned
Пример #42
0
def main():
    contents = {}
    data_train = []
    # 正解ラベル 0: 独女通信, 1:ITライフハック...
    label_train = []

    directories = dir_list()
    for directory in directories:
        files = file_list(directory)
        for file in files:
            content = read_data(data_path(directory) + file)
            contents[file] = content
            label_train.append(class_id(file))

    # ワードの重複を除いた、辞書リストの作成
    words = get_words(contents)
    wordbook = corpora.Dictionary(words)

    # ここは調整が必要
    # no_berow: 使われてる文章がno_berow個以下の単語無視
    # no_above: 使われてる文章の割合がno_above以上の場合無視
    wordbook.filter_extremes(no_below=20, no_above=0.2)

    # 辞書リストを.txtに保存
    wordbook.save_as_text(SAVE_FILE_NAME)

    # 作った辞書ファイルをロードして(wordbook)辞書オブジェクト作る
    # wordbook = corpora.Dictionary.load_from_text(SAVE_FILE_NAME)

    # BoW (単語id, 出現回数)と表現される
    for w in words:
        vector = wordbook.doc2bow(w)
        # 特徴ベクトルの取得
        dense = list(
            matutils.corpus2dense([vector], num_terms=len(wordbook)).T[0])
        data_train.append(dense)

    # ランダムフォレストオブジェクト生成
    estimator = RandomForestClassifier()

    # 学習させる
    estimator.fit(data_train, label_train)

    # 予測
    # label_predict = estimator.predict(data_train)

    # 予測結果
    print(estimator.score(data_train, label_train))

    # 学習データと試験データに分ける
    data_train_s, data_test_s, label_train_s, label_test_s = train_test_split(
        data_train, label_train, test_size=0.5)

    # もう一度ランダムフォレストで検証する
    estimator2 = RandomForestClassifier()

    estimator2.fit(data_train_s, label_train_s)

    print(estimator2.score(data_train_s, label_train_s))
Пример #43
0
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None):
    if not matutils.ismatrix(corpus):
        corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary))
    else:
        corpus_csc = corpus
        # Need corpus to be a streaming gensim list corpus for len and inference functions below:
        corpus = matutils.Sparse2Corpus(corpus_csc)

    beta = 0.01
    fnames_argsort = np.asarray(list(dictionary.token2id.values()),
                                dtype=np.int_)
    term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
    term_freqs[term_freqs == 0] = beta
    doc_lengths = corpus_csc.sum(axis=0).A.ravel()

    assert term_freqs.shape[0] == len(
        dictionary
    ), 'Term frequencies and dictionary have different shape {} != {}'.format(
        term_freqs.shape[0], len(dictionary))
    assert doc_lengths.shape[0] == len(
        corpus
    ), 'Document lengths and corpus have different sizes {} != {}'.format(
        doc_lengths.shape[0], len(corpus))

    if hasattr(topic_model, 'lda_alpha'):
        num_topics = len(topic_model.lda_alpha)
    else:
        num_topics = topic_model.num_topics

    if doc_topic_dists is None:
        # If its an HDP model.
        if hasattr(topic_model, 'lda_beta'):
            gamma = topic_model.inference(corpus)
        else:
            gamma, _ = topic_model.inference(corpus)
        doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
    else:
        if isinstance(doc_topic_dists, list):
            doc_topic_dists = matutils.corpus2dense(doc_topic_dists,
                                                    num_topics).T
        elif issparse(doc_topic_dists):
            doc_topic_dists = doc_topic_dists.T.todense()
        doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)

    assert doc_topic_dists.shape[
        1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format(
            doc_topic_dists.shape[1], num_topics)

    # get the topic-term distribution straight from gensim without
    # iterating over tuples
    if hasattr(topic_model, 'lda_beta'):
        topic = topic_model.lda_beta
    else:
        topic = topic_model.state.get_lambda()
    topic = topic / topic.sum(axis=1)[:, None]
    topic_term_dists = topic[:, fnames_argsort]

    assert topic_term_dists.shape[0] == doc_topic_dists.shape[1]
    return doc_topic_dists
Пример #44
0
def predict(input_text):
    dictionary = corpora.Dictionary.load_from_text('%s/dictionary.txt' % (os.environ["INQUIRY_BOT_PROJECT_DIR"]))
    words = morphological_analyze(input_text)
    bow = dictionary.doc2bow(words)
    dense = list(matutils.corpus2dense([bow], num_terms=len(dictionary)).T[0])

    estimator = joblib.load('%s/linear_svm.pkl' % (os.environ["INQUIRY_BOT_PROJECT_DIR"]))
    return json.dumps({"answer_id": estimator.predict([dense])[0]})
Пример #45
0
def test_lda(mlda_model=None, tfidf_model=None, corpus=None, dictionary=None, n_topics=2):
    data_tfidf = matutils.corpus2dense(tfidf_model[corpus], num_terms=len(dictionary)).T
    data_mlda = mlda_model.corpus2dense_lda(bow_corpus=corpus, dictionary=dictionary, n_topics=n_topics)
    if data_mlda is None:
        return data_tfidf
    else:
        x_data = np.concatenate((data_tfidf, data_mlda), axis=1)
    return x_data
Пример #46
0
 def calculate_lsi_transformed_corpus_matrix(self):
     """Find the documents represented as vectors in LSI space"""
     self.transformed = matutils.corpus2dense(self.lsi[self.corpus_tfidf], len(self.lsi.projection.s)).T
     # normalize the vectors because only the vector orientation represents semantics
     transformed_norms = np.sum(self.transformed**2,axis=-1)**(1./2)
     # avoid dividing by zero
     transformed_norms[ transformed_norms==0] = 1
     self.transformed = self.transformed / transformed_norms.reshape(len(transformed_norms),1)
 def load_test_category(self, dir):
     texts, categories = self.load_data(dir,getTokens=self.getTokensForCategory)
     # 生成corpus和lda测试集
     corpus = [self.dictionary.doc2bow(text) for text in texts]
     test_cat = self.lda[corpus]
     mat = matutils.corpus2dense(test_cat, num_terms = self.lda.num_topics, dtype='float64').T
     self.cat_result = self.test(mat)
     return self.cat_result
Пример #48
0
def estimate(news_title):
    # 未知のデータ予測
    vec = dictionary.doc2bow(M.isMecab(news_title))
    print(vec)
    pre = list(matutils.corpus2dense([vec], num_terms=len(dictionary)).T[0])
    print(pre)
    label_predict = estimator.predict(pre)
    print (label_predict)
Пример #49
0
 def get_dense(self, text):
     #remove stopword
     words = TextPreprocess(text).get_words_split()
     # Bag of words
     self.load_dictionary()
     vec = self.dictionary.doc2bow(words)
     dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0])
     return dense
Пример #50
0
    def transform(self, corpus):
        """ Create a table with topics representation. """
        topics = self.model[corpus.ngrams_corpus]
        matrix = matutils.corpus2dense(topics, num_docs=len(corpus),
                                       num_terms=self.num_topics).T

        corpus.extend_attributes(matrix[:, :len(self.topic_names)], self.topic_names)
        return corpus
Пример #51
0
def get_vector(dictionary, content):
    '''
    Analyze content and return a vector of feature using dictionary.
    @param  gensim_dict, str
    @return vector
    '''
    tmp = dictionary.doc2bow(get_words_main(content))
    dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])
    return dense
    def load_test_polarity(self, dir):

        texts, polarities = self.load_data(dir,getTokens=self.getTokensForPolarity)
        # 生成corpus和tfidf模型
        corpus = [self.dictionary.doc2bow(text) for text in texts]
        test_pol = self.tfidf[corpus]
        mat = matutils.corpus2dense(test_pol, num_terms=len(self.dictionary),dtype='float64').T
        self.pol_result = self.test(mat)
        return self.pol_result
def LDA_process(dataset):
    fea, link, label = load_dataset(dataset)
    corpus = matutils.Dense2Corpus(fea, documents_columns=False)
    num_topics = 100
    print 'performing lda...'
    model = models.LdaModel(corpus, num_topics=num_topics, passes=10)
    topic_fea = matutils.corpus2dense(model[corpus], num_topics)
    topic_fea = topic_fea.transpose()
    np.save('dataset/'+dataset+'/lda_fea', topic_fea)
Пример #54
0
def extractTopicModelData(df_comments, set_tag, tag):
    corpus = []
    i = 0
    for _, row in df_comments.iterrows():   
        comm = row['comment_content']
        corpus.append(comment_to_words_for_topics(comm))
        i += 1
        if i % 100 == 0:
            print i,datetime.datetime.now().time()
    
         
    print len(corpus)
    
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(corpus[v])
    for v in test_v:
        test_list.append(corpus[v])    
        
    lda = models.LdaModel.load(model_path + set_tag.replace("_","") + "_lda_model")
    
    dictionary = corpora.Dictionary.load(model_path + set_tag.replace("_","") + "_dictionary")
    train = [dictionary.doc2bow(text) for text in train_list]
    test = [dictionary.doc2bow(text) for text in test_list]
    
    lda.print_topics(20, 5)
    
    docTopicProbMat_train = lda[train]
    docTopicProbMat_test = lda[test]
    
    #print lda.top_topics(docTopicProbMat_train, 10)
    
    train_lda=matutils.corpus2dense(docTopicProbMat_train, 100, num_docs=len(train)).transpose()
    test_lda=matutils.corpus2dense(docTopicProbMat_test, 100, num_docs=len(test)).transpose()
      
    print train_lda.shape
    print test_lda.shape
    
    save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_train", sparse.csr_matrix(train_lda)) 
    save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_test",  sparse.csr_matrix(test_lda)) 
    
    print "DONE LDA"
Пример #55
0
def gensim2ndarray(corpus, dim, num_docs=None, dtype=numpy.float32):
    """Convert a gensim-style list of list of tuples into a numpy ndarray
    with documents as rows. Can now also deal with a single sparse vector.

    Mirror function to ``ndarray2gensim``."""
    # Checking for single-vector.
    # print 'Corpus: {0}'.format(corpus)
    if isinstance(corpus[0], tuple):
        return sparse2full(corpus, dim)
    return corpus2dense(corpus, dim, num_docs=num_docs, dtype=dtype).T
	def transform(self, X):
		corpus = [self.dictionary.doc2bow(text) for text in X]

		if self.use_tfidf:
			corpus = self.tfidf[corpus]

		corpus_lsi = self.model[corpus]
		corpus_lsi_dense = matutils.corpus2dense(corpus_lsi, self.n_latent_topics).T

		return corpus_lsi_dense
Пример #57
0
 def vec_to_data(self,txtName):
   dictionary = self.makeDict(txtName)
   corpus = [dictionary.doc2bow(word) for word in self.dict_from_nouncsv]
   lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10)
   data = []
   for words in self.dict_from_nouncsv:
     vec = dictionary.doc2bow(words)
     vec2lsi = lsi[vec]
     dense = list(matutils.corpus2dense([vec2lsi], num_terms=len(lsi.projection.s)).T[0])
     data.append(dense)
   return data
Пример #58
0
def get_vector2(dictionary, content):
    '''
    テスト用のデータをベクトルに変換す
    '''
    test_dense_list=[]
    for item in content:
        tmp = dictionary.doc2bow(item)
        dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])
        test_dense_list.append(dense)
    
    return test_dense_list