def extractTopicModelData(articleList, commentList, commentCount, set_tag, tag): processed_comment_list = extract_global_bag_of_words_processed(commentList) print len(processed_comment_list) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v]) for v in test_v: test_list.append(processed_comment_list[v]) lda = models.LdaModel.load(model_path + set_tag.replace("_","") + "_lda_model") dictionary = corpora.Dictionary.load(model_path + set_tag.replace("_","") + "_dictionary") train = [dictionary.doc2bow(text) for text in train_list] test = [dictionary.doc2bow(text) for text in test_list] docTopicProbMat_train = lda[train] docTopicProbMat_test = lda[test] train_lda=matutils.corpus2dense(docTopicProbMat_train) test_lda=matutils.corpus2dense(docTopicProbMat_test) print train_lda.shape print test_lda.shape save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_train", train_lda) save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_test", test_lda) print "DONE LDA"
def transformed_corpus(): for doc in input_data: if numpy_output: yield self._get_hidden_representations(matutils.corpus2dense(doc, self.input_dimensionality)) else: yield matutils.any2sparse( self._get_hidden_representations(matutils.corpus2dense(doc, self.input_dimensionality)))
def get_BoW_vectors(contents_lines, synopsis_lines): """ 文のリストから各文のBoWベクトルのリストを返す :param contents_lines: list 本文の各文を要素とするリスト :param synopsis_lines: list あらすじの各文を要素とするリスト :return: ([np.array], [np.array]) """ print('creating BoW vectors...') removed_contents_lines = [ remove_stop_word(cleaning(line)) for line in contents_lines ] removed_synopsis_lines = [ remove_stop_word(cleaning(line)) for line in synopsis_lines ] all_lines = removed_contents_lines + removed_synopsis_lines vocaburaly = corpora.Dictionary(all_lines) contents_BoWs = [ vocaburaly.doc2bow(line) for line in removed_contents_lines ] synopsis_BoWs = [ vocaburaly.doc2bow(line) for line in removed_synopsis_lines ] contents_vectors = [ np.array(matutils.corpus2dense([bow], num_terms=len(vocaburaly)).T[0]) for bow in contents_BoWs ] synopsis_vectors = [ np.array(matutils.corpus2dense([bow], num_terms=len(vocaburaly)).T[0]) for bow in synopsis_BoWs ] return contents_vectors, synopsis_vectors
def calculate_similarities(self, corpus): num_terms = len(self.dictionary) if self.simdict is None: return matutils.corpus2dense(corpus, num_terms=num_terms).T similarities = np.array([[calculate_similarity(doc1, doc2, self.simdict) for doc1 in self.bow_corpus] for doc2 in corpus]) print similarities.shape print matutils.corpus2dense(corpus, num_terms=num_terms).shape return np.concatenate((matutils.corpus2dense(corpus, num_terms=num_terms).T, similarities), axis=1)
def pre_process(self, x): bow_corpus = [self.dictionary.doc2bow(text) for text in x] data_tfidf = matutils.corpus2dense(self.tfidf_model[bow_corpus], num_terms=len(self.dictionary)).T if self.model is None: return data_tfidf else: data_lda = matutils.corpus2dense(self.model[bow_corpus], num_terms=len(self.dictionary)).T x_data = np.concatenate((data_tfidf, data_lda), axis=1) return x_data
def handle(self, *args, **options): """ tws = Timeline.objects.all().order_by('?')[:50000] for t in tws: self.__tokenize(t.body) dictionary = corpora.Dictionary(self.__words) dictionary.filter_extremes(no_below=2) dictionary.save_as_text('tw_dic.txt') """ dictionary = corpora.Dictionary.load_from_text('tw_dic.txt') favs = Favorite.objects.all()[:1000] for f in favs: words = self.__train_tokenize(f.body) tmp = dictionary.doc2bow(words) dense = list( matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) self.__train_data.append(dense) self.__train_label.append(1) ptws = PublicTimeline.objects.all()[:1000] for p in ptws: words = self.__train_tokenize(p.body) tmp = dictionary.doc2bow(words) dense = list( matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) self.__train_data.append(dense) self.__train_label.append(0) estimator = RandomForestClassifier() # 学習 estimator.fit(self.__train_data, self.__train_label) # 予測 tws = Timeline.objects.all().order_by('-ts')[:100] for t in tws: words = self.__train_tokenize(t.body) tmp = dictionary.doc2bow(words) dense = list( matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) if 1 in estimator.predict(dense): print t.body
def _doc_term_mtx(table, model, input_col, result_type='doc_to_bow_token'): corpus = table[input_col].tolist() dictionary = model['dictionary'] bow_corpus = [] for doc in corpus: bow_corpus.append(dictionary.doc2bow(doc)) doc_to_bow = [] for i in range(len(corpus)): token_cnt = [] for j in range(len(bow_corpus[i])): token_cnt.append('({token}, {cnt})'.format( token=dictionary[bow_corpus[i][j][0]], cnt=bow_corpus[i][j][1])) doc_to_bow.append(token_cnt) doc_to_bow_list = [] for doc in doc_to_bow: doc_to_bow_list.append('{}'.format(list(doc))) doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))] terms = [term for term in dictionary.token2id.keys()] if result_type == 'doc_to_bow_token': out_table = pd.DataFrame(data=doc_to_bow_list, columns=['doc_to_bow']) out_table.insert(loc=0, column='doc_idx', value=doc_idx) elif result_type == 'doc_term_mtx': out_table = pd.DataFrame( matutils.corpus2dense(bow_corpus, num_terms=len(dictionary.token2id)).T) out_table.insert(loc=0, column=' ', value=doc_idx) out_table.columns = np.append('', terms) elif result_type == 'term_doc_mtx': out_table = pd.DataFrame( matutils.corpus2dense(bow_corpus, num_terms=len(dictionary.token2id))) out_table.insert(loc=0, column=' ', value=terms) out_table.columns = np.append('', doc_idx) else: raise_runtime_error("Please check 'result_type'.") rb = BrtcReprBuilder() model = _model_dict('doc_term_mtx') model['bow_corpus'] = bow_corpus model['_repr_brtc_'] = rb.get() return {'out_table': out_table}
def get_dense(self, text): dict = self.__load_dictionary() words = tp.TextPreprocessing(text).get_words_feature() vec = dict.doc2bow(words) dense = list(matutils.corpus2dense([vec], num_terms=len(dict)).T[0]) return dense
def load_data(fname): source = [] target = [] f = open(fname, "r") document_list = [] #各行に一文書. 文書内の要素は単語 for l in f.readlines(): sample = l.strip().split(" ", 1) #ラベルと単語列を分ける label = int(sample[0]) #ラベル target.append(label) document_list.append(sample[1].split()) #単語分割して文書リストに追加 #単語辞書を作成 dictionary = corpora.Dictionary(document_list) dictionary.filter_extremes(no_below=5, no_above=0.8) # no_below: 使われている文書がno_below個以下の単語を無視 # no_above: 使われてる文章の割合がno_above以上の場合無視 #文書のベクトル化 for document in document_list: tmp = dictionary.doc2bow(document) #文書をBoW表現 vec = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) source.append(vec) dataset = {} dataset['target'] = np.array(target) dataset['source'] = np.array(source) print "vocab size:", len(dictionary.items()) return dataset, dictionary
def log_perplexity(self, corpus): """Calculate perplexity bound on the specified corpus. Perplexity = e^(-bound). Parameters ---------- corpus : iterable of list of (int, float), optional Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). Returns ------- float The perplexity bound. """ W = self.get_topics().T H = np.zeros((W.shape[1], len(corpus))) for bow_id, bow in enumerate(corpus): for topic_id, factor in self[bow]: H[topic_id, bow_id] = factor dense_corpus = matutils.corpus2dense(corpus, W.shape[0]) pred_factors = W.dot(H) pred_factors /= pred_factors.sum(axis=0) return (np.log(pred_factors, where=pred_factors > 0) * dense_corpus).sum() / dense_corpus.sum()
def transformed_corpus(): for chunk_no, doc_chunk in utils.grouper(bow, chunksize): chunk = matutils.corpus2dense(doc_chunk, self.input_dimensionality) hidden = self._get_hidden_representations(chunk) for column in hidden.T: yield matutils.any2sparse(column)
def calculate_embedding(corpus: Corpus, *, rank=2, svd_dims=50, perplexity=30, seed=0): """ Calculate a document embedding that assigns each document in the corpus a N-d position based on the word usage. :returns: A list of N-d tuples for the documents in the corpus. """ from gensim.models.tfidfmodel import TfidfModel from sklearn.decomposition import TruncatedSVD from sklearn.manifold import TSNE dic = corpus.dictionary freqs = corpus.frequencies tfidf = corpus2dense(TfidfModel(dictionary=dic)[freqs], len(dic)).T if svd_dims is not None: svd = TruncatedSVD(n_components=svd_dims, random_state=seed) components = svd.fit_transform(tfidf) else: components = tfidf model = TSNE(rank, metric='cosine', square_distances=True, perplexity=perplexity, random_state=seed) return model.fit_transform(components)
def transform(self, X): X = [text.words for text in X] x_tfidf = self.tfidf[[self.dictionary.doc2bow(text) for text in X]] x_data = matutils.corpus2dense(x_tfidf, num_terms=len(self.dictionary)).T logging.info("Returning data of shape %s " % (x_data.shape,)) print x_data return x_data
def getRank(fname): fname = path[0] twitter_stoplist = ["what's", "it's", "they'd"] stoplist = nltk.corpus.stopwords.words('english') + twitter_stoplist document = [] with open(fname) as f: for line in f: tl = [] tl.append(line[:32]) #添加标号 line = re.sub(r"(http:.*?\s)", "url", line[33:]) d = re.sub(r'\W|\d', ' ', line) d = re.sub(r'\s+', ' ', d) #合并多余空格 for word in d.split(): word = word.lower() if word not in stoplist and len(word) < 15 and len(word) > 2: word = nltk.PorterStemmer().stem(word) tl.append(word) document.append(tl) return document print fname + " read complate" dc = [dictionary.doc2bow(t) for t in document] d = matutils.corpus2dense(dc, dimension) # for i in range(len(d[0])): # tsum = sum(d[:,i]) # if tsum != 0: # for j in range(len(d)): # if d[j][i]/tsum > 0.5: # d[j][i] = 2 r2 = numpy.dot(numpy.transpose(u), d) score2 = numpy.array([sum(r2[:, i]) for i in range(len(r2[0]))]) x = [document[i][0] for i in range(len(document))] l2 = zip(score2, x) print fname + " process complate" return l2
def Tf_idf(sentences): dictionary = corpora.Dictionary(sentences)#创建词典 corpus = [dictionary.doc2bow(sentence) for sentence in sentences]#创建文档词频向量 tfidf_model = models.TfidfModel(corpus)#计算tf-idf值 corpus_tfidf = tfidf_model[corpus] corpus_matrix = corpus2dense(corpus_tfidf, len(dictionary))#转换矩阵形式 return corpus_matrix
def vectorize(self, docs, vocab_size): ''' Args: docs: bag-of-words format, iterable of iterable of (int, number) vocab_size (int) – Number of terms in the dictionary. X-axis of the resulting matrix. ''' return matutils.corpus2dense(docs, vocab_size)
def get_Xy(corpus, labels, ndims): # Term frequency inverse document frequency (tfidf) weighting: # reflects how important a word is to a document in a corpus tfidf = models.TfidfModel(corpus) tfidf_corpus = tfidf[corpus] docs_tfidf = [doc for doc in tfidf_corpus] ##scipy_csc_matrix = matutils.corpus2csc(tfidf_corpus).toarray().transpose() # Builds an LSI space from the input TFIDF matrix, it uses SVD # for dimensionality reduction with num_topics = dimensions lsi = models.LsiModel(tfidf_corpus, num_topics = ndims) lsi_corpus = lsi[tfidf_corpus] docs_lsi = [doc for doc in lsi_corpus] X = matutils.corpus2dense(lsi_corpus, num_terms = ndims).transpose() # Convert labels to: promoter: 0, enhancer: 1 y = [] error_ind = [] for i in range(len(labels)): if labels[i] == 'promoter': y.append(0) elif labels[i] == 'enhancer': y.append(1) else: print "Promoter / enhancer not recorded at index", i error_ind.append(i) y = np.asarray(y) return (X, y)
def ns(numpy_matrix, number_of_corpus_features, scipy_sparse_matrix): corpus = matutils.Dense2Corpus(numpy_matrix) numpy_matrix = matutils.corpus2dense(corpus, num_terms=number_of_corpus_features) corpus = matutils.Sparse2Corpus(scipy_sparse_matrix) scipy_csc_matrix = matutils.corpus2csc(corpus)
def main(): document_words = list() document_labels = list() count = 0 with open('tweets_label.csv', 'r') as f: reader = csv.reader(f) header = next(reader) for row in reader: document_words.append(parse_word_list(row[0])) document_labels.append(row[1]) count += 1 if count == 2000: break dictionary = corpora.Dictionary(document_words) dictionary.filter_extremes(no_below=3, no_above=0.4) vecs = list() for wordlist in document_words: bow = dictionary.doc2bow(wordlist) dense = list(matutils.corpus2dense([bow], num_terms=len(dictionary)).T[0]) vecs.append(dense) normal_fit_predict(vecs, document_labels)
def get_vector(dictionary, content): tmp = dictionary.doc2bow(get_words_main(content)) # print(tmp) # dense = tfidf_model[tmp] # print(dense) dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) return dense
def train_nmf_model(corpus: Corpus, num_topics: int, seed=0, max_iter=500) -> TopicModel: """ Train a topic model using NMF. :param num_topics: The number of topics to train. :param seed: The seed used for random number generation. :param max_iter: The maximum number of iterations to use for training. More iterations mean better results, but longer training times. """ import gensim.models.nmf dic = corpus.dictionary freqs = corpus.frequencies tfidf = gensim.models.tfidfmodel.TfidfModel(dictionary=dic) model = gensim.models.nmf.Nmf(list(tfidf[freqs]), num_topics=num_topics, passes=max_iter, random_state=seed, w_stop_condition=1e-9, h_stop_condition=1e-9, w_max_iter=50, h_max_iter=50) doc2topic = corpus2dense(model[freqs], num_topics).T topic2token = model.get_topics() return TopicModel(dic, doc2topic, topic2token)
def get_vector(dictionary, content): ''' ある記事の特徴語カウント ''' tmp = dictionary.doc2bow(get_words_main(content)) dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) return dense
def arun(corpus, dictionary, min_topics=10, max_topics=100, step=10): l = np.array([sum(cnt for _, cnt in doc) for doc in corpus]) kl = [] for n in range(min_topics, max_topics+step, step): print("starting multicore LDA for num_topics={}".format(n)) st = time.clock() lda = LdaMulticore(corpus=corpus, id2word=vocabulary, num_topics=n, passes=20, workers=mp.cpu_count()-1) el = time.clock()-st print("multicore LDA finished in {:.2f}s!".format(el)) m1 = lda.expElogbeta _, cm1, _ = np.linalg.svd(m1) lda_topics = lda[corpus] m2 = matutils.corpus2dense(lda_topics, lda.num_topics).transpose() cm2 = l.dot(m2) cm2 = cm2 + 0.0001 cm2norm = np.linalg.norm(l) cm2 = cm2/cm2norm kl.append(sym_kl(cm1, cm2)) return kl
def load_data(fname): print 'input file name:', fname target = [] #ラベル source = [] #文書ベクトル #文書リストを作成 document_list = [] word_list = [] for l in open(fname, 'r').readlines(): sample = l.strip().split(' ', 1) label = sample[0] target.append([label]) #ラベル word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング document_list.append(word_list) #文書ごとの単語リスト #辞書を作成 #低頻度と高頻度のワードは除く dct = Dictionary(document_list) dct.filter_extremes(no_below=3, no_above=0.6) #文書のBOWでベクトル化 for doc in document_list: tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0]) source.append(dense) dataset = {} dataset['target'] = np.array(target) dataset['source'] = np.array(source) return dataset #, max_len, width
def train(self): self.process_dataset(self.training_path, True) self.process_dataset(self.training_path, False) self.training_sources_length = len(self.sources) self.logger.debug( f'After train set processing: sources len {len(self.sources)}, labels len {len(self.labels)}' ) self.process_dataset(self.test_path, True) self.process_dataset(self.test_path, False) self.logger.debug( f'After full processing: sources len {len(self.sources)}, labels len {len(self.labels)}' ) corpus = Texts(self.sources).to_vector() dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] model = TfidfModel(corpus) corpus = [text for text in model[corpus]] self.training_text_matrix = corpus2dense(corpus, num_terms=len( dictionary.token2id)).T if self.pca: self.training_text_matrix = self.pca.fit_transform( self.training_text_matrix) self.classifier.fit( self.training_text_matrix[:self.training_sources_length], self.labels[:self.training_sources_length]) self.is_trained = True
def get_dense(self, text): self.__load_dictionary() words = NLP(text).get_words_feature() # Bag of words vec = self.dictionary.doc2bow(words) dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0]) return dense
def infer_topics(self, num_topics=10): self.nb_topics = num_topics lda = models.LdaModel(corpus=self.corpus.gensim_vector_space, iterations=10000, num_topics=num_topics) tmp_topic_word_matrix = list( lda.show_topics(num_topics=num_topics, num_words=len(self.corpus.vocabulary), formatted=False)) row = [] col = [] data = [] for topic_id in range(self.nb_topics): topic_description = tmp_topic_word_matrix[topic_id] for probability, word_id in topic_description: row.append(topic_id) col.append(word_id) data.append(probability) self.topic_word_matrix = coo_matrix( (data, (row, col)), shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr() self.document_topic_matrix = sparse.csr_matrix( np.transpose( matutils.corpus2dense(lda[self.corpus.gensim_vector_space], num_topics, self.corpus.size)))
def infer_topics(self, num_topics=10): if self.corpus.gensim_vector_space is None: self.corpus.gensim_vector_space = matutils.Sparse2Corpus(self.corpus.sklearn_vector_space, documents_columns=False) self.nb_topics = num_topics lda = models.LdaModel(corpus=self.corpus.gensim_vector_space, iterations=10000, num_topics=num_topics) tmp_topic_word_matrix = list(lda.show_topics(num_topics=num_topics, num_words=len(self.corpus.vocabulary), formatted=False)) row = [] col = [] data = [] for topic_id in range(self.nb_topics): topic_description = tmp_topic_word_matrix[topic_id] for word_id, probability in topic_description[1]: row.append(topic_id) col.append(int(word_id)) data.append(probability) self.topic_word_matrix = coo_matrix((data, (row, col)), shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr() self.document_topic_matrix = sparse.csr_matrix( np.transpose(matutils.corpus2dense(lda[self.corpus.gensim_vector_space], num_topics, self.corpus.size))) self.corpus.gensim_vector_space = None
def get_Xy(corpus, labels, ndims): # Term frequency inverse document frequency (tfidf) weighting: # reflects how important a word is to a document in a corpus tfidf = models.TfidfModel(corpus) tfidf_corpus = tfidf[corpus] docs_tfidf = [doc for doc in tfidf_corpus] ##scipy_csc_matrix = matutils.corpus2csc(tfidf_corpus).toarray().transpose() # Builds an LSI space from the input TFIDF matrix, it uses SVD # for dimensionality reduction with num_topics = dimensions lsi = models.LsiModel(tfidf_corpus, num_topics=ndims) lsi_corpus = lsi[tfidf_corpus] docs_lsi = [doc for doc in lsi_corpus] X = matutils.corpus2dense(lsi_corpus, num_terms=ndims).transpose() # Convert labels to: promoter: 0, enhancer: 1 y = [] error_ind = [] for i in range(len(labels)): if labels[i] == 'promoter': y.append(0) elif labels[i] == 'enhancer': y.append(1) else: print "Promoter / enhancer not recorded at index", i error_ind.append(i) y = np.asarray(y) return (X, y)
def jaccard_wrapper(doclist): """Wrapper function, that 1) calculates jaccard similarities and b) extracts all distances once. Args: doclist (list of strings): document list Returns: np.array: flat array of occurring jaccard values """ # Transform into sparse document-word-matrix #vectorizer = CountVectorizer(token_pattern="(?u)[\w.!?\\/-]+") #vocab_matrix = vectorizer.fit_transform(doclist).todense() all_words = [x.split(" ") for x in doclist] lexicon = corpora.Dictionary(all_words) bow_x = [] for t in all_words: bow_x.append(lexicon.doc2bow(t)) vocab_matrix = matutils.corpus2dense(bow_x, num_terms=len(lexicon.token2id)).T.astype(bool) # Calculate jaccard similarities sim_matrix = 1 - pairwise_distances(vocab_matrix, metric='jaccard') # Extract values from upper triangle in matrix size = len(doclist) indices = np.triu_indices(size, k=1) flat_values = sim_matrix[indices] return flat_values
def predictData(strDictName, strModelName, sentence, mecab): words = [] # モデルをロード with open(strModelName, 'rb') as f: model = cPickle.load(f) # 引数に入ってきた文字列を分解して入れる words.append(make_random_forest.makeWakatiData(mecab, sentence)) #print(words) dictionary = corpora.Dictionary.load_from_text(strDictName) # BoW corpus = [dictionary.doc2bow(text) for text in words] aryDense = [] # ベクトルを作成 for c in corpus: dense = list( matutils.corpus2dense([c], num_terms=len(dictionary)).T[0]) print(dense) aryDense.append(dense) result = model.predict(aryDense) return result
def loadData(features=100): lmtzr = WordNetLemmatizer() with open('/home/molina/Dropbox/Datasets/VisualGenome/objects.txt') as f: images = f.readlines() stopw = set(stopwords.words('english')) texts = [[lmtzr.lemmatize(word) for word in document.lower().split() if word not in stopw] for document in images] print("documents", len(texts)) dictionary = corpora.Dictionary(texts) print("dict before filtering", dictionary) dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=features) dictionary.compactify() print("dict after filtering", dictionary) corpus = [] for text in texts: corpus.append(dictionary.doc2bow(text)) denseCorpus = corpus2dense(corpus, num_terms=len(dictionary.keys())) print(corpus[0]) print(denseCorpus[0]) return corpus, denseCorpus, dictionary
def arun_metric(corpus, dictionary, min_topics=1, max_topics=1, iteration=1): """ Caluculates Arun et al metric.. """ result = []; for i in range(min_topics, max_topics, iteration): # Instanciates LDA. lda = models.ldamodel.LdaModel( corpus=corpus, id2word=dictionary, num_topics=i ) # Caluculates raw LDA matrix. matrix = lda.expElogbeta # Caluculates SVD for LDA matris. U, document_word_vector, V = numpy.linalg.svd(matrix) # Gets LDA topics. lda_topics = lda[my_corpus] # Caluculates document-topic matrix. term_document_matrix = matutils.corpus2dense( lda_topics, lda.num_topics ).transpose() document_topic_vector = corpus_length_vector.dot(term_document_matrix) document_topic_vector = document_topic_vector + 0.0001 document_topic_norm = numpy.linalg.norm(corpus_length_vector) document_topic_vector = document_topic_vector / document_topic_norm result.append(symmetric_kl_divergence( document_word_vector, document_topic_vector )) return result
def get_dense_bow(self, text): words = text.split() self.__load_dictionary() vec = self.dictionary.doc2bow(words) dense = list( matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0]) return dense
def infer_topics(self, num_topics=10): if self.corpus.gensim_vector_space is None: self.corpus.gensim_vector_space = matutils.Sparse2Corpus(self.corpus.sklearn_vector_space, documents_columns=False) self.nb_topics = num_topics lsa = models.LsiModel(corpus=self.corpus.gensim_vector_space, id2word=self.corpus.vocabulary, num_topics=num_topics) tmp_topic_word_matrix = list(lsa.show_topics(num_topics=num_topics, num_words=len(self.corpus.vocabulary), formatted=False)) row = [] col = [] data = [] for topic_id in range(self.nb_topics): topic_description = tmp_topic_word_matrix[topic_id] for weight, word_id in topic_description: row.append(topic_id) col.append(word_id) data.append(weight) self.topic_word_matrix = coo_matrix((data, (row, col)), shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr() self.document_topic_matrix = np.transpose(matutils.corpus2dense(lsa[self.corpus.gensim_vector_space], num_topics, self.corpus.size)) self.corpus.gensim_vector_space = None
def arun(corpus, dictionary, min_topics=10, max_topics=21, step=5): print "Arun runing" output = [] for i in range(min_topics, max_topics, step): lda = LDA(dictionary, corpus, i, "lda20/lda_training_" + str(i)) print "Модель построена/загружена" m1 = lda.expElogbeta # U, cm1, V = np.linalg.svd(m1) smat = scipy.sparse.csc_matrix(m1) # convert to sparse CSC format U, cm1, V = sparsesvd(smat, i + 30) # do SVD, asking for 100 factors print "sparsesvd сделано" #Document-topic matrix lda_topics = lda[my_corpus] m2 = matutils.corpus2dense(lda_topics, lda.num_topics).transpose() cm2 = l.dot(m2) cm2 = cm2 + 0.0001 print "cm2norm begin" cm2norm = np.linalg.norm(l) print "cm2norm end" cm2 = cm2/cm2norm print len(cm1), len(cm2) kl = sym_kl(cm1, cm2) output.append((i, kl)) print i, kl print output return output
def log_perplexity(self, corpus): """Calculate perplexity bound on the specified corpus. Perplexity = e^(-bound). Parameters ---------- corpus : list of list of (int, float) The corpus on which the perplexity is computed. Returns ------- float The perplexity bound. """ W = self.get_topics().T H = np.zeros((W.shape[1], len(corpus))) for bow_id, bow in enumerate(corpus): for topic_id, factor in self[bow]: H[topic_id, bow_id] = factor dense_corpus = matutils.corpus2dense(corpus, W.shape[0]) pred_factors = W.dot(H) pred_factors /= pred_factors.sum(axis=0) return (np.log(pred_factors, where=pred_factors > 0) * dense_corpus).sum() / dense_corpus.sum()
def getRank(fname): fname = path[0] twitter_stoplist = ["what's","it's","they'd"] stoplist = nltk.corpus.stopwords.words('english')+twitter_stoplist document = [] with open(fname) as f: for line in f: tl = [] tl.append(line[:32])#添加标号 line=re.sub(r"(http:.*?\s)","url" ,line[33:]) d = re.sub(r'\W|\d', ' ', line) d = re.sub(r'\s+',' ',d)#合并多余空格 for word in d.split(): word = word.lower() if word not in stoplist and len(word)<15 and len(word)>2: word = nltk.PorterStemmer().stem(word) tl.append(word) document.append(tl) return document print fname+" read complate" dc = [dictionary.doc2bow(t) for t in document] d = matutils.corpus2dense(dc,dimension) # for i in range(len(d[0])): # tsum = sum(d[:,i]) # if tsum != 0: # for j in range(len(d)): # if d[j][i]/tsum > 0.5: # d[j][i] = 2 r2 = numpy.dot(numpy.transpose(u),d) score2 =numpy.array([sum(r2[:,i]) for i in range(len(r2[0]))]) x = [document[i][0] for i in range(len(document))] l2 = zip(score2,x) print fname+" process complate" return l2
def mm(self, dictionary): values_set = set(dictionary.values()) self.texts = [[token for token in text if token in values_set] for text in self.texts] # print(self.texts[0]) self.corpus = [dictionary.doc2bow(text) for text in self.texts] self.dense = matutils.corpus2dense(self.corpus, len(dictionary)).T
def text_from_clusters(posts, cluster_labels, threshold=0.7, top_n=10): ''' extract high tfidf tokens and artist names from each cluster unusual_tokens, cluster_tokens_cleaned = text_from_clusters(posts, cluster_labels, threshold=0.7, top_n=10) :param posts: DataFrame with 'text' column :param cluster_labels: array of cluster_labels of len=post.shape[0] :param threshold=0.7: tfidf threshold above which to return "unusual" tokens :param top_n: [deprecated] top # tokens to return :return: unusual_tokens, cluster_tokens_cleaned ''' # Get text from each cluster from nltk.tokenize import word_tokenize from gensim import corpora, matutils import string # n_clust = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) n_clust = max(cluster_labels)+1 docs = posts['text'].values tokened_docs = [word_tokenize(doc) if doc is not None else ['#'] for doc in docs] cluster_tokens = [[]] * n_clust # only includes clusters that are labeled (not "noise") for ind, doc in enumerate(tokened_docs): if cluster_labels[ind] == -1: pass # ignore points not considered to be in a cluster else: cluster_tokens[cluster_labels[ind]] = cluster_tokens[cluster_labels[ind]] + doc # remove funny characters and spaces bad_words = [' ', 'san', 'in', 'the'] chars = string.punctuation + ' ' temp_cleaned = [[''.join(ch for ch in word.lower() if ch not in chars) for word in doc] for doc in cluster_tokens] temp_cleaned = [[word for word in doc if len(word) > 1] for doc in temp_cleaned] cluster_tokens_cleaned = [[word for word in doc if word not in bad_words] for doc in temp_cleaned] dictionary = corpora.dictionary.Dictionary(cluster_tokens_cleaned) # indexing: dictionary.token2id['streetart'] bow_corp = [dictionary.doc2bow(doc) for doc in cluster_tokens_cleaned] token_freq = matutils.corpus2dense(bow_corp, len(dictionary.token2id.keys())) # normalize words by occurrence from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer() tfidf = transformer.fit_transform(token_freq) norm_token_freq = tfidf.toarray() words = dictionary.token2id.keys() # Pick out unusual words unusual_tokens = [[]] * n_clust #[[x] for x in range(0,n_clust)] #* n_clust for word in words: for ind_cluster, p in enumerate(norm_token_freq[dictionary.token2id[word],:]): if p > threshold: if np.sum(token_freq[dictionary.token2id[word]]) > 1: # check appear more than once in entire corpus unusual_tokens[ind_cluster] = unusual_tokens[ind_cluster] + [(str(word), token_freq[dictionary.token2id[word], ind_cluster])] return unusual_tokens, cluster_tokens_cleaned
def main(): contents = {} data_train = [] # 正解ラベル 0: 独女通信, 1:ITライフハック... label_train = [] directories = dir_list() for directory in directories: files = file_list(directory) for file in files: content = read_data(data_path(directory) + file) contents[file] = content label_train.append(class_id(file)) # ワードの重複を除いた、辞書リストの作成 words = get_words(contents) wordbook = corpora.Dictionary(words) # ここは調整が必要 # no_berow: 使われてる文章がno_berow個以下の単語無視 # no_above: 使われてる文章の割合がno_above以上の場合無視 wordbook.filter_extremes(no_below=20, no_above=0.2) # 辞書リストを.txtに保存 wordbook.save_as_text(SAVE_FILE_NAME) # 作った辞書ファイルをロードして(wordbook)辞書オブジェクト作る # wordbook = corpora.Dictionary.load_from_text(SAVE_FILE_NAME) # BoW (単語id, 出現回数)と表現される for w in words: vector = wordbook.doc2bow(w) # 特徴ベクトルの取得 dense = list( matutils.corpus2dense([vector], num_terms=len(wordbook)).T[0]) data_train.append(dense) # ランダムフォレストオブジェクト生成 estimator = RandomForestClassifier() # 学習させる estimator.fit(data_train, label_train) # 予測 # label_predict = estimator.predict(data_train) # 予測結果 print(estimator.score(data_train, label_train)) # 学習データと試験データに分ける data_train_s, data_test_s, label_train_s, label_test_s = train_test_split( data_train, label_train, test_size=0.5) # もう一度ランダムフォレストで検証する estimator2 = RandomForestClassifier() estimator2.fit(data_train_s, label_train_s) print(estimator2.score(data_train_s, label_train_s))
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None): if not matutils.ismatrix(corpus): corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary)) else: corpus_csc = corpus # Need corpus to be a streaming gensim list corpus for len and inference functions below: corpus = matutils.Sparse2Corpus(corpus_csc) beta = 0.01 fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_) term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort] term_freqs[term_freqs == 0] = beta doc_lengths = corpus_csc.sum(axis=0).A.ravel() assert term_freqs.shape[0] == len( dictionary ), 'Term frequencies and dictionary have different shape {} != {}'.format( term_freqs.shape[0], len(dictionary)) assert doc_lengths.shape[0] == len( corpus ), 'Document lengths and corpus have different sizes {} != {}'.format( doc_lengths.shape[0], len(corpus)) if hasattr(topic_model, 'lda_alpha'): num_topics = len(topic_model.lda_alpha) else: num_topics = topic_model.num_topics if doc_topic_dists is None: # If its an HDP model. if hasattr(topic_model, 'lda_beta'): gamma = topic_model.inference(corpus) else: gamma, _ = topic_model.inference(corpus) doc_topic_dists = gamma / gamma.sum(axis=1)[:, None] else: if isinstance(doc_topic_dists, list): doc_topic_dists = matutils.corpus2dense(doc_topic_dists, num_topics).T elif issparse(doc_topic_dists): doc_topic_dists = doc_topic_dists.T.todense() doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1) assert doc_topic_dists.shape[ 1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format( doc_topic_dists.shape[1], num_topics) # get the topic-term distribution straight from gensim without # iterating over tuples if hasattr(topic_model, 'lda_beta'): topic = topic_model.lda_beta else: topic = topic_model.state.get_lambda() topic = topic / topic.sum(axis=1)[:, None] topic_term_dists = topic[:, fnames_argsort] assert topic_term_dists.shape[0] == doc_topic_dists.shape[1] return doc_topic_dists
def predict(input_text): dictionary = corpora.Dictionary.load_from_text('%s/dictionary.txt' % (os.environ["INQUIRY_BOT_PROJECT_DIR"])) words = morphological_analyze(input_text) bow = dictionary.doc2bow(words) dense = list(matutils.corpus2dense([bow], num_terms=len(dictionary)).T[0]) estimator = joblib.load('%s/linear_svm.pkl' % (os.environ["INQUIRY_BOT_PROJECT_DIR"])) return json.dumps({"answer_id": estimator.predict([dense])[0]})
def test_lda(mlda_model=None, tfidf_model=None, corpus=None, dictionary=None, n_topics=2): data_tfidf = matutils.corpus2dense(tfidf_model[corpus], num_terms=len(dictionary)).T data_mlda = mlda_model.corpus2dense_lda(bow_corpus=corpus, dictionary=dictionary, n_topics=n_topics) if data_mlda is None: return data_tfidf else: x_data = np.concatenate((data_tfidf, data_mlda), axis=1) return x_data
def calculate_lsi_transformed_corpus_matrix(self): """Find the documents represented as vectors in LSI space""" self.transformed = matutils.corpus2dense(self.lsi[self.corpus_tfidf], len(self.lsi.projection.s)).T # normalize the vectors because only the vector orientation represents semantics transformed_norms = np.sum(self.transformed**2,axis=-1)**(1./2) # avoid dividing by zero transformed_norms[ transformed_norms==0] = 1 self.transformed = self.transformed / transformed_norms.reshape(len(transformed_norms),1)
def load_test_category(self, dir): texts, categories = self.load_data(dir,getTokens=self.getTokensForCategory) # 生成corpus和lda测试集 corpus = [self.dictionary.doc2bow(text) for text in texts] test_cat = self.lda[corpus] mat = matutils.corpus2dense(test_cat, num_terms = self.lda.num_topics, dtype='float64').T self.cat_result = self.test(mat) return self.cat_result
def estimate(news_title): # 未知のデータ予測 vec = dictionary.doc2bow(M.isMecab(news_title)) print(vec) pre = list(matutils.corpus2dense([vec], num_terms=len(dictionary)).T[0]) print(pre) label_predict = estimator.predict(pre) print (label_predict)
def get_dense(self, text): #remove stopword words = TextPreprocess(text).get_words_split() # Bag of words self.load_dictionary() vec = self.dictionary.doc2bow(words) dense = list(matutils.corpus2dense([vec], num_terms=len(self.dictionary)).T[0]) return dense
def transform(self, corpus): """ Create a table with topics representation. """ topics = self.model[corpus.ngrams_corpus] matrix = matutils.corpus2dense(topics, num_docs=len(corpus), num_terms=self.num_topics).T corpus.extend_attributes(matrix[:, :len(self.topic_names)], self.topic_names) return corpus
def get_vector(dictionary, content): ''' Analyze content and return a vector of feature using dictionary. @param gensim_dict, str @return vector ''' tmp = dictionary.doc2bow(get_words_main(content)) dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) return dense
def load_test_polarity(self, dir): texts, polarities = self.load_data(dir,getTokens=self.getTokensForPolarity) # 生成corpus和tfidf模型 corpus = [self.dictionary.doc2bow(text) for text in texts] test_pol = self.tfidf[corpus] mat = matutils.corpus2dense(test_pol, num_terms=len(self.dictionary),dtype='float64').T self.pol_result = self.test(mat) return self.pol_result
def LDA_process(dataset): fea, link, label = load_dataset(dataset) corpus = matutils.Dense2Corpus(fea, documents_columns=False) num_topics = 100 print 'performing lda...' model = models.LdaModel(corpus, num_topics=num_topics, passes=10) topic_fea = matutils.corpus2dense(model[corpus], num_topics) topic_fea = topic_fea.transpose() np.save('dataset/'+dataset+'/lda_fea', topic_fea)
def extractTopicModelData(df_comments, set_tag, tag): corpus = [] i = 0 for _, row in df_comments.iterrows(): comm = row['comment_content'] corpus.append(comment_to_words_for_topics(comm)) i += 1 if i % 100 == 0: print i,datetime.datetime.now().time() print len(corpus) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(corpus[v]) for v in test_v: test_list.append(corpus[v]) lda = models.LdaModel.load(model_path + set_tag.replace("_","") + "_lda_model") dictionary = corpora.Dictionary.load(model_path + set_tag.replace("_","") + "_dictionary") train = [dictionary.doc2bow(text) for text in train_list] test = [dictionary.doc2bow(text) for text in test_list] lda.print_topics(20, 5) docTopicProbMat_train = lda[train] docTopicProbMat_test = lda[test] #print lda.top_topics(docTopicProbMat_train, 10) train_lda=matutils.corpus2dense(docTopicProbMat_train, 100, num_docs=len(train)).transpose() test_lda=matutils.corpus2dense(docTopicProbMat_test, 100, num_docs=len(test)).transpose() print train_lda.shape print test_lda.shape save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_train", sparse.csr_matrix(train_lda)) save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_test", sparse.csr_matrix(test_lda)) print "DONE LDA"
def gensim2ndarray(corpus, dim, num_docs=None, dtype=numpy.float32): """Convert a gensim-style list of list of tuples into a numpy ndarray with documents as rows. Can now also deal with a single sparse vector. Mirror function to ``ndarray2gensim``.""" # Checking for single-vector. # print 'Corpus: {0}'.format(corpus) if isinstance(corpus[0], tuple): return sparse2full(corpus, dim) return corpus2dense(corpus, dim, num_docs=num_docs, dtype=dtype).T
def transform(self, X): corpus = [self.dictionary.doc2bow(text) for text in X] if self.use_tfidf: corpus = self.tfidf[corpus] corpus_lsi = self.model[corpus] corpus_lsi_dense = matutils.corpus2dense(corpus_lsi, self.n_latent_topics).T return corpus_lsi_dense
def vec_to_data(self,txtName): dictionary = self.makeDict(txtName) corpus = [dictionary.doc2bow(word) for word in self.dict_from_nouncsv] lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10) data = [] for words in self.dict_from_nouncsv: vec = dictionary.doc2bow(words) vec2lsi = lsi[vec] dense = list(matutils.corpus2dense([vec2lsi], num_terms=len(lsi.projection.s)).T[0]) data.append(dense) return data
def get_vector2(dictionary, content): ''' テスト用のデータをベクトルに変換す ''' test_dense_list=[] for item in content: tmp = dictionary.doc2bow(item) dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0]) test_dense_list.append(dense) return test_dense_list