from sklearn.decomposition import PCA from gensim import corpora, models import pickle import numpy as np tokens = pickle.load(open('tag_list.pkl', 'rb')) dictionary = corpora.Dictionary(tokens) texts = [dictionary.doc2bow(text) for text in tokens] tfidf_model = models.TfidfModel(texts, normalize=False) tfidf = np.zeros([len(tokens), 1386], np.float32) for i in range(len(tokens)): temp = tfidf_model[texts[i]] for topic in temp: tfidf[i, topic[0]] = topic[1] np.save('tf_idf.npy', tfidf)
# :ref:`core_concepts_corpus`. # # One simple example of a model is `tf-idf # <https://en.wikipedia.org/wiki/Tf%E2%80%93idf>`_. The tf-idf model # transforms vectors from the bag-of-words representation to a vector space # where the frequency counts are weighted according to the relative rarity of # each word in the corpus. # # Here's a simple example. Let's initialize the tf-idf model, training it on # our corpus and transforming the string "system minors": # from gensim import models # train the model tfidf = models.TfidfModel(bow_corpus) # transform the "system minors" string words = "system minors".lower().split() print(tfidf[dictionary.doc2bow(words)]) ############################################################################### # The ``tfidf`` model again returns a list of tuples, where the first entry is # the token ID and the second entry is the tf-idf weighting. Note that the ID # corresponding to "system" (which occurred 4 times in the original corpus) has # been weighted lower than the ID corresponding to "minors" (which only # occurred twice). # # You can save trained models to disk and later load them back, either to # continue training on new training documents or to transform new documents. #
with open(e) as f: str = "" for line in f: str += clean(line) raw_corpus.append(str) stoplist = set(stopwords.words('english')).union(set(stopwords.words('french'))) texts = [[word for word in document.split() if word not in stoplist] for document in raw_corpus] # Count word frequencies frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 # Only keep words that appear more than once processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(processed_corpus) dictionary.save('simul.dict') corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('simul.mm', corpus) tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) corpus_lsi = lsi[corpus_tfidf] lsi.save('model.lsi')
def __init__(self, data_list): data_list = self._check(data_list) self.dictionary = corpora.Dictionary(data_list) corpus = [self.dictionary.doc2bow(doc) for doc in data_list] self.tfidf = models.TfidfModel(corpus) #文档建tfidf模型
def create_tfidf_model(corpus, tfidf_file_dir='data/tfidf.tfidf_model'): tfidf = models.TfidfModel(corpus) tfidf.save(tfidf_file_dir) return tfidf
# In[12]: sims = index[corpus[1]] print((list(enumerate(sims)))) print(sims.argsort()) # Thus _document 15_ is the most similar document to _document 1_. As can easily be verified both documents refer to the same topic (crisis in ukraine). # ## TF-IDF representation # So far in the BoW representation of the documents the _term frequency (tf)_ has been applied. This value measures how often the term (word) appears in the document. If document similarity is calculated on such tf-based BoW representation, common words which appear quite often (in many documents) but have low semantic focus have a strong impact on the similarity-value. In most cases this is a drawback, since similarity should be based on terms with a high semantic focus. Such semantically meaningful words usually appear only in a few documents. The _term frequency inversed document frequency measure (tf-idf)_ does not only count the frequency of a term in a document, but weighs those terms stronger, which occur only in a few documents of the corpus. # # In _gensim_ the _tfidf_ - model of a corpus can be calculated as follows: # In[13]: tfidf = models.TfidfModel(corpus) # The _tf-idf_-representation of the first 3 documents in the corpus are: # In[14]: idx = 0 for d in corpus[:3]: print("-------------tf-idf BoW of document %d ---------------" % idx) print(tfidf[d]) idx += 1 # In this representation the second element in the tuples is not the term frequency, but the _tfidf_. Note that default configuration of [tf-idf in gensim](http://radimrehurek.com/gensim/models/tfidfmodel.html) calculates tf-idf values such that each document-vector has a norm of _1._ The tfidf-model without normalization is generated at the end of this notebook. # # Question: Find the maximum tf-idf value in these 3 documents. To which word does this maximum value belong? How often does this word occur in the document? #
# read data from mongoDB # Get Ideas db = mongohq.Data_Utility(mongohq.fac_exp) ideas = db.get_data('ideas') #### tokenize #### # get stopwords stopWords = nlp.get_stopwords() # get bag of words data, expandedText = nlp.bag_of_words(ideas, stopWords) # convert tokenized documents to a corpus of vectors corpus = [dictionary.doc2bow(text) for text in expandedText] # convert raw vectors to tfidf vectors tfidf = models.TfidfModel(corpus) #initialize model corpus_tfidf = tfidf[corpus] #apply tfidf model to whole corpus # make lsa space if len(data) > 300: dim = 300 # default is 300 dimensions else: dim = len(data) # default to 300 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=dim) #create the space # output the matrix V so we can use it to get pairwise cosines # https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q3-how-do-you-calculate-the-matrix-v-in-lsi-space vMatrix = matutils.corpus2dense(lsi[corpus_tfidf], len( lsi.projection.s)).T / lsi.projection.s
#数据处理 contents = read_file() stopwords = load_stopwords() seg = seg_file(contents[0], stopwords) # 建立字典 dictionary = corpora.Dictionary(seg) V = len(dictionary) print(V) # 统计文档词频矩阵 text = [dictionary.doc2bow(text, allow_update=True) for text in seg] #print(text[0])#稀疏矩阵 #计算Tfidf矩阵 text_tfidf = models.TfidfModel(text)[text] #建立LDA模型,输出前十个主题 lda = models.LdaModel(text_tfidf, id2word=dictionary, num_topics=200, iterations=100) #显示主题 for k, v in lda.print_topics(num_topics=10): print(k, v) #所有文档的主题 doc_topic = lda.get_document_topics(text_tfidf) print(len(doc_topic)) for dt in doc_topic: print(dt)
def main(): binsize = 1 # days on which pages are grouped on -- coverage of a news within the same bin of 1 day is taken as a single item smooth = 7 # smooth time-dependent results taking a moving average of 7 days, to avoid fluctuations # first of all transform HTML into plain text # to remove all the HTML tags # then use the clean() function of the Source class # to clean up rubbish (ie: navigation bars, advertisement) # the document is stored in data/[date]/[source tag]/frond_clean.txt # the document's clean text is in doc_set and its date and source are kept with # the same index in doc_id dirlist = os.listdir('data/') dirlist.sort() doc_set = [] doc_id = [] for s in utils.sources: source = utils.sources[s] doc = "" rebin_count = 0 for day in dirlist: output = 'data/%s/%s' % (day, s) os.system('lynx -dump -nolist %s/front.html > %s/front.txt' % (output, output)) source.clean('%s/front.txt' %output, '%s/front_clean.txt' % output) f = open("%s/front_clean.txt" % output) for i in f.readlines(): doc += i rebin_count += 1 if rebin_count % binsize == 0: doc_set.append(doc) doc_id.append('%s/%s' % (day, s)) doc = "" # now we need to split it into words # and remove the word ending, so that only the stem of the word remains # this removes differences between masculin and feminin and plural/singular # we also remove common articles, prepositions, etc, which happen too often and carry no # meaning in the "bad-of-words" approach tokenizer = RegexpTokenizer(r'\w+') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # stop words lang_stop = (stopwords.words('portuguese')) # get set of stop words for portuguese lang_stop.extend(['08', 'achaque', 'lico', 'r', '1', 'pra', 'bbc', 'globo', 'foto', 'agencia', 'photo', '01', '00', 'folha', 'folhapress']) texts = [] for doc in doc_set: raw = doc.lower() # to lower case tokens = tokenizer.tokenize(raw) # make word tokens and save it in a list stopped_tokens = [i for i in tokens if not i in lang_stop] # remove stop words # stem token text = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(text) # texts keeps a list of list of words (same indexing as doc_set and doc_id) # now make a dictionary of words found # this assigns a unique integer to each word dictionary = corpora.Dictionary(texts) # we can use dictionary.token2id to get the list of word-id mapping # doc2bow counts how many times a word appears in the text and makes a list of counts of words # this is now closer to a vector interpretation of each document corpus = [dictionary.doc2bow(text) for text in texts] # we would now like to use the tf-idf transformation for each document representation # this weights more words that appear very often, but normalises it by the size of the document # to avoid biases to large documents # it also underweights terms that appears to often in many documents # this avoids the appearance of wors such as "say", which often appears in newspapers tfidf = models.TfidfModel(corpus, normalize = True) corpus_tfidf = tfidf[corpus] # apply the trained transformation to the corpus # now make the model, which can be LSI for an SVD transformation of the # term-document matrix # or LDA for a probabilistic model if useLDA: myModel = models.ldamodel.LdaModel(corpus_tfidf, num_topics=ntopics, id2word = dictionary, random_state=123) else: myModel = models.lsimodel.LsiModel(corpus_tfidf, num_topics=ntopics, id2word = dictionary) print "<!DOCTYPE html>" print "<html lang=\"en\"><head>" print """ <meta charset="utf-8"> <link rel="stylesheet" href="https://cdn.pydata.org/bokeh/release/bokeh-0.12.4.min.css" type="text/css" /> <script type="text/javascript" src="https://cdn.pydata.org/bokeh/release/bokeh-0.12.4.min.js"></script> <script type="text/javascript"> Bokeh.set_log_level("info"); </script> <style> body { margin: auto; text-align: left; text-weight: bold; font-size: 1.2em; } table, th, td { padding: 0.5em; text-align: center; } th { height: 2em; font-size: 1.4em; } th, td { border-bottom: 1px solid #ddd; } tr:hover { background-color: #f5f5f5; } table { padding-right: 1em; padding-left: 1em; border-collapse: collapse; width: 100%; } </style> """ print "<title>Results of text mining Brazilian newspapers front page</title></head><body><h3>Results of text mining Brazilian newspapers front page</h3>" # now print the topics that appear often topics = myModel.show_topics(num_topics=ntopics, num_words=nwords, formatted=False) for i in range(0, len(topics)): print "<table>" print "<tr><th colspan=\"2\">Words within topic '%d':</th></tr>" % i print "<tr><th>Contribution</th><th>Word</th></tr>" for v in topics[i][1]: print "<tr><td>%6.4f</td><td>%10s</td></tr>" % (v[1], utils.showWord(v[0])) print "</table>" # make a graph showing this topic connected to its words, with the length # of the edge being the weight of the word in that topic utils.save_fulltopic_graph([ myModel.show_topics(ntopics, num_words=nwords, formatted=False)[i] ], [i], "_only_%d.html" % i) # same as before, but put all topics and words in the same graph script, div = utils.save_fulltopic_graph(myModel.show_topics(ntopics, num_words =nwords, formatted=False), range(0, len(topics))) print "<h4>Graph showing words in each topic</h4>" print script print div # Try now projecting the document in the topics set # this tells us how much each topic contributes in a document print "Topics per document:" topic_per_doc = {} for did in range(0, len(texts)): print "<table>" date = doc_id[did].split('/')[0] dt = datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8])) print "<tr><th colspan=\"2\">Topics within document '%s' of '%s':</th></tr>" % (utils.showWord(doc_id[did].split('/')[-1]), dt.date()) print "<tr><th>Relevance</th><th>Topic</th></tr>" if useLDA: topics = myModel.get_document_topics(tfidf[dictionary.doc2bow(texts[did])]) else: topics = myModel[tfidf[dictionary.doc2bow(texts[did])]] for k,v in topics: print "<tr><td>%6.4f</td><td>%d</td></tr>" % (v, k) print "</table>" date = doc_id[did].split('/')[0] d = doc_id[did].split('/')[-1] if not date in topic_per_doc: topic_per_doc[date] = {} topic_per_doc[date][d] = topics # now make a graph of it # connecting the documents to topics # this is done for each document in a specific day #for date in topic_per_doc: # # for all documents in this date # script, div = utils.save_doctopic_graph(topic_per_doc[date], "topic_per_doc_%s.html" % date) # dt = datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8])) # print "<h4>Graph showing topics in each document at %s</h4>" % dt.date() # print script # print div # now do a similarity query if useLDA: corpus_projection = myModel.get_document_topics(corpus_tfidf) else: corpus_projection = myModel[corpus_tfidf] index = similarities.MatrixSimilarity(corpus_projection) # we can now use index[input], where input = myModel[tfidf[dictionary.doc2bow(newDocument.lower().split())]] # this compares a new document with what is in the corpus # we can compare the documents in the corpus with each other similar = {} for item in sim_query: similar[item] = [] if useLDA: result = myModel.get_document_topics(tfidf[ dictionary.doc2bow([p_stemmer.stem(i) for i in tokenizer.tokenize(item.lower()) if not i in lang_stop]) ]) else: result = myModel[ tfidf[ dictionary.doc2bow([p_stemmer.stem(i) for i in tokenizer.tokenize(item.lower()) if not i in lang_stop]) ] ] for did2, weight in list(enumerate( index[ result ] )): similar[item].append((weight, doc_id[did2])) for item in sim_query: print "<table>" print "<tr><th colspan=\"3\">Documents matching '%s':</th></tr>" % item print "<tr><th>Similarity (%)</th><th>Source</th><th>Date</th></tr>" for k,v in sorted(similar[item], key=lambda val: -val[0]): date = v.split('/')[0] dt = datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8])) print "<tr><td>%5.2f</td><td>%20s</td><td>%20s</td></tr>" % (k*100, utils.showWord(v.split('/')[-1]), dt.date()) print "</table>" print "</body></html>" utils.save_query_time(similar, ".html", smooth) utils.save_query_time_conditional(similar, ".html", smooth)
def lda(export_perplexity=False): np.set_printoptions(linewidth=300) data = pd.read_csv('QQ_chat_result.csv', header=0, encoding='utf-8') texts = [] for info in data['Info']: texts.append(info.split(' ')) M = len(texts) print('文档数目:%d个' % M) # pprint(texts) print('正在建立词典 --') dictionary = corpora.Dictionary(texts) V = len(dictionary) print('正在计算文本向量 --') corpus = [dictionary.doc2bow(text) for text in texts] print('正在计算文档TF-IDF --') t_start = time.time() corpus_tfidf = models.TfidfModel(corpus)[corpus] print('建立文档TF-IDF完成,用时%.3f秒' % (time.time() - t_start)) print('LDA模型拟合推断 --') num_topics = 20 t_start = time.time() lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, alpha=0.001, eta=0.02, minimum_probability=0, update_every=1, chunksize=1000, passes=20) print('LDA模型完成,训练时间为\t%.3f秒' % (time.time() - t_start)) if export_perplexity: export_perplexity1(corpus_tfidf, dictionary, corpus) # export_perplexity2(corpus_tfidf, dictionary, corpus) # # 所有文档的主题 # doc_topic = [a for a in lda[corpus_tfidf]] # print 'Document-Topic:\n' # pprint(doc_topic) num_show_term = 7 # 每个主题显示几个词 print('每个主题的词分布:') for topic_id in range(num_topics): print('主题#%d:\t' % topic_id, end=' ') term_distribute_all = lda.get_topic_terms(topicid=topic_id) term_distribute = term_distribute_all[:num_show_term] term_distribute = np.array(term_distribute) term_id = term_distribute[:, 0].astype(np.int) for t in term_id: print(dictionary.id2token[t], end=' ') print('\n概率:\t', term_distribute[:, 1]) # 随机打印某10个文档的主题 np.set_printoptions(linewidth=200, suppress=True) num_show_topic = 10 # 每个文档显示前几个主题 print('10个用户的主题分布:') doc_topics = lda.get_document_topics(corpus_tfidf) # 所有文档的主题分布 idx = np.arange(M) np.random.shuffle(idx) idx = idx[:10] for i in idx: topic = np.array(doc_topics[i]) topic_distribute = np.array(topic[:, 1]) # print topic_distribute topic_idx = topic_distribute.argsort()[:-num_show_topic - 1:-1] print(('第%d个用户的前%d个主题:' % (i, num_show_topic)), topic_idx) print(topic_distribute[topic_idx]) # 显示着10个文档的主题 mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['axes.unicode_minus'] = False plt.figure(figsize=(8, 7), facecolor='w') for i, k in enumerate(idx): ax = plt.subplot(5, 2, i + 1) topic = np.array(doc_topics[i]) topic_distribute = np.array(topic[:, 1]) ax.stem(topic_distribute, linefmt='g-', markerfmt='ro') ax.set_xlim(-1, num_topics + 1) ax.set_ylim(0, 1) ax.set_ylabel("概率") ax.set_title("用户 {}".format(k)) plt.grid(b=True, axis='both', ls=':', color='#606060') plt.xlabel("主题", fontsize=13) plt.suptitle('用户的主题分布', fontsize=15) plt.tight_layout(1, rect=(0, 0, 1, 0.95)) plt.show() # 计算各个主题的强度 print('\n各个主题的强度:\n') topic_all = np.zeros(num_topics) doc_topics = lda.get_document_topics(corpus_tfidf) # 所有文档的主题分布 for i in np.arange(M): # 遍历所有文档 topic = np.array(doc_topics[i]) topic_distribute = np.array(topic[:, 1]) topic_all += topic_distribute topic_all /= M # 平均 idx = topic_all.argsort() topic_sort = topic_all[idx] print(topic_sort) plt.figure(facecolor='w') plt.stem(topic_sort, linefmt='g-', markerfmt='ro') plt.xticks(np.arange(idx.size), idx) plt.xlabel("主题", fontsize=13) plt.ylabel("主题出现概率", fontsize=13) plt.title('主题强度', fontsize=15) plt.grid(b=True, axis='both', ls=':', color='#606060') plt.show()
# We already have a document for Arthur, but let's grab the text from someone else to compare it with. # In[42]: p = re.compile(r'(?:GALAHAD: )(.+)') galahad = ' '.join(re.findall(p, document)) arthur_tokens = tokens galahad_tokens = word_tokenize(galahad) # Now, we use gensim to create vectors from these tokenized documents: # In[43]: dictionary = corpora.Dictionary([arthur_tokens, galahad_tokens]) corpus = [dictionary.doc2bow(doc) for doc in [arthur_tokens, galahad_tokens]] tfidf = models.TfidfModel(corpus, id2word=dictionary) # Then, we create matrix models of our corpus and query # In[44]: query = tfidf[dictionary.doc2bow(['peasant'])] index = similarities.MatrixSimilarity(tfidf[corpus]) # And finally, we can test our query, "peasant" on the two documents in our corpus # In[45]: list(enumerate(index[query])) # So we see here that "peasant" does not match Galahad very well (a really bad match would have a negative value), and is more similar to the kind of speach output that we see from King Arthur.
def tfidf_train(): dictionary = corpora.Dictionary.load('../dictionary/new_dict_filter.dict') for index in range(0, 1): corpus = corpora.MmCorpus('../corpus_mm/corpus_{}.mm'.format(index)) tfidf_model = models.TfidfModel(corpus=corpus, dictionary=dictionary) corpus_tfidf = np.array([tfidf_model[doc] for doc in corpus])
def get_tfidf(bow_corpus): tfidf = models.TfidfModel(bow_corpus) corpus_tfidf = tfidf[bow_corpus] return corpus_tfidf
def construct_lsi_sim_graph(corpus, args): """ compute lsi vector similarity between paragraphs :param corpus: :param args: :return: """ sim_graph = [] raw_corpus = [' '.join(para) for para in corpus] # create English stop words list stoplist = set(stopwords.words('english')) # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # Lowercase each document, split it by white space and filter out stopwords texts = [[word for word in para.lower().split() if word not in stoplist] for para in raw_corpus] # Create a set of frequent words frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 # stem each word processed_corpus = [[p_stemmer.stem(token) for token in text] for text in texts] dictionary = corpora.Dictionary(processed_corpus) bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus] # train the model tfidf = models.TfidfModel(bow_corpus) # transform the "system minors" string corpus_tfidf = tfidf[bow_corpus] if args.find_opt_num: lsi = get_optimal_lsimodel_by_coherence_values(corpus=corpus_tfidf, texts=processed_corpus, dictionary=dictionary) else: lsi = models.LsiModel( corpus_tfidf, id2word=dictionary, num_topics=args.num_topics) # initialize an LSI transformation corpus_lsi = lsi[ corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi # for i, doc in enumerate(corpus_lsi): # if len(doc) == 0: # print("The lsi is empty: %s" % raw_corpus[i]) index = similarities.MatrixSimilarity(corpus_lsi, num_features=len(dictionary)) total = 0. count_large = 0. for i in range(len(corpus_lsi)): sim = index[corpus_lsi[i]] assert len(sim) == len(corpus_lsi), "the lsi sim is not correct!" sim_graph.append(sim) for s in sim: total += 1 if s > args.sim_threshold: count_large += 1 print("sim_graph[0]: %s" % str(sim_graph[0])) return sim_graph, count_large, total
# 生成字典和向量语料,记录一个词在多少篇文档中出现,方便之后计算idf,同时唯一标识一个词。 dictionary = corpora.Dictionary(corpora_documents) print(dictionary.dfs) # dictionary.save('dict.txt') #保存生成的词典 # dictionary=Dictionary.load('dict.txt')#加载 # 通过下面一句得到语料中每一篇文档对应的稀疏向量(这里是bow向量) corpus = [dictionary.doc2bow(text) for text in corpora_documents] # 向量的每一个元素代表了一个word在这篇文档中出现的次数 print(corpus) # corpora.MmCorpus.serialize('corpuse.mm',corpus)#保存生成的语料 # corpus=corpora.MmCorpus('corpuse.mm')#加载 # corpus是一个返回bow向量的迭代器。下面代码将完成对corpus中出现的每一个特征的IDF值的统计工作 print('corpus: {}'.format(corpus)) tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] # 这里输出的是标准化后的tfidf值 print('corpus tfidf: {}'.format(corpus_tfidf)) # 查看model中的内容 for item in corpus_tfidf: print(item) # tfidf.save("data.tfidf") # tfidf = models.TfidfModel.load("data.tfidf") # print(tfidf_model.dfs) similarity = similarities.Similarity(None, corpus_tfidf, num_features=600) test_data_1 = '北京雾霾红色预警' test_cut_raw_1 = list(jieba.cut(test_data_1)) # ['北京', '雾', '霾', '红色', '预警'] test_corpus_1 = dictionary.doc2bow(
def do_gensim(self): logging.info("Starting GENSIM code") documents = [] logging.info("INCOMMING TWEET CORPUS SIZE: " + str(len(self.social_data.posts))) for tweet in self.social_data.posts: #tweet = str(curr.fetchone()[0]) #print("doc:%s" %tweet.text) documents.append(' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(gt)", " ", tweet.text).split()).lower()) logging.info("CORPUS SIZE AFTER REGEX: " + str(len(documents))) #stoplist work s = "" for w in self.stopwords: s += w + " " stoplist = set(s.split()) #print("stoplist %s" % str(stoplist)) #logging.info("s:\n\t"+s) #logging.info("stopwords"+str([i for i in self.builder.get_object('TopicStopwords_Listbox').get(0,tk.END)])) #logging.info(stoplist) #tokenize texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in documents] logging.info("CORPUS SIZE AFTER STOPLIST: " + str(len(texts))) #singles reduction all_tokens = sum(texts, []) logging.info("beginning tokenization") tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) logging.info( "words tokenized, starting single mentioned word reduction") texts = [[word for word in text if word not in tokens_once] for text in texts] logging.info("words mentioned only once removed") #remove nulls texts = filter(None, texts) logging.info("CORPUS SIZE AFTER EMPTY ROWS REMOVED: " + str(len(texts))) dictionary = corpora.Dictionary(texts) #create corpus, tfidf, set up model corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) #step 1. --initialize(train) a model corpus_tfidf = tfidf[corpus] # Apply TFIDF transform to entire corpus logging.info("starting LDA model") #run model model = models.ldamodel.LdaModel(corpus_tfidf, id2word=dictionary, alpha=self.num_alpha, num_topics=self.num_topics, update_every=self.num_update, passes=self.num_passes) return model
for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) dictionary.save('/tmp/deerwester4.dict') ## VETOR DAS FRASES corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('/tmp/deerwester4.mm', corpus) # store to disk, for later use from gensim import corpora, models, similarities tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) corpus_lsi = lsi[ corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi ## COORDENADAS DOS TEXTOS todas = [] for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly todas.append(doc) from gensim import corpora, models, similarities dictionary = corpora.Dictionary.load('/tmp/deerwester4.dict') corpus = corpora.MmCorpus( '/tmp/deerwester4.mm'
for document in documents] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 new_text = [[token for token in text if frequency[token] > 10] for text in texts] dictionary = corpora.Dictionary(new_text) dictionary.save("dict2.txt") # 开始对测试集进行操作 testing = "novel/鬼吹灯.txt" testContent = open(testing, 'rb').read().decode("utf-8", 'ignore') testWords = jieba.cut(testContent) testList = "" for word in testWords: testList += word + " " test_bow = dictionary.doc2bow(testList.split()) train_bow = [dictionary.doc2bow(text) for text in texts] # 构建模型 tfidf = models.TfidfModel(train_bow) featureNum = len(dictionary.token2id.keys()) index = similarities.SparseMatrixSimilarity(tfidf[train_bow], num_features=featureNum) similary = index[tfidf[test_bow]] string_tfidf = tfidf[test_bow] print(similary)
if __name__ == '__main__': #建立词典 rawdata = get_rawdata() # print(rawdata) dictionary = corpora.Dictionary(rawdata) #此处的rawdata为集合的集合,是分词后的句子的集合 # for i in dictionary: # print(i,'------',dictionary[i]) #得到词袋 docbow = [dictionary.doc2bow(text) for text in rawdata] # print(docbow) # for i in docbow: # print(i) #tf-idf模型 tfidf_model = models.TfidfModel(docbow) #参数是词袋模型 tfidf = tfidf_model[docbow] # print(tfidf) for i in tfidf: print(i) #test # raw_documents = [ # '0无偿居间介绍买卖毒品的行为应如何定性', # '1吸毒男动态持有大量毒品的行为该如何认定', # '2如何区分是非法种植毒品原植物罪还是非法制造毒品罪', # '3为毒贩贩卖毒品提供帮助构成贩卖毒品罪', # '4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定', # '5为获报酬帮人购买毒品的行为该如何认定', # '6毒贩出狱后再次够买毒品途中被抓的行为认定', # '7虚夸毒品功效劝人吸食毒品的行为该如何认定',
from gensim import corpora, models, similarities corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)], [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)], [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)], [(0, 1.0), (4, 2.0), (7, 1.0)], [(3, 1.0), (5, 1.0), (6, 1.0)], [(9, 1.0)], [(9, 1.0), (10, 1.0)], [(9, 1.0), (10, 1.0), (11, 1.0)], [(8, 1.0), (10, 1.0), (11, 1.0)]] # In[67]: tfidf = models.TfidfModel(corpus) vec = [(0, 1), (4, 1)] print(tfidf[vec]) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12) sims = index[tfidf[vec]] print(list(enumerate(sims))) # ### Corpora and Vector Spaces # In[70]: from gensim import corpora documents = ["Human machine interface for lab abc computer applications",
def get_work_experience_score(jobroles, user_experience): processd_corpus = [[word for word in document.lower().split()] for document in jobroles] dictionary = corpora.Dictionary(processd_corpus) feature_count = len(dictionary.token2id) bow_corpus = [dictionary.doc2bow(text) for text in processd_corpus] tfidf = models.TfidfModel(bow_corpus) index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=feature_count) master_count = 0 bachelor_count = 0 phd_count = 0 total_expr = 0 for work_expr in user_experience: if (len(work_expr) < 2): continue jobrole = work_expr[0].lower().split() job_duration = work_expr[1] query_bow = dictionary.doc2bow(jobrole) sims = index[tfidf[query_bow]] sorted_similarity = sorted(enumerate(sims), key=lambda x: x[1], reverse=True) if sorted_similarity[0][1] >= 0.70: job_duration = job_duration.strip() dates = job_duration.strip().split('-') dates = [date.strip() for date in dates] try: startdate = datetime.strptime(dates[0], '%m/%Y') except: try: startdate = datetime.strptime(dates[0], '/%Y') except: startdate = '' enddate = dates[1] if enddate != 'Present': try: enddate = datetime.strptime(enddate, '%m/%Y') except: try: enddate = datetime.strptime(enddate, '/%Y') except: enddate = '' else: enddate = datetime.now() if enddate == '' or startdate == '': total_expr = 0 else: duration = enddate - startdate total_expr = duration.days / 365 work_score = 0 if (total_expr > 0 and total_expr < 3): work_score = 1 elif total_expr >= 3 and total_expr < 5: work_score = 2 elif total_expr >= 5 and total_expr < 10: work_score = 3 elif total_expr > 10: work_score = 4 else: work_score = 0 return work_score / 4
def get_vectors_centroid(self, method='update', extra_weights=None, tfidf_weighted=True, weight_method='sqrt', tfidf_model=None, extra_epochs=10): """ Calculate centroid vectors for all documents Individual word vectors are weighted using tfidf (unless weighted=False). Args: -------- method: str Which method to use if not all words are present in trained model. 'update': word2vec model will be updated by additional training of the model. 'ignore': will ignore all 'words' not present in the pre-trained model. TODO 'substitute": will look to replace missing words with closest matches? extra_weights: list List of extra weights for add documents (and every word). Set to "False" if not used. tfidf_weighted: bool True, False weight_method: str Select method for how to weigh the extra_weights... 'sqrt' - weight word vectors by sqrt or extra_weights None tfidf_model: str Give filename if pre-defined tfidf model should be used. Otherwise set to None. extra_epochs: int Number of extra epochs to train IF method is 'update' and missing words are detected. """ # TODO maybe move the update section to the build_model function? # Check if everything is there: # 1) Check if model and bow-corpus are present if self.model_word2vec is None: print( "Word2vec model first needs to be load or made (self.build_model_word2vec)." ) if len(self.bow_corpus) == 0: print("BOW corpus has not been calculated yet (bow_corpus).") # 2) Check if all words are included in trained word2vec model dictionary = [self.dictionary[x] for x in self.dictionary] test_vocab = [] for i, word in enumerate(dictionary): if word not in self.model_word2vec.wv.vocab: test_vocab.append((i, word)) if len(test_vocab) > 0: print( "Not all 'words' of the given documents are present in the trained word2vec model!" ) print(len(test_vocab), " out of ", len(self.dictionary), " 'words' were not found in the word2vec model.") if method == 'update': print( "The word2vec model will hence be updated by additional training." ) self.model_word2vec.build_vocab(self.corpus, update=True) self.model_word2vec.train(self.corpus, total_examples=len(self.corpus), epochs=extra_epochs) self.model_word2vec.save('newmodel') elif method == 'ignore': print( "'Words'missing in the pretrained word2vec model will be ignored." ) _, missing_vocab = zip(*test_vocab) print("Removing missing 'words' from corpus...") # Update corpus and BOW-corpus self.corpus = [[ word for word in document if word not in missing_vocab ] for document in self.corpus] self.bow_corpus = [ self.dictionary.doc2bow(text) for text in self.corpus ] # TODO: add check with word intensities else: print( "Given method how do deal with missing words could not be found." ) else: print( "All 'words' of the given documents were found in the trained word2vec model." ) if tfidf_weighted is True: if tfidf_model is not None: self.tfidf = models.TfidfModel.load(tfidf_model) print("Tfidf model found and loaded.") else: if self.tfidf is None: self.tfidf = models.TfidfModel(self.bow_corpus) print("No tfidf model found.") else: print("Using present tfidf model.") vector_size = self.model_word2vec.wv.vector_size vectors_centroid = [] for i in range(len(self.bow_corpus)): if (i + 1) % 10 == 0 or i == len( self.bow_corpus) - 1: # show progress print('\r', ' Calculated centroid vectors for ', i + 1, ' of ', len(self.bow_corpus), ' documents.', end="") document = [self.dictionary[x[0]] for x in self.bow_corpus[i]] if extra_weights is not None: document_weight = [ extra_weights[i][self.initial_documents[i].index( self.dictionary[x[0]])] for x in self.bow_corpus[i] ] document_weight = np.array(document_weight) / np.max( document_weight) # normalize if len(document_weight) == 0: print("Something might have gone wrong with: ", i) np.ones((len(document))) elif weight_method == 'sqrt': document_weight = np.sqrt( document_weight ) # idea: take sqrt to make huge intensity differences less severe elif weight_method is None: pass else: print("Unkown weight adding method.") else: document_weight = np.ones((len(document))) if len(document) > 0: term1 = self.model_word2vec.wv[document] if tfidf_weighted: term2 = np.array( list(zip(*self.tfidf[self.bow_corpus[i]]))[1]) else: term2 = np.ones((len(document))) term1 = term1 * np.tile(document_weight, (vector_size, 1)).T weighted_docvector = np.sum((term1.T * term2).T, axis=0) else: weighted_docvector = np.zeros( (self.model_word2vec.vector_size)) vectors_centroid.append(weighted_docvector) self.vectors_centroid = np.array(vectors_centroid)
def main(): """ Initiation docstring """ # Change to whatever you want to plot from subreddit = "depression" #read suicide-related keywords in csv #df = pd.read_csv(f"subreddits/{subreddit}/reddit_depression_submissions.csv", #sep=',', #encoding='latin-1') df = pd.concat(map(pd.read_csv, ['subreddits/depression/reddit_depression_submissions.csv', 'subreddits/foreveralone/reddit_foreveralone_submissions.csv', 'subreddits/offmychest/reddit_offmychest_submissions.csv', 'subreddits/singapore/reddit_singapore_submissions.csv', 'subreddits/suicidewatch/reddit_suicidewatch_submissions.csv'])) #print(df) ############################################################################## #####1. PLOTTING BAR CHART of overall sentiment analysis of submissions#### ############################################################################## fig, ax = plt.subplots(figsize=(8, 8)) counts = df.risk.value_counts(normalize=True) * 100 sns.barplot(x=counts.index, y=counts, ax=ax) ax.set_xticklabels(['Negative', 'Neutral', 'Positive']) plt.title("Sentiment Analysis on Reddit") ax.set_ylabel("Percentage") ax.set_xlabel("Sentiment Categories") #plt.show() ############################################################################## #####2. PLOTTING Negative keyword frequency#### ############################################################################## neg_lines = list(df[df.risk == -1].submission) data_text = df[['submission']] data_text['index'] = data_text.index documents = data_text tokenizer = RegexpTokenizer(r'\w+') stop_words = stopwords.words('english') customStopWords = ['iâ','one','want','anyone','today','itâ','suicidal','depressed','would','get','make','really','else','even', 'ever','know','think','day','much','going','feeling','person','died','everyone','dead','everything','feel','like', 'life','someone','always','still','way','sometimes','things','thoughts','something','every','back','years','killing','killed' 'keep'] stop_words.extend(customStopWords) neg_tokens = [] doc_clean = [] for line in neg_lines: toks = tokenizer.tokenize(line) toks = [t.lower() for t in toks if t.lower() not in stop_words] #toks = [ps.stem(t) for t in toks] neg_tokens.extend(toks) plt.style.use('ggplot') neg_freq = nltk.FreqDist(neg_tokens) neg_freq.most_common(20) #print(neg_freq.most_common(20)) y_val = [x[1] for x in neg_freq.most_common()] y_final = [] for i, k, z, t in zip(y_val[0::4], y_val[1::4], y_val[2::4], y_val[3::4]): y_final.append(math.log(i + k + z + t)) x_val = [math.log(i + 1) for i in range(len(y_final))] fig = plt.figure(figsize=(10,5)) plt.xlabel("Words (Log)") plt.ylabel("Frequency (Log)") plt.title("Negative Word Frequency Distribution on Reddit") plt.plot(x_val, y_final) #plt.show() ############################################################################## #####3. PLOTTING Negative keyword wordcloud#### ############################################################################## neg_words = ' '.join([text for text in neg_tokens]) wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(neg_words) plt.figure(figsize=(10, 7)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis('off') #plt.show() ############################################################################## #####3. Topic Analysis#### ############################################################################## processed_docs = documents['submission'].map(preprocess) print(processed_docs[:10]) dictionary = gensim.corpora.Dictionary(processed_docs) count = 0 for k, v in dictionary.iteritems(): #print(k, v) count += 1 if count > 10: break dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] #bow_corpus[4310] from gensim import corpora, models tfidf = models.TfidfModel(bow_corpus) corpus_tfidf = tfidf[bow_corpus] from pprint import pprint for doc in corpus_tfidf: pprint(doc) break lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2) for idx, topic in lda_model.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) topics = lda_model.show_topics(formatted=False) cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] # more colors: 'mcolors.XKCD_COLORS' cloud = WordCloud(stopwords=stop_words, background_color='white', width=2500, height=1800, max_words=10, colormap='tab10', color_func=lambda *args, **kwargs: cols[i], prefer_horizontal=1.0) fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) topic_words = dict(topics[i][1]) cloud.generate_from_frequencies(topic_words, max_font_size=300) plt.gca().imshow(cloud) plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16)) plt.gca().axis('off') plt.subplots_adjust(wspace=0, hspace=0) plt.axis('off') plt.margins(x=0, y=0) plt.tight_layout() plt.show()
# 词袋模型形成的词向量 bow = [] with open('data.txt', 'r', encoding='utf-8') as f: for line in f.readlines(): lines = line.strip().split(' ') word_bow = dictionary.doc2bow(lines) bow.append(word_bow) # for i in bow: # print(i) #tfidf向量 # corpus = read_data(r'data.txt') # for i in corpus: # print(i) tfidf_model = models.TfidfModel(bow) corpus_tfidf = tfidf_model[bow] for item in corpus_tfidf: print(item) # for i in tfidf_model: # print(i) # for doc in corpus: # print(tfidf_model[doc]) # print(tfidf_model['宣亚国际 终止 收购 映客 股票 巨量 封死 跌停']) # for i in tfidf_model: # print(i) # print(type(tfidf_model)) # corpus_tfidf = [tfidf_model[doc] for doc in corpus] # for i in corpus_tfidf: # print(i)
] for line in f] # texts = [line.strip().split() for line in f] print '读入语料数据完成,用时%.3f秒' % (time.time() - t_start) f.close() M = len(texts) print '文本数目:%d个' % M # pprint(texts) print '正在建立词典 --' dictionary = corpora.Dictionary(texts) V = len(dictionary) print '正在计算文本向量 --' corpus = [dictionary.doc2bow(text) for text in texts] print '正在计算文档TF-IDF --' t_start = time.time() corpus_tfidf = models.TfidfModel(corpus)[corpus] print '建立文档TF-IDF完成,用时%.3f秒' % (time.time() - t_start) print 'LDA模型拟合推断 --' num_topics = 30 t_start = time.time() lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, alpha=0.01, eta=0.01, minimum_probability=0.001, update_every=1, chunksize=100, passes=1) print 'LDA模型完成,训练时间为\t%.3f秒' % (time.time() - t_start) # # 所有文档的主题
def gen_corpus(self, documents): texts = [[w for w in jieba.cut(doc) if len(w) > 1] for doc in documents] self.dictionary = corpora.Dictionary(texts) self.corpus = [self.dictionary.doc2bow(text) for text in texts] self.tfidf = models.TfidfModel(self.corpus)
from gensim import corpora, models, similarities # enable logging to display what is happening logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # read dataset 20newsgroups dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) documents = dataset.data texts = preprocess_data(documents) dictionary = corpora.Dictionary(texts) bow_corpus = [dictionary.doc2bow(text) for text in texts] # bow = Bag Of Words # pprint.pprint(bow_corpus[5]) # one example document, words maped to ids tfidf = models.TfidfModel(bow_corpus) # train tf-idf model corpus_tfidf = tfidf[bow_corpus] # apply transformation on the whole corpus ## TODO: transform your tfidf model into a LSI Model ## using python gensim, use num_topics=200 ## TODO: query! pick a random document and formulate a query based on the ## terms in the document. ## TODO: initialize a query structure for your LSI space ## TODO: perform the query on the LSI space, interpret the result and summarize your findings in the report
def __init__(self): self.__all_doc_list, self.__all_timestamp_list = load_1h_news() self.__dictionary = corpora.Dictionary(self.__all_doc_list) self.__corpus = [self.__dictionary.doc2bow(doc) for doc in self.__all_doc_list] self.__tfidf = models.TfidfModel(self.__corpus) print("tf-idf model has beens build successfully")
def _generate_tfidf_model(self): print(' | Generating Tfidf model...') self.tfidf = models.TfidfModel(self.corpus) self.tfidf.save(self.tfidf_model_filename)
def op(s, q, kaishi): #这个函数做一次切分操作 nonlocal step # print('当前开始处理',step,'步奏') # 准备数据:现有8条文本数据,将8条文本数据放入到list中 documentsb = s documentsq = q shujukushuliang = len(documentsb) chaxunshuliang = len(documentsq) documentAll = documentsb + documentsq ## # 待比较的文档 # 获取停用词 stopwords = set() file = open("stopwords.txt", 'r', encoding='UTF-8') for line in file: stopwords.add(line.strip()) file.close() # 将分词、去停用词后的文本数据存储在list类型的texts中 documentsb__after_preprocess = [ ] #预处理之后的数据库记做documentsb__after_preprocess for line in documentAll: words = ' '.join(jieba.cut(line)).split(' ') # 利用jieba工具进行中文分词 text = [] # 过滤停用词,只保留不属于停用词的词语 for word in words: if word not in stopwords: text.append(word) documentsb__after_preprocess.append(text) ## # 待比较的文档也进行预处理(同上) documentsq__after_preprocess = [ ] #预处理之后的数据库记做documentsb__after_preprocess for line in documentsq: words = ' '.join(jieba.cut(line)).split(' ') # 利用jieba工具进行中文分词 text = [] # 过滤停用词,只保留不属于停用词的词语 for word in words: if word not in stopwords: text.append(word) documentsq__after_preprocess.append(text) ## # 2.计算词频 # print('2.计算词频') frequency = defaultdict(int) # 构建一个字典对象 # 遍历分词后的结果集,计算每个词出现的频率 for text in documentsb__after_preprocess: for word in text: frequency[word] += 1 # 选择频率大于1的词(根据实际需求确定) texts = [[word for word in text if frequency[word] > 1] for text in documentsb__after_preprocess] # for line in texts: # print(line) # 3.创建字典(单词与编号之间的映射) # print('3.创建字典(单词与编号之间的映射)') dictionary = corpora.Dictionary(texts) # print(dictionary) # 打印字典,key为单词,value为单词的编号 # print(dictionary.token2id) # 4.将待比较的文档转换为向量(词袋表示方法) # print('4.将待比较的文档转换为向量(词袋表示方法)') # 使用doc2bow方法对每个不同单词的词频进行了统计,并将单词转换为其编号,然后以稀疏向量的形式返回结果 new_vec = [ dictionary.doc2bow(text) for text in documentsq__after_preprocess ] #print(new_vec) #这个就是query 了!!!! ## # 5.建立语料库 # print('5.建立语料库') # 将每一篇文档转换为向量 corpus = [ dictionary.doc2bow(text) for text in documentsb__after_preprocess ] # print(corpus) # 6.初始化模型 # print('6.初始化模型') # 初始化一个tfidf模型,可以用它来转换向量(词袋整数计数),表示方法为新的表示方法(Tfidf 实数权重) tfidf = models.TfidfModel(corpus) # 将整个语料库转为tfidf表示方法 corpus_tfidf = tfidf[corpus] #这个就是比较的库 # 7.创建索引 # print('7.创建索引') # 使用上一步得到的带有tfidf值的语料库建立索引#如果库太小,下行会出现bug.这时候需要制定字典. index = similarities.MatrixSimilarity( corpus_tfidf, num_features=len( dictionary)) #这个是根据词频算内积,也就是看句子有多少个词汇一样,一样的越多,分数越高. # 8.相似度计算并返回相似度最大的文本 # print('# 8.相似度计算并返回相似度最大的文本') new_vec_tfidf = [tfidf[i] for i in new_vec] # 将待比较文档转换为tfidf表示方法 ## # 计算要比较的文档与语料库中每篇文档的相似度 sims = index[new_vec_tfidf] #删除numpy矩阵q数据里面的对角线数据,因为他们是自己跟自己比没有意义的. for i in range(chaxunshuliang): #q 里面0 对应 shujukushuliang sims[0 + i][i + shujukushuliang] = 0 ## import numpy as np tmp = np.argmax(sims, axis=1) tmp2 = np.max(sims, axis=1) end = time.time() # print("当前步奏使用时间",end-start) step += 1 tmp3 = set([kaishi + i for i in range(len(tmp2)) if tmp2[i] > yuzhi]) #tmp3是需要删除的文本 return tmp3