Exemplo n.º 1
0
from sklearn.decomposition import PCA
from gensim import corpora, models
import pickle
import numpy as np
tokens = pickle.load(open('tag_list.pkl', 'rb'))
dictionary = corpora.Dictionary(tokens)
texts = [dictionary.doc2bow(text) for text in tokens]
tfidf_model = models.TfidfModel(texts, normalize=False)
tfidf = np.zeros([len(tokens), 1386], np.float32)
for i in range(len(tokens)):
    temp = tfidf_model[texts[i]]
    for topic in temp:
        tfidf[i, topic[0]] = topic[1]
np.save('tf_idf.npy', tfidf)
Exemplo n.º 2
0
# :ref:`core_concepts_corpus`.
#
# One simple example of a model is `tf-idf
# <https://en.wikipedia.org/wiki/Tf%E2%80%93idf>`_.  The tf-idf model
# transforms vectors from the bag-of-words representation to a vector space
# where the frequency counts are weighted according to the relative rarity of
# each word in the corpus.
#
# Here's a simple example. Let's initialize the tf-idf model, training it on
# our corpus and transforming the string "system minors":
#

from gensim import models

# train the model
tfidf = models.TfidfModel(bow_corpus)

# transform the "system minors" string
words = "system minors".lower().split()
print(tfidf[dictionary.doc2bow(words)])

###############################################################################
# The ``tfidf`` model again returns a list of tuples, where the first entry is
# the token ID and the second entry is the tf-idf weighting. Note that the ID
# corresponding to "system" (which occurred 4 times in the original corpus) has
# been weighted lower than the ID corresponding to "minors" (which only
# occurred twice).
#
# You can save trained models to disk and later load them back, either to
# continue training on new training documents or to transform new documents.
#
Exemplo n.º 3
0
    with open(e) as f:
        str = ""
        for line in f:
            str += clean(line)
        raw_corpus.append(str)

stoplist = set(stopwords.words('english')).union(set(stopwords.words('french')))
texts = [[word for word in document.split() if word not in stoplist]
    for document in raw_corpus]

# Count word frequencies
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(processed_corpus)
dictionary.save('simul.dict')

corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('simul.mm', corpus)

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
corpus_lsi = lsi[corpus_tfidf]
lsi.save('model.lsi')
Exemplo n.º 4
0
 def __init__(self, data_list):
     data_list = self._check(data_list)
     self.dictionary = corpora.Dictionary(data_list)
     corpus = [self.dictionary.doc2bow(doc) for doc in data_list]
     self.tfidf = models.TfidfModel(corpus)  #文档建tfidf模型
Exemplo n.º 5
0
def create_tfidf_model(corpus, tfidf_file_dir='data/tfidf.tfidf_model'):
    tfidf = models.TfidfModel(corpus)
    tfidf.save(tfidf_file_dir)
    return tfidf
Exemplo n.º 6
0
# In[12]:

sims = index[corpus[1]]
print((list(enumerate(sims))))
print(sims.argsort())

# Thus _document 15_ is the most similar document to _document 1_. As can easily be verified both documents refer to the same topic (crisis in ukraine).

# ## TF-IDF representation
# So far in the BoW representation of the documents the _term frequency (tf)_ has been applied. This value measures how often the term (word) appears in the document. If document similarity is calculated on such tf-based BoW representation, common words which appear quite often (in many documents) but have low semantic focus have a strong impact on the similarity-value. In most cases this is a drawback, since similarity should be based on terms with a high semantic focus. Such semantically meaningful words usually appear only in a few documents. The _term frequency inversed document frequency measure (tf-idf)_ does not only count the frequency of a term in a document, but weighs those terms stronger, which occur only in a few documents of the corpus.
#
# In _gensim_ the _tfidf_ - model of a corpus can be calculated as follows:

# In[13]:

tfidf = models.TfidfModel(corpus)

# The _tf-idf_-representation of the first 3 documents in the corpus are:

# In[14]:

idx = 0
for d in corpus[:3]:
    print("-------------tf-idf BoW of document %d ---------------" % idx)
    print(tfidf[d])
    idx += 1

# In this representation the second element in the tuples is not the term frequency, but the _tfidf_. Note that default configuration of [tf-idf in gensim](http://radimrehurek.com/gensim/models/tfidfmodel.html) calculates tf-idf values such that each document-vector has a norm of _1._ The tfidf-model without normalization is generated at the end of this notebook.
#
# Question: Find the maximum tf-idf value in these 3 documents. To which word does this maximum value belong? How often does this word occur in the document?
#
Exemplo n.º 7
0
    # read data from mongoDB
    # Get Ideas
    db = mongohq.Data_Utility(mongohq.fac_exp)
    ideas = db.get_data('ideas')

    #### tokenize ####
    # get stopwords
    stopWords = nlp.get_stopwords()
    # get bag of words
    data, expandedText = nlp.bag_of_words(ideas, stopWords)

    # convert tokenized documents to a corpus of vectors
    corpus = [dictionary.doc2bow(text) for text in expandedText]

    # convert raw vectors to tfidf vectors
    tfidf = models.TfidfModel(corpus)  #initialize model
    corpus_tfidf = tfidf[corpus]  #apply tfidf model to whole corpus

    # make lsa space
    if len(data) > 300:
        dim = 300  # default is 300 dimensions
    else:
        dim = len(data)  # default to 300
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
                          num_topics=dim)  #create the space

    # output the matrix V so we can use it to get pairwise cosines
    # https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q3-how-do-you-calculate-the-matrix-v-in-lsi-space
    vMatrix = matutils.corpus2dense(lsi[corpus_tfidf], len(
        lsi.projection.s)).T / lsi.projection.s
Exemplo n.º 8
0
    #数据处理
    contents = read_file()
    stopwords = load_stopwords()

    seg = seg_file(contents[0], stopwords)
    # 建立字典
    dictionary = corpora.Dictionary(seg)
    V = len(dictionary)
    print(V)

    # 统计文档词频矩阵
    text = [dictionary.doc2bow(text, allow_update=True) for text in seg]
    #print(text[0])#稀疏矩阵

    #计算Tfidf矩阵
    text_tfidf = models.TfidfModel(text)[text]

    #建立LDA模型,输出前十个主题
    lda = models.LdaModel(text_tfidf,
                          id2word=dictionary,
                          num_topics=200,
                          iterations=100)

    #显示主题
    for k, v in lda.print_topics(num_topics=10):
        print(k, v)
    #所有文档的主题
    doc_topic = lda.get_document_topics(text_tfidf)
    print(len(doc_topic))
    for dt in doc_topic:
        print(dt)
Exemplo n.º 9
0
def main():
  binsize = 1 # days on which pages are grouped on -- coverage of a news within the same bin of 1 day is taken as a single item
  smooth = 7 # smooth time-dependent results taking a moving average of 7 days, to avoid fluctuations

  # first of all transform HTML into plain text
  # to remove all the HTML tags
  # then use the clean() function of the Source class
  # to clean up rubbish (ie: navigation bars, advertisement)
  # the document is stored in data/[date]/[source tag]/frond_clean.txt
  # the document's clean text is in doc_set and its date and source are kept with
  # the same index in doc_id
  dirlist = os.listdir('data/')
  dirlist.sort()

  doc_set = []
  doc_id = []
  for s in utils.sources:
    source = utils.sources[s]
    doc = ""
    rebin_count = 0
    for day in dirlist:
      output = 'data/%s/%s' % (day, s)
      os.system('lynx -dump -nolist %s/front.html > %s/front.txt' % (output, output))
      source.clean('%s/front.txt' %output, '%s/front_clean.txt' % output)
      f = open("%s/front_clean.txt" % output)
      for i in f.readlines(): doc += i
      rebin_count += 1
      if rebin_count % binsize == 0:
        doc_set.append(doc)
        doc_id.append('%s/%s' % (day, s))
        doc = ""

  # now we need to split it into words
  # and remove the word ending, so that only the stem of the word remains
  # this removes differences between masculin and feminin and plural/singular
  # we also remove common articles, prepositions, etc, which happen too often and carry no
  # meaning in the "bad-of-words" approach
  tokenizer = RegexpTokenizer(r'\w+')
  
  # Create p_stemmer of class PorterStemmer
  p_stemmer = PorterStemmer()

  # stop words
  lang_stop = (stopwords.words('portuguese')) # get set of stop words for portuguese
  lang_stop.extend(['08', 'achaque', 'lico', 'r', '1', 'pra', 'bbc', 'globo', 'foto', 'agencia', 'photo', '01', '00', 'folha', 'folhapress'])
  
  texts = []
  for doc in doc_set:
    raw = doc.lower() # to lower case
    tokens = tokenizer.tokenize(raw) # make word tokens and save it in a list
    stopped_tokens = [i for i in tokens if not i in lang_stop] # remove stop words
  
    # stem token
    text = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(text) # texts keeps a list of list of words (same indexing as doc_set and doc_id)

  # now make a dictionary of words found
  # this assigns a unique integer to each word
  dictionary = corpora.Dictionary(texts)
  # we can use dictionary.token2id to get the list of word-id mapping
  # doc2bow counts how many times a word appears in the text and makes a list of counts of words
  # this is now closer to a vector interpretation of each document
  corpus = [dictionary.doc2bow(text) for text in texts]
  # we would now like to use the tf-idf transformation for each document representation
  # this weights more words that appear very often, but normalises it by the size of the document
  # to avoid biases to large documents
  # it also underweights terms that appears to often in many documents
  # this avoids the appearance of wors such as "say", which often appears in newspapers
  tfidf = models.TfidfModel(corpus, normalize = True)
  corpus_tfidf = tfidf[corpus] # apply the trained transformation to the corpus

  # now make the model, which can be LSI for an SVD transformation of the 
  # term-document matrix
  # or LDA for a probabilistic model
  if useLDA:
    myModel = models.ldamodel.LdaModel(corpus_tfidf, num_topics=ntopics, id2word = dictionary, random_state=123)
  else:
    myModel = models.lsimodel.LsiModel(corpus_tfidf, num_topics=ntopics, id2word = dictionary)

  print "<!DOCTYPE html>"
  print "<html lang=\"en\"><head>"
  print """
<meta charset="utf-8">
<link rel="stylesheet" href="https://cdn.pydata.org/bokeh/release/bokeh-0.12.4.min.css" type="text/css" />
        
<script type="text/javascript" src="https://cdn.pydata.org/bokeh/release/bokeh-0.12.4.min.js"></script>
<script type="text/javascript">
    Bokeh.set_log_level("info");
</script>
        <style>
          body {
            margin: auto;
            text-align: left;
            text-weight: bold;
            font-size: 1.2em;
          }
          table, th, td {
            padding: 0.5em;
            text-align: center;
          }
          th {
            height: 2em;
            font-size: 1.4em;
          }
          th, td {
            border-bottom: 1px solid #ddd;
          }
          tr:hover {
            background-color: #f5f5f5;
          }
          table {
            padding-right: 1em;
            padding-left: 1em;
            border-collapse: collapse;
            width: 100%;
          }
        </style>
"""
  print "<title>Results of text mining Brazilian newspapers front page</title></head><body><h3>Results of text mining Brazilian newspapers front page</h3>"

  # now print the topics that appear often
  topics = myModel.show_topics(num_topics=ntopics, num_words=nwords, formatted=False)
  for i in range(0, len(topics)):
    print "<table>"
    print "<tr><th colspan=\"2\">Words within topic '%d':</th></tr>" % i
    print "<tr><th>Contribution</th><th>Word</th></tr>"
    for v in topics[i][1]:
      print "<tr><td>%6.4f</td><td>%10s</td></tr>" % (v[1], utils.showWord(v[0]))
    print "</table>"

    # make a graph showing this topic connected to its words, with the length
    # of the edge being the weight of the word in that topic
    utils.save_fulltopic_graph([ myModel.show_topics(ntopics, num_words=nwords, formatted=False)[i] ], [i], "_only_%d.html" % i)
  # same as before, but put all topics and words in the same graph
  script, div = utils.save_fulltopic_graph(myModel.show_topics(ntopics, num_words =nwords, formatted=False), range(0, len(topics)))
  print "<h4>Graph showing words in each topic</h4>"
  print script
  print div

  # Try now projecting the document in the topics set
  # this tells us how much each topic contributes in a document
  print "Topics per document:"
  topic_per_doc = {}
  for did in range(0, len(texts)):
    print "<table>"
    date = doc_id[did].split('/')[0]
    dt = datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8]))
    print "<tr><th colspan=\"2\">Topics within document '%s' of '%s':</th></tr>" % (utils.showWord(doc_id[did].split('/')[-1]), dt.date())
    print "<tr><th>Relevance</th><th>Topic</th></tr>"
    if useLDA:
      topics = myModel.get_document_topics(tfidf[dictionary.doc2bow(texts[did])])
    else:
      topics = myModel[tfidf[dictionary.doc2bow(texts[did])]]
    for k,v in topics:
      print "<tr><td>%6.4f</td><td>%d</td></tr>" % (v, k)
    print "</table>"
    date = doc_id[did].split('/')[0]
    d = doc_id[did].split('/')[-1]
    if not date in topic_per_doc:
      topic_per_doc[date] = {}
    topic_per_doc[date][d] = topics

  # now make a graph of it
  # connecting the documents to topics
  # this is done for each document in a specific day
  #for date in topic_per_doc:
  #  # for all documents in this date
  #  script, div = utils.save_doctopic_graph(topic_per_doc[date], "topic_per_doc_%s.html" % date)
  #  dt = datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8]))
  #  print "<h4>Graph showing topics in each document at %s</h4>" % dt.date()
  #  print script
  #  print div

  # now do a similarity query
  if useLDA:
    corpus_projection = myModel.get_document_topics(corpus_tfidf)
  else:
    corpus_projection = myModel[corpus_tfidf]
  index = similarities.MatrixSimilarity(corpus_projection)

  # we can now use index[input], where input = myModel[tfidf[dictionary.doc2bow(newDocument.lower().split())]]
  # this compares a new document with what is in the corpus
  # we can compare the documents in the corpus with each other
  similar = {}
  for item in sim_query:
    similar[item] = []
    if useLDA:
      result = myModel.get_document_topics(tfidf[ dictionary.doc2bow([p_stemmer.stem(i) for i in tokenizer.tokenize(item.lower()) if not i in lang_stop]) ])
    else:
      result = myModel[ tfidf[ dictionary.doc2bow([p_stemmer.stem(i) for i in tokenizer.tokenize(item.lower()) if not i in lang_stop]) ] ]
    for did2, weight in list(enumerate( index[ result ] )):
      similar[item].append((weight, doc_id[did2]))

  for item in sim_query:
    print "<table>"
    print "<tr><th colspan=\"3\">Documents matching '%s':</th></tr>" % item
    print "<tr><th>Similarity (%)</th><th>Source</th><th>Date</th></tr>"
    for k,v in sorted(similar[item], key=lambda val: -val[0]):
      date = v.split('/')[0]
      dt = datetime.datetime(int(date[0:4]), int(date[4:6]), int(date[6:8]))
      print "<tr><td>%5.2f</td><td>%20s</td><td>%20s</td></tr>" % (k*100, utils.showWord(v.split('/')[-1]), dt.date())
    print "</table>"

  print "</body></html>"
  utils.save_query_time(similar, ".html", smooth)
  utils.save_query_time_conditional(similar, ".html", smooth)
Exemplo n.º 10
0
def lda(export_perplexity=False):
    np.set_printoptions(linewidth=300)
    data = pd.read_csv('QQ_chat_result.csv', header=0, encoding='utf-8')
    texts = []
    for info in data['Info']:
        texts.append(info.split(' '))
    M = len(texts)
    print('文档数目:%d个' % M)
    # pprint(texts)

    print('正在建立词典 --')
    dictionary = corpora.Dictionary(texts)
    V = len(dictionary)
    print('正在计算文本向量 --')
    corpus = [dictionary.doc2bow(text) for text in texts]
    print('正在计算文档TF-IDF --')
    t_start = time.time()
    corpus_tfidf = models.TfidfModel(corpus)[corpus]
    print('建立文档TF-IDF完成,用时%.3f秒' % (time.time() - t_start))
    print('LDA模型拟合推断 --')
    num_topics = 20
    t_start = time.time()
    lda = models.LdaModel(corpus_tfidf,
                          num_topics=num_topics,
                          id2word=dictionary,
                          alpha=0.001,
                          eta=0.02,
                          minimum_probability=0,
                          update_every=1,
                          chunksize=1000,
                          passes=20)
    print('LDA模型完成,训练时间为\t%.3f秒' % (time.time() - t_start))
    if export_perplexity:
        export_perplexity1(corpus_tfidf, dictionary, corpus)
        # export_perplexity2(corpus_tfidf, dictionary, corpus)
    # # 所有文档的主题
    # doc_topic = [a for a in lda[corpus_tfidf]]
    # print 'Document-Topic:\n'
    # pprint(doc_topic)

    num_show_term = 7  # 每个主题显示几个词
    print('每个主题的词分布:')
    for topic_id in range(num_topics):
        print('主题#%d:\t' % topic_id, end=' ')
        term_distribute_all = lda.get_topic_terms(topicid=topic_id)
        term_distribute = term_distribute_all[:num_show_term]
        term_distribute = np.array(term_distribute)
        term_id = term_distribute[:, 0].astype(np.int)
        for t in term_id:
            print(dictionary.id2token[t], end=' ')
        print('\n概率:\t', term_distribute[:, 1])

    # 随机打印某10个文档的主题
    np.set_printoptions(linewidth=200, suppress=True)
    num_show_topic = 10  # 每个文档显示前几个主题
    print('10个用户的主题分布:')
    doc_topics = lda.get_document_topics(corpus_tfidf)  # 所有文档的主题分布
    idx = np.arange(M)
    np.random.shuffle(idx)
    idx = idx[:10]
    for i in idx:
        topic = np.array(doc_topics[i])
        topic_distribute = np.array(topic[:, 1])
        # print topic_distribute
        topic_idx = topic_distribute.argsort()[:-num_show_topic - 1:-1]
        print(('第%d个用户的前%d个主题:' % (i, num_show_topic)), topic_idx)
        print(topic_distribute[topic_idx])
    # 显示着10个文档的主题
    mpl.rcParams['font.sans-serif'] = ['SimHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(8, 7), facecolor='w')
    for i, k in enumerate(idx):
        ax = plt.subplot(5, 2, i + 1)
        topic = np.array(doc_topics[i])
        topic_distribute = np.array(topic[:, 1])
        ax.stem(topic_distribute, linefmt='g-', markerfmt='ro')
        ax.set_xlim(-1, num_topics + 1)
        ax.set_ylim(0, 1)
        ax.set_ylabel("概率")
        ax.set_title("用户 {}".format(k))
        plt.grid(b=True, axis='both', ls=':', color='#606060')
    plt.xlabel("主题", fontsize=13)
    plt.suptitle('用户的主题分布', fontsize=15)
    plt.tight_layout(1, rect=(0, 0, 1, 0.95))
    plt.show()

    # 计算各个主题的强度
    print('\n各个主题的强度:\n')
    topic_all = np.zeros(num_topics)
    doc_topics = lda.get_document_topics(corpus_tfidf)  # 所有文档的主题分布
    for i in np.arange(M):  # 遍历所有文档
        topic = np.array(doc_topics[i])
        topic_distribute = np.array(topic[:, 1])
        topic_all += topic_distribute
    topic_all /= M  # 平均
    idx = topic_all.argsort()
    topic_sort = topic_all[idx]
    print(topic_sort)
    plt.figure(facecolor='w')
    plt.stem(topic_sort, linefmt='g-', markerfmt='ro')
    plt.xticks(np.arange(idx.size), idx)
    plt.xlabel("主题", fontsize=13)
    plt.ylabel("主题出现概率", fontsize=13)
    plt.title('主题强度', fontsize=15)
    plt.grid(b=True, axis='both', ls=':', color='#606060')
    plt.show()
Exemplo n.º 11
0
# We already have a document for Arthur, but let's grab the text from someone else to compare it with.

# In[42]:

p = re.compile(r'(?:GALAHAD: )(.+)')
galahad = ' '.join(re.findall(p, document))
arthur_tokens = tokens
galahad_tokens = word_tokenize(galahad)

# Now, we use gensim to create vectors from these tokenized documents:

# In[43]:

dictionary = corpora.Dictionary([arthur_tokens, galahad_tokens])
corpus = [dictionary.doc2bow(doc) for doc in [arthur_tokens, galahad_tokens]]
tfidf = models.TfidfModel(corpus, id2word=dictionary)

# Then, we create matrix models of our corpus and query

# In[44]:

query = tfidf[dictionary.doc2bow(['peasant'])]
index = similarities.MatrixSimilarity(tfidf[corpus])

# And finally, we can test our query, "peasant" on the two documents in our corpus

# In[45]:

list(enumerate(index[query]))

# So we see here that "peasant" does not match Galahad very well (a really bad match would have a negative value), and is more similar to the kind of speach output that we see from King Arthur.
Exemplo n.º 12
0
def tfidf_train():
    dictionary = corpora.Dictionary.load('../dictionary/new_dict_filter.dict')
    for index in range(0, 1):
        corpus = corpora.MmCorpus('../corpus_mm/corpus_{}.mm'.format(index))
        tfidf_model = models.TfidfModel(corpus=corpus, dictionary=dictionary)
        corpus_tfidf = np.array([tfidf_model[doc] for doc in corpus])
Exemplo n.º 13
0
def get_tfidf(bow_corpus):
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]
    return corpus_tfidf
Exemplo n.º 14
0
def construct_lsi_sim_graph(corpus, args):
    """
    compute lsi vector similarity between paragraphs
    :param corpus:
    :param args:
    :return:
    """
    sim_graph = []
    raw_corpus = [' '.join(para) for para in corpus]

    # create English stop words list
    stoplist = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # Lowercase each document, split it by white space and filter out stopwords
    texts = [[word for word in para.lower().split() if word not in stoplist]
             for para in raw_corpus]
    # Create a set of frequent words
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    # stem each word
    processed_corpus = [[p_stemmer.stem(token) for token in text]
                        for text in texts]

    dictionary = corpora.Dictionary(processed_corpus)
    bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

    # train the model
    tfidf = models.TfidfModel(bow_corpus)
    # transform the "system minors" string
    corpus_tfidf = tfidf[bow_corpus]

    if args.find_opt_num:
        lsi = get_optimal_lsimodel_by_coherence_values(corpus=corpus_tfidf,
                                                       texts=processed_corpus,
                                                       dictionary=dictionary)
    else:
        lsi = models.LsiModel(
            corpus_tfidf, id2word=dictionary,
            num_topics=args.num_topics)  # initialize an LSI transformation

    corpus_lsi = lsi[
        corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

    # for i, doc in enumerate(corpus_lsi):
    #     if len(doc) == 0:
    #         print("The lsi is empty: %s" % raw_corpus[i])

    index = similarities.MatrixSimilarity(corpus_lsi,
                                          num_features=len(dictionary))

    total = 0.
    count_large = 0.
    for i in range(len(corpus_lsi)):
        sim = index[corpus_lsi[i]]

        assert len(sim) == len(corpus_lsi), "the lsi sim is not correct!"
        sim_graph.append(sim)

        for s in sim:
            total += 1
            if s > args.sim_threshold:
                count_large += 1

    print("sim_graph[0]: %s" % str(sim_graph[0]))
    return sim_graph, count_large, total
Exemplo n.º 15
0
# 生成字典和向量语料,记录一个词在多少篇文档中出现,方便之后计算idf,同时唯一标识一个词。
dictionary = corpora.Dictionary(corpora_documents)
print(dictionary.dfs)
# dictionary.save('dict.txt') #保存生成的词典
# dictionary=Dictionary.load('dict.txt')#加载

# 通过下面一句得到语料中每一篇文档对应的稀疏向量(这里是bow向量)
corpus = [dictionary.doc2bow(text) for text in corpora_documents]
# 向量的每一个元素代表了一个word在这篇文档中出现的次数
print(corpus)
# corpora.MmCorpus.serialize('corpuse.mm',corpus)#保存生成的语料
# corpus=corpora.MmCorpus('corpuse.mm')#加载

# corpus是一个返回bow向量的迭代器。下面代码将完成对corpus中出现的每一个特征的IDF值的统计工作
print('corpus: {}'.format(corpus))
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]
# 这里输出的是标准化后的tfidf值
print('corpus tfidf: {}'.format(corpus_tfidf))

# 查看model中的内容
for item in corpus_tfidf:
    print(item)
# tfidf.save("data.tfidf")
# tfidf = models.TfidfModel.load("data.tfidf")
# print(tfidf_model.dfs)

similarity = similarities.Similarity(None, corpus_tfidf, num_features=600)
test_data_1 = '北京雾霾红色预警'
test_cut_raw_1 = list(jieba.cut(test_data_1))  # ['北京', '雾', '霾', '红色', '预警']
test_corpus_1 = dictionary.doc2bow(
Exemplo n.º 16
0
    def do_gensim(self):
        logging.info("Starting GENSIM code")
        documents = []
        logging.info("INCOMMING TWEET CORPUS SIZE: " +
                     str(len(self.social_data.posts)))
        for tweet in self.social_data.posts:
            #tweet = str(curr.fetchone()[0])
            #print("doc:%s" %tweet.text)
            documents.append(' '.join(
                re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(gt)",
                       " ", tweet.text).split()).lower())

        logging.info("CORPUS SIZE AFTER REGEX: " + str(len(documents)))

        #stoplist work
        s = ""
        for w in self.stopwords:
            s += w + " "
        stoplist = set(s.split())
        #print("stoplist %s" % str(stoplist))
        #logging.info("s:\n\t"+s)
        #logging.info("stopwords"+str([i for i in self.builder.get_object('TopicStopwords_Listbox').get(0,tk.END)]))
        #logging.info(stoplist)

        #tokenize
        texts = [[
            word for word in document.lower().split() if word not in stoplist
        ] for document in documents]
        logging.info("CORPUS SIZE AFTER STOPLIST: " + str(len(texts)))

        #singles reduction
        all_tokens = sum(texts, [])
        logging.info("beginning tokenization")
        tokens_once = set(word for word in set(all_tokens)
                          if all_tokens.count(word) == 1)
        logging.info(
            "words tokenized, starting single mentioned word reduction")
        texts = [[word for word in text if word not in tokens_once]
                 for text in texts]
        logging.info("words mentioned only once removed")
        #remove nulls
        texts = filter(None, texts)
        logging.info("CORPUS SIZE AFTER EMPTY ROWS REMOVED: " +
                     str(len(texts)))
        dictionary = corpora.Dictionary(texts)

        #create corpus, tfidf, set up model
        corpus = [dictionary.doc2bow(text) for text in texts]
        tfidf = models.TfidfModel(corpus)  #step 1. --initialize(train) a model
        corpus_tfidf = tfidf[corpus]  # Apply TFIDF transform to entire corpus
        logging.info("starting LDA model")

        #run model

        model = models.ldamodel.LdaModel(corpus_tfidf,
                                         id2word=dictionary,
                                         alpha=self.num_alpha,
                                         num_topics=self.num_topics,
                                         update_every=self.num_update,
                                         passes=self.num_passes)

        return model
Exemplo n.º 17
0
            for token in text:
                frequency[token] += 1

        texts = [[token for token in text if frequency[token] > 1]
                 for text in texts]

        dictionary = corpora.Dictionary(texts)
        dictionary.save('/tmp/deerwester4.dict')

        ## VETOR DAS FRASES
        corpus = [dictionary.doc2bow(text) for text in texts]
        corpora.MmCorpus.serialize('/tmp/deerwester4.mm',
                                   corpus)  # store to disk, for later use

        from gensim import corpora, models, similarities
        tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model

        corpus_tfidf = tfidf[corpus]
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
        corpus_lsi = lsi[
            corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

        ## COORDENADAS DOS TEXTOS
        todas = []
        for doc in corpus_lsi:  # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
            todas.append(doc)

        from gensim import corpora, models, similarities
        dictionary = corpora.Dictionary.load('/tmp/deerwester4.dict')
        corpus = corpora.MmCorpus(
            '/tmp/deerwester4.mm'
Exemplo n.º 18
0
         for document in documents]

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
new_text = [[token for token in text if frequency[token] > 10]
            for text in texts]

dictionary = corpora.Dictionary(new_text)
dictionary.save("dict2.txt")

# 开始对测试集进行操作
testing = "novel/鬼吹灯.txt"
testContent = open(testing, 'rb').read().decode("utf-8", 'ignore')
testWords = jieba.cut(testContent)
testList = ""
for word in testWords:
    testList += word + " "
test_bow = dictionary.doc2bow(testList.split())
train_bow = [dictionary.doc2bow(text) for text in texts]

# 构建模型
tfidf = models.TfidfModel(train_bow)

featureNum = len(dictionary.token2id.keys())
index = similarities.SparseMatrixSimilarity(tfidf[train_bow],
                                            num_features=featureNum)
similary = index[tfidf[test_bow]]
string_tfidf = tfidf[test_bow]
print(similary)
Exemplo n.º 19
0
if __name__ == '__main__':
    #建立词典
    rawdata = get_rawdata()
    # print(rawdata)
    dictionary = corpora.Dictionary(rawdata)  #此处的rawdata为集合的集合,是分词后的句子的集合
    # for i in dictionary:
    #     print(i,'------',dictionary[i])

    #得到词袋
    docbow = [dictionary.doc2bow(text) for text in rawdata]
    # print(docbow)
    # for i in docbow:
    #     print(i)

    #tf-idf模型
    tfidf_model = models.TfidfModel(docbow)  #参数是词袋模型
    tfidf = tfidf_model[docbow]
    # print(tfidf)
    for i in tfidf:
        print(i)

    #test
    # raw_documents = [
    #     '0无偿居间介绍买卖毒品的行为应如何定性',
    #     '1吸毒男动态持有大量毒品的行为该如何认定',
    #     '2如何区分是非法种植毒品原植物罪还是非法制造毒品罪',
    #     '3为毒贩贩卖毒品提供帮助构成贩卖毒品罪',
    #     '4将自己吸食的毒品原价转让给朋友吸食的行为该如何认定',
    #     '5为获报酬帮人购买毒品的行为该如何认定',
    #     '6毒贩出狱后再次够买毒品途中被抓的行为认定',
    #     '7虚夸毒品功效劝人吸食毒品的行为该如何认定',
Exemplo n.º 20
0
from gensim import corpora, models, similarities

corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
    [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
    [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
    [(0, 1.0), (4, 2.0), (7, 1.0)],
    [(3, 1.0), (5, 1.0), (6, 1.0)],
    [(9, 1.0)],
    [(9, 1.0), (10, 1.0)],
    [(9, 1.0), (10, 1.0), (11, 1.0)],
    [(8, 1.0), (10, 1.0), (11, 1.0)]]


# In[67]:

tfidf = models.TfidfModel(corpus)
vec = [(0, 1), (4, 1)]
print(tfidf[vec])

index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
sims = index[tfidf[vec]]
print(list(enumerate(sims)))


# ### Corpora and Vector Spaces

# In[70]:

from gensim import corpora

documents = ["Human machine interface for lab abc computer applications",
Exemplo n.º 21
0
def get_work_experience_score(jobroles, user_experience):
    processd_corpus = [[word for word in document.lower().split()]
                       for document in jobroles]
    dictionary = corpora.Dictionary(processd_corpus)
    feature_count = len(dictionary.token2id)
    bow_corpus = [dictionary.doc2bow(text) for text in processd_corpus]
    tfidf = models.TfidfModel(bow_corpus)
    index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus],
                                                num_features=feature_count)
    master_count = 0
    bachelor_count = 0
    phd_count = 0
    total_expr = 0
    for work_expr in user_experience:
        if (len(work_expr) < 2):
            continue
        jobrole = work_expr[0].lower().split()
        job_duration = work_expr[1]
        query_bow = dictionary.doc2bow(jobrole)
        sims = index[tfidf[query_bow]]
        sorted_similarity = sorted(enumerate(sims),
                                   key=lambda x: x[1],
                                   reverse=True)
        if sorted_similarity[0][1] >= 0.70:
            job_duration = job_duration.strip()
            dates = job_duration.strip().split('-')
            dates = [date.strip() for date in dates]
            try:
                startdate = datetime.strptime(dates[0], '%m/%Y')
            except:
                try:
                    startdate = datetime.strptime(dates[0], '/%Y')
                except:
                    startdate = ''
            enddate = dates[1]
            if enddate != 'Present':
                try:
                    enddate = datetime.strptime(enddate, '%m/%Y')
                except:
                    try:
                        enddate = datetime.strptime(enddate, '/%Y')
                    except:
                        enddate = ''
            else:
                enddate = datetime.now()
            if enddate == '' or startdate == '':
                total_expr = 0
            else:
                duration = enddate - startdate
                total_expr = duration.days / 365
    work_score = 0
    if (total_expr > 0 and total_expr < 3):
        work_score = 1
    elif total_expr >= 3 and total_expr < 5:
        work_score = 2
    elif total_expr >= 5 and total_expr < 10:
        work_score = 3
    elif total_expr > 10:
        work_score = 4
    else:
        work_score = 0
    return work_score / 4
Exemplo n.º 22
0
    def get_vectors_centroid(self,
                             method='update',
                             extra_weights=None,
                             tfidf_weighted=True,
                             weight_method='sqrt',
                             tfidf_model=None,
                             extra_epochs=10):
        """ Calculate centroid vectors for all documents
        
        Individual word vectors are weighted using tfidf (unless weighted=False).
        
        Args:
        --------
        method: str
            Which method to use if not all words are present in trained model.
            'update': word2vec model will be updated by additional training of the model.
            'ignore': will ignore all 'words' not present in the pre-trained model.
            TODO 'substitute": will look to replace missing words with closest matches?
        extra_weights: list
            List of extra weights for add documents (and every word). Set to "False" if not used.
        tfidf_weighted: bool
            True, False
        weight_method: str
            Select method for how to weigh the extra_weights...
            'sqrt' - weight word vectors by sqrt or extra_weights
            None
        tfidf_model: str
            Give filename if pre-defined tfidf model should be used. Otherwise set to None.
        extra_epochs: int
            Number of extra epochs to train IF method is 'update' and missing words are detected.
        """
        # TODO  maybe move the update section to the build_model function?

        # Check if everything is there:
        # 1) Check if model and bow-corpus are present
        if self.model_word2vec is None:
            print(
                "Word2vec model first needs to be load or made (self.build_model_word2vec)."
            )
        if len(self.bow_corpus) == 0:
            print("BOW corpus has not been calculated yet (bow_corpus).")

        # 2) Check if all words are included in trained word2vec model
        dictionary = [self.dictionary[x] for x in self.dictionary]
        test_vocab = []
        for i, word in enumerate(dictionary):
            if word not in self.model_word2vec.wv.vocab:
                test_vocab.append((i, word))

        if len(test_vocab) > 0:
            print(
                "Not all 'words' of the given documents are present in the trained word2vec model!"
            )
            print(len(test_vocab), " out of ", len(self.dictionary),
                  " 'words' were not found in the word2vec model.")
            if method == 'update':
                print(
                    "The word2vec model will hence be updated by additional training."
                )
                self.model_word2vec.build_vocab(self.corpus, update=True)
                self.model_word2vec.train(self.corpus,
                                          total_examples=len(self.corpus),
                                          epochs=extra_epochs)
                self.model_word2vec.save('newmodel')

            elif method == 'ignore':
                print(
                    "'Words'missing in the pretrained word2vec model will be ignored."
                )

                _, missing_vocab = zip(*test_vocab)
                print("Removing missing 'words' from corpus...")
                # Update corpus and BOW-corpus
                self.corpus = [[
                    word for word in document if word not in missing_vocab
                ] for document in self.corpus]
                self.bow_corpus = [
                    self.dictionary.doc2bow(text) for text in self.corpus
                ]
                # TODO: add check with word intensities
            else:
                print(
                    "Given method how do deal with missing words could not be found."
                )
        else:
            print(
                "All 'words' of the given documents were found in the trained word2vec model."
            )

        if tfidf_weighted is True:
            if tfidf_model is not None:
                self.tfidf = models.TfidfModel.load(tfidf_model)
                print("Tfidf model found and loaded.")
            else:
                if self.tfidf is None:
                    self.tfidf = models.TfidfModel(self.bow_corpus)
                    print("No tfidf model found.")
                else:
                    print("Using present tfidf model.")

        vector_size = self.model_word2vec.wv.vector_size
        vectors_centroid = []

        for i in range(len(self.bow_corpus)):
            if (i + 1) % 10 == 0 or i == len(
                    self.bow_corpus) - 1:  # show progress
                print('\r',
                      ' Calculated centroid vectors for ',
                      i + 1,
                      ' of ',
                      len(self.bow_corpus),
                      ' documents.',
                      end="")

            document = [self.dictionary[x[0]] for x in self.bow_corpus[i]]
            if extra_weights is not None:
                document_weight = [
                    extra_weights[i][self.initial_documents[i].index(
                        self.dictionary[x[0]])] for x in self.bow_corpus[i]
                ]
                document_weight = np.array(document_weight) / np.max(
                    document_weight)  # normalize
                if len(document_weight) == 0:
                    print("Something might have gone wrong with: ", i)
                    np.ones((len(document)))
                elif weight_method == 'sqrt':
                    document_weight = np.sqrt(
                        document_weight
                    )  # idea: take sqrt to make huge intensity differences less severe
                elif weight_method is None:
                    pass
                else:
                    print("Unkown weight adding method.")
            else:
                document_weight = np.ones((len(document)))
            if len(document) > 0:
                term1 = self.model_word2vec.wv[document]
                if tfidf_weighted:
                    term2 = np.array(
                        list(zip(*self.tfidf[self.bow_corpus[i]]))[1])
                else:
                    term2 = np.ones((len(document)))

                term1 = term1 * np.tile(document_weight, (vector_size, 1)).T
                weighted_docvector = np.sum((term1.T * term2).T, axis=0)
            else:
                weighted_docvector = np.zeros(
                    (self.model_word2vec.vector_size))
            vectors_centroid.append(weighted_docvector)

        self.vectors_centroid = np.array(vectors_centroid)
Exemplo n.º 23
0
def main():
    """
    Initiation docstring
    """
	
    # Change to whatever you want to plot from
    subreddit = "depression"

	#read suicide-related keywords in csv
    #df = pd.read_csv(f"subreddits/{subreddit}/reddit_depression_submissions.csv", 
			  #sep=',',
			  #encoding='latin-1')
    df = pd.concat(map(pd.read_csv, ['subreddits/depression/reddit_depression_submissions.csv', 
         'subreddits/foreveralone/reddit_foreveralone_submissions.csv',
		 'subreddits/offmychest/reddit_offmychest_submissions.csv',
		 'subreddits/singapore/reddit_singapore_submissions.csv',
		 'subreddits/suicidewatch/reddit_suicidewatch_submissions.csv']))
    #print(df)
	
	##############################################################################
    #####1. PLOTTING BAR CHART of overall sentiment analysis of submissions####
    ##############################################################################

    fig, ax = plt.subplots(figsize=(8, 8))

    counts = df.risk.value_counts(normalize=True) * 100

    sns.barplot(x=counts.index, y=counts, ax=ax)

    ax.set_xticklabels(['Negative', 'Neutral', 'Positive'])
    plt.title("Sentiment Analysis on Reddit")
    ax.set_ylabel("Percentage")
    ax.set_xlabel("Sentiment Categories")	
    #plt.show()
	##############################################################################
    #####2. PLOTTING Negative keyword frequency####
    ##############################################################################
    neg_lines = list(df[df.risk == -1].submission)
    data_text = df[['submission']]
    data_text['index'] = data_text.index
    documents = data_text
    tokenizer = RegexpTokenizer(r'\w+')
    stop_words = stopwords.words('english')
    customStopWords = ['iâ','one','want','anyone','today','itâ','suicidal','depressed','would','get','make','really','else','even',
       'ever','know','think','day','much','going','feeling','person','died','everyone','dead','everything','feel','like',
	   'life','someone','always','still','way','sometimes','things','thoughts','something','every','back','years','killing','killed'
	   'keep']
    stop_words.extend(customStopWords)
    neg_tokens = []
    doc_clean = []
	
    for line in neg_lines:
        toks = tokenizer.tokenize(line)
        toks = [t.lower() for t in toks if t.lower() not in stop_words]
        #toks = [ps.stem(t) for t in toks]
        neg_tokens.extend(toks)
    
    plt.style.use('ggplot')
	
    neg_freq = nltk.FreqDist(neg_tokens)
    neg_freq.most_common(20)
    #print(neg_freq.most_common(20))
    y_val = [x[1] for x in neg_freq.most_common()]
    y_final = []
    for i, k, z, t in zip(y_val[0::4], y_val[1::4], y_val[2::4], y_val[3::4]):
        y_final.append(math.log(i + k + z + t))

    x_val = [math.log(i + 1) for i in range(len(y_final))]
    fig = plt.figure(figsize=(10,5))
    
    plt.xlabel("Words (Log)")
    plt.ylabel("Frequency (Log)")
    plt.title("Negative Word Frequency Distribution on Reddit")
    plt.plot(x_val, y_final)
    #plt.show()
	##############################################################################
    #####3. PLOTTING Negative keyword wordcloud####
    ##############################################################################
    neg_words = ' '.join([text for text in neg_tokens])
    wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(neg_words)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    #plt.show()
	##############################################################################
    #####3. Topic Analysis####
    ##############################################################################
    processed_docs = documents['submission'].map(preprocess)
    print(processed_docs[:10])
    dictionary = gensim.corpora.Dictionary(processed_docs)
    count = 0
    for k, v in dictionary.iteritems():
        #print(k, v)
        count += 1
        if count > 10:
            break
    dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    #bow_corpus[4310]
    from gensim import corpora, models
    tfidf = models.TfidfModel(bow_corpus)
    corpus_tfidf = tfidf[bow_corpus]
    from pprint import pprint
    for doc in corpus_tfidf:
        pprint(doc)
        break
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
    for idx, topic in lda_model.print_topics(-1):
        print('Topic: {} \nWords: {}'.format(idx, topic))
    topics = lda_model.show_topics(formatted=False)
	
    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

    cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)
	
    fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

    for i, ax in enumerate(axes.flatten()):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')


    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.show()
Exemplo n.º 24
0
# 词袋模型形成的词向量
    bow = []
    with open('data.txt', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            lines = line.strip().split(' ')
            word_bow = dictionary.doc2bow(lines)
            bow.append(word_bow)
    # for i in bow:
    #     print(i)

    #tfidf向量
    # corpus = read_data(r'data.txt')
    # for i in corpus:
    #     print(i)
    tfidf_model = models.TfidfModel(bow)
    corpus_tfidf = tfidf_model[bow]
    for item in corpus_tfidf:
        print(item)
    # for i in tfidf_model:
    #     print(i)
    # for doc in corpus:
    #     print(tfidf_model[doc])

    # print(tfidf_model['宣亚国际 终止 收购 映客  股票 巨量 封死 跌停'])
    # for i in tfidf_model:
    #     print(i)
    # print(type(tfidf_model))
    # corpus_tfidf = [tfidf_model[doc] for doc in corpus]
    # for i in corpus_tfidf:
    #     print(i)
Exemplo n.º 25
0
    ] for line in f]
    # texts = [line.strip().split() for line in f]
    print '读入语料数据完成,用时%.3f秒' % (time.time() - t_start)
    f.close()
    M = len(texts)
    print '文本数目:%d个' % M
    # pprint(texts)

    print '正在建立词典 --'
    dictionary = corpora.Dictionary(texts)
    V = len(dictionary)
    print '正在计算文本向量 --'
    corpus = [dictionary.doc2bow(text) for text in texts]
    print '正在计算文档TF-IDF --'
    t_start = time.time()
    corpus_tfidf = models.TfidfModel(corpus)[corpus]
    print '建立文档TF-IDF完成,用时%.3f秒' % (time.time() - t_start)
    print 'LDA模型拟合推断 --'
    num_topics = 30
    t_start = time.time()
    lda = models.LdaModel(corpus_tfidf,
                          num_topics=num_topics,
                          id2word=dictionary,
                          alpha=0.01,
                          eta=0.01,
                          minimum_probability=0.001,
                          update_every=1,
                          chunksize=100,
                          passes=1)
    print 'LDA模型完成,训练时间为\t%.3f秒' % (time.time() - t_start)
    # # 所有文档的主题
Exemplo n.º 26
0
 def gen_corpus(self, documents):
     texts = [[w for w in jieba.cut(doc) if len(w) > 1]
              for doc in documents]
     self.dictionary = corpora.Dictionary(texts)
     self.corpus = [self.dictionary.doc2bow(text) for text in texts]
     self.tfidf = models.TfidfModel(self.corpus)
Exemplo n.º 27
0
from gensim import corpora, models, similarities

# enable logging to display what is happening
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

# read dataset 20newsgroups
dataset = fetch_20newsgroups(shuffle=True,
                             random_state=1,
                             remove=('headers', 'footers', 'quotes'))
documents = dataset.data

texts = preprocess_data(documents)
dictionary = corpora.Dictionary(texts)

bow_corpus = [dictionary.doc2bow(text) for text in texts]  # bow = Bag Of Words
# pprint.pprint(bow_corpus[5]) # one example document, words maped to ids

tfidf = models.TfidfModel(bow_corpus)  # train tf-idf model
corpus_tfidf = tfidf[bow_corpus]  # apply transformation on the whole corpus

##  TODO: transform your tfidf model into a LSI Model
##  using python gensim, use num_topics=200

## TODO: query! pick a random document and formulate a query based on the
## terms in the document.

## TODO: initialize a query structure for your LSI space

## TODO: perform the query on the LSI space, interpret the result and summarize your findings in the report
Exemplo n.º 28
0
 def __init__(self):
     self.__all_doc_list, self.__all_timestamp_list = load_1h_news()
     self.__dictionary = corpora.Dictionary(self.__all_doc_list)
     self.__corpus = [self.__dictionary.doc2bow(doc) for doc in self.__all_doc_list]
     self.__tfidf = models.TfidfModel(self.__corpus)
     print("tf-idf model has beens build successfully")
Exemplo n.º 29
0
 def _generate_tfidf_model(self):
     print('   | Generating Tfidf model...')
     self.tfidf = models.TfidfModel(self.corpus)
     self.tfidf.save(self.tfidf_model_filename)
Exemplo n.º 30
0
    def op(s, q, kaishi):  #这个函数做一次切分操作
        nonlocal step
        # print('当前开始处理',step,'步奏')

        # 准备数据:现有8条文本数据,将8条文本数据放入到list中
        documentsb = s

        documentsq = q

        shujukushuliang = len(documentsb)
        chaxunshuliang = len(documentsq)
        documentAll = documentsb + documentsq

        ##
        # 待比较的文档

        # 获取停用词
        stopwords = set()
        file = open("stopwords.txt", 'r', encoding='UTF-8')
        for line in file:
            stopwords.add(line.strip())
        file.close()

        # 将分词、去停用词后的文本数据存储在list类型的texts中
        documentsb__after_preprocess = [
        ]  #预处理之后的数据库记做documentsb__after_preprocess
        for line in documentAll:
            words = ' '.join(jieba.cut(line)).split(' ')  # 利用jieba工具进行中文分词
            text = []
            # 过滤停用词,只保留不属于停用词的词语
            for word in words:
                if word not in stopwords:
                    text.append(word)
            documentsb__after_preprocess.append(text)
        ##

        # 待比较的文档也进行预处理(同上)
        documentsq__after_preprocess = [
        ]  #预处理之后的数据库记做documentsb__after_preprocess
        for line in documentsq:
            words = ' '.join(jieba.cut(line)).split(' ')  # 利用jieba工具进行中文分词
            text = []
            # 过滤停用词,只保留不属于停用词的词语
            for word in words:
                if word not in stopwords:
                    text.append(word)
            documentsq__after_preprocess.append(text)

        ##
        # 2.计算词频
        # print('2.计算词频')
        frequency = defaultdict(int)  # 构建一个字典对象
        # 遍历分词后的结果集,计算每个词出现的频率
        for text in documentsb__after_preprocess:
            for word in text:
                frequency[word] += 1
        # 选择频率大于1的词(根据实际需求确定)
        texts = [[word for word in text if frequency[word] > 1]
                 for text in documentsb__after_preprocess]

        # for line in texts:
        #     print(line)

        # 3.创建字典(单词与编号之间的映射)
        # print('3.创建字典(单词与编号之间的映射)')
        dictionary = corpora.Dictionary(texts)
        # print(dictionary)
        # 打印字典,key为单词,value为单词的编号
        # print(dictionary.token2id)

        # 4.将待比较的文档转换为向量(词袋表示方法)
        # print('4.将待比较的文档转换为向量(词袋表示方法)')
        # 使用doc2bow方法对每个不同单词的词频进行了统计,并将单词转换为其编号,然后以稀疏向量的形式返回结果
        new_vec = [
            dictionary.doc2bow(text) for text in documentsq__after_preprocess
        ]
        #print(new_vec)  #这个就是query 了!!!!
        ##
        # 5.建立语料库
        # print('5.建立语料库')
        # 将每一篇文档转换为向量
        corpus = [
            dictionary.doc2bow(text) for text in documentsb__after_preprocess
        ]
        # print(corpus)

        # 6.初始化模型
        # print('6.初始化模型')
        # 初始化一个tfidf模型,可以用它来转换向量(词袋整数计数),表示方法为新的表示方法(Tfidf 实数权重)
        tfidf = models.TfidfModel(corpus)
        # 将整个语料库转为tfidf表示方法
        corpus_tfidf = tfidf[corpus]  #这个就是比较的库
        # 7.创建索引
        # print('7.创建索引')
        # 使用上一步得到的带有tfidf值的语料库建立索引#如果库太小,下行会出现bug.这时候需要制定字典.
        index = similarities.MatrixSimilarity(
            corpus_tfidf, num_features=len(
                dictionary))  #这个是根据词频算内积,也就是看句子有多少个词汇一样,一样的越多,分数越高.

        # 8.相似度计算并返回相似度最大的文本
        # print('# 8.相似度计算并返回相似度最大的文本')
        new_vec_tfidf = [tfidf[i] for i in new_vec]  # 将待比较文档转换为tfidf表示方法
        ##
        # 计算要比较的文档与语料库中每篇文档的相似度
        sims = index[new_vec_tfidf]

        #删除numpy矩阵q数据里面的对角线数据,因为他们是自己跟自己比没有意义的.

        for i in range(chaxunshuliang):  #q 里面0 对应 shujukushuliang
            sims[0 + i][i + shujukushuliang] = 0

        ##

        import numpy as np
        tmp = np.argmax(sims, axis=1)
        tmp2 = np.max(sims, axis=1)

        end = time.time()
        # print("当前步奏使用时间",end-start)
        step += 1

        tmp3 = set([kaishi + i for i in range(len(tmp2))
                    if tmp2[i] > yuzhi])  #tmp3是需要删除的文本

        return tmp3