Пример #1
0
def gensim_similarity(data_c):
    """
    使用Gensim包计算相似度:
        词频
            COUNT
            LDA
            LSI
        Tfidf:
            TFIDF
            LDA
            LSI
    """
    # 合并获取词袋
    data_c['s1'] = data_c['s1'].apply(lambda text: list(text))
    data_c['s2'] = data_c['s2'].apply(lambda text: list(text))
    data_c_all = data_c['s1'].append(data_c['s2'], ignore_index=True).to_frame(name='s')

    # 构建词典
    print("starting create dic....")
    dic = corpora.Dictionary(data_c['s1'].values)
    dic.add_documents(data_c['s2'].values)

    print("文档数:", dic.num_docs)
    print("starting create count bow...")
    data_c['s1'] = data_c['s1'].apply(lambda text: dic.doc2bow(text))
    data_c['s2'] = data_c['s2'].apply(lambda text: dic.doc2bow(text))
    data_c_all['s'] = data_c_all['s'].apply(lambda text: dic.doc2bow(text))

    # cps1 = [dic.doc2bow(text) for text in list(data_c['s1'].values)]
    # cps2 = [dic.doc2bow(text) for text in list(data_c['s2'].values)]

    cps1 = list(data_c['s1'])
    cps2 = list(data_c['s2'])
    cps = list(data_c_all['s'])

    # 计算s1,s2词频相似度
    print("starting count similarity....")
    sm = similarities.SparseMatrixSimilarity(corpus=cps1, num_features=10000)
    count_sm = np.diag(sm[cps2])

    # 计算s1,s2词频LDA,LSI相似度
    count_lda_sm = lda_similarity(cps, cps1, cps2, dic)
    # count_lsi_sm= lsi_similarity(cps,cps1,cps2,dic)

    # 计算s1,s2 tfidf相似度
    print("starting tfidf similarity....")
    tfidf = TfidfModel(corpus=cps, id2word=dic)
    cps1_tfidf = tfidf[cps1]
    cps2_tfidf = tfidf[cps2]
    cps_tfidf = tfidf[cps]

    # 计算s1,s2 TFIDF相似度
    sm = similarities.SparseMatrixSimilarity(corpus=cps1_tfidf, num_features=10000)
    tfidf_sm = np.diag(sm[cps2_tfidf])

    # 计算s1,s2词频LDA,LSI相似度
    tfidf_lda_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic)
    tfidf_lsi_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic)

    return count_sm, count_lda_sm, tfidf_sm, tfidf_lda_sm, tfidf_lsi_sm
Пример #2
0
    def __init__(self, doc_set, hash_length=32, words_per_token=1):
        if hash_length > self.MAX_HASH_LENGTH:
            raise Exception(
                "The specified hash length is too long. It must be 128 bits or less"
            )
        self.hash_length = hash_length
        self.documents = doc_set.documents
        docs = [title + " " + body for title, body in self.documents.items()]
        self.doc_list = [title for title, body in self.documents.items()]
        self.inverted_doc_index = {}
        for index, title in enumerate(self.doc_list):
            self.inverted_doc_index[title] = index
        texts = [[
            word for word in document.lower().split()
            if word not in stopwords.words('english')
        ] for document in docs]
        self.dictionary = corpora.Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in texts]
        self.tfidf = models.TfidfModel(self.corpus)
        self.index = similarities.SparseMatrixSimilarity(
            self.tfidf[self.corpus], num_features=len(self.dictionary))
        self.simhash_dict = {}

        for ind, v in enumerate(self.corpus):
            self.simhash_dict[self.doc_list[ind]] = self.create_hash(
                self.tfidf[v])
Пример #3
0
    def index_events(self, event_id_list=None):
        """
        Index the Events based on its indexes
        """

        # Event selection by Id (if provided)
        if event_id_list:
            event_corpus = [
                self.corpus_of_bows[self.dict_event_id_index[event_id]]
                for event_id in event_id_list
            ]
        else:
            event_corpus = self.corpus_of_bows

        # Applying the TFIDF Transformation (if necessary)
        if self.tfidf_model:
            transformed_corpus = self.model[self.tfidf_model[event_corpus]]
        else:
            transformed_corpus = self.model[event_corpus]

        # Create the index of the transformed_corpus to submit queries
        # We use the SparseMatrixSimilarity that uses a sparse data structure instead of a dense one
        # That's why we have to provide the num_features parameter
        self.corpus_query_index = similarities.SparseMatrixSimilarity(
            transformed_corpus, num_features=len(self.dictionary))
Пример #4
0
 def create_index(self, docs_with_urls):
     logger.info("Creating index out of {} documents".format(
         len(docs_with_urls)))
     urls, doc_bows = zip(*self.infer_all(docs_with_urls))
     self.urls = urls
     self.index = similarities.SparseMatrixSimilarity(
         doc_bows, num_features=self.topics)
Пример #5
0
def find_similar_research():
    research = select('url, plaintext as "value" from maincol where url != ?;',
                      [reference_person])
    research.extend(
        select('url, plaintext as "value" from maincol where url = ?;',
               [reference_person]))
    documents = [row['value'].strip() for row in research]
    stoplist = set('for a of the and to in'.split())
    texts = [[
        word for word in document.lower().split() if word not in stoplist
    ] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    vec = corpus[-1]  #The person being compared to

    tfidf = models.TfidfModel(corpus)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus])
    sims = index[tfidf[vec]]

    print list(enumerate(sims))
    save(['url'], [{
        "url": row[0],
        "similarity": row[1][1]
    } for row in zip([row['url'] for row in research], list(enumerate(sims)))],
         'similarity')
Пример #6
0
def build_model():
    sentences = get_sentences()

    words_split = [sentence.split(' ') for sentence in sentences]

    # remove words that appear only once
    from collections import defaultdict

    frequency = defaultdict(int)
    for text in words_split:
        for token in text:
            frequency[token] += 1

    words_split = [[token for token in text if frequency[token] > 1]
                   for text in words_split]

    dictionary = corpora.Dictionary(words_split)
    corpus = [dictionary.doc2bow(text) for text in words_split]

    model = models.TfidfModel(corpus)
    feature_num = len(dictionary.token2id.keys())
    index = similarities.SparseMatrixSimilarity(model[corpus],
                                                num_features=feature_num)

    return dictionary, index, model
Пример #7
0
def creat_main(str1,str2):
    #建立停用词
    stop_words=['。',',','!','?','……']
    #进行分词
    str1_list=[]
    for line in str1:
        str1_words=' '.join(jieba.cut(line)).split(' ')
        doc_txt=[]
        for word in str1_words:
            if word not in stop_words:
                doc_txt.append(word)
        str1_list.append(doc_txt)      

    str2_words=' '.join(jieba.cut(str2)).split(' ')
    str2_list=[]
    for word in str2_words:
        if word not in stop_words:
            str2_list.append(word)

    #对原文进行处理,形成词袋
    dictionary=corpora.Dictionary(str1_list)
    #对词袋中的词进行编号
    dictionary.keys()
    #使用doc2bow制作语料库
    corpus=[dictionary.doc2bow(word) for word in str1_list]
    #对测试文档也进行制作语料库
    test_words_vec=dictionary.doc2bow(str2_list)
    #利用tfidf模型对语料库建模
    tfidf=models.TfidfModel(corpus)
    #对每个目标文档,分析测试文档的相似度
    index=similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(dictionary.keys()))
    sim=index[tfidf[test_words_vec]]
    print('相似度为%s','%.5f'% max(sim))
Пример #8
0
def get_affinity_matrix(allTextList, gensimDict=False):
    print 'clustering: get_affinity_matrix'
    start = datetime.now()
    # 构建字典
    if gensimDict is False:
        # 去停用词
        stoplist = set('for a of the and to in'.split())
        texts = [[
            word for word in text.lower().split() if word not in stoplist
        ] for text in allTextList]
        # 去低频词
        k = 2
        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1
        texts = [[token for token in text if frequency[token] > k]
                 for text in texts]
        dictionary = corpora.Dictionary(
            texts)  # dictionary.doc2bow()将语料中的句子字符串转换为 由每个词对应的(id,freq)组成的列表
    else:
        dictionary = gensimDict
        texts = [text.lower().split() for text in allTextList]
    corpus = [dictionary.doc2bow(text) for text in texts]

    featureNum = len(dictionary.token2id.keys())
    index = similarities.SparseMatrixSimilarity(corpus,
                                                num_features=featureNum)
    sims = [index[i] for i in corpus]
    print 'clustering: get_affinity_matrix end with ', (datetime.now() -
                                                        start).seconds, 's'
    return np.array(sims)
Пример #9
0
def build_gensim_model(corpus):
    text_corpus = [el['text'] for el in corpus]
    # Create a set of frequent words
    stoplist = set('for a of the and to in'.split(' '))
    # Lowercase each document, split it by white space and filter out stopwords
    texts = [[word for word in document.lower().split() if word not in stoplist] for document in text_corpus]
    # Count word frequencies
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    # Only keep words that appear more than once
    processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
    #pprint.pprint(processed_corpus)

    from gensim import corpora
    dictionary = corpora.Dictionary(processed_corpus)
    bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

    from gensim import models
    # train the model
    tfidf = models.TfidfModel(bow_corpus)
    # transform the "system minors" string
    words = "dog food".lower().split()
    print(tfidf[dictionary.doc2bow(words)])
    from gensim import similarities
    index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)]

    query_document = 'dog food'.split()
    query_bow = dictionary.doc2bow(query_document)
    sims = index[tfidf[query_bow]]
    print(list(enumerate(sims)))

    return
Пример #10
0
def main():
    posts = grabPosts()
    posts_combined = {
        x['post_id']: '{0} {1}'.format(x['title'], x['text'])
        for x in posts
    }

    keys = posts_combined.keys()
    documents = posts_combined.values()

    # sanitize and build our corpus
    sentences = [sanitize_sentence(sentence) for sentence in documents]
    dictionary = corpora.Dictionary(sentences)
    corpus = [dictionary.doc2bow(sentence) for sentence in sentences]

    # find out how many features we have and train the model.
    last_corpus = corpus[len(corpus) - 1]
    num_features = last_corpus[len(last_corpus) - 1][0] + 1
    tfidf = models.TfidfModel(corpus)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                num_features=num_features)

    # now that the corpus is settled, go through each post and compute similarity.
    for key, value in posts_combined.iteritems():
        vec = dictionary.doc2bow(sanitize_sentence(value))
        sims = index[tfidf[vec]]
        p = list(enumerate(sims))
        top_ten = sorted(p, key=lambda x: x[1], reverse=True)[1:11]
        dic = sorted([(keys[x[0]], x[1]) for x in top_ten],
                     key=lambda x: x[1],
                     reverse=True)
        EVALUATED_POSTS[key] = dic
Пример #11
0
    def test_miislita_high_level(self):
        # construct corpus from file
        corpusname = datapath('miIslita.cor')
        miislita = CorpusMiislita(corpusname)

        # initialize tfidf transformation and similarity index
        tfidf = models.TfidfModel(miislita,
                                  miislita.dictionary,
                                  normalize=False)
        index = similarities.SparseMatrixSimilarity(tfidf[miislita],
                                                    num_features=len(
                                                        miislita.dictionary))

        # compare to query
        query = 'latent semantic indexing'
        vec_bow = miislita.dictionary.doc2bow(query.lower().split())
        vec_tfidf = tfidf[vec_bow]

        # perform a similarity query against the corpus
        sims_tfidf = index[vec_tfidf]

        # for the expected results see the article
        expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334]
        for i, value in enumerate(expected):
            self.assertAlmostEqual(sims_tfidf[i], value, 2)
Пример #12
0
    def infer(self):
        courses = [
            list(set(stop_words(item).remove()))
            for item in [w.split() for w in self.Courses]
        ]
        classes = list(set(stop_words(self.File_class).remove()))

        dictionary = corpora.Dictionary(courses)
        feature_cnt = len(dictionary.token2id)
        corpus = [dictionary.doc2bow(text) for text in courses]
        tfidf = models.TfidfModel(corpus)
        kw_vector = dictionary.doc2bow(classes)
        index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                    num_features=feature_cnt)
        sim = index[tfidf[kw_vector]]

        course_rec = dict(zip(sim, self.Names))
        course_sort = sorted(course_rec.items(), reverse=True)

        lda_model = models.LdaMulticore(tfidf[corpus],
                                        num_topics=10,
                                        id2word=dictionary,
                                        passes=2,
                                        workers=2)

        for idx, topic in lda_model.print_topics(-1):
            print('Topic: {} \nWords: {}'.format(idx, topic))

        for index, score in sorted(lda_model[tfidf[kw_vector]],
                                   key=lambda tup: -1 * tup[1]):
            print("\nScore: {}\t \nTopic: {}".format(
                score, lda_model.print_topic(index, 10)))

        return course_sort
Пример #13
0
    def get_similarity_rate(all_doc: List[str], doc_test: str) -> List[float]:
        bad_word = '[,."#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。' '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?]+'
        doc_test_cleaned = re.sub(bad_word, "", doc_test)
        all_doc_cleaned = [re.sub(bad_word, "", doc) for doc in all_doc]

        if not doc_test_cleaned or not all_doc_cleaned:
            return [0, 0]

        all_doc_list = [[word for word in jieba.cut(doc)]
                        for doc in all_doc_cleaned]
        doc_test_list = [word for word in jieba.cut(doc_test_cleaned)]

        dictionary = corpora.Dictionary([doc_test_list])

        corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]
        doc_test_vec = dictionary.doc2bow(doc_test_list)

        model = models.TfidfModel(corpus)

        index = similarities.SparseMatrixSimilarity(model[corpus],
                                                    num_features=len(
                                                        dictionary.keys()))

        sim = index[model[doc_test_vec]]

        max_index = np.argmax(sim)
        max_value = sim[max_index]

        return [max_index, max_value]
Пример #14
0
    def judgement(self,key_text,compared_text):
        '''
        相似度判断函数
        :return: 相似度
        '''

        texts = [compared_text, '']
        key_text = key_text

        texts = [self.cut(text) for text in texts]
        # print(texts)

        dictionary = corpora.Dictionary(texts)
        feature_cnt = len(dictionary.token2id.keys())
        corpus = [dictionary.doc2bow(text) for text in texts]
        tfidf = models.TfidfModel(corpus)
        new_vec = dictionary.doc2bow(self.cut(key_text))

        # 相似度计算
        index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt)
        # print('\nTF-IDF模型的稀疏向量集:')
        # for i in tfidf[corpus]:
        #     print(i)
        # print('\nTF-IDF模型的keyword稀疏向量:')
        # print(tfidf[new_vec])

        sim = index[tfidf[new_vec]]
        # self.log("相似度:%s" % sim[0])
        return sim[0]
Пример #15
0
def test():
    # 构建匹配语料库 398872 samples
    sku_names_texts = get_train_datas()
    sku_names_jieba = get_text_jieba(sku_names_texts)

    # 测试数据 1000 samples
    keywords_texts = get_test_datas()
    keywords_jieba = get_text_jieba(keywords_texts)

    # 统计词表
    dictionary = corpora.Dictionary(sku_names_jieba)
    corpus = [dictionary.doc2bow(sku_name) for sku_name in sku_names_jieba]

    # 加载已训练的模型
    print("Model is loading...")
    tfidf = models.TfidfModel.load("models/tfidf_v2")
    print("Model has loaded !")
    # 相似度
    index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                num_features=len(
                                                    dictionary.keys()))

    for i, item in enumerate(keywords_jieba):
        item_vec = dictionary.doc2bow(item)
        sims = index[tfidf[item_vec]]
        idx = list(sims).index(max(list(sims)))
        print(i, "||", keywords_texts[i], "||", sku_names_texts[idx])

        with open("result/tfidf_v2_results.txt", 'a', encoding='utf8') as wf:
            wf.write(
                str(i) + "||" + keywords_texts[i] + "||" +
                sku_names_texts[idx] + "\n")
Пример #16
0
 def sim(self,all_docs_words,test_doc_words):
     # 首先用dictionary方法获取词袋(bag-of-words)
     dictionary=corpora.Dictionary(all_docs_words)
     
     #词袋中用数字对所有词进行了编号
     dictionary.keys()
     
     #编号与词之间的对应关系
     dictionary.token2id
     
     #使用doc2bow制作语料库
     corpus = [dictionary.doc2bow(doc_words) for doc_words in all_docs_words]
     
     #对测试文档分词转换为二元向量
     doc_test_vec = dictionary.doc2bow(test_doc_words)
     
     #使用TF-IDF模型对语料库建模
     tfidf = models.TfidfModel(corpus)
     
     #获取测试文档中,每个词的TF-IDF值
     print(tfidf[doc_test_vec])
     
     #对每个目标文档,分析测试文档的相似度
     index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
     sim = index[tfidf[doc_test_vec]]
     
     #根据相似度排序
     result=sorted(enumerate(sim), key=lambda item: -item[1])
     
     return result
     
Пример #17
0
    def setup_models(self):

        start = time.time()
        print("Preparing corpus dictionary and vector...")

        #corpus_documents = [str(src).split() for src in self.eng_corpus]
        corpus_documents = [
            simple_preprocess(str(src)) for src in self.eng_corpus
        ]
        self.dictionary = corpora.Dictionary(corpus_documents)
        corpus_vector = [
            self.dictionary.doc2bow(tokens) for tokens in corpus_documents
        ]
        print("\tCorpus dictionary and vector completed, time cost: {}".format(
            round(time.time() - start, 2)))

        start = time.time()
        feature_cnt = len(self.dictionary.token2id)
        self.tfidf = models.TfidfModel(corpus_vector, smartirs='nnc')
        self.similarities = similarities.SparseMatrixSimilarity(
            self.tfidf[corpus_vector], num_features=feature_cnt)
        print("\tTFIDF and similarity matrix completed, time cost: {}".format(
            round(time.time() - start, 2)))

        print("\nSerializing corpus dictionary, tfidf and similarities... ")
        self.dictionary.save(str(self.serialize_dict))
        self.tfidf.save(str(self.serialize_tfidf))
        self.similarities.save(str(self.serialize_similarities))
        # corpora.MmCorpus.serialize(self.serialize_vector, self.corpus_vector)
        print("Serialization done.")
Пример #18
0
def check(content, db):
    contentList = getContent(db)
    #print(type(contentList[0][1]))
    all_list = []
    for row in contentList:
        all_list.append(row[1])

    cut_list = []
    for doc in all_list:
        doc_list = [word for word in jieba.cut(doc)]
        cut_list.append(doc_list)

    #print(cut_list)

    cut_content = [word for word in jieba.cut(content)]

    dictionary = corpora.Dictionary(cut_list)

    dictionary.keys()
    #dictionary.token2id
    corpus = [dictionary.doc2bow(doc) for doc in cut_list]
    cut_content_vrc = dictionary.doc2bow(cut_content)

    tfidf = models.TfidfModel(corpus)
    #tfidf[cut_content_vrc]
    index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                num_features=len(
                                                    dictionary.keys()))
    sim = index[tfidf[cut_content_vrc]]
    #result = list(sim)
    result = list(map(float, sim))
    return result
Пример #19
0
def vsm(data):
    documents = []
    for item in data:
        documents.append(item[0])
        documents.append(item[1])
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(doc) for doc in documents]
    tfidf_model = models.TfidfModel(corpus)
    vectors = [tfidf_model[bow] for bow in corpus]
    sim = similarities.SparseMatrixSimilarity(tfidf_model[corpus],
                                              num_features=len(
                                                  dictionary.keys()))
    mrr = 0
    hit = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    for i in range(len(data)):
        query_id = i * 2
        query_tfidf = vectors[query_id]
        sim_result = sim[query_tfidf]
        rank = 0
        for id, item in enumerate(sim_result):
            if id % 2 == 1 and item >= sim_result[query_id + 1]:
                rank += 1
        mrr += 1.0 / rank
        for k in range(len(hit)):
            if rank <= k + 1:
                hit[k] += 1
        print(
            '#%d:' % int(data[i][2]), 'rank=%d' % rank,
            'MRR=%.4f' % (mrr / (i + 1)),
            ', '.join([('Hit@%d=%.4f' % (k + 1, (h / (i + 1))))
                       for k, h in enumerate(hit)]))
Пример #20
0
 def _freq_train(self):
     print('\t\t 1. Frequency training...', end='')
     self._freq_dict = corpora.Dictionary(self.words)
     bow_list = [self._freq_dict.doc2bow(text) for text in self.words]
     self._freq_index = similarities.SparseMatrixSimilarity(
         bow_list, num_features=len(self._freq_dict))
     print(' done.')
Пример #21
0
 def genModel(self):
     if len(self.conds.keys()) == 0:
         return
     cnt = 0
     for key, vals in self.conds.items():
         for val in vals:
             self.conds_list.append(val)
             self.sent2cond[cnt] = key
             cnt += 1
     self.conds_list.append('不知道你说的啥')
     self.sent2cond[cnt] = '不知道你说的啥'
     choice_cut = []
     for i in self.conds_list:
         data1 = ''
         this_data = jieba.cut(i)
         for item in this_data:
             data1 += item + ' '
         choice_cut.append(data1)
     docs = choice_cut
     tall = [[w1 for w1 in doc.split()] for doc in docs]
     self.dictionary = corpora.Dictionary(tall)
     corpus = [self.dictionary.doc2bow(text) for text in tall]
     self.tfidf = models.TfidfModel(corpus)
     print(self.tfidf)
     num = len(self.dictionary.token2id.keys())
     self.index = similarities.SparseMatrixSimilarity(self.tfidf[corpus],
                                                      num_features=num)
     for key, val in self.children.items():
         val.genModel()
Пример #22
0
def main():
    texts = preprocess_doc(DOC_PATH)  #比较文档集
    new_text = preprocess_doc(NEW_DOC_PATH)  #新文档

    frequency = defaultdict(int)  #统计词频
    for text in texts:
        for word in text:
            frequency[word] += 1

    dictionary = corpora.Dictionary(texts)  #定义一个字典

    corpus_bow = [dictionary.doc2bow(text) for text in texts
                  ]  #词袋模型,得到每一篇文档的稀疏向量表示,向量的每一个元素代表了一个word在这篇文档中出现的次数。
    new_bow = dictionary.doc2bow(new_text[0])

    tfidf_model = models.TfidfModel(
        corpus_bow
    )  #tfidf模型,其中corpus是返回bow向量的迭代器,将完成对corpus中出现的每一个特征的IDF值的统计工作。

    new_tfidf = tfidf_model[
        corpus_bow]  #可以调用这个itidf_model将任意一段语料(依然是bow向量的迭代器)转化成TFIDF向量的迭代器
    new_vec_tfidf = tfidf_model[new_bow]

    featureNum = len(dictionary.token2id.keys())

    index = similarities.SparseMatrixSimilarity(new_tfidf,
                                                num_features=featureNum)
    sim = index[new_vec_tfidf]

    display(sim)

    return 0
Пример #23
0
 def __init__(self, documents):
     dictionary = corpora.Dictionary(documents)
     corpus = [dictionary.doc2bow(doc) for doc in documents]
     tfidf_model = models.TfidfModel(corpus)
     self.vectors = [tfidf_model[bow] for bow in corpus]
     self.sim = similarities.SparseMatrixSimilarity(tfidf_model[corpus],
                                                    num_features=len(dictionary.keys()))
Пример #24
0
 def process_lda_matrix(self):
     self.lda = models.ldamulticore.LdaMulticore.load(
         'path_pre_process/lda-model')
     index = similarities.SparseMatrixSimilarity(self.lda[self.corpus],
                                                 num_features=len(
                                                     self.data_dictionary))
     index.save('path_pre_process/lda_matrix')
Пример #25
0
    def get_sim(self,all_anwser_list,question):
        anwser_list = []
        for doc in all_anwser_list:
            anwser1 = str(doc[3]) + '的'
            cut_list1 = [word for word in jieba.cut(anwser1)]
            anwser_list.append(cut_list1)
            anwser2 = str(doc[6]) + '的'
            cut_list2 = [word for word in jieba.cut(anwser2)]
            anwser_list.append(cut_list2)
        stopwords = self.stopwordslist()

        doc_list = []
        for sentence in anwser_list:
            l = []
            for word in sentence:
                if (word not in stopwords) and (word != '\t'):
                    l.append(word)
            doc_list.append(l)

        question_list = [word for word in jieba.cut(question)]
        doc_question_list = []
        for word in question_list:
            if (word not in stopwords) and (word != '\t'):
                doc_question_list.append(word)

        dictionary = corpora.Dictionary(doc_list)
        corpus = [dictionary.doc2bow(doc) for doc in doc_list]
        doc_question_vec = dictionary.doc2bow(doc_question_list)
        tfidf = models.TfidfModel(corpus)
        index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
        sim = index[tfidf[doc_question_vec]]
        return sim
Пример #26
0
def get_similar_paper(target, number=3):
    # Use TF-IDF to recommend the user to read another paper.
    print('crawling...')
    doc_test = get_abstract(target)
    test_doc_list = []
    for i in doc_test.split(' '):
        test_doc_list.append(i)

    all_doc_list = []
    for i in all_doc:
        doc_list = []
        for j in i.split(' '):
            doc_list.append(j)
        all_doc_list.append(doc_list)

    print('analyzing...')
    mydict = corpora.Dictionary(all_doc_list)
    corpus = []
    for i in all_doc_list:
        corpus.append(mydict.doc2bow(i))
    test_vec = mydict.doc2bow(test_doc_list)
    tfidf = models.TfidfModel(corpus)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                num_features=len(
                                                    mydict.keys()))
    sim = index[tfidf[test_vec]]

    res = []
    for i in sorted(enumerate(sim), key=lambda item: item[-1], reverse=True):
        res.append(i[0])
    similar_links = []
    for i in res[0:number]:
        similar_links.append(links_dict[i])
    return similar_links
Пример #27
0
    def tf_idf(self, doc_bow):
        """
        根据此类的文档集来检测文档相似度
        :param doc_bow:
        :return:
        """
        # initialize a model
        tfidf = models.TfidfModel(
            dictionary=self.dictionary)  # 初始化tf-idf模型, corpus 作为语料库

        # 使用tfidf模型将自身的词库转换成tf-idf表示
        corpus_tfidf = tfidf[self.corpus]
        for doc in corpus_tfidf:
            print(doc)

        # 使用模型tfidf,将doc_bow(由词,词频)表示转换成(词,tfidf)表示
        print(tfidf[doc_bow])

        # 检查和每个文档的相似度
        index = similarities.SparseMatrixSimilarity(tfidf[self.corpus],
                                                    num_features=len(
                                                        self.dictionary))
        sims = index[doc_bow]

        print(sims)
Пример #28
0
 def creat_index(self):
     self.tfidf = models.TfidfModel(self.corpus)
     # self.lsi = models.LsiModel(
     #     self.tfidf[self.corpus], id2word=self.dic, num_topics=50
     # )
     self._index = similarities.SparseMatrixSimilarity(
         self.tfidf[self.corpus], num_features=200000)
Пример #29
0
def similarity(sent, topN=10):

    corpus_lines = read_corpus(ner_result_path)
    texts = [line.split("\t")[0].split(' ') for line in corpus_lines]

    keywords = one_ner_tag(sent)

    dictionary = corpora.Dictionary(texts)
    num_features = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(corpus)

    new_vec = dictionary.doc2bow(keywords)
    # 相似度计算
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features)
    # index = similarities.Similarity('-Similarity-index', corpus, num_features)
    # print('\nTF-IDF模型的稀疏向量集:')
    # for i in tfidf[corpus]:
    #     print(i)
    # print('\nTF-IDF模型的keyword稀疏向量:')
    # print(tfidf[new_vec])

    sims = index[tfidf[new_vec]]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    print("\n相似度计算")
    print('Words: {}\nText: {}\n'.format(keywords, sent))

    for k, v in sims[:topN]:
        i = int(k)
        print('Similarity: {}\nWords: {}\nText: {}'.format(
            v, corpus_lines[i].split("\t")[0].split(' '),
            corpus_lines[i].split("\t")[1]))
Пример #30
0
def get_similarity(all_doc_list, doc_test_list):
    """
    1. 使用corpora.Dictionary(建立分词与编号的定义库)->使用dictionary.doc2bow(建立每个文档的分词-索引标号-数量 向量)
    2. 使用models.TfidfModel(通过分词-索引标号-数量 向量,建立TFidfModel,得到每个all_doc_vec的TF-IDF值)->剔除TF-IDF值较低的
    3. 通过文档的TF-IDF值建立相似度比较矩阵对象index,将测试文档的doc_test_vec代入index对象得到相似度值,并进行排序

    :param all_doc_list: 输入的文字
    :param doc_test_list: 需要测试相似度的样本文字
    :return:相关度排序
    """
    # 1.
    dictionary = corpora.Dictionary(all_doc_list)
    print(dictionary.keys())
    # 获取词的编号
    print(dictionary.token2id)
    # 编号与词之间的对应
    all_doc_vec = get_bag_of_words(dictionary, all_doc_list)
    doc_test_vec = get_bag_of_words(dictionary, doc_test_list)
    # 取二元组向量
    # 2.
    tfidf = models.TfidfModel(all_doc_vec)  # 得到TF-IDF模型

    doc_test_vec = eliminate_junk(tfidf, doc_test_vec)
    # 3.
    index = similarities.SparseMatrixSimilarity(tfidf[all_doc_vec],
                                                num_features=len(
                                                    dictionary.keys()))
    sim = index[tfidf[doc_test_vec]]
    print("相似度:")
    print(sim)
    similarities_sort = sorted(enumerate(sim), key=lambda item: -item[1])
    print("相似度排序:")
    print(similarities_sort)
    return similarities_sort