示例#1
0
def getSparseMatrixSimilarity(keyword, texts):

    # 1、将【文本集】生成【分词列表】
    texts = [jieba.lcut(text) for text in texts]

    # 2、基于文本集建立【词典】,并获得词典特征数
    dictionary = Dictionary(texts)
    num_features = len(dictionary.token2id)

    # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
    corpus = [dictionary.doc2bow(text) for text in texts]
    # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
    kw_vector = dictionary.doc2bow(jieba.lcut(keyword))

    # 4、创建【TF-IDF模型】,传入【语料库】来训练
    tfidf = TfidfModel(corpus)
    # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
    tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
    tf_kw = tfidf[kw_vector]
    # 6、相似度计算
    sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
    similarities = sparse_matrix.get_similarities(tf_kw)
    for e, s in enumerate(similarities, 1):
        print('kw 与 text%d 相似度为:%.2f' % (e, s))

    print(sparse_matrix)
    print(similarities)
示例#2
0
文件: tfidf.py 项目: zzozzolev/claf
    def init_model(self):
        corpus = []
        for text in tqdm(self.texts, desc="make corpus (BoW)"):
            corpus.append(self.parse(text))

        self.model = TfidfModel(corpus)
        self.index = SparseMatrixSimilarity(self.model[corpus], num_features=len(self.vocab))
示例#3
0
 def load(conf: Configuration, force: Optional[bool] = False,
          persist: Optional[bool] = True) -> "TFIDFRanker":
     model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/'
     if force or (not os.path.exists(model_path)) \
             or (not os.path.isfile(model_path + 'corpus.mm')) \
             or (not os.path.isfile(model_path + 'tfidf.model')):
         utils.mk_dir_if_not_exists(model_path)
         dataset = TFIDFRanker.extractor.load_dataset(conf=conf)
         dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()])
         bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename'])
                       for (index, data) in dataset.iterrows()]
         bow_corpus, names = map(list, zip(*bow_corpus))
         index_mapping = TFIDFRanker.build_index_mapping(names)
         corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus)
         mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm')
         tfidf_model = TfidfModel(mm_corpus, )
         tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus],
                                              num_features=mm_corpus.num_terms)
         ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus,
                              model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf)
         ranker.persist(model_path)
         logging.info('TFIDFRanker : initialized')
         logging.info('TFIDFRanker : model : {}'.format(tfidf_model))
         logging.info('TFIDFRanker : index : {}'.format(tfidf_index))
         return ranker
     else:
         dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary')
         mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm')
         tfidf_model = TfidfModel.load(model_path + 'tfidf.model')
         tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index')
         with open(model_path + 'index_mapping.pickle', mode='rb') as file:
             index_mapping = pickle.load(file)
             logging.info('TFIDFRanker : initialized')
         return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus,
                            model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf)
示例#4
0
def mergeTags():
    res = {}  # 创建一个空字典
    for i in range(len(displayArr)):
        texts = default_tags
        keyword = displayArr[i]
        # 1、将【文本集】生成【分词列表】
        texts = [lcut(text) for text in texts]
        # 2、基于文本集建立【词典】,并获得词典特征数
        dictionary = Dictionary(texts)
        num_features = len(dictionary.token2id)
        # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
        corpus = [dictionary.doc2bow(text) for text in texts]
        # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
        kw_vector = dictionary.doc2bow(lcut(keyword))
        # 4、创建【TF-IDF模型】,传入【语料库】来训练
        tfidf = TfidfModel(corpus)
        # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
        tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
        tf_kw = tfidf[kw_vector]
        # 6、相似度计算
        sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
        similarities = sparse_matrix.get_similarities(tf_kw)
        for e, s in enumerate(similarities, 1):
            if s > 0.5:
                # print(keyword, ' 与 ', ''.join(texts[e - 1]), ' 的相似度为: ', s)
                key = ''.join(texts[e - 1]).strip()
                res[key] = s
        arrSorted = sorted(res.items(), key=lambda item: item[1], reverse=True)
        for ind, (k, v) in enumerate(arrSorted):
            if ind == 0:
                ids = textsOld[i].strip().split('.')[0]
                textsOld[i] = textsOld[i] + '----------' + k
                # textsOld[i] = ids+'.'+k
        res = {}  #字典置空
    return textsOld
示例#5
0
def samilarRate(texts, keyword):
    # 传入texts,keyword
    # 文本集和搜索词

    # 1、将【文本集】生成【分词列表】
    texts = [lcut(text) for text in texts]

    # 2、基于文本集建立【词典】,并获得词典特征数
    dictionary = Dictionary(texts)
    num_features = len(dictionary.token2id)

    # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
    corpus = [dictionary.doc2bow(text) for text in texts]

    # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
    kw_vector = dictionary.doc2bow(lcut(keyword))

    # 4、创建【TF-IDF模型】,传入【语料库】来训练
    tfidf = TfidfModel(corpus)

    # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
    tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
    tf_kw = tfidf[kw_vector]

    # 6、相似度计算
    sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
    similarities = sparse_matrix.get_similarities(tf_kw)
    result = []
    sorft = []
    for e, s in enumerate(similarities, 1):
        result.append('kw 与 text%d 相似度为:%.2f' % (e, s))
        sorft.append(s)
    return result, sorft
示例#6
0
 def check(news):
     """检查是否重复"""
     dictionary, corpus, num_features = Similar.dictionary()
     kw_vector = dictionary.doc2bow(lcut(news))
     tfidf = TfidfModel(corpus)
     tf_texts = tfidf[corpus]
     tf_kw = tfidf[kw_vector]
     sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
     similarities = sparse_matrix.get_similarities(tf_kw)
     for e, s in enumerate(similarities, 1):
         if 0.6 < s < 0.98:
             return
     return news
def getMatrixSimilarity(tfidfModel, lsiModel=None) -> SparseMatrixSimilarity:
    similarityPath = os.path.join('.cache', 'sim_mat.gensim_sim')
    try:
        sim = MatrixSimilarity.load(similarityPath)
    except FileNotFoundError:
        corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False)
        if lsiModel is None:
            lsiModel = getLsiModel(tfidfModel)
        sim = SparseMatrixSimilarity(lsiModel[corpus],
                                     num_best=21,
                                     num_features=tfidfModel.vectors.shape[0])
        sim.save(similarityPath)
    return sim
示例#8
0
    def fit(self):
        """
        Fit the TFIDF model

        each argument should be a list of lists, where each inner list is a list of keyphrases.

        e.g.

        submission_kps = [
            ['deep_learning', 'natural_language_processing'],
            ['neural_imaging', 'fmri', 'functional_magnetic_resonance']
        ]
        """

        self.bow_archives_by_paperid = {userid: [self.dictionary.doc2bow(doc) for doc in archive] \
            for userid, archive in self.kp_archives_by_paperid.items()}

        self.bow_archives_by_userid = {userid: [self.dictionary.doc2bow(doc) for doc in archive] \
            for userid, archive in self.kp_archives_by_userid.items()}

        flattened_archives = [
            bow for archive in self.bow_archives_by_paperid.values()
            for bow in archive
        ]

        self.index = SparseMatrixSimilarity(
            [self.tfidf[bow] for bow in flattened_archives],
            num_features=len(self.dictionary))
示例#9
0
 def create_index(self, docs_with_urls):
     logger.info("Creating index out of {} documents".format(
         len(docs_with_urls)))
     urls, doc_bows = zip(*self.infer_all(docs_with_urls))
     self.urls = urls
     self.index = SparseMatrixSimilarity(doc_bows,
                                         num_features=len(self.dictionary))
def get_sim(previous_req, new_req, flag):
    # 利用cosmic和非cosmic信息构建的Dictionary
    dictionary = corpora.Dictionary.load_from_text('./data/total.dic')
    # print('new_req.shape:', new_req.shape, 'raw_info.shape:', raw_info.shape, 'previous_req.shape:', previous_req.shape)
    corpus = [dictionary.doc2bow(text) for text in previous_req[flag]]
    tfidf = models.TfidfModel(corpus)
    index = SparseMatrixSimilarity(tfidf[corpus], 4000)

    new = [dictionary.doc2bow(t) for t in new_req['joint_info']]
    sim_dict = {}
    sim = index[new]
    relation_exist = []
    for i in range(new_req.shape[0]):
        key = (new_req['batch'].iloc[i], new_req['projectNo'].iloc[i],
               new_req['requirementNO'].iloc[i])
        value = {}
        current = {
            key,
        }
        # print(key)
        for j in range(len(sim[i])):
            if sim[i][j] >= 0.8:
                inner_key = (previous_req['batch'].iloc[j],
                             previous_req['projectNo'].iloc[j],
                             previous_req['requirementNO'].iloc[j])
                if inner_key == key:
                    continue
                else:
                    value[inner_key] = sim[i][j]
                    current.add(inner_key)
        if value and (current not in relation_exist):
            relation_exist.append(current)
            sim_dict[key] = value

    return sim_dict, len(relation_exist)
示例#11
0
    def __tfidf_sim_match(self, query, ans_list, threshold=0.10):
        cut_query, _ = self.reader.clean_cut_trim([query])  # 清洗query
        cut_ans_list, _ = self.reader.clean_cut_trim(ans_list)  # 清洗ans_list

        ans_bow = [self.tfidf_dict.doc2bow(line)
                   for line in cut_ans_list]  # 用ans_list做一个bag of words
        text_tfidf = self.tfidf_model[ans_bow]  # apply model
        sim_index = SparseMatrixSimilarity(text_tfidf, self.n_features)

        query_bow = [self.tfidf_dict.doc2bow(cut_query[0])
                     ]  # 用query做一个bag of words
        query_tfidf = self.tfidf_model[query_bow]  # 用tfidf model编码
        similarities = sim_index[query_tfidf][0]  # 算相似度

        sorted_scores = sorted(similarities, reverse=True)  # 将得分从大到小排序
        max_pos = np.argsort(similarities)[::-1]  # 从大到小排序,返回index(而不是真正的value)
        answers = self.__max_pos2answers(max_pos, ans_list)

        # 用QQ匹配的阈值过滤一遍结果
        sorted_scores, max_pos, answers, questions = \
            self.__filter_by_threshold(sorted_scores, max_pos, answers, [], threshold)

        if len(answers) > 0:
            return True
        else:
            return False
示例#12
0
async def load_index():
    if 'index' not in model:
        index_file = await tasks['index']
        model['index'] = SparseMatrixSimilarity.load(index_file)
        for shard in model['index'].shards:
            shard.dirname = os.path.dirname(index_file)
    return model['index']
def mergeTags(textArr):
    res = []
    for i in range(len(displayArr)):
        try:
            exampleArr = textArr
            if i == 0:
                texts = textArr
            else:
                for item in res:
                    if item in exampleArr:
                        exampleArr.remove(item)
                texts = exampleArr
                res = []

            # print(exampleArr)
            # 文本集和搜索词
            # texts = ['吃鸡这里所谓的吃鸡并不是真的吃鸡,也不是谐音词刺激的意思',
            #          '而是出自策略射击游戏《绝地求生:大逃杀》里的台词',
            #          '我吃鸡翅,你吃鸡腿']
            keyword = texts[i]
            # 1、将【文本集】生成【分词列表】
            texts = [lcut(text) for text in texts]
            # 2、基于文本集建立【词典】,并获得词典特征数
            dictionary = Dictionary(texts)
            num_features = len(dictionary.token2id)
            # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
            corpus = [dictionary.doc2bow(text) for text in texts]
            # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
            kw_vector = dictionary.doc2bow(lcut(keyword))
            # 4、创建【TF-IDF模型】,传入【语料库】来训练
            tfidf = TfidfModel(corpus)
            # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
            tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
            tf_kw = tfidf[kw_vector]
            # 6、相似度计算
            sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
            similarities = sparse_matrix.get_similarities(tf_kw)
            for e, s in enumerate(similarities, 1):
                if s > 0.5:
                    res.append(exampleArr[e - 1])
                    print(keyword + ' 与 ' + exampleArr[e - 1] + ' 的相似度为 :', s)
            print('---------------------------------------------------')
        except:
            print('')
    print('合并完成!')
def predict_tfid(pred_data, data):
    path = input("Enter path to LDA model: ")
    tfid = gensim.models.TfidfModel.load(path + "tfid_model")
    corpus = MmCorpus(path + "tfid_corpus.mm")
    tfid_corpus = tfid[corpus]
    new_dictionary = Dictionary(data['tokens'])
    new_corpus = [new_dictionary.doc2bow(doc) for doc in data['tokens']]
    index_sparse = SparseMatrixSimilarity(tfid_corpus,
                                          num_features=corpus.num_terms)
    index_sparse.num_best = 500
    idx = (index_sparse[new_corpus])
    print("Most Similar users are as follows: ")
    print("Name\t\t\tscore ")
    m = 1
    for i in idx[0]:
        display("{}. {}     {}".format(m, data.iloc[i[0]]['handles'], i[1]))
        m += 1
    return
示例#15
0
 def build_model(self, listoftextlist):
     self.dictionary = Dictionary(listoftextlist)
     self.num_features = len(self.dictionary.token2id)
     self.corpus = [
         self.dictionary.doc2bow(text) for text in listoftextlist
     ]
     self.tfidf = TfidfModel(self.corpus)
     self.index = SparseMatrixSimilarity(self.tfidf[self.corpus],
                                         self.num_features)
示例#16
0
def create_neg_from_cluster(data, cluster_id, all_clusters):
    '''
    Creates negative pairs from a cluster
    '''

    # Get the cluster
    cluster = data.loc[data["cluster_id"].values == cluster_id]
    cluster = extract_key_features(cluster)
    pairs = []
    hard_neg = len(cluster) // 2
    
    # Hard negatives are those that are from different clusters, but we get the pair with the highest similarity
    for row in range(hard_neg):
        # Keep choosing random titles until we get one that is not our own
        neg_cluster_id = cluster_id        
        while neg_cluster_id == cluster_id:
            neg_cluster_id = random.choice(all_clusters)
        
        # Extract data about this cluster
        neg_cluster = data.loc[data["cluster_id"].values == neg_cluster_id].copy()
        neg_cluster = extract_key_features(neg_cluster)
        
        # Add the current title of the cluster to the beginning of this random cluster so that
        # the first row in the similarity matrix will refer to this title
        neg_cluster = pd.concat([pd.DataFrame([cluster.iloc[row].values], columns=["id", "description", "title", "titleDesc"]),
                                 neg_cluster])
        
        # Get the similarity between the title and the random cluster
        dictionary = corpora.Dictionary(neg_cluster["titleDesc"])
        neg_cluster_dict = [dictionary.doc2bow(title) for title in neg_cluster["title"].map(lambda x: x.split(" "))]
        sim_matrix = np.array(SparseMatrixSimilarity(neg_cluster_dict, num_features=len(dictionary)))
        
        # First row is the similarity between the current title and the rest of the random cluster
        # so get the max similarity of this (+1 is because we don't include the similarity with ourself)
        max_val = sim_matrix[0][1:].argmax() + 1
        
        # Add the pair
        pair = [cluster["title"].iloc[row], neg_cluster["title"].iloc[max_val], 0]
        pairs.append(pair)
    
    for row in range(hard_neg, len(cluster)):
        # Keep choosing random titles until we get one that is not our own
        neg_cluster_id = cluster_id
        while neg_cluster_id == cluster_id:
            neg_cluster_id = random.choice(all_clusters)
        
        # Randomly get a title from the random cluster
        neg_cluster = data.loc[data["cluster_id"].values == neg_cluster_id].copy()
        neg_cluster = extract_key_features(neg_cluster)
        neg_title = neg_cluster["title"].iloc[random.choice(list(range(len(neg_cluster))))]
        
        # Add the pair
        pair = [cluster["title"].iloc[row], neg_title, 0]
        pairs.append(pair)
    
    return pd.DataFrame(pairs, columns=["title_one", "title_two", "label"])
示例#17
0
文件: tfidf.py 项目: zzozzolev/claf
    def load(self, dir_path):
        dir_path = Path(dir_path)

        vocab_path = str(dir_path / self.VOCAB_FNAME)
        model_path = str(dir_path / self.TFIDF_FNAME)
        index_path = str(dir_path / self.INDEX_FNAME)

        self.vocab = Dictionary.load(vocab_path)
        self.model = TfidfModel.load(model_path)
        self.index = SparseMatrixSimilarity.load(index_path)
示例#18
0
def similar(aim):
    aim_text = aim.title + aim.abstract
    simple = [x.title + x.abstract for x in ret[0:-10]]
    text = [set(posseg.lcut(x)) for x in simple]
    text = list({y for x in text for y in x})
    dictionary = Dictionary(text)
    length = len(dictionary.token2id)
    corpus = [dictionary.doc2bow(lcut(src)) for src in simple]
    tfidf = TfidfModel(corpus)
    tf_texts = tfidf[corpus]
    sparse_matrix = SparseMatrixSimilarity(tf_texts, length)

    vector = dictionary.doc2bow(lcut(aim_text))
    tf_kw = tfidf[vector]
    similarities = sparse_matrix.get_similarities(tf_kw)

    print(aim.title)
    for e, s in enumerate(similarities, 1):
        if s > 0.1:
            print(s, ret[e - 1].title)
示例#19
0
def create_pos_from_cluster(data, cluster_id):
    '''
    Creates positive pairs from a cluster
    '''

    MAX_PAIRS = 16
    cluster = data.loc[data["cluster_id"].values == cluster_id]
    cluster = extract_key_features(cluster)
    max_combos = combinations(len(cluster), 2)
    
    dictionary = corpora.Dictionary(cluster["titleDesc"])
    cluster_dict = [dictionary.doc2bow(title) for title in cluster["title"].map(lambda x: x.split(" "))]
    sim_matrix = np.array(SparseMatrixSimilarity(cluster_dict, num_features=len(dictionary)))
    
    # Because the matrix is redundant (the rows and columns represent the same titles)
    # we set the bottom half of the similarities (including the diagonal) to 100
    # so that we don't have to worry about them when doing argmin()
    for row in range(sim_matrix.shape[0]):
        for column in range(sim_matrix.shape[1]):
            if (row >= column):
                sim_matrix[row][column] = 100
    
    # If the maximum amount of combinations we can make is less than our set max,
    # set the maximum to the max combos
    if max_combos < MAX_PAIRS:
        MAX_PAIRS = max_combos
    
    # Half of the pairs should be hard positives and the other half random
    hard_pos = MAX_PAIRS // 2
    random_pos = MAX_PAIRS - hard_pos
    
    pairs = []

    # Hard positives are those that are from the same cluster, but with the least similarity
    for x in range(hard_pos):
        # Keep getting the pairs with the lowest similarity score
        min_sim = np.unravel_index(sim_matrix.argmin(), sim_matrix.shape)
        pair = [cluster["title"].iloc[min_sim[0]], cluster["title"].iloc[min_sim[1]], 1]
        pairs.append(pair)
        sim_matrix[min_sim[0]][min_sim[1]] = 100
    
    # The amount of available pairs (given that some are gone from hard positive creation)
    avail_indices = np.argwhere(sim_matrix != 100)

    # Get random pairs within the same cluster
    for x in range(random_pos):
        ran_idx = random.sample(list(range(len(avail_indices))), 1)
        choice = avail_indices[ran_idx][0]
        pair = [cluster["title"].iloc[choice[0]],
                cluster["title"].iloc[choice[1]], 1]
        pairs.append(pair)
        avail_indices = np.delete(avail_indices, ran_idx, 0)
    
    return pd.DataFrame(pairs, columns=["title_one", "title_two", "label"])
示例#20
0
    def index(self, corpus, mode="MatrixSimilarity"):
        if mode == "MatrixSimilarity":
            self._index = MatrixSimilarity(self.corpus,
                                           num_features=self.num_features)
        elif mode == "SparseMatrixSimilarity":
            self._index = SparseMatrixSimilarity(
                self.corpus, num_features=self.num_features)
        else:
            raise TypeError(
                "mode has to be either MatrixSimilarity or SparseMatrixSimilarity"
            )

        return self._index[corpus]
 def to_sparse_matrix(self):
     # 1、将【文本集】生成【分词列表】
     #texts = [lcut(text) for text in texts_list]
     texts = [lcut(str(text)) for text in self.texts_list]
     
     # 2、基于文本集建立【词典】,并获得词典特征数
     dictionary = Dictionary(texts)
     self.dictionary=dictionary
     num_features = len(dictionary.token2id)
     # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
     corpus = [dictionary.doc2bow(text) for text in texts]
     # 4、创建【TF-IDF模型】,传入【语料库】来训练
     tfidf = TfidfModel(corpus)
     self.tfidf=tfidf
     tf_texts = tfidf[corpus]
     sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
     
     return sparse_matrix
def get_sim(all_reqs, new_req):
    # 利用cosmic和非cosmic信息构建的Dictionary
    dictionary = corpora.Dictionary.load_from_text('./data/total.dic')
    corpus = [dictionary.doc2bow(text) for text in all_reqs['joint_info']]

    # 得到tfidf模型
    tfidf = models.TfidfModel(corpus)

    # 得到稀疏矩阵相似度计算模型
    index = SparseMatrixSimilarity(tfidf[corpus], 4000)

    # 储存相似度结果的双层嵌套词典, 格式为:{new_req_id1:{sim_req_id1:sim_value, ...}, ...}
    # 其中主键id为需求的type, batch, projectNO, requirementNO, file_trail信息组成的tuple
    sim_dict = {}

    # 先根据dictionary将new_req(新需求)信息构建words bow,再使用tfidf(TF-IDF模型)将其转化为TF-IDF向量
    # 将新需求的TF-IDF向量输入到 index(SparseMatrixSimilarity模型)得到相似度矩阵sim, 大小为 [len(new_req) * len(all_req)]
    sim = index[tfidf[[dictionary.doc2bow(t) for t in new_req['joint_info']]]]
    print('相似矩阵的大小为:', sim.shape)
    relation_exist = []
    for i in range(sim.shape[0]):
        key = (new_req['type'].iloc[i], new_req['batch'].iloc[i],
               new_req['projectNO'].iloc[i], new_req['requirementNO'].iloc[i],
               new_req['file_trail'].iloc[i])
        # print(key)
        value = {}
        for j in range(sim.shape[1]):
            if sim[i][j] >= 0.86:
                inner_key = (all_reqs['type'].iloc[j],
                             all_reqs['batch'].iloc[j],
                             all_reqs['projectNO'].iloc[j],
                             all_reqs['requirementNO'].iloc[j],
                             all_reqs['file_trail'].iloc[j])
                if inner_key == key:
                    continue
                else:
                    ## current_relation是 关系对set
                    current_relation = {key, inner_key}
                    if current_relation not in relation_exist:
                        value[inner_key] = sim[i][j]
                        relation_exist.append(current_relation)
        if value:
            sim_dict[key] = value
    return sim_dict, len(relation_exist)
示例#23
0
def compute_section_text_similarity(corpus: Sequence[str]):
    """ Convert text bundle into tfidf vectors."""

    # Use generator to increase memory efficiency
    def tokenized_corpus() -> typing.Generator[list[str], None, None]:
        yield from (tokenize(doc) for doc in corpus)

    def bag_of_words_corpus(
        dct: corpora.Dictionary
    ) -> typing.Generator[list[tuple[int, int]], None, None]:
        yield from (dct.doc2bow(doc) for doc in tokenized_corpus())

    word_id_map = corpora.Dictionary(tokenized_corpus())
    tfidf = models.TfidfModel(bag_of_words_corpus(word_id_map),
                              dictionary=word_id_map)

    similarity_index = SparseMatrixSimilarity(
        tfidf[bag_of_words_corpus(word_id_map)], num_features=len(word_id_map))
    pairwise_similarity = [
        simi for idx, similarities in enumerate(similarity_index)
        for simi in similarities[idx + 1:]
    ]
    return sum(pairwise_similarity) / len(pairwise_similarity)
示例#24
0
    dictionary = corpora.Dictionary(worked_texts)

    corpus = [dictionary.doc2bow(worked_text) for worked_text in worked_texts]

    # convert the dictionary to a bag of words corpus for reference

    lsi_model = models.LsiModel(corpus, id2word=dictionary, num_topics=12)

    query = "April is the fourth month of the year, and comes between March \
        and May. It has 30 days. April begins on the same day of week as July in \
        all years and also January in leap years."

    query = "IF YOU'RE LIKE most iPhone users, when you upgraded to the newest version of iOS, Apple automatically migrated your settings, apps, and text messages. While there are benefits to wiping your phone and starting over—c'mon, you don't really need all those apps—there's also the possibility that you might lose valuable info hidden within your text messages."

    tfidf_model = TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity, Similarity

    index_sparse = SparseMatrixSimilarity(corpus, num_features=len(dictionary))

    import sklearn
    sklearn.externals.joblib.dump(tfidf_model, 'tfidf_model.pkl')
    sklearn.externals.joblib.dump(lsi_model, 'lsi_model.pkl')
    sklearn.externals.joblib.dump(texts, 'texts.pkl')

    sklearn.externals.joblib.dump(worked_texts, 'worked_texts.pkl')
    sklearn.externals.joblib.dump(titles, 'titles.pkl')
    sklearn.externals.joblib.dump(dictionary, 'dictionary.pkl')
    sklearn.externals.joblib.dump(corpus, 'corpus.pkl')
    sklearn.externals.joblib.dump(index_sparse, 'index_sparse.pkl')
示例#25
0
    def __init__(self):
        # 读入停用词表
        with open(FilePool.stopword_txt, 'r') as f_stopword:
            doc = f_stopword.readlines()
        self.stopwords = [line.rstrip('\n') for line in doc]

        # 读入答案
        if args.answer_base == 'long':
            # 使用长答案
            ans_json = FilePool.long_answers_json
            ans_txt = FilePool.long_answers_txt
        elif args.answer_base == 'cleaned':
            # 使用短答案
            ans_json = FilePool.cleaned_answers_json
            ans_txt = FilePool.cleaned_answers_txt
        else:
            # 使用small answers
            ans_json = FilePool.small_answers_json
            ans_txt = FilePool.small_answers_txt
        with open(ans_json, 'r') as f_json:
            text = json.load(f_json)
            if args.trim_stop:
                self.cut_answers = [[ele for ele in answer if ele not in self.stopwords] for answer in text]
            else:
                self.cut_answers = text
        with open(ans_txt, 'r') as f_ans_txt:
            text = f_ans_txt.readlines()
            self.uncut_answers = [line.rstrip('\n') for line in text]

        # 读入QA库和已知问题库
        if args.method == Method.mix or args.method == Method.qq_match:
            with open(FilePool.qa_file, 'r') as f_qa:
                self.qa = json.load(f_qa)
            with open(FilePool.base_ques_list_file, 'r') as f_base_ques_list:
                self.base_ques_list = json.load(f_base_ques_list)

        # 提前实例化bm25模型,提升性能
        # 如果提前对问题分类了,那也要提前实例化模型,给分类为空的问题兜底
        if (args.method == Method.bm25 or args.method == Method.bm25_syn):
            self.bm25_model_uncat = BM25(self.cut_answers)
        if args.method == Method.mix or args.method == Method.bm25_new:
            self.bm25_model_uncat = NewBM25(self.cut_answers)

        # 提前实例化tfidf模型,提升性能
        if args.method == Method.mix or args.method == Method.qq_match:
            self.tfidf_dict = Dictionary(self.base_ques_list)  # fit dictionary
            n_features = len(self.tfidf_dict.token2id)
            bow = [self.tfidf_dict.doc2bow(line) for line in self.base_ques_list]  # convert corpus to BoW format
            # 构造tf-idf模型
            self.tfidf_model = TfidfModel(bow)  # fit model
            text_tfidf = self.tfidf_model[bow]  # apply model
            self.sim_index = SparseMatrixSimilarity(text_tfidf, n_features)
        elif args.method == Method.tfidf_sim:
            self.tfidf_dict = Dictionary(self.cut_answers)  # fit dictionary
            n_features = len(self.tfidf_dict.token2id)
            bow = [self.tfidf_dict.doc2bow(line) for line in self.cut_answers]  # convert corpus to BoW format
            # 构造tf-idf模型
            self.tfidf_model = TfidfModel(bow)  # fit model
            text_tfidf = self.tfidf_model[bow]  # apply model
            self.sim_index = SparseMatrixSimilarity(text_tfidf, n_features)

        # 实例化Parser
        self.parser = StanfordDependencyParser(path_to_jar=FilePool.stanford_parser,
                                               path_to_models_jar=FilePool.stanford_chinese_model)
def build_positive_pairs(corpus, clusters, attribute, num_pos):
    pos_pairs = []
    for current_cluster in tqdm(clusters):
        cluster_data = corpus[corpus['cluster_id'] == current_cluster]

        # build gensim dictionary, corpus and search index for selected cluster
        dct = Dictionary(cluster_data[attribute], prune_at=5000000)
        dct.filter_extremes(no_below=2, no_above=1.0, keep_n=None)
        gensim_corpus = [dct.doc2bow(text) for text in cluster_data[attribute]]
        index = SparseMatrixSimilarity(gensim_corpus,
                                       num_features=len(dct),
                                       num_best=80)

        # query up to 80 most similar offers, only offers with similarity > 0 will be returned
        query = index[gensim_corpus]

        for i, offer_sim_dup in enumerate(query):

            current_num_pos = num_pos
            current_id = cluster_data.iloc[i]['id']

            offer_sim = []

            # remove self
            for x in offer_sim_dup:
                if x[0] != i:
                    offer_sim.append(x)

            # check if any pairs > 0 similarity remain
            if len(offer_sim) == 0:
                pos_pairs.append((current_id, [[], []]))
                continue

            # adapt number of selectable pairs if too few available
            offer_len = len(offer_sim)
            if offer_len < current_num_pos:
                current_num_pos = offer_len

            if current_num_pos == 1:
                hard_pos = 1
                random_pos = 0
            elif current_num_pos % 2 == 1:
                hard_pos = int(current_num_pos / 2) + 1
                random_pos = int(current_num_pos / 2)
            else:
                hard_pos = int(current_num_pos / 2)
                random_pos = int(current_num_pos / 2)

            # get hard offers from bottom of list
            hard_offers = offer_sim[-hard_pos:]

            if random_pos == 0:
                pos_pairs.append(
                    (current_id,
                     [[cluster_data.iloc[x[0]]['id'] for x in hard_offers],
                      []]))
                continue

            # remaining offers
            rest = offer_sim[:-hard_pos]

            # randomly select from remaining
            random_select = random.sample(range(len(rest)), random_pos)
            random_offers = [rest[idx] for idx in random_select]

            hard_ids = [cluster_data.iloc[x[0]]['id'] for x in hard_offers]
            random_ids = [cluster_data.iloc[x[0]]['id'] for x in random_offers]

            pos_pairs.append((current_id, [hard_ids, random_ids]))
    return pos_pairs
示例#27
0
        else:
            """建立词典  获得特征数"""
            dictionary = corpora.Dictionary(diff_word_list)
            feature_cnt = len(dictionary.token2id.keys())
            """基于词典  分词列表转稀疏向量集"""
            corpus = [dictionary.doc2bow(codes) for codes in diff_word_list]
            # print("key")
            # print([x for x in word_list if x not in stopwords])
            kw_vector = dictionary.doc2bow([x for x in word_list if x not in stopwords])
            """创建tf-idf模型   传入语料库训练"""
            tfidf = TfidfModel(corpus)
            """训练好的tf-idf模型处理检索文本和搜索词"""
            tf_texts = tfidf[corpus]
            tf_kw = tfidf[kw_vector]
            """相似度计算"""
            sparse_matrix = SparseMatrixSimilarity(tf_texts, feature_cnt)
            similarities = sparse_matrix.get_similarities(tf_kw)
            # print("similarities")
            # print(similarities)
            # for e, s in enumerate(similarities, 1):
            #     print('kw 与 text%d 相似度为:%.2f' % (e, s))
            conceptualSimilarity.append(max(similarities))

        """key word ratio"""
        keywordsInComments = [x for x in word_list if x in languageKeyWords]
        stopKeyRatio.append(keywordsInComments.__len__() / word_list.__len__())


    print(readable)
    print(max(readable), min(readable))
示例#28
0
    def fit(self,
            source,
            target,
            sourcetext='text',
            sourcedate='publication_date',
            targettext='text',
            targetdate='publication_date',
            keyword_source=None,
            keyword_target=None,
            keyword_source_must=False,
            keyword_target_must=False,
            condition_source=None,
            condition_target=None,
            days_before=None,
            days_after=None,
            merge_weekend=False,
            threshold=None,
            from_time=None,
            to_time=None,
            to_csv=False,
            destination='comparisons',
            to_pajek=False,
            filter_above=0.5,
            filter_below=5):
        '''
        source/target = doctype of source/target (can also be a list of multiple doctypes)
        sourcetext/targettext = field where text of target/source can be found (defaults to 'text')
        sourcdate/targetedate = field where date of source/target can be found (defaults to 'publication_date')
        keyword_source/_target = optional: specify keywords that need to be present in the textfield; list or string (lowercase)
        keyword_source/_target_must = optional: In case of a list, do all keywords need to appear in the text (logical AND) or does at least one of the words need to be in the text (logical OR). Defaults to False (logical OR)
        condition_source/target = optional: supply the field and its value as a dict as a condition for analysis, e.g. {'topic':1} (defaults to None)
        days_before = days target is before source (e.g. 2); days_after = days target is after source (e.g. 2) -> either both or none should be supplied. Additionally, merge_weekend = True will merge articles published on Saturday and Sunday.
        threshold = threshold to determine at which point similarity is sufficient; if supplied only the rows who pass it are included in the dataset
        from_time, to_time = optional: specifying a date range to filter source and target articles. Supply the date in the yyyy-MM-dd format.
        to_csv = if True save the resulting data in a csv file - otherwise a pandas dataframe is returned
        destination = optional: where should the resulting datasets be saved? (defaults to 'comparisons' folder)
        to_pajek = if True save - in addition to csv/pickle - the result (source, target and similarity score) as pajek file to be used in the Infomap method (defaults to False) - not available in combination with days_before/days_after parameters
        filter_above = Words occuring in more than this fraction of all documents will be filtered
        filter_below = Words occuring in less than this absolute number of docments will be filtered
        '''
        now = time.localtime()

        logger.info(
            "The results of the similarity analysis could be inflated when not using the recommended text processing steps (stopword removal, punctuation removal, stemming) beforehand"
        )

        #Construct source and target queries for elasticsearch
        if isinstance(source, list):  # multiple doctypes
            source_query = {
                'query': {
                    'bool': {
                        'filter': {
                            'bool': {
                                'must': [{
                                    'terms': {
                                        'doctype': source
                                    }
                                }]
                            }
                        }
                    }
                }
            }
        elif isinstance(source, str):  # one doctype
            source_query = {
                'query': {
                    'bool': {
                        'filter': {
                            'bool': {
                                'must': [{
                                    'term': {
                                        'doctype': source
                                    }
                                }]
                            }
                        }
                    }
                }
            }

        if isinstance(target, list):  # multiple doctypes
            target_query = {
                'query': {
                    'bool': {
                        'filter': {
                            'bool': {
                                'must': [{
                                    'terms': {
                                        'doctype': target
                                    }
                                }]
                            }
                        }
                    }
                }
            }
        elif isinstance(target, str):  # one doctype
            target_query = {
                'query': {
                    'bool': {
                        'filter': {
                            'bool': {
                                'must': [{
                                    'term': {
                                        'doctype': target
                                    }
                                }]
                            }
                        }
                    }
                }
            }

        #Change query if date range was specified
        source_range = {'range': {sourcedate: {}}}
        target_range = {'range': {targetdate: {}}}
        if from_time:
            source_range['range'][sourcedate].update({'gte': from_time})
            target_range['range'][targetdate].update({'gte': from_time})
        if to_time:
            source_range['range'][sourcedate].update({'lte': to_time})
            target_range['range'][targetdate].update({'lte': to_time})
        if from_time or to_time:
            source_query['query']['bool']['filter']['bool']['must'].append(
                source_range)
            target_query['query']['bool']['filter']['bool']['must'].append(
                target_range)

        #Change query if keywords were specified
        if isinstance(keyword_source, str) == True:
            source_query['query']['bool']['filter']['bool']['must'].append(
                {'term': {
                    sourcetext: keyword_source
                }})
        elif isinstance(keyword_source, list) == True:
            if keyword_source_must == True:
                for item in keyword_source:
                    source_query['query']['bool']['filter']['bool'][
                        'must'].append({'term': {
                            sourcetext: item
                        }})
            elif keyword_source_must == False:
                source_query['query']['bool']['should'] = []
                source_query['query']['bool']['minimum_should_match'] = 1
                for item in keyword_source:
                    source_query['query']['bool']['should'].append(
                        {'term': {
                            sourcetext: item
                        }})
        if isinstance(keyword_target, str) == True:
            target_query['query']['bool']['filter']['bool']['must'].append(
                {'term': {
                    targettext: keyword_target
                }})
        elif isinstance(keyword_target, list) == True:
            if keyword_target_must == True:
                for item in keyword_target:
                    target_query['query']['bool']['filter']['bool'][
                        'must'].append({'term': {
                            targettext: item
                        }})
            elif keyword_target_must == False:
                target_query['query']['bool']['should'] = []
                target_query['query']['bool']['minimum_should_match'] = 1
                for item in keyword_target:
                    target_query['query']['bool']['should'].append(
                        {'term': {
                            targettext: item
                        }})

        #Change query if condition_target or condition_source is specified
        if isinstance(condition_target, dict) == True:
            target_query['query']['bool']['filter']['bool']['must'].append(
                {'match': condition_target})
        if isinstance(condition_source, dict) == True:
            source_query['query']['bool']['filter']['bool']['must'].append(
                {'match': condition_source})

        #Retrieve source and target articles as generators
        source_query = scroll_query(source_query)
        target_query = scroll_query(target_query)

        #Make generators into lists and filter out those who do not have the specified keys (preventing KeyError)
        target_query = [
            a for a in target_query if targettext in a['_source'].keys()
            and targetdate in a['_source'].keys()
        ]
        source_query = [
            a for a in source_query if sourcetext in a['_source'].keys()
            and sourcedate in a['_source'].keys()
        ]

        #Target and source texts (split)
        target_text = []
        for doc in target_query:
            target_text.append(doc['_source'][targettext].split())
        source_text = []
        for doc in source_query:
            source_text.append(doc['_source'][sourcetext].split())

        logger.info('Preparing dictionary')
        dictionary = Dictionary(source_text + target_text)
        logger.info(
            'Removing all tokens that occur in less than {} documents or in more than {:.1f}% or all documents from dictionary'
            .format(filter_below, filter_above * 100))
        dictionary.filter_extremes(no_below=filter_below,
                                   no_above=filter_above)
        logger.info('Preparing tfidf model')
        tfidf = TfidfModel(dictionary=dictionary)

        #extract additional information from sources
        source_dates = [doc['_source'][sourcedate] for doc in source_query]
        source_ids = [doc['_id'] for doc in source_query]
        source_doctype = [doc['_source']['doctype'] for doc in source_query]
        source_dict = dict(zip(source_ids, source_dates))
        source_dict2 = dict(zip(source_ids, source_doctype))

        #extract information from targets
        target_ids = [doc['_id'] for doc in target_query]
        target_dates = [doc['_source'][targetdate] for doc in target_query]
        target_dict = dict(zip(target_ids, target_dates))
        target_doctype = [doc['_source']['doctype'] for doc in target_query]
        target_dict2 = dict(zip(target_ids, target_doctype))

        #If specified, comparisons compare docs within sliding date window
        if days_before != None or days_after != None:
            logger.info('Performing sliding window comparisons...')
            # merge queries including identifier key
            for i in source_query:
                i.update({'identifier': 'source'})
            for i in target_query:
                i.update({'identifier': 'target'})
            source_query.extend(target_query)

            # sourcedate and targetdate need to be the same key (bc everything is done for sourcedate)
            if targetdate is not sourcedate:
                logger.info(
                    'Make sure that sourcedate and targetdate are the same key.'
                )

            else:
                # convert dates into datetime objects
                for a in source_query:
                    if isinstance(a['_source'][sourcedate],
                                  datetime.date) == True:
                        pass  # is already datetime object
                    else:
                        a['_source'][sourcedate] = [
                            int(i)
                            for i in a['_source'][sourcedate][:10].split("-")
                        ]
                        a['_source'][sourcedate] = datetime.date(
                            a['_source'][sourcedate][0],
                            a['_source'][sourcedate][1],
                            a['_source'][sourcedate][2])

                # sort query by date
                source_query.sort(key=lambda item: item['_source'][sourcedate])

                # create list of all possible dates
                d1 = source_query[0]['_source'][sourcedate]
                d2 = source_query[-1]['_source'][sourcedate]
                delta = d2 - d1
                date_list = []
                for i in range(delta.days + 1):
                    date_list.append(d1 + datetime.timedelta(i))

                # create list of docs grouped by date (dates without docs are empty lists)
                grouped_query = []
                for d in date_list:
                    dt = []
                    for a in source_query:
                        if a['_source'][sourcedate] == d:
                            dt.append(a)
                    grouped_query.append(dt)
                # Optional: merges saturday and sunday into one weekend group
                # Checks whether group is Sunday, then merge together with previous (saturday) group.
                if merge_weekend == True:
                    grouped_query_new = []
                    for group in grouped_query:
                        # if group is sunday, extend previous (saturday) list, except when it is the first day in the data.
                        if group[0]['_source'][sourcedate].weekday() == 6:
                            if not grouped_query_new:
                                grouped_query_new.append(group)
                            else:
                                grouped_query_new[-1].extend(group)
                        # if empty, append empty list
                        elif not group:
                            grouped_query_new.append([])
                        # for all other weekdays, append new list
                        else:
                            grouped_query_new.append(group)
                    grouped_query = grouped_query_new

                # Sliding window starts here... How it works:
                # A sliding window cuts the documents into groups that should be compared to each other based on their publication dates. A list of source documents published on the reference date is created. For each of the target dates in the window, the source list is compared to the targets, the information is put in a dataframe, and the dataframe is added to a list. This process is repeated for each window. We end up with a list of dataframes, which are eventually merged together into one dataframe.

                len_window = days_before + days_after + 1
                source_pos = days_before  # source position is equivalent to days_before (e.g. 2 days before, means 3rd day is source with the index position [2])
                n_window = 0

                for e in tqdm(self.window(grouped_query, n=len_window)):
                    n_window += 1
                    df_window = []

                    source_texts = []
                    source_ids = []
                    if not 'source' in [
                            l2['identifier'] for l2 in e[source_pos]
                    ]:
                        pass

                    else:
                        for doc in e[source_pos]:
                            try:
                                if doc['identifier'] == 'source':
                                    # create sourcetext list to compare against
                                    source_texts.append(
                                        doc['_source'][sourcetext].split())
                                    # extract additional information
                                    source_ids.append(doc['_id'])
                            except:
                                logger.error(
                                    'This does not seem to be a valid document'
                                )
                                print(doc)

                        # create index of source texts
                        query = tfidf[[
                            dictionary.doc2bow(d) for d in source_texts
                        ]]

                        # iterate through targets
                        for d in e:
                            target_texts = []
                            target_ids = []

                            for doc in d:
                                try:
                                    if doc['identifier'] == 'target':
                                        target_texts.append(
                                            doc['_source'][targettext].split())
                                        # extract additional information
                                        target_ids.append(doc['_id'])
                                except:
                                    logger.error(
                                        'This does not seem to be a valid document'
                                    )
                                    print(doc)
                            # do comparison
                            index = SparseMatrixSimilarity(
                                tfidf[[
                                    dictionary.doc2bow(d) for d in target_texts
                                ]],
                                num_features=len(dictionary))
                            sims = index[query]
                            #make dataframe
                            df_temp = pd.DataFrame(
                                sims, columns=target_ids,
                                index=source_ids).stack().reset_index()
                            df_window.append(df_temp)

                        df = pd.concat(df_window, ignore_index=True)
                        df.columns = ['source', 'target', 'similarity']
                        df['source_date'] = df['source'].map(source_dict)
                        df['target_date'] = df['target'].map(target_dict)
                        df['source_doctype'] = df['source'].map(source_dict2)
                        df['target_doctype'] = df['target'].map(target_dict2)

                        #Optional: if threshold is specified
                        if threshold:
                            df = df.loc[df['similarity'] >= threshold]

                        #Make exports folder if it does not exist yet
                        if not 'comparisons' in os.listdir('.'):
                            os.mkdir('comparisons')

                        #Optional: save as csv file
                        if to_csv == True:
                            df.to_csv(
                                os.path.join(
                                    destination,
                                    r"INCA_cosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{n_window}.csv"
                                    .format(now=now,
                                            target=target,
                                            source=source,
                                            n_window=n_window)))
                            #Otherwise: save as pickle file
                        else:
                            df.to_pickle(
                                os.path.join(
                                    destination,
                                    r"INCA_cosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{n_window}.pkl"
                                    .format(now=now,
                                            target=target,
                                            source=source,
                                            n_window=n_window)))

                #Optional: save as pajek file not for days_before/days_after
                if to_pajek == True:
                    logger.info(
                        "Does not save as Pajek file with days_before/days_after because of the size of the files."
                    )

        #Same procedure as above, but without specifying a time frame (thus: comparing all sources to all targets)
        else:

            #Create index out of target texts
            logger.info("Preparing the index out of target texts...")
            index = SparseMatrixSimilarity(
                tfidf[[dictionary.doc2bow(d) for d in target_text]],
                num_features=len(dictionary))

            #Retrieve source IDs and make generator to compute similarities between each source and the index
            logger.info("Preparing the query out of source texts...")
            query = tfidf[[dictionary.doc2bow(d) for d in source_text]]
            query_generator = (item for item in query)

            #Retrieve similarities
            logger.info("Starting comparisons...")

            i = 0
            s_ids = 0
            for doc in query_generator:
                i += 1  # count each round of comparisons
                # if doc is empty (which may happen due to pruning)
                # then we skip this comparison
                if len(doc) == 0:
                    s_ids += 1
                    logger.info('Skipped one empty document')
                    continue

                #sims_list = [index[doc] for doc in query_generator]
                sims = index[doc]

                #make dataframe
                #df = pd.DataFrame(sims_list, columns=target_ids, index = source_ids).stack(). reset_index()
                df = pd.DataFrame([sims]).transpose()
                logger.debug('Created dataframe of shape {}'.format(df.shape))
                logger.debug('Length of target_id list: {}'.format(
                    len(target_ids)))
                df['target'] = target_ids
                df['source'] = source_ids[s_ids]
                df.columns = ['similarity', 'target', 'source']
                df["source_date"] = df["source"].map(source_dict)
                df["target_date"] = df["target"].map(target_dict)
                df['source_doctype'] = df['source'].map(source_dict2)
                df['target_doctype'] = df['target'].map(target_dict2)
                df = df.set_index('source')

                #Optional: if threshold is specified
                if threshold:
                    df = df.loc[df['similarity'] >= threshold]

                #Make exports folder if it does not exist yet
                if not 'comparisons' in os.listdir('.'):
                    os.mkdir('comparisons')

                #Optional: save as csv file
                if to_csv == True:
                    df.to_csv(
                        os.path.join(
                            destination,
                            r"INCA_cosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.csv"
                            .format(now=now, target=target, source=source,
                                    i=i)))
                #Otherwise: save as pickle file
                else:
                    df.to_pickle(
                        os.path.join(
                            destination,
                            r"INCA_cosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.pkl"
                            .format(now=now, target=target, source=source,
                                    i=i)))

                #Optional: additionally save as pajek file
                if to_pajek == True:
                    G = nx.Graph()
                    # change int to str (necessary for pajek format)
                    df['similarity'] = df['similarity'].apply(str)
                    # change column name to 'weights' to faciliate later analysis
                    df.rename({'similarity': 'weight'}, axis=1, inplace=True)
                    # notes and weights from dataframe
                    G = nx.from_pandas_edgelist(df,
                                                source='source',
                                                target='target',
                                                edge_attr='weight')
                    # write to pajek
                    nx.write_pajek(
                        G,
                        os.path.join(
                            destination,
                            r"INCA_cosine_{source}_{target}_{now.tm_year}_{now.tm_mon}_{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}_{i}.net"
                            .format(now=now, target=target, source=source,
                                    i=i)))

                s_ids += 1  # move one doc down in source_ids

                logger.info("Done with source " + str(i) + " out of " +
                            str(len(source_text)))
示例#29
0
                                 resultDir=gensim_build.RESULT_DIR,
                                 acceptLangs=[language])

    logging.info("loading word id mapping from %s" %
                 config.resultFile('wordids.txt'))
    id2word = dmlcorpus.DmlCorpus.loadDictionary(
        config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))

    corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl'))
    input = MmCorpus(config.resultFile('_%s.mm' % method))
    assert len(input) == len(
        corpus
    ), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (
        len(input), len(corpus))

    # initialize structure for similarity queries
    if method == 'lsi' or method == 'rp':  # for these methods, use dense vectors
        index = MatrixSimilarity(input,
                                 numBest=MAX_SIMILAR + 1,
                                 numFeatures=input.numTerms)
    else:
        index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1)

    index.normalize = False  # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op)
    generateSimilar(
        corpus, index, method
    )  # for each document, print MAX_SIMILAR nearest documents to a xml file, in dml-cz specific format

    logging.info("finished running %s" % program)
示例#30
0
async def create_file(keyword: str,
                      threshold: float,
                      file: UploadFile = File(...)):
    contents = file.file.read()
    now = time.time()
    with open("./cache_file/" + str(now) + file.filename, "w+") as f:
        f.write(contents.decode("utf-8"))
    with open("./cache_file/" + str(now) + file.filename, "r") as f_read:
        data = f_read.readlines()
    # 1、将【文本集】生成【分词列表】
    texts = [lcut(text.strip("\n")) for text in tqdm(data)]
    # 2、基于文本集建立【词典】,并获得词典特征数
    dictionary = Dictionary(texts)
    num_features = len(dictionary.token2id)
    # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
    corpus = [dictionary.doc2bow(text) for text in tqdm(texts)]
    # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
    kw_vector = dictionary.doc2bow(lcut(keyword))
    # 4、创建【TF-IDF模型】,传入【语料库】来训练
    tfidf = TfidfModel(corpus)
    # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
    tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
    tf_kw = tfidf[kw_vector]
    # 6、相似度计算
    sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
    similarities = sparse_matrix.get_similarities(tf_kw)
    # print(similarities)
    new_now = datetime.datetime.now()
    #文件目录简单管理
    os.makedirs("./static/" + keyword + str(new_now))
    db.insert({"name": keyword + str(new_now), "type": "dir"})
    f = open("./static/" + keyword + str(new_now) + "/result.txt", "w")
    db.insert({
        "name": keyword + str(new_now) + "/result.txt",
        "type": "file",
        "dir": keyword + str(new_now)
    })
    f1 = open("./static/" + keyword + str(new_now) + "/result_er.txt", "w")
    db.insert({
        "name": keyword + str(new_now) + "/result_er.txt",
        "type": "file",
        "dir": keyword + str(new_now)
    })
    #end
    Semantic_list = []

    for e, s in enumerate(similarities, 1):
        su = (e, s)
        Semantic_list.append(su)
        try:
            if s >= threshold:
                f.write(data[e - 1].strip("\n") + str(s) + "\n")
            else:
                f1.write(data[e - 1].strip("\n") + str(s) + "\n")
        except Exception as e:
            pass
    Semantic_list.sort(key=takeSecond, reverse=True)
    rs_list = []
    for item in Semantic_list[0:101]:
        rs_dic = {"msg": data[item[0] - 1], "Similaritydegree": str(item[1])}
        rs_list.append(rs_dic)
    # Semantic_list

    os.remove("./cache_file/" + str(now) + file.filename)
    return {"semantic": rs_list}