Exemplo n.º 1
0
def to_article(rank, lda):
    # rank   排序后的主题列表
    value = np.arange(1, 0, -0.1)  # 主题对应的分值 np.arange(1,0,-0.1)
    tp = pickle.load(open(r"D:\citeulike\dic\all_" + lda[:-4] + ".dump", "r"))
    # 遍历每篇文章,通过排序后主题计算与该主题最相关的文章
    rank_list = []
    for m in range(len(tp)):
        rate2 = 0
        t = 0
        nov_rate = rank_topic.seen("1", m)  # 乘以新奇性
        for n in rank:
            if n in tp[m].keys():
                j = rank.index(n)
                # 计算排序主题与主题概率的乘积,方便选取最相似的文章
                rate = value[j] * tp[m][n]
                t = t + 1
            else:
                rate = 0
            rate2 += rate
        if t != 0:
            # 乘以文章新奇性概率,除以文章中主题出现在rank中的个数,防止主题过多,结果越大
            sum_rate = rate2 * nov_rate / t
        else:
            sum_rate = rate2 * nov_rate
        rank_list.append(sum_rate)  # 按文章顺序排列的相似性值
    return rank_list
Exemplo n.º 2
0
def to_article(rank,lda):
    #rank   排序后的主题列表
    value = np.arange(1,0,-0.1)  # 主题对应的分值 np.arange(1,0,-0.1)
    tp = pickle.load(open(r'D:\citeulike\dic\all_'+lda[:-4]+'.dump', 'r'))
    # 遍历每篇文章,通过排序后主题计算与该主题最相关的文章
    rank_list = []
    for m in range(len(tp)):
        rate2 = 0
        t = 0
        nov_rate = rank_topic.seen('1', m) #乘以新奇性
        for n in rank:
            if n in tp[m].keys():
                j = rank.index(n)
                # 计算排序主题与主题概率的乘积,方便选取最相似的文章
                rate = value[j] * tp[m][n]
                t = t +1
            else:
                rate = 0
            rate2 += rate
        if t != 0:
            #乘以文章新奇性概率,除以文章中主题出现在rank中的个数,防止主题过多,结果越大
            sum_rate = rate2*nov_rate / t
        else:
            sum_rate =rate2*nov_rate
        rank_list.append(sum_rate)  # 按文章顺序排列的相似性值
    return rank_list
Exemplo n.º 3
0
def get_tp(nom):
    text_list = stopword.get_txt(content[nom])
    doc_bow = diction.doc2bow(text_list)
    doc_lda = batch_lda[doc_bow]
    tp_batch = []
    for yuanzu in doc_lda:
        tp_batch.append(list(yuanzu))
#归一化
    dict_sim1 = {}
    sum0 = 0
    t = 0
    for m in range(len(doc_lda)):
        if tp_batch[m][1] > 0.1:
            t += 1
            sum0 += tp_batch[m][1]
    for n in range(len(doc_lda)):
        if tp_batch[n][1] > 0.1:
            tp_batch[n][1] = tp_batch[n][1]/sum0
    for (tp_id,rate) in tp_batch:
        if rate >0.1:
            dict_sim2 = {}
            wordsNum = int(round((10-t)*rate))
            #print tp_id
            #print wordsNum
            #从相似性矩阵中获取wordnum个最相似的主题
            sim_1 = sorted(sim[tp_id],reverse=True)
            sim_2 = sim_1[1:wordsNum+1]
            #print sim_2
            for i in sim_2:
                dict_sim2[sim[tp_id].index(i)] = i #index获取列表中对应元素的下标
            '''
            if tp_id in dict_sim2.keys():
                dict_sim2[tp_id] += 1.0
            else:
                dict_sim2[tp_id] = 1.0  #加入文章的主题,概率设为1
            '''
            dict_sim1[tp_id] = dict_sim2
    tp_list = []
    for i in dict_sim1.values():
        for j in i.keys():
            tp_list.append(j)
    tp_list2 = list(set(tp_list))

    tp = pickle.load(open(r'D:\citeulike\dic\all_online_lda_100.dump', 'r'))
#def to_article(rank):
# 遍历每篇文章,通过排序后主题计算与该主题最相关的文章
    rank_list = []
    for m in range(len(tp)):
        rate2 = 0
        t = 0
        nov_rate = rank_topic.seen('1', m)  # 乘以新奇性
        for n in tp_list2:
            if n in tp[m].keys():
                # 计算排序主题与主题概率的乘积,方便选取最相似的文章
                rate = tp[m][n]
                t = t + 1
            else:
                rate = 0
            rate2 += rate
        if t != 0:
            # 乘以文章新奇性概率,除以文章中主题出现在rank中的个数,防止主题过多,结果越大
            sum_rate = rate2 * nov_rate / t
        else:
            sum_rate = rate2 * nov_rate
        rank_list.append(sum_rate)  # 按文章顺序排列的相似性值
    return rank_list