Python text_net示例，text_classify.text_net Python示例

示例#1

0

显示文件

def summary_main(weibo_data):  #摘要自动生成主函数
    '''
        输入数据：
        weibo列表：[weibo1,weibo2,...]
    '''

    word_result, word_weight = word_net(weibo_data, 5)

    text_list = text_net(word_result, word_weight, weibo_data)

    text_str = ''
    for text in text_list:
        re_t = re_cut(text)
        if not len(re_t):
            continue
        if re_t[-1] != '。':
            text_str = text_str + re_t + '。'
        else:
            text_str = text_str + re_t
    #print text_str
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text_str, lower=True, source='all_filters')

    result = []
    for item in tr4s.get_key_sentences(num=10):
        result.append(item.sentence)

    return result

示例#2

0

显示文件

def opinion_main(weibo_data,k_cluster):
    '''
        观点挖掘主函数：
        输入数据：
        weibo_data：微博列表，[weibo1,weibo2,...]
        k_cluster：子话题个数

        输出数据：
        opinion_name：子话题名称字典，{topic1:name1,topic2:name2,...}
        word_result：子话题关键词对，{topic1:[w1,w2,...],topic2:[w1,w2,...],...}
        text_list：子话题对应的文本，{topic1:[text1,text2,...],topic2:[text1,text2,..],..}
    '''
    
    weibo_new = []
    for i in range(0,len(weibo_data)):
        text = weibo_data[i]
        n = str(text).count('@')
        if n >= 5:
            continue
        value = cut_filter(text)
        if len(value) > 0:
            if text != '转发微博':
                weibo_new.append(value)
    
    word_result,word_weight = word_net(weibo_new,k_cluster)#提取关键词对
    
    text_list,opinion_name = text_net(word_result,word_weight,weibo_new)#提取代表文本

    return opinion_name,word_result,text_list

示例#3

0

显示文件

def opinion_main(weibo_data, k_cluster):
    '''
        观点挖掘主函数：
        输入数据：
        weibo_data：微博列表，[weibo1,weibo2,...]
        k_cluster：子话题个数

        输出数据：
        opinion_name：子话题名称字典，{topic1:name1,topic2:name2,...}
        word_result：子话题关键词对，{topic1:[w1,w2,...],topic2:[w1,w2,...],...}
        text_list：子话题对应的文本，{topic1:[text1,text2,...],topic2:[text1,text2,..],..}
    '''
    print('\t\tGetting keywords...')
    limit_num = 30000
    weibo_data = weibo_num_limit(weibo_data, limit_num)
    # while True:
    word_result, word_weight, word_main = word_net(weibo_data,
                                                   k_cluster)  #提取关键词对
    # if len(word_result):
    #     break
    # else:
    #     print('Cluto wrong!!! Trying again... If you want to stop it, just kill it...')

    print('\t\tGetting present text...')
    text_list, opinion_name = text_net(word_result, word_weight,
                                       weibo_data)  #提取代表文本,会保证每个聚类里面的微博数量是相等的

    return opinion_name, word_result, text_list, word_main

示例#4

0

显示文件

文件： ad.py 项目： NeilWang6/case

def main(flag, k_cluster):
    weibo = []
    weibo_dict = dict()
    reader = csv.reader(file('./test/weibo%s.csv' % flag, 'rb'))
    for mid, text in reader:
        n = str(text).count('@')
        if n >= 5:
            continue
        value = cut_filter(text)
        if len(value) > 0:
            if text != '转发微博':
                weibo.append(str(mid))
                weibo_dict[str(mid)] = str(text)

    test(weibo, weibo_dict, flag)  #生成测试数据

    lable = choose_ad(flag)  #广告过滤

    ind, word = word_net(weibo, weibo_dict, lable, flag, k_cluster)  #提取关键词对

    write(ind, word, flag)  #写关键词对

    text_net(weibo, weibo_dict, lable, ind, word, flag)  #提取代表文本

示例#5

0

显示文件

文件： ad.py 项目： huxiaoqian/case

def main(flag,k_cluster):
    weibo = []
    weibo_dict = dict()
    reader = csv.reader(file('./test/weibo%s.csv' % flag, 'rb'))
    for mid,text in reader:
        n = str(text).count('@')
        if n >= 5:
            continue
        value = cut_filter(text)
        if len(value) > 0:
            if text != '转发微博':
                weibo.append(str(mid))
                weibo_dict[str(mid)] = str(text)

    test(weibo,weibo_dict,flag)#生成测试数据
    
    lable = choose_ad(flag)#广告过滤

    ind, word = word_net(weibo,weibo_dict,lable,flag,k_cluster)#提取关键词对

    write(ind,word,flag)#写关键词对
    
    text_net(weibo,weibo_dict,lable,ind,word,flag)#提取代表文本

示例#6

0

显示文件

文件： search_weibo_byuid.py 项目： yuanhuiru/xnr2

def search_weibo_from_word(uidlist,keywords):#第四种策略：先根据BM25检索文本，然后再根据关键词的交集筛选文本
    '''
        输入数据：
        uidlist：uid列表
        keywords：keywords列表，热点新闻切词之后的结果

        输出数据：
        text_list：筛选之后的微博文本
    '''
    text_list,word_set = get_text_word_by_id(uidlist)#根据uid列表获取对应的文本和分词之后的结果

    text_set,word_dict = get_text_by_BM(text_list,word_set,keywords)
    
    n = int(0.5*len(text_set))
    if n < 1:
        n = 1
    result_list = TopkHeap(n)

    w_n = int(0.5*len(keywords))
    if w_n < 1:
        w_n = 1

    for i in range(0,len(word_dict)):
        words = word_dict[i]
        len_n = len(set(words)&set(keywords))
        if len_n >= w_n:
            result_list.Push((len_n,text_set[i]))

    result = result_list.TopK()
    text_list = []
    for i in range(0,len(result)):
        if result[i][1] not in text_list:
            text_list.append(result[i][1])
    
    if len(text_list) >= 10:
        word_result,word_weight = word_net(text_list,OPINION_CLUSTER)
        text_list = text_net(word_result,word_weight,text_list)
        result = []
        for text in text_list:
            s = summary_text(text)
            max_r,n = get_s(result,s)
            if max_r >= 0.5:
                continue
            else:
                result.append(s)
    else:
        result = [summary_text(text_list)]

    return result

示例#7

0

显示文件

def opinion_main(weibo_data,k_cluster):
    '''
        观点挖掘主函数：
        输入数据：
        weibo_data：微博列表，[[mid,text,uid,timetamp,uname,forwarding_count,comment_count],...]
        k_cluster：子话题个数

        输出数据：
        opinion_name：子话题名称字典，{topic1:name1,topic2:name2,...}
        word_result：子话题关键词对，{topic1:[w1,w2,...],topic2:[w1,w2,...],...}
        text_list：子话题对应的文本，{topic1:[text1,text2,...],topic2:[text1,text2,..],..}
    '''
    
    weibo_new = []
    for i in range(0,len(weibo_data)):
        text = weibo_data[i][1]
        mid = weibo_data[i][0]
        uid = weibo_data[i][2]
        timetamp = weibo_data[i][3]
        uname = weibo_data[i][4]
        forwarding_count = weibo_data[i][5]
        comment_count = weibo_data[i][6]
        #print 'text..',text
        #print 'type..text..',type(text)
        text = text.encode('utf-8')
        n = str(text).count('@')
        if n >= 5:
            continue
        
        value = cut_filter(text)
        if len(value) > 0:
            if text != '转发微博':
                weibo_new.append((value,mid,uid,timetamp,uname,forwarding_count,comment_count))
    
    word_result,word_weight = word_net(weibo_new,k_cluster)#提取关键词对
    
    text_list,opinion_name = text_net(word_result,word_weight,weibo_new)#提取代表文本

    return opinion_name,word_result,text_list