Пример #1
0
def weibo_subob_rub_neu_classifier(items, batch=RUBBISH_BATCH_COUNT):
    '''
    分类主函数:
    输入数据:weibo(list元素),示例:[[mid,text,...],[mid,text,...]...]
            batch: rubbish filter的参数
    输出数据:label_data(字典元素),示例:{{'mid':类别标签},{'mid':类别标签}...}
            1表示垃圾文本,0表示新闻文本,[2表示中性文本, 已去除],-1表示有极性的文本
    '''
    results = []
    items = rubbish_classifier(items, batch=batch)

    for item in items:
        label = 1
        if item['rub_label'] == 1:
            label = 1 # 垃圾

        else:
            item = subob_classifier(item)
            if item['subob_label'] == 1:
                label = 0 # 客观
            else:
                sentiment = triple_classifier(item)
                if sentiment == 0:
                    # label = 2 # 中性
                    label = cut_mid_weibo(item['content168'])
                else:
                    label = -1 # 有极性

        item['subob_rub_neu_label'] = label
        results.append(item)

    return results
Пример #2
0
def classify(weibo, flag):
    '''
    分类主函数:
    输入数据:weibo(list元素),示例:[[mid,text,...],[mid,text,...]...]
            flag(标记变量,任意设置)
    输出数据:label_data(字典元素),示例:{{'mid':类别标签},{'mid':类别标签}...}
            1表示垃圾文本,0表示新闻文本,2表示中性文本,-1表示有极性的文本
    '''
    start = time.time()
    label_data = start_ad(weibo, flag)  #垃圾分类
    end = time.time()
    print(end - start)

    news_weibo = []
    for i in range(0, len(weibo)):
        if label_data[str(weibo[i][0])] == 0:
            news_weibo.append(weibo[i])

    start = time.time()
    label = cut_weibo(news_weibo)  #规则分类
    end = time.time()
    print 'cutting weibo by rules takes %s' % (end - start)
    start = time.time()
    for i in range(0, len(label)):
        if label[i] == 0:
            mid = news_weibo[i][0]
            text = news_weibo[i][1]
            sentiment = triple_classifier(text)  #调用中性情感分类器
            if sentiment == 0:
                label_data[str(mid)] = cut_mid_weibo(text)
                #label_data[str(mid)] = 2
            else:
                label_data[str(mid)] = -1
    end = time.time()
    print 'classifying weibo takes %s' % (end - start)

    return label_data
Пример #3
0
def classify(weibo,flag):
    '''
    分类主函数:
    输入数据:weibo(list元素),示例:[[mid,text,...],[mid,text,...]...]
            flag(标记变量,任意设置)
    输出数据:label_data(字典元素),示例:{{'mid':类别标签},{'mid':类别标签}...}
            1表示垃圾文本,0表示新闻文本,2表示中性文本,-1表示有极性的文本
    '''
    start = time.time()
    label_data = start_ad(weibo,flag)#垃圾分类
    end = time.time()
    print (end-start)

    news_weibo = []
    for i in range(0,len(weibo)):
        if label_data[str(weibo[i][0])] == 0:
            news_weibo.append(weibo[i])

    start = time.time()
    label = cut_weibo(news_weibo)#规则分类
    end = time.time()
    print 'cutting weibo by rules takes %s' % (end-start)
    start = time.time()
    for i in range(0,len(label)):
        if label[i] == 0:
            mid = news_weibo[i][0]
            text = news_weibo[i][1]
            sentiment = triple_classifier(text)#调用中性情感分类器
            if sentiment == 0:
                label_data[str(mid)] = cut_mid_weibo(text)
                #label_data[str(mid)] = 2
            else:
                label_data[str(mid)] = -1    
    end = time.time()
    print 'classifying weibo takes %s' % (end-start)
    
    return label_data