Exemplo n.º 1
0
def correlation_negative():
    """
    相关性, 篇章正负面, 机构标签 -> 正负面
    :return:
    """
    # get parameters
    start_time = datetime.now()
    records = request.json['record']
    logger.info('starting correlation_negative, {list_size: %d}' %
                (len(records)))

    # 相关性判断
    words_list = pre.handle_contents([record['content'] for record in records])
    result_list = predict.predict_corpus(words_list)
    record_result_list = zip(records, result_list)

    ret_list = []
    for record_result in record_result_list:
        id = int(record_result[0]['id'])
        title = record_result[0]['title']
        content = record_result[0]['content']
        content = title + '。' + content
        sec = int(record_result[1])
        tendency = 0
        org_list = []

        if sec == 1:
            # 句子正负面
            tendency, org_list = sa.evaluate_article(content)

            # 相关文章, 篇章级正负面
            # 早先设定-----tendency: 正负面字段,     -1: 负面; 0: 正面 (篇章正负面)
            # 模型返回-----positive  -> 1      negetive -> 0
            content = pre.handle_contents([content])
            tendency = chapter_pipeline.predict(content)
            tendency = int(tendency[0]) - 1  # 将模型返回值对应早先设定的值

        ret_list.append({
            'id': id,
            'sec': sec,
            'tendency': tendency,
            'org_list': org_list
        })

    # 返回结果
    logger.info('end correlation_negative: {ret_list: %d, lost_seconds: %ds}' %
                (len(ret_list), (datetime.now() - start_time).seconds))
    ret = {'docs': ret_list}
    return jsonify(ret)
Exemplo n.º 2
0
def handle_test_excel(test_file, mid_file, all_file, tag_file, right_flag):
    # 从excel读取数据
    from openpyxl import Workbook
    from openpyxl import load_workbook
    test_corpus = []
    workbook = load_workbook(test_file)
    sheet_names = workbook.sheetnames  # 获得表单名字
    for sheet_name in sheet_names:
        sheet = workbook[sheet_name]
        for row in range(2, sheet.max_row + 1):
            row = row
            title_column = 1
            column_column = 2
            title = sheet.cell(row=row, column=title_column).value.encode('utf-8')
            content = sheet.cell(row=row, column=column_column).value.encode('utf-8')

            # python3 需要转换
            content = content.decode()

            words_str = handle_content(content)
            words_str = words_str.encode('utf-8')
            test_corpus.append((title, content, words_str))
    workbook.close()

    # 预测, 输出准确率
    import predict
    result = predict.predict_corpus([content_word[2] for content_word in test_corpus])
    print('测试文件: ', test_file)
    print('总条数: ', len(result))
    right_result = list(filter(lambda x: x == right_flag, result))
    print('正确条数; ', len(right_result))
    print('准确度: ', len(right_result) / len(result))

    # 将预处理的完毕的数据保存到中间文件中
    mid_wb = Workbook()
    mid_ws = mid_wb.active
    mid_ws.cell(row=1, column=1).value = 'words_str'
    for row in range(0, len(test_corpus)):
        mid_ws.cell(row=row+2, column=1).value = test_corpus[row][2]
    mid_wb.save(mid_file)
    print('保存中间数据: ', mid_file)

    # 保存所有数据
    all_wb = Workbook()
    all_ws = all_wb.active
    all_ws.cell(row=1, column=1).value = 'title'
    all_ws.cell(row=1, column=2).value = 'content'
    all_ws.cell(row=1, column=3).value = 'predict_flag'
    all_list = [content for content in zip(test_corpus, result)]
    for row in range(0, len(all_list)):
        all_ws.cell(row=row+2, column=1).value = all_list[row][0][0]
        all_ws.cell(row=row+2, column=2).value = all_list[row][0][1]
        all_ws.cell(row=row+2, column=3).value = all_list[row][1]
    all_wb.save(all_file)
    print('保存所有数据: ', all_file)

    # 将结果不正确的数据, 保存到tag_file中
    wb = Workbook()
    ws = wb.active
    ws.cell(row=1, column=1).value = 'title'
    ws.cell(row=1, column=2).value = 'content'
    error_list = [content[0] for content in zip(test_corpus, result) if content[1] != right_flag]
    for row in range(2, len(error_list)):
        ws.cell(row=row, column=1).value = error_list[row][0]
        ws.cell(row=row, column=2).value = error_list[row][1]
    wb.save(tag_file)
    print('保存错误数据: ', tag_file)
Exemplo n.º 3
0
            else:
                length = len(x)
            data.append([
                len(x),
                self.getcnt(x),
                self.getcnt(x) / length,
                self.getnegcnt(x),
                self.getnegcnt(x) / length
            ])
        return data

if __name__ == '__main__':
    import pre
    import predict

    record_ = [{
        'id':
        1,
        'content':
        '一个恐怖的数字[怒]据国家癌症中心发布☞全国每天约有10000人确诊为癌症,平均每分钟就有7人确诊[怒]不过放心,癌症是可以治愈的,但……需要很多'
    }]
    # predict
    corpus = []
    files = open("corpus/test_new.txt", "r", encoding="utf-8").readlines()
    for item in files:
        corpus.append(item.strip())
    contents = [pre.handle_content(i_content) for i_content in corpus]

    correlations = predict.predict_corpus(contents)
    print(correlations)