Пример #1
0
def async_tokenize(batch=100, empty=False):
    """batch不能设置过大!越大,返回错误的次数越多,而且速度也变慢了

    默认是断点恢复模式,即从上回的结束位置继续;
    设置 empty 为 True,将从头开始。
    """
    if empty:
        logger.info("refresh tokenize mode!")
        logger.info("empty chat tokenize %s" % conf.file_chat_tokenize)
        file_op.empty_file(conf.file_chat_tokenize)
        start = 0
    else:
        chat_tokenize = file_op.read_lines(conf.file_chat_tokenize)
        start = len(chat_tokenize)
        logger.info("breakpoint resume mode, start is %s" % str(start))

    file_chat = conf.file_chat_pred
    chat = file_op.read_lines(file_chat)
    logger.info("read %s success!" % file_chat)
    chat[0] = chat[0].replace("\ufeff", '')
    logger.info("tokenize from %i" % start)
    chat = chat[start:] 
    batch_nums = int(len(chat) / batch) + 1
    for i in range(batch_nums):
        chat_batch = chat[i*batch:(i+1)*batch]
        logger.info("current batch start is %s" % str(i*batch))
        async_batch_tokenize(chat_batch)
Пример #2
0
def json_wr():
    import json
    file_session_parsed = base_conf.file_session_parsed
    json_session_parsed = base_conf.json_session_parsed
    session_parsed = file_op.read_lines(file_session_parsed)
    session_parsed = [eval(x) for x in session_parsed]

    sp_dict = dict()
    for x in session_parsed:
        del x['lines']
        sp_dict[x['session_id']] = x

    json.dump(sp_dict, open(json_session_parsed, 'w', encoding='utf-8'),
              indent=2, ensure_ascii=False)
Пример #3
0
def get_bracket_words():
    lines = file_op.read_lines(conf['chat_pred'])
    logger.info("read %s success!" % conf['chat_pred'])
    bracket_pat = re.compile("\[(.*?)\]")
    bracket_values = []
    for line in lines:
        values = bracket_pat.findall(line)
        bracket_values.extend(values)
    bracket_values = list(set(bracket_values))
    # order_ids = [value for value in bracket_values if "ORDERID_" in value]
    values = [
        value for value in bracket_values
        if "ORDERID_" not in value and "USERID_" not in value
    ]
    file_op.write_lines("temp.txt", values)
Пример #4
0
def merge_78():
    """合并多余的字段"""
    lines = file_op.read_lines(conf['chat'])
    logger.info("read chat.txt success!")
    for i in range(len(lines)):
        line = lines[i]
        line_ = line.strip("\r\n").split('\t')
        if len(line_) > 7:
            line_pred = line_[:6]
            text = " ".join(line_[6:])
            line_pred.append(text)
            lines[i] = '\t'.join(line_pred)
            # print('\t'.join(line_pred))
    file_op.write_lines(conf['chat_pred'], lines)
    logger.info("write results to  %s success!" % conf['chat_pred'])
Пример #5
0
def chat_parse():
    """逐行解析chat数据,按照session进行统计,归集"""
    file_chat = base_conf.file_chat
    logger.info('reading chat from %s' % file_chat)
    lines = file_op.read_lines(file_chat)
    chat_parsed = []

    # 初始化 session info
    sess_info = {
        "session_id": lines[0].split('\t')[0],
        "q_nums": 0,
        "a_nums": 0,
        "lines": []
    }

    for line in lines:
        line = line.strip('\t').replace("\t", '|')
        try:
            cols = line.split("|")
            line_cols = {
                "id": cols[0],
                "user": cols[1],
                "waiter_send": cols[2],
                "transfer": cols[3],
                "repeat": cols[4],
                "sku": cols[5],
                "content": "|".join(cols[6:])
            }
            # assert len(cols) == 7, "总共有七个字段,当前行有%i个字段" % len(cols)
            if sess_info['session_id'] == line_cols['id']:
                sess_info = _update_nums(sess_info, line_cols)
                sess_info['lines'].append(line)
            else:
                chat_parsed.append(sess_info)
                sess_info = {
                    "session_id": line_cols['id'],
                    "q_nums": 0,
                    "a_nums": 0,
                    "lines": [line]
                }
                sess_info = _update_nums(sess_info, line_cols)
        except Exception as e:
            logger.error('line error: %s' % line)
            logger.exception(e)
    file_op.write_lines(base_conf.file_chat_parsed, chat_parsed)
    logger.info("chat parse result saved in %s" % base_conf.file_chat_parsed)
    return chat_parsed
Пример #6
0
def chat_session_parse():
    """输入chat_parse()返回的结果,将连续的q、a进行合并,并标记顺序"""
    logger.info("reading chat parsed from %s" % base_conf.file_chat_parsed)
    chat_parsed = file_op.read_lines(base_conf.file_chat_parsed)
    chat_parsed = [eval(x) for x in chat_parsed]
    session_parsed = []
    for sess_info in chat_parsed:
        try:
            sess_parsed = _parse_session(sess_info)
            session_parsed.append(sess_parsed)
        except Exception as e:
            logger.error("sess info parse error, sess_id: %s" % sess_info['session_id'])
            logger.exception(e)
    file_session_parsed = base_conf.file_session_parsed
    logger.info("save session parse result to %s" % file_session_parsed)
    file_op.write_lines(file_session_parsed, session_parsed)
    logger.info("save success!")
Пример #7
0
def count_fail():
    """统计调用api失败的数量"""
    logger.info("count api invoke fail numbers.")
    file_chat_tokenize = conf.file_chat_tokenize
    chat_tokenize = file_op.read_lines(file_chat_tokenize)
    logger.info("read %s success!" % chat_tokenize)
    nums = {
        "fail": 0,
        "success": 0
    }

    for line in chat_tokenize:
        try:
            if inspect_tokenize(line):
                nums["success"] += 1
            else:
                nums["fail"] += 1
        except:
            nums["fail"] += 1
    logger.info("tokenize success nums: %i, fail nums: %i" %
                (nums["success"], nums["fail"]))
    return nums
Пример #8
0
def tokenize_modify():
    """对部分返回失败的句子重新分词"""
    logger.info("modify tokenize results!")
    file_chat_tokenize = conf.file_chat_tokenize
    chat_tokenize = file_op.read_lines(file_chat_tokenize)
    logger.info("read %s success!" % chat_tokenize)
    for i in range(len(chat_tokenize)):
        line = chat_tokenize[i]
        try:
            if not inspect_tokenize(line):
                line = "\t".join(line.split('\t')[:-1])
                # 处理特殊字符
                line = line_pre(line)
                text = line.split('\t')[6]
                logger.info("current line: %i, text: %s" % (i, text))
                res = get_text_tokenize(text)
                text_tokens = res.get('tokenizedText', "fail_tokenize")
                line += str(text_tokens)
                chat_tokenize[i] = line
        except Exception as e:
            logger.exception(e)
            print(line)
    file_op.write_lines(file_chat_tokenize, chat_tokenize, mode='w')
    logger.info("write %s success!" % chat_tokenize)