def async_tokenize(batch=100, empty=False): """batch不能设置过大!越大,返回错误的次数越多,而且速度也变慢了 默认是断点恢复模式,即从上回的结束位置继续; 设置 empty 为 True,将从头开始。 """ if empty: logger.info("refresh tokenize mode!") logger.info("empty chat tokenize %s" % conf.file_chat_tokenize) file_op.empty_file(conf.file_chat_tokenize) start = 0 else: chat_tokenize = file_op.read_lines(conf.file_chat_tokenize) start = len(chat_tokenize) logger.info("breakpoint resume mode, start is %s" % str(start)) file_chat = conf.file_chat_pred chat = file_op.read_lines(file_chat) logger.info("read %s success!" % file_chat) chat[0] = chat[0].replace("\ufeff", '') logger.info("tokenize from %i" % start) chat = chat[start:] batch_nums = int(len(chat) / batch) + 1 for i in range(batch_nums): chat_batch = chat[i*batch:(i+1)*batch] logger.info("current batch start is %s" % str(i*batch)) async_batch_tokenize(chat_batch)
def json_wr(): import json file_session_parsed = base_conf.file_session_parsed json_session_parsed = base_conf.json_session_parsed session_parsed = file_op.read_lines(file_session_parsed) session_parsed = [eval(x) for x in session_parsed] sp_dict = dict() for x in session_parsed: del x['lines'] sp_dict[x['session_id']] = x json.dump(sp_dict, open(json_session_parsed, 'w', encoding='utf-8'), indent=2, ensure_ascii=False)
def get_bracket_words(): lines = file_op.read_lines(conf['chat_pred']) logger.info("read %s success!" % conf['chat_pred']) bracket_pat = re.compile("\[(.*?)\]") bracket_values = [] for line in lines: values = bracket_pat.findall(line) bracket_values.extend(values) bracket_values = list(set(bracket_values)) # order_ids = [value for value in bracket_values if "ORDERID_" in value] values = [ value for value in bracket_values if "ORDERID_" not in value and "USERID_" not in value ] file_op.write_lines("temp.txt", values)
def merge_78(): """合并多余的字段""" lines = file_op.read_lines(conf['chat']) logger.info("read chat.txt success!") for i in range(len(lines)): line = lines[i] line_ = line.strip("\r\n").split('\t') if len(line_) > 7: line_pred = line_[:6] text = " ".join(line_[6:]) line_pred.append(text) lines[i] = '\t'.join(line_pred) # print('\t'.join(line_pred)) file_op.write_lines(conf['chat_pred'], lines) logger.info("write results to %s success!" % conf['chat_pred'])
def chat_parse(): """逐行解析chat数据,按照session进行统计,归集""" file_chat = base_conf.file_chat logger.info('reading chat from %s' % file_chat) lines = file_op.read_lines(file_chat) chat_parsed = [] # 初始化 session info sess_info = { "session_id": lines[0].split('\t')[0], "q_nums": 0, "a_nums": 0, "lines": [] } for line in lines: line = line.strip('\t').replace("\t", '|') try: cols = line.split("|") line_cols = { "id": cols[0], "user": cols[1], "waiter_send": cols[2], "transfer": cols[3], "repeat": cols[4], "sku": cols[5], "content": "|".join(cols[6:]) } # assert len(cols) == 7, "总共有七个字段,当前行有%i个字段" % len(cols) if sess_info['session_id'] == line_cols['id']: sess_info = _update_nums(sess_info, line_cols) sess_info['lines'].append(line) else: chat_parsed.append(sess_info) sess_info = { "session_id": line_cols['id'], "q_nums": 0, "a_nums": 0, "lines": [line] } sess_info = _update_nums(sess_info, line_cols) except Exception as e: logger.error('line error: %s' % line) logger.exception(e) file_op.write_lines(base_conf.file_chat_parsed, chat_parsed) logger.info("chat parse result saved in %s" % base_conf.file_chat_parsed) return chat_parsed
def chat_session_parse(): """输入chat_parse()返回的结果,将连续的q、a进行合并,并标记顺序""" logger.info("reading chat parsed from %s" % base_conf.file_chat_parsed) chat_parsed = file_op.read_lines(base_conf.file_chat_parsed) chat_parsed = [eval(x) for x in chat_parsed] session_parsed = [] for sess_info in chat_parsed: try: sess_parsed = _parse_session(sess_info) session_parsed.append(sess_parsed) except Exception as e: logger.error("sess info parse error, sess_id: %s" % sess_info['session_id']) logger.exception(e) file_session_parsed = base_conf.file_session_parsed logger.info("save session parse result to %s" % file_session_parsed) file_op.write_lines(file_session_parsed, session_parsed) logger.info("save success!")
def count_fail(): """统计调用api失败的数量""" logger.info("count api invoke fail numbers.") file_chat_tokenize = conf.file_chat_tokenize chat_tokenize = file_op.read_lines(file_chat_tokenize) logger.info("read %s success!" % chat_tokenize) nums = { "fail": 0, "success": 0 } for line in chat_tokenize: try: if inspect_tokenize(line): nums["success"] += 1 else: nums["fail"] += 1 except: nums["fail"] += 1 logger.info("tokenize success nums: %i, fail nums: %i" % (nums["success"], nums["fail"])) return nums
def tokenize_modify(): """对部分返回失败的句子重新分词""" logger.info("modify tokenize results!") file_chat_tokenize = conf.file_chat_tokenize chat_tokenize = file_op.read_lines(file_chat_tokenize) logger.info("read %s success!" % chat_tokenize) for i in range(len(chat_tokenize)): line = chat_tokenize[i] try: if not inspect_tokenize(line): line = "\t".join(line.split('\t')[:-1]) # 处理特殊字符 line = line_pre(line) text = line.split('\t')[6] logger.info("current line: %i, text: %s" % (i, text)) res = get_text_tokenize(text) text_tokens = res.get('tokenizedText', "fail_tokenize") line += str(text_tokens) chat_tokenize[i] = line except Exception as e: logger.exception(e) print(line) file_op.write_lines(file_chat_tokenize, chat_tokenize, mode='w') logger.info("write %s success!" % chat_tokenize)