def message_Clustering(): data = pd.read_csv("antbot/datasets/question_45/root_q_a").values.tolist() search_msg = [j for i in data for j in i] for hit_0 in tqdm(search_msg): root_msg = hit_0 body = { "query": { "bool": { "filter": { "match_phrase": { "q": "{}".format(root_msg) } } } } } questions_one = es.search(index="bot_entity_tmp_new", body=body, size=100) for hit_1 in questions_one['hits']['hits']: res = hit_1['_source']['q'] body = { "query": { "bool": { "filter": { "match_phrase": { "q": "{}".format(res) } } } } } questions_two = es.search(index="bot_entity_tmp", body=body, size=100) out = [] for hit_2 in questions_two['hits']['hits']: res_two = hit_2['_source'] rows = { 'q': str(res_two['q']).replace("\n", ""), 'a': str(res_two['a']).replace("\n", ""), 'roomId': (res_two['roomId']), 'tenantId': (res_two['tenantId']), 'lanlordId': (res_two['lanlordId']), 'id': str(res_two['id']).replace("\n", "") } out.append(rows) if len(out) < 1: continue df = pd.DataFrame(out) save_name = '{}.csv'.format(replace_symbol(out[0]['q'])) save_dir = "/home/duyp/mayi_datasets/seed/entity" if not os.path.exists(save_dir): os.makedirs(save_dir) save_path = os.path.join(save_dir, "{}".format(save_name)) log.info("{}".format(save_path)) df.to_csv(save_path, index=None)
def cutallcase(inputs): # TODO 按照标点符号切割,然后再遍历所有情况 outputs = [] inputs = replace_symbol(inputs) length = len(inputs) for i in range(length): if length == 2: outputs.append(inputs) elif length == 3: x8, x9 = inputs[i:i + 2], inputs[i:i + 3] outputs.append(x8) outputs.append(x9) elif length == 4: x5, x6, x7 = inputs[i:i + 2], inputs[i:i + 3], inputs[i:i + 4] outputs.append(x5) outputs.append(x6) outputs.append(x7) elif 5 < length: x1, x2, x3, x4 = inputs[i:i + 2], inputs[i:i + 3], inputs[i:i + 4], inputs[i:i + 5] outputs.append(x1) outputs.append(x2) outputs.append(x3) outputs.append(x4) else: continue return list(set(outputs))
def cut_data(): out = [] data_name = os.path.join(root_path, 'datasets/cd_by_nosplit.txt') with open(data_name, 'r') as fr: lines = fr.readlines() for line in tqdm(lines): line_cut = cut(replace_symbol(line), add_stopwords=True) for x in line_cut: out.append(x) log.info(" Length: {} ".format(len(out))) fw = open(os.path.join(root_path, "datasets/cd.txt"), 'w') fw.writelines(" ".join(out)) fw.close()
def cut_jieba(inputs): if isinstance(inputs, str): res = jieba.analyse.extract_tags(replace_symbol(inputs), topK=10) keywords = '|'.join(res) msg_cut = jieba.posseg.lcut(replace_symbol(inputs)) _msg_cut = [i for i in msg_cut if i not in sw2list] msg_cut_tags = [] for w in _msg_cut: wf = "{}_{}".format(w.word, w.flag) msg_cut_tags.append(wf) p = re.compile("[0-9]+?[元|块]").findall(inputs) if p: for price in p: msg_cut_tags.append("{}_{}".format(price, 'n')) if len(msg_cut_tags) > 0: return "|".join(msg_cut_tags), keywords else: return inputs, keywords else: return inputs, inputs
def posegcut(inputs): if isinstance(inputs, str): msg_cut = jieba.posseg.lcut(replace_symbol(inputs)) _msg_cut = [i for i in msg_cut if i not in sw2list] msg_cut_tags = [] for w in _msg_cut: wf = "{}_{}".format(w.word, w.flag) msg_cut_tags.append(wf) p = re.compile("[0-9]+?[元|块]").findall(inputs) if p: for price in p: msg_cut_tags.append("{}_{}".format(price, 'n')) if len(msg_cut_tags) > 0: return "|".join(msg_cut_tags) else: return inputs else: return inputs
def sort_file_by_dict(data_dir, input_filename, output_filename, delete=True): """ 输出文件和输入文件保存在同一目录下 :param data_dir: 数据根目录 :param input_filename: 要排序文件的名字 :param output_filename: 输出文件的名字 :param delete: 是否删除标点符号 :return: 0 """ locale.setlocale(locale.LC_ALL, locale='zh_CN.UTF-8') files = [] line_number = 0 inputs_dir = os.path.join(data_dir, input_filename) with open(inputs_dir, 'r') as fr: lines = fr.readlines() for line in lines: if delete: line_new = replace_symbol( line.replace("\n", '').lstrip().rstrip().strip()) if len(line_new) > 1: files.append(line_new) line_number += 1 # TODO 或者可以隔500000保存一次,加快保存速度. if line_number % 10000 == 0: log.info( "=============== process : {} ===============". format(line_number)) else: line_new = line.replace("\n", '').lstrip().rstrip().strip() files.append(line_new) line_number += 1 if line_number % 10000 == 0: pass log.info(" Total lines : {}".format(line_number)) b = sorted(files, key=cmp_to_key(locale.strcoll)) df = pd.DataFrame(b) df.columns = ['message'] output_dir = os.path.join(data_dir, output_filename) log.info("Save file : {}".format(output_dir)) df.to_csv(output_dir, index=None)
def read_newdata(): path = '/home/duyp/mayi_datasets/question/question_new' number = 0 for file in os.listdir(path): filename = os.path.join(path, file) data = pd.read_csv(filename, lineterminator="\n").values out = [] for x in tqdm(data): msg = x[0] if isinstance(msg, str): msgcut = cut(replace_symbol(msg), add_stopwords=True) for i in msgcut: out.append(i) number += 1 else: continue fw = open(os.path.join(root_path, "datasets/rawdata/{}".format(file)), 'w') fw.writelines(" ".join(out)) fw.close() log.info("{}".format(number))
def cutpro(inputs): if isinstance(inputs, str): _msg_cut = jieba.lcut(replace_symbol(inputs.lower()), cut_all=True, HMM=True) msg_cut = [i for i in _msg_cut if i not in sw2list] p = re.compile("[0-9]+?[元|块]").findall(inputs) if p: for price in p: msg_cut.append("{}_{}".format(price, 'n')) p = re.compile("[0-9]+?[月|日][0-9]+?[月|日]").findall(inputs) if p: for x in p: msg_cut.append(x) if isindict(msg_cut): return msg_cut else: msg_cut = cutallcase(inputs) return msg_cut else: return None