예제 #1
0
def message_Clustering():
    data = pd.read_csv("antbot/datasets/question_45/root_q_a").values.tolist()
    search_msg = [j for i in data for j in i]
    for hit_0 in tqdm(search_msg):
        root_msg = hit_0
        body = {
            "query": {
                "bool": {
                    "filter": {
                        "match_phrase": {
                            "q": "{}".format(root_msg)
                        }
                    }
                }
            }
        }

        questions_one = es.search(index="bot_entity_tmp_new",
                                  body=body,
                                  size=100)

        for hit_1 in questions_one['hits']['hits']:
            res = hit_1['_source']['q']
            body = {
                "query": {
                    "bool": {
                        "filter": {
                            "match_phrase": {
                                "q": "{}".format(res)
                            }
                        }
                    }
                }
            }
            questions_two = es.search(index="bot_entity_tmp",
                                      body=body,
                                      size=100)
            out = []
            for hit_2 in questions_two['hits']['hits']:
                res_two = hit_2['_source']
                rows = {
                    'q': str(res_two['q']).replace("\n", ""),
                    'a': str(res_two['a']).replace("\n", ""),
                    'roomId': (res_two['roomId']),
                    'tenantId': (res_two['tenantId']),
                    'lanlordId': (res_two['lanlordId']),
                    'id': str(res_two['id']).replace("\n", "")
                }
                out.append(rows)
            if len(out) < 1:
                continue
            df = pd.DataFrame(out)
            save_name = '{}.csv'.format(replace_symbol(out[0]['q']))
            save_dir = "/home/duyp/mayi_datasets/seed/entity"
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            save_path = os.path.join(save_dir, "{}".format(save_name))

            log.info("{}".format(save_path))
            df.to_csv(save_path, index=None)
예제 #2
0
def cutallcase(inputs):
    # TODO 按照标点符号切割,然后再遍历所有情况
    outputs = []
    inputs = replace_symbol(inputs)
    length = len(inputs)
    for i in range(length):
        if length == 2:
            outputs.append(inputs)
        elif length == 3:
            x8, x9 = inputs[i:i + 2], inputs[i:i + 3]
            outputs.append(x8)
            outputs.append(x9)

        elif length == 4:
            x5, x6, x7 = inputs[i:i + 2], inputs[i:i + 3], inputs[i:i + 4]
            outputs.append(x5)
            outputs.append(x6)
            outputs.append(x7)

        elif 5 < length:
            x1, x2, x3, x4 = inputs[i:i +
                                    2], inputs[i:i +
                                               3], inputs[i:i +
                                                          4], inputs[i:i + 5]
            outputs.append(x1)
            outputs.append(x2)
            outputs.append(x3)
            outputs.append(x4)
        else:
            continue
    return list(set(outputs))
예제 #3
0
def cut_data():
    out = []
    data_name = os.path.join(root_path, 'datasets/cd_by_nosplit.txt')
    with open(data_name, 'r') as fr:
        lines = fr.readlines()
        for line in tqdm(lines):
            line_cut = cut(replace_symbol(line), add_stopwords=True)
            for x in line_cut:
                out.append(x)
    log.info(" Length: {} ".format(len(out)))
    fw = open(os.path.join(root_path, "datasets/cd.txt"), 'w')
    fw.writelines(" ".join(out))
    fw.close()
예제 #4
0
def cut_jieba(inputs):
    if isinstance(inputs, str):
        res = jieba.analyse.extract_tags(replace_symbol(inputs), topK=10)
        keywords = '|'.join(res)

        msg_cut = jieba.posseg.lcut(replace_symbol(inputs))
        _msg_cut = [i for i in msg_cut if i not in sw2list]

        msg_cut_tags = []
        for w in _msg_cut:
            wf = "{}_{}".format(w.word, w.flag)
            msg_cut_tags.append(wf)
        p = re.compile("[0-9]+?[元|块]").findall(inputs)
        if p:
            for price in p:
                msg_cut_tags.append("{}_{}".format(price, 'n'))
        if len(msg_cut_tags) > 0:
            return "|".join(msg_cut_tags), keywords
        else:
            return inputs, keywords
    else:
        return inputs, inputs
예제 #5
0
def posegcut(inputs):
    if isinstance(inputs, str):
        msg_cut = jieba.posseg.lcut(replace_symbol(inputs))
        _msg_cut = [i for i in msg_cut if i not in sw2list]

        msg_cut_tags = []
        for w in _msg_cut:
            wf = "{}_{}".format(w.word, w.flag)
            msg_cut_tags.append(wf)
        p = re.compile("[0-9]+?[元|块]").findall(inputs)
        if p:
            for price in p:
                msg_cut_tags.append("{}_{}".format(price, 'n'))
        if len(msg_cut_tags) > 0:
            return "|".join(msg_cut_tags)
        else:
            return inputs
    else:
        return inputs
예제 #6
0
def sort_file_by_dict(data_dir, input_filename, output_filename, delete=True):
    """
    输出文件和输入文件保存在同一目录下
    :param data_dir: 数据根目录
    :param input_filename: 要排序文件的名字
    :param output_filename: 输出文件的名字
    :param delete: 是否删除标点符号
    :return: 0
    """
    locale.setlocale(locale.LC_ALL, locale='zh_CN.UTF-8')
    files = []
    line_number = 0
    inputs_dir = os.path.join(data_dir, input_filename)
    with open(inputs_dir, 'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if delete:
                line_new = replace_symbol(
                    line.replace("\n", '').lstrip().rstrip().strip())
                if len(line_new) > 1:
                    files.append(line_new)
                    line_number += 1
                    # TODO 或者可以隔500000保存一次,加快保存速度.
                    if line_number % 10000 == 0:
                        log.info(
                            "=============== process : {} ===============".
                            format(line_number))
            else:
                line_new = line.replace("\n", '').lstrip().rstrip().strip()
                files.append(line_new)
                line_number += 1
                if line_number % 10000 == 0:
                    pass
    log.info(" Total lines : {}".format(line_number))
    b = sorted(files, key=cmp_to_key(locale.strcoll))
    df = pd.DataFrame(b)
    df.columns = ['message']
    output_dir = os.path.join(data_dir, output_filename)
    log.info("Save file : {}".format(output_dir))
    df.to_csv(output_dir, index=None)
예제 #7
0
def read_newdata():
    path = '/home/duyp/mayi_datasets/question/question_new'
    number = 0

    for file in os.listdir(path):
        filename = os.path.join(path, file)
        data = pd.read_csv(filename, lineterminator="\n").values
        out = []
        for x in tqdm(data):
            msg = x[0]
            if isinstance(msg, str):
                msgcut = cut(replace_symbol(msg), add_stopwords=True)
                for i in msgcut:
                    out.append(i)
                    number += 1
            else:
                continue
        fw = open(os.path.join(root_path, "datasets/rawdata/{}".format(file)),
                  'w')
        fw.writelines(" ".join(out))
        fw.close()
    log.info("{}".format(number))
예제 #8
0
def cutpro(inputs):
    if isinstance(inputs, str):
        _msg_cut = jieba.lcut(replace_symbol(inputs.lower()),
                              cut_all=True,
                              HMM=True)
        msg_cut = [i for i in _msg_cut if i not in sw2list]

        p = re.compile("[0-9]+?[元|块]").findall(inputs)
        if p:
            for price in p:
                msg_cut.append("{}_{}".format(price, 'n'))

        p = re.compile("[0-9]+?[月|日][0-9]+?[月|日]").findall(inputs)
        if p:
            for x in p:
                msg_cut.append(x)
        if isindict(msg_cut):
            return msg_cut
        else:
            msg_cut = cutallcase(inputs)
            return msg_cut
    else:
        return None