def classify_pair_corpus(bert_model):
    # 数据预处理
    from utils.text_tools import text_preprocess, txtRead, txtWrite
    from conf.path_config import path_webank_sim
    import random

    webank_q_2_l = txtRead(path_webank_sim, encodeType='gbk')
    questions = []
    labels = []
    for ques_label in webank_q_2_l[1:]:
        q_2_l = ques_label.split(',')
        q_1 = q_2_l[0]
        q_2 = "".join(q_2_l[1:-1])
        label = q_2_l[-1]
        questions.append([text_preprocess(q_1), text_preprocess(q_2)])
        label_int = int(label)
        labels.append([0, 1] if label_int == 1 else [1, 0])

    questions = np.array(questions)
    labels = np.array(labels)
    index = [i for i in range(len(labels))]
    random.shuffle(index)
    questions = questions[index]
    labels = labels[index]
    len_train = int(len(labels) * 0.9)

    train_x, train_y = questions[0:len_train], labels[0:len_train]
    test_x, test_y = questions[len_train:], labels[len_train:]

    input_ids, input_masks, input_type_ids = bert_model.process_pair(train_x)
    input_ids2, input_masks2, input_type_ids2 = bert_model.process_pair(test_x)

    return train_x, train_y, test_x, test_y, input_ids, input_masks, input_type_ids, input_ids2, input_masks2, input_type_ids2
def chatbot_sentence_vec_by_bert_bertasserver():
    """bert encode is used bert as server"""
    from conf.path_config import chicken_and_gossip_path
    from bert_serving.client import BertClient
    from utils.text_tools import txtRead
    import numpy as np

    topk = 5
    matrix_ques_save_path = "doc_vecs_chicken_and_gossip"
    questions = txtRead(chicken_and_gossip_path, encodeType='utf-8')
    ques = [ques.split('\t')[0] for ques in questions][0:100]

    bc = BertClient(ip = 'localhost')
    doc_vecs = bc.encode(ques)
    np.savetxt(matrix_ques_save_path, doc_vecs)
    # matrix_ques = np.loadtxt(matrix_ques_save_path)

    while True:
        query = input('你问: ')
        query_vec = bc.encode([query])[0]
        query_bert_vec = np.array(query_bert_vec)
        # compute normalized dot product as score
        score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1)
        topk_idx = np.argsort(score)[::-1][:topk]
        for idx in topk_idx:
            print('小姜机器人回答: %s\t%s' % (score[idx], questions[idx]))
Exemplo n.º 3
0
def cut_td_idf(sources_path, target_path):
    """
    结巴切词,汉语
    :param path: 
    :return: 
    """
    print("cut_td_idf start! ")
    corpus = txtRead(sources_path)
    governments = []
    for corpus_one in corpus:
        corpus_one_clear = corpus_one.replace(' ', '').strip()
        ques_q2b = strQ2B(corpus_one_clear.strip())
        ques_q2b_syboml = get_syboml(ques_q2b)
        governments.append(ques_q2b_syboml.strip())

    government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments))

    topic_ques_all = []
    for topic_ques_one in government_ques:
        top_ques_aqlq = topic_ques_one.replace('   ', ' ').replace(
            '  ', ' ').strip() + '\n'
        topic_ques_all.append(top_ques_aqlq)

    txtWrite(topic_ques_all, target_path)
    print("cut_td_idf ok! " + sources_path)
def tok_td_idf(data_path):
    if os.path.exists(data_path + 'td_idf_cut.csv'):
        '''#计算TD-DIDF,获取训练测试数据'''
        datas = txtRead(data_path + 'td_idf_cut.csv')
        # 默认值只匹配长度≥2的单词,修改为1;ngram_range特征所以有2个词的,总计词语50428个
        # vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
        vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
                                    max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
        vec_tdidf.fit_transform(datas)
        file_vec_tdidf = open(data_path + 'td_idf_cut_model.pkl', 'wb')
        pickle.dump(vec_tdidf, file_vec_tdidf)

    return vec_tdidf
Exemplo n.º 5
0
def init_tfidf_chinese_or_pinyin(sources_path):
    """
      构建td_idf
    :param path: 
    :return: 
    """
    questions = txtRead(sources_path)
    corpora_documents = []
    for item_text in questions:
        item_seg = list(jieba.cut(str(item_text).strip()))
        corpora_documents.append(item_seg)

    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf_model = models.TfidfModel(corpus)
    print("init_tfidf_chinese_or_pinyin ok! " + sources_path)
    file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb')
    pickle.dump([dictionary, tfidf_model], file)
Exemplo n.º 6
0
def creat_train_data_of_cg_corpus(limit=50, x_limit=2, y_limit=2):
    x_datas = []
    y_datas = []
    max_len = 0
    sim_ali_web_gov_dli_datas = txtRead(chicken_and_gossip_path,
                                        encodeType="utf-8")
    for sim_ali_web_gov_dli_datas_one in sim_ali_web_gov_dli_datas[1:]:
        if sim_ali_web_gov_dli_datas_one:
            sim_ali_web_gov_dli_datas_one_split = sim_ali_web_gov_dli_datas_one.strip(
            ).split("\t")
            if len(sim_ali_web_gov_dli_datas_one_split) == 2:
                # if sim_ali_web_gov_dli_datas_one_split[2]=="1":
                len_x1 = len(sim_ali_web_gov_dli_datas_one_split[0])
                len_x2 = len(sim_ali_web_gov_dli_datas_one_split[1])
                # if max_len < len_x1 or max_len < len_x2:
                max_len = max(len_x1, len_x2, max_len)

                sentence_org = regular(sim_ali_web_gov_dli_datas_one_split[0],
                                       limit=limit)
                sentence_sim = regular(sim_ali_web_gov_dli_datas_one_split[1],
                                       limit=limit)
                x_datas.append([sen for sen in sentence_org])
                y_datas.append([sen for sen in sentence_sim])
                # x_datas.append([sen for sen in sentence_sim])
                # y_datas.append([sen for sen in sentence_org])

    datas = list(zip(x_datas, y_datas))
    datas = [(x, y) for x, y in datas if len(x) < limit and len(y) < limit
             and len(y) >= y_limit and len(x) >= x_limit]
    x_datas, y_datas = zip(*datas)

    print('fit word_sequence')

    ws_input = WordSequence()
    ws_input.fit(x_datas + y_datas)

    print('dump')

    pickle.dump((x_datas, y_datas), open(chatbot_data_cg_xy_anti, 'wb'))
    pickle.dump(ws_input, open(chatbot_data_cg_ws_anti, 'wb'))

    print('done')
    print(max_len)
Exemplo n.º 7
0
def create_matrix_org_np(sen_count, word2vec_model, qa_path,
                         matrix_ques_path_word):
    """
      创建问题句向量,设置sen_count=10000, 防止内存不够奔溃
    :param sen_count: int, write sentence_encode num per twice
    :param word2vec_model: model
    :param qa_path: str
    :param matrix_ques_path: str
    :return: 
    """
    if os.path.exists(matrix_ques_path_word):
        file_matrix_ques = open(matrix_ques_path_word, 'rb')
        matrix_ques = pickle.load(file_matrix_ques)
        return matrix_ques
    print('create_matrix_org_pkl start!')
    qa_dail = txtRead(qa_path, encodeType='utf-8')
    # questions = []
    matrix_ques = []
    count = 0
    for qa_dail_one in qa_dail:
        ques = getChinese(qa_dail_one.split('\t')[0])
        # questions.append(ques)
        word_list, flag_list = word_segment_process(ques)
        sentence_vec = encoding_question(word2vec_model, word_list, flag_list)
        matrix_ques.append(sentence_vec)
        if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0:
            print("count: " + str(count))
            count += 1
            np.savetxt(
                projectdir + "/Data/sentence_vec_encode_word/" + str(count) +
                ".txt", matrix_ques)
            matrix_ques = []
            # break

    count += 1
    np.savetxt(
        projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt",
        matrix_ques)
    # matrix_ques = []
    # file_matrix_ques = open(matrix_ques_path, 'wb')
    # pickle.dump(matrix_ques, file_matrix_ques)
    print('create_matrix_org_np ok!')
def chatbot_sentence_vec_by_bert_own():
    """bert encode is writted by my own"""
    from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector
    from conf.path_config import chicken_and_gossip_path
    from utils.text_tools import txtRead
    import numpy as np

    # 读取数据和一些参数,这里只取了100个标准问题
    topk = 5
    matrix_ques_save_path = "doc_vecs_chicken_and_gossip"
    questions = txtRead(chicken_and_gossip_path, encodeType='utf-8')
    ques = [ques.split('\t')[0] for ques in questions][0:100]

    # 生成标准问题的bert句向量
    bert_vector = KerasBertVector()
    ques_basic_vecs = bert_vector.bert_encode(ques)

    # 线上你可以生成,直接调用,然后直接load就好
    np.savetxt(matrix_ques_save_path, ques_basic_vecs)
    # matrix_ques = np.loadtxt(matrix_ques_save_path)

    query_bert_vec = bert_vector.bert_encode(["小姜机器人是什么"])[0]
    query_bert_vec = np.array(query_bert_vec)
    print(query_bert_vec)
    # 矩阵点乘,很快的,你也可以用annoy等工具,计算就更加快了
    qq_score = np.sum(query_bert_vec * ques_basic_vecs, axis=1) / np.linalg.norm(ques_basic_vecs, axis=1)
    topk_idx = np.argsort(qq_score)[::-1][:topk]
    for idx in topk_idx:
        print('小姜机器人回答检索: %s\t%s' % (qq_score[idx], questions[idx]))


    while True:
        print("你的问题:")
        query = input()
        query_bert_vec = bert_vector.bert_encode([query])[0]
        query_bert_vec = np.array(query_bert_vec)
        # 矩阵点乘,很快的,你也可以用annoy等工具,计算就更加快了
        qq_score = np.sum(query_bert_vec * ques_basic_vecs, axis=1) / np.linalg.norm(ques_basic_vecs, axis=1)
        topk_idx = np.argsort(qq_score)[::-1][:topk]
        for idx in topk_idx:
            print('小姜机器人回答检索: %s\t%s' % (qq_score[idx], questions[idx]))
Exemplo n.º 9
0
def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
    """
      创建问题句向量
    :param sen_count: int
    :param word2vec_model: gensim model
    :param qa_path: str
    :param matrix_ques_path:str 
    :return: None
    """
    if os.path.exists(matrix_ques_path):
        file_matrix_ques = open(matrix_ques_path, 'rb')
        matrix_ques = pickle.load(file_matrix_ques)
        return matrix_ques
    print('create_matrix_org_pkl start!')
    qa_dail = txtRead(qa_path, encodeType='utf-8')
    # questions = []
    matrix_ques = []
    count = 0
    for qa_dail_one in qa_dail:
        ques = getChinese(qa_dail_one.split('\t')[0])
        char_list = [ques_char for ques_char in ques]
        sentence_vec = question_encoding(word2vec_model, char_list)
        matrix_ques.append(sentence_vec)
        if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0:
            print("count: " + str(count))
            count += 1
            np.savetxt(
                projectdir + "/Data/sentence_vec_encode_char/" + str(count) +
                ".txt", matrix_ques)
            matrix_ques = []
            break

    count += 1
    np.savetxt(
        projectdir + "/Data/sentence_vec_encode_char/" + str(count) + ".txt",
        matrix_ques)

    print('create_matrix_org_pkl ok!')
def classify_pair_corpus_webank(bert_model, path_webank):
    # 数据预处理
    from utils.text_tools import text_preprocess, txtRead, txtWrite
    import random

    webank_q_2_l = txtRead(path_webank, encodeType='utf-8')
    questions = []
    labels = []
    for ques_label in webank_q_2_l[1:]:
        q_2_l = ques_label.split(',')
        q_1 = q_2_l[0]
        q_2 = "".join(q_2_l[1:-1])
        label = q_2_l[-1]
        questions.append([text_preprocess(q_1), text_preprocess(q_2)])
        label_int = int(label)
        labels.append([0, 1] if label_int == 1 else [1, 0])

    questions = np.array(questions)
    labels = np.array(labels)

    input_ids, input_masks, input_type_ids = bert_model.process_pair(questions)

    return questions, labels, input_ids, input_masks, input_type_ids
Exemplo n.º 11
0
def cut_td_idf_pinyin(sources_path, target_path):  # 获取拼音
    """
       汉语转拼音
    :param path: 
    :return: 
    """
    pin = xpinyin.Pinyin()
    corpus = txtRead(sources_path)
    topic_ques_all = []
    corpus_count = 0
    for corpus_one in corpus:
        corpus_count += 1
        # time1 = time.time()
        corpus_one_clear = corpus_one.replace(' ', '').strip()
        ques_q2b = strQ2B(corpus_one_clear.strip())
        ques_q2b_syboml = get_syboml(ques_q2b)
        ques_q2b_syboml_pinying = pin.get_pinyin(
            ques_q2b_syboml.replace('   ', '').replace('  ', '').strip(), ' ')
        topic_ques_all.append(ques_q2b_syboml_pinying + '\n')
        # time2 = time.time()
        # print(str(corpus_count) + 'time:' + str(time2 - time1))
    txtWrite(topic_ques_all, target_path)
    print("cut_td_idf_pinyin ok! " + sources_path)
Exemplo n.º 12
0
            # break

    count += 1
    np.savetxt(
        projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt",
        matrix_ques)
    # matrix_ques = []
    # file_matrix_ques = open(matrix_ques_path, 'wb')
    # pickle.dump(matrix_ques, file_matrix_ques)
    print('create_matrix_org_np ok!')
    # return matrix_ques


if __name__ == '__main__':
    # 读取问答语料
    syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8')

    # 读取词向量,w2v_model_wiki_word_path数据是自己训练的,w2v_model_merge_short_path只取了部分数据,你可以前往下载
    if os.path.exists(w2v_model_wiki_word_path):
        word2vec_model = load_word2vec_model(w2v_model_wiki_word_path,
                                             limit=None)
        print("load w2v_model_wiki_word_path ok!")
    else:
        word2vec_model = load_word2vec_model(w2v_model_merge_short_path,
                                             limit=None)
        print("load w2v_model_merge_short_path ok!")

    # 创建标准问答中问题的句向量,存起来,到matrix_ques_path
    if not os.path.exists(matrix_ques_part_path):
        create_matrix_org_np(sen_count=100000,
                             word2vec_model=word2vec_model,
Exemplo n.º 13
0
def statistics_keyword_by_label(path, rate=1):
    """
    judge is total chinese or not, 判断是不是全是数字
    Args:
        path: str, eg. "train.json"
        rate: float, eg. 0.75
    Returns:
        None
    """
    datas = txtRead(path)

    lwd = {}
    for i in tqdm(range(len(datas)), desc="jieba cut and statistics: "):
        # 从标准文档里边获取文本, 切词处理
        d = datas[i]
        d_json = json.loads(d)
        text = d_json.get("x", {}).get("text")
        label = d_json.get("y")
        word_list = list(jieba.cut(text))
        # 去除 停用词、全数字、1个字
        word_list = [
            wl for wl in word_list if wl not in stop_words
            and not is_total_number(wl) and len(wl) >= 2
        ]
        # 词频统计(类别内)
        word_freq_dict = dict(Counter(word_list))
        if label not in lwd:
            lwd[label] = word_freq_dict
        else:
            lwd[label].update(word_freq_dict)

    # 取范围, 排序
    lwd_keys = list(lwd.keys())
    lwd_soft = [
        sorted(lwd[l].items(), key=lambda x: x[1], reverse=True)
        for l in lwd_keys
    ]
    lwd_soft_rate = [s[:int(len(s) * rate)] for s in lwd_soft]
    label_word_dict = {
        lwd_keys[i]: OrderedDict(lwd_soft_rate[i])
        for i in range(len(lwd_keys))
    }
    print("cut ok!")
    # 获取每个类独有的词汇
    label_keys = set(list(label_word_dict.keys()))
    label_words = {}
    for key in label_keys:
        key_dict = set(list(label_word_dict[key].keys()))
        keys_other = copy.deepcopy(label_keys)
        keys_other.discard(key)
        # 其他类别的所有词汇
        kos = set()
        for ko in keys_other:
            ko_dict = set(list(label_word_dict[ko].keys()))
            kos = kos | ko_dict

        # 获取独有的词汇
        key_public = kos & key_dict
        key_label = key_dict - key_public

        label_word_freq = {kl: label_word_dict[key][kl] for kl in key_label}
        label_words[key] = label_word_freq

    save_json(label_words, "label_keyword_unique.json")
Exemplo n.º 14
0
    end_time3 = time.time()

    # print('end_time1: ' + str(end_time1 - start_time))
    # print('end_time2: ' + str(end_time2 - start_time))
    # print('end_time3: ' + str(end_time3 - start_time))

    return result
    # [fuzz.WRatio, fuzz.QRatio,
    #  fuzz.token_set_ratio, fuzz.token_sort_ratio,
    #  fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
    #  fuzz.UWRatio, fuzz.UQRatio]


if __name__ == '__main__':
    start_time = time.time()
    qa_list = txtRead(chicken_and_gossip_path)
    questions = [qa.strip().split("\t")[0] for qa in qa_list]
    print("read questions ok!")
    sen = "你谁呀"
    # list_fuzzyfinder = fuzzyfinder(base_syn_one_split[1], qa_list)
    # list_fuzzyfinder = fuzzy_fuzzywuzzy(fuzz, base_syn_one_split[1], qa_list)
    print("你问: " + "你谁呀")
    list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz,
                                             sen,
                                             qa_list,
                                             questions,
                                             topn=5)
    print("小姜机器人: " + list_fuzzyfinder[0][0].split("\t")[1].strip())
    print("推荐结果: ")
    print(list_fuzzyfinder)
Exemplo n.º 15
0
    gen_all_syn = []
    for generated_hot_one in generated_hot:
        generated_hot_one_1 = [generated_hot_one]
        generated_str = generate_random_select(generated_hot_one_1,
                                               model_txt,
                                               twice=1000,
                                               len_min=5)
        if generated_str:
            gen_all_syn = gen_all_syn + generated_str
    # 提取原句中没有的部分
    gen_all_syn = list(set(gen_all_syn))
    # 生成句子与原句的交集
    syn_intersection = list(set(sentence_list).intersection(set(gen_all_syn)))
    # 生成句子减去交集
    gen_syns = list(set(gen_all_syn).difference(set(syn_intersection)))
    return gen_syns


if __name__ == "__main__":
    # 读取一个文件,再生成句子
    txt_path = chicken_and_gossip_path
    sentence_list = txtRead(txt_path)
    sentence_list = sentence_list[0:100]
    enhance_texts = generate_syns_from_list(sentence_list,
                                            begin_word="tfidf",
                                            p=0.1)
    for enhance_texts_one in enhance_texts:
        try:
            print(enhance_texts_one)
        except Exception as e:
            print(str(e))