예제 #1
0
def create_matrix_org_np(sen_count, word2vec_model, qa_path,
                         matrix_ques_path_word):
    """
      创建问题句向量,设置sen_count=10000, 防止内存不够奔溃
    :param sen_count: int, write sentence_encode num per twice
    :param word2vec_model: model
    :param qa_path: str
    :param matrix_ques_path: str
    :return: 
    """
    if os.path.exists(matrix_ques_path_word):
        file_matrix_ques = open(matrix_ques_path_word, 'rb')
        matrix_ques = pickle.load(file_matrix_ques)
        return matrix_ques
    print('create_matrix_org_pkl start!')
    qa_dail = txtRead(qa_path, encodeType='utf-8')
    # questions = []
    matrix_ques = []
    count = 0
    for qa_dail_one in qa_dail:
        ques = getChinese(qa_dail_one.split('\t')[0])
        # questions.append(ques)
        word_list, flag_list = word_segment_process(ques)
        sentence_vec = encoding_question(word2vec_model, word_list, flag_list)
        matrix_ques.append(sentence_vec)
        if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0:
            print("count: " + str(count))
            count += 1
            np.savetxt(
                projectdir + "/Data/sentence_vec_encode_word/" + str(count) +
                ".txt", matrix_ques)
            matrix_ques = []
            # break

    count += 1
    np.savetxt(
        projectdir + "/Data/sentence_vec_encode_word/" + str(count) + ".txt",
        matrix_ques)
    # matrix_ques = []
    # file_matrix_ques = open(matrix_ques_path, 'wb')
    # pickle.dump(matrix_ques, file_matrix_ques)
    print('create_matrix_org_np ok!')
예제 #2
0
def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
    """
      创建问题句向量
    :param sen_count: int
    :param word2vec_model: gensim model
    :param qa_path: str
    :param matrix_ques_path:str 
    :return: None
    """
    if os.path.exists(matrix_ques_path):
        file_matrix_ques = open(matrix_ques_path, 'rb')
        matrix_ques = pickle.load(file_matrix_ques)
        return matrix_ques
    print('create_matrix_org_pkl start!')
    qa_dail = txtRead(qa_path, encodeType='utf-8')
    # questions = []
    matrix_ques = []
    count = 0
    for qa_dail_one in qa_dail:
        ques = getChinese(qa_dail_one.split('\t')[0])
        char_list = [ques_char for ques_char in ques]
        sentence_vec = question_encoding(word2vec_model, char_list)
        matrix_ques.append(sentence_vec)
        if len(matrix_ques) % sen_count == 0 and len(matrix_ques) != 0:
            print("count: " + str(count))
            count += 1
            np.savetxt(
                projectdir + "/Data/sentence_vec_encode_char/" + str(count) +
                ".txt", matrix_ques)
            matrix_ques = []
            break

    count += 1
    np.savetxt(
        projectdir + "/Data/sentence_vec_encode_char/" + str(count) + ".txt",
        matrix_ques)

    print('create_matrix_org_pkl ok!')
예제 #3
0
    # 创建标准问答中问题的句向量,存起来,到matrix_ques_path
    if not os.path.exists(matrix_ques_part_path):
        create_matrix_org_np(sen_count=100000,
                             word2vec_model=word2vec_model,
                             qa_path=chicken_and_gossip_path,
                             matrix_ques_path_word=matrix_ques_part_path)

    # 读取
    print("np.loadtxt(matrix_ques_part_path) start!")
    matrix_ques = np.loadtxt(matrix_ques_part_path)
    print("np.loadtxt(matrix_ques_part_path) end!")
    while True:
        print("你: ")
        ques_ask = input()
        ques_clean = getChinese(ques_ask)
        word_list, flag_list = word_segment_process(ques_clean)
        sentence_vic = encoding_question(word2vec_model, word_list, flag_list)
        top_20_qid = most_similar_sentence_vec(sentence_vic,
                                               matrix_ques,
                                               top_vec=20)
        try:
            print("小姜机器人: " +
                  syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
            print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0],
                    syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1])
                   for i in range(len(top_20_qid))])
        except Exception as e:
            # 有的字符可能打不出来
            print(str(e))
예제 #4
0
        # matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
        create_matrix_org_np(sen_count=100000,
                             word2vec_model=word2vec_model,
                             qa_path=chicken_and_gossip_path,
                             matrix_ques_path=matrix_ques_part_path_char)

    # 读取标准问句矩阵
    print("np.loadtxt(matrix_ques_part_path) start!")
    matrix_ques = np.loadtxt(matrix_ques_part_path_char)
    print("np.loadtxt(matrix_ques_part_path) end!")
    # 标准问句矩阵初始化和预处理
    matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(
        matrix_ques, top_vec=20)

    ### 测试一个例子
    ques_clean = getChinese("小姜机器人是谁呀")
    char_list = [ques_char for ques_char in ques_clean]
    sentence_vec = question_encoding(word2vec_model, char_list)
    top_20_qid = calculate_text_similar(sentence_vec,
                                        matrix_org_norm,
                                        matrix_org_index,
                                        top_vec=top_vec)
    try:
        print("小姜机器人: " +
              syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
        print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0],
                syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1])
               for i in range(len(top_20_qid))])
    except Exception as e:
        # 有的字符可能打不出来
        print(str(e))