Пример #1
0
def w2v(word):
    if word != "UNK":
        word = word.lower()
    index = data_helpers.word2id(word)
    if index == -1:
        raise ValueError("{} doesn't exist in the vocablury.".format(word))
    else:
        return word_vector[0][index]
Пример #2
0
def preprocess():
    '''
    read from the text file.
    :return:    sen word id:[324,1413,1,41,43,0,0,0]
                sen len:[5]
                sen max len :[8]
                sen label:[0,0,1]
                target word id:[34,154,0,0]
                target len: [2]
                target max len: [4]
                targets word id :[[34,154,0,0],
                                  [34,14,12,56],
                                  [0,0,0,0]]
                targets num = 2
                targets len: [2,4,0]
                targets max num:[3]
                targets_relation_self = [[1,0,0],
                                         [0,1,0],
                                         [0.0.0]]
                targets_relation_cross = [[0,1,0],
                                          [1,0,0],
                                          [0.0.0]]
    '''
    # Data Preparation
    # ==================================================
    # Load data
    print("Loading data...")
    train_x_str, train_target_str, train_y = data_helpers.load_data_and_labels(
        FLAGS.train_file)
    dev_x_str, dev_target_str, dev_y = data_helpers.load_data_and_labels(
        FLAGS.test_file)
    test_x_str, test_target_str, test_y = data_helpers.load_data_and_labels(
        FLAGS.test_file)

    #word embedding ---> x[324,1413,1,41,43,0,0,0]  y[0,1]
    #word_id_mapping,such as  apple--->23 ,w2v  23---->[vector]
    word_id_mapping, w2v = data_helpers.load_w2v(FLAGS.embedding_file_path,
                                                 300)
    max_document_length = max(
        [len(x.split(" ")) for x in (train_x_str + dev_x_str + test_x_str)])
    max_target_length = max([
        len(x.split(" "))
        for x in (train_target_str + dev_target_str + test_target_str)
    ])

    #The targets  ---->[[[141,23,45],[23,45,1,2],[2]], ...]
    #The number of targets ----> [3, ...]
    train_targets_str, train_targets_num = data_helpers.load_targets(
        FLAGS.train_file)
    dev_targets_str, dev_targets_num = data_helpers.load_targets(
        FLAGS.test_file)
    test_targets_str, test_targets_num = data_helpers.load_targets(
        FLAGS.test_file)
    max_target_num = max(
        [len(x) for x in (train_targets_str + test_targets_str)])

    # sentence ---> word_id
    train_x, train_x_len = data_helpers.word2id(train_x_str, word_id_mapping,
                                                max_document_length)
    dev_x, dev_x_len = data_helpers.word2id(dev_x_str, word_id_mapping,
                                            max_document_length)
    test_x, test_x_len = data_helpers.word2id(test_x_str, word_id_mapping,
                                              max_document_length)
    # target ---> word_id
    train_target, train_target_len = data_helpers.word2id(
        train_target_str, word_id_mapping, max_target_length)
    dev_target, dev_target_len = data_helpers.word2id(dev_target_str,
                                                      word_id_mapping,
                                                      max_target_length)
    test_target, test_target_len = data_helpers.word2id(
        test_target_str, word_id_mapping, max_target_length)
    # targets ---> word_id
    train_targets, train_targets_len = data_helpers.word2id_2(
        train_targets_str, word_id_mapping, max_target_length, max_target_num)
    dev_targets, dev_targets_len = data_helpers.word2id_2(
        dev_targets_str, word_id_mapping, max_target_length, max_target_num)
    test_targets, test_targets_len = data_helpers.word2id_2(
        test_targets_str, word_id_mapping, max_target_length, max_target_num)

    #which one targets in all targets
    train_target_whichone = data_helpers.get__whichtarget(
        train_targets_num, max_target_num)
    test_target_whichone = data_helpers.get__whichtarget(
        test_targets_num, max_target_num)
    # target position
    train_target_position = data_helpers.get_position(FLAGS.train_file,
                                                      max_document_length)
    test_target_position = data_helpers.get_position(FLAGS.test_file,
                                                     max_document_length)

    train_targets_position = data_helpers.get_position_2(
        train_target_position, train_targets_num, max_target_num)
    test_targets_position = data_helpers.get_position_2(
        test_target_position, test_targets_num, max_target_num)
    if use_data == 'Restaurants':
        train_x = np.load(
            "data/data_res/bert_embedding/Res_Train_Embedding.npy"
        )  #([3608,80,768])
        train_target = np.load(
            "data/data_res/bert_embedding/Res_Train_target_Embedding.npy"
        )  #([3608,23,768])
        train_targets = np.load(
            "data/data_res/bert_embedding/Res_Train_targets_Embedding.npy"
        )  #([3608,13,23,768])
        test_x = np.load("data/data_res/bert_embedding/Res_Test_Embedding.npy"
                         )  #([1120,80,768])
        test_target = np.load(
            "data/data_res/bert_embedding/Res_Test_target_Embedding.npy"
        )  #([1120,23,768])
        test_targets = np.load(
            "data/data_res/bert_embedding/Res_Test_targets_Embedding.npy"
        )  #([1120,13,23,768])

    if use_data == 'Laptops':
        train_x = np.load(
            "data/data_lap/bert_embedding/Lap_Train_Embedding.npy"
        )  #([3608,80,768])
        train_target = np.load(
            "data/data_lap/bert_embedding/Lap_Train_target_Embedding.npy"
        )  #([3608,23,768])
        train_targets = np.load(
            "data/data_lap/bert_embedding/Lap_Train_targets_Embedding.npy"
        )  #([3608,13,23,768])
        test_x = np.load("data/data_lap/bert_embedding/Lap_Test_Embedding.npy"
                         )  #([1120,80,768])
        test_target = np.load(
            "data/data_lap/bert_embedding/Lap_Test_target_Embedding.npy"
        )  #([1120,23,768])
        test_targets = np.load(
            "data/data_lap/bert_embedding/Lap_Test_targets_Embedding.npy"
        )  #([1120,13,23,768])

    #Relation Matrix
    #use test_target to creat the relation
    train_relation_self, train_relation_cross = data_helpers.get_relation(
        train_targets_num, max_target_num, FLAGS.which_relation)
    test_relation_self, test_relation_cross = data_helpers.get_relation(
        test_targets_num, max_target_num, FLAGS.which_relation)
    Train = {
        'x':
        train_x,  # int32(3608, 79, 768)       train sentences input embeddingID
        'T':
        train_target,  # int32(3608, 23, 768)       train target input embeddingID
        'Ts':
        train_targets,  # int32(3608, 13, 23, 768)   train targets input embeddingID
        'x_len':
        train_x_len,  # int32(3608,)          train sentences input len
        'T_len': train_target_len,  # int32(3608,)          train target len
        'Ts_len': train_targets_len,  # int32(3608, 13)       train targets len
        'T_W':
        train_target_whichone,  # int32(3608, 13)       the ith number of all the targets
        'T_P': train_target_position,  # float32(3608, 79)
        'Ts_P': train_targets_position,  # float32(3608,13, 79)
        'R_Self': train_relation_self,  # int32(3608, 13, 13)
        'R_Cross': train_relation_cross,  # int32(3608, 13, 13)
        'y': train_y,  # int32(3608, 3)
    }
    Test = {
        'x': test_x,
        'T': test_target,
        'Ts': test_targets,
        'x_len': test_x_len,
        'T_len': test_target_len,
        'Ts_len': test_targets_len,
        'T_W': test_target_whichone,
        'T_P': test_target_position,
        'Ts_P': test_targets_position,
        'R_Self': test_relation_self,
        'R_Cross': test_relation_cross,
        'y': test_y,
    }
    #
    # batches = data_helpers.batch_iter(
    #     list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)

    print("Vocabulary Size: {:d}".format(len(word_id_mapping)))
    print("Train/Dev/test split: {:d}/{:d}/{:d}".format(
        len(train_y), len(dev_y), len(test_y)))
    return Train, Test, w2v
Пример #3
0
sent_pad_id = word_index["<pad>"]
sent_unk_id = word_index["<unk>"]
sent_end_id = word_index["</s>"]

jieba.load_userdict(os.path.join(model_dir, "userdict.txt"))

query_seged = jieba.cut(query, cut_all=False)
candidate_seged = jieba.cut(candidate, cut_all=False)

encoder_inputs_list = list()
encoder_inputs_actual_lengths_list = list()
decoder_outputs_list = list()
decoder_outputs_actual_lengths_list = list()

query_word_list, query_id_list, query_len = data_helpers.word2id(query, word_index)
print(list(query_word_list))
candidate_word_list, candidate_id_list, candidate_len = data_helpers.word2id(candidate, word_index)
print(list(candidate_word_list))

encoder_inputs_list.append(query_id_list)
encoder_inputs_actual_lengths_list.append(query_len)
decoder_outputs_list.append(candidate_id_list)
#decoder_outputs_actual_lengths_list.append(candidate_len)
decoder_outputs_actual_lengths_list.append(20)


with tf.Session(graph=tf.Graph()) as sess:
    tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], model_dir)
    # Load the saved meta graph and restore variables
    output_node_names = "output/predict_prob"