def w2v(word): if word != "UNK": word = word.lower() index = data_helpers.word2id(word) if index == -1: raise ValueError("{} doesn't exist in the vocablury.".format(word)) else: return word_vector[0][index]
def preprocess(): ''' read from the text file. :return: sen word id:[324,1413,1,41,43,0,0,0] sen len:[5] sen max len :[8] sen label:[0,0,1] target word id:[34,154,0,0] target len: [2] target max len: [4] targets word id :[[34,154,0,0], [34,14,12,56], [0,0,0,0]] targets num = 2 targets len: [2,4,0] targets max num:[3] targets_relation_self = [[1,0,0], [0,1,0], [0.0.0]] targets_relation_cross = [[0,1,0], [1,0,0], [0.0.0]] ''' # Data Preparation # ================================================== # Load data print("Loading data...") train_x_str, train_target_str, train_y = data_helpers.load_data_and_labels( FLAGS.train_file) dev_x_str, dev_target_str, dev_y = data_helpers.load_data_and_labels( FLAGS.test_file) test_x_str, test_target_str, test_y = data_helpers.load_data_and_labels( FLAGS.test_file) #word embedding ---> x[324,1413,1,41,43,0,0,0] y[0,1] #word_id_mapping,such as apple--->23 ,w2v 23---->[vector] word_id_mapping, w2v = data_helpers.load_w2v(FLAGS.embedding_file_path, 300) max_document_length = max( [len(x.split(" ")) for x in (train_x_str + dev_x_str + test_x_str)]) max_target_length = max([ len(x.split(" ")) for x in (train_target_str + dev_target_str + test_target_str) ]) #The targets ---->[[[141,23,45],[23,45,1,2],[2]], ...] #The number of targets ----> [3, ...] train_targets_str, train_targets_num = data_helpers.load_targets( FLAGS.train_file) dev_targets_str, dev_targets_num = data_helpers.load_targets( FLAGS.test_file) test_targets_str, test_targets_num = data_helpers.load_targets( FLAGS.test_file) max_target_num = max( [len(x) for x in (train_targets_str + test_targets_str)]) # sentence ---> word_id train_x, train_x_len = data_helpers.word2id(train_x_str, word_id_mapping, max_document_length) dev_x, dev_x_len = data_helpers.word2id(dev_x_str, word_id_mapping, max_document_length) test_x, test_x_len = data_helpers.word2id(test_x_str, word_id_mapping, max_document_length) # target ---> word_id train_target, train_target_len = data_helpers.word2id( train_target_str, word_id_mapping, max_target_length) dev_target, dev_target_len = data_helpers.word2id(dev_target_str, word_id_mapping, max_target_length) test_target, test_target_len = data_helpers.word2id( test_target_str, word_id_mapping, max_target_length) # targets ---> word_id train_targets, train_targets_len = data_helpers.word2id_2( train_targets_str, word_id_mapping, max_target_length, max_target_num) dev_targets, dev_targets_len = data_helpers.word2id_2( dev_targets_str, word_id_mapping, max_target_length, max_target_num) test_targets, test_targets_len = data_helpers.word2id_2( test_targets_str, word_id_mapping, max_target_length, max_target_num) #which one targets in all targets train_target_whichone = data_helpers.get__whichtarget( train_targets_num, max_target_num) test_target_whichone = data_helpers.get__whichtarget( test_targets_num, max_target_num) # target position train_target_position = data_helpers.get_position(FLAGS.train_file, max_document_length) test_target_position = data_helpers.get_position(FLAGS.test_file, max_document_length) train_targets_position = data_helpers.get_position_2( train_target_position, train_targets_num, max_target_num) test_targets_position = data_helpers.get_position_2( test_target_position, test_targets_num, max_target_num) if use_data == 'Restaurants': train_x = np.load( "data/data_res/bert_embedding/Res_Train_Embedding.npy" ) #([3608,80,768]) train_target = np.load( "data/data_res/bert_embedding/Res_Train_target_Embedding.npy" ) #([3608,23,768]) train_targets = np.load( "data/data_res/bert_embedding/Res_Train_targets_Embedding.npy" ) #([3608,13,23,768]) test_x = np.load("data/data_res/bert_embedding/Res_Test_Embedding.npy" ) #([1120,80,768]) test_target = np.load( "data/data_res/bert_embedding/Res_Test_target_Embedding.npy" ) #([1120,23,768]) test_targets = np.load( "data/data_res/bert_embedding/Res_Test_targets_Embedding.npy" ) #([1120,13,23,768]) if use_data == 'Laptops': train_x = np.load( "data/data_lap/bert_embedding/Lap_Train_Embedding.npy" ) #([3608,80,768]) train_target = np.load( "data/data_lap/bert_embedding/Lap_Train_target_Embedding.npy" ) #([3608,23,768]) train_targets = np.load( "data/data_lap/bert_embedding/Lap_Train_targets_Embedding.npy" ) #([3608,13,23,768]) test_x = np.load("data/data_lap/bert_embedding/Lap_Test_Embedding.npy" ) #([1120,80,768]) test_target = np.load( "data/data_lap/bert_embedding/Lap_Test_target_Embedding.npy" ) #([1120,23,768]) test_targets = np.load( "data/data_lap/bert_embedding/Lap_Test_targets_Embedding.npy" ) #([1120,13,23,768]) #Relation Matrix #use test_target to creat the relation train_relation_self, train_relation_cross = data_helpers.get_relation( train_targets_num, max_target_num, FLAGS.which_relation) test_relation_self, test_relation_cross = data_helpers.get_relation( test_targets_num, max_target_num, FLAGS.which_relation) Train = { 'x': train_x, # int32(3608, 79, 768) train sentences input embeddingID 'T': train_target, # int32(3608, 23, 768) train target input embeddingID 'Ts': train_targets, # int32(3608, 13, 23, 768) train targets input embeddingID 'x_len': train_x_len, # int32(3608,) train sentences input len 'T_len': train_target_len, # int32(3608,) train target len 'Ts_len': train_targets_len, # int32(3608, 13) train targets len 'T_W': train_target_whichone, # int32(3608, 13) the ith number of all the targets 'T_P': train_target_position, # float32(3608, 79) 'Ts_P': train_targets_position, # float32(3608,13, 79) 'R_Self': train_relation_self, # int32(3608, 13, 13) 'R_Cross': train_relation_cross, # int32(3608, 13, 13) 'y': train_y, # int32(3608, 3) } Test = { 'x': test_x, 'T': test_target, 'Ts': test_targets, 'x_len': test_x_len, 'T_len': test_target_len, 'Ts_len': test_targets_len, 'T_W': test_target_whichone, 'T_P': test_target_position, 'Ts_P': test_targets_position, 'R_Self': test_relation_self, 'R_Cross': test_relation_cross, 'y': test_y, } # # batches = data_helpers.batch_iter( # list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) print("Vocabulary Size: {:d}".format(len(word_id_mapping))) print("Train/Dev/test split: {:d}/{:d}/{:d}".format( len(train_y), len(dev_y), len(test_y))) return Train, Test, w2v
sent_pad_id = word_index["<pad>"] sent_unk_id = word_index["<unk>"] sent_end_id = word_index["</s>"] jieba.load_userdict(os.path.join(model_dir, "userdict.txt")) query_seged = jieba.cut(query, cut_all=False) candidate_seged = jieba.cut(candidate, cut_all=False) encoder_inputs_list = list() encoder_inputs_actual_lengths_list = list() decoder_outputs_list = list() decoder_outputs_actual_lengths_list = list() query_word_list, query_id_list, query_len = data_helpers.word2id(query, word_index) print(list(query_word_list)) candidate_word_list, candidate_id_list, candidate_len = data_helpers.word2id(candidate, word_index) print(list(candidate_word_list)) encoder_inputs_list.append(query_id_list) encoder_inputs_actual_lengths_list.append(query_len) decoder_outputs_list.append(candidate_id_list) #decoder_outputs_actual_lengths_list.append(candidate_len) decoder_outputs_actual_lengths_list.append(20) with tf.Session(graph=tf.Graph()) as sess: tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], model_dir) # Load the saved meta graph and restore variables output_node_names = "output/predict_prob"