def get_train_data(data_type, w2v_model, qa_file, doc_file, to_file_path, args): logger.info("preprocessing...") ns_amount = args.ns_amount questions = [] answers = [] # 计算每个question的向量 input_length = 0 with open(qa_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "" and i % 2 == 0: words = word_tokenize(remove_punc(line)) input_length = max(len(words), input_length) questions.append(words) elif line != "" and i % 2 == 1: arr = line.strip().split(" ") ans = [] for a in arr: if a != "": ans.append(int(a) - 1) # 因为原始数据从1开始计数,这里减去1。改为从0开始。 answers.append(ans) question_vecs = [] for q_words in questions: question_vecs.append(sentence2vec(w2v_model, q_words, input_length)) print("len(question_vecs)", len(question_vecs)) # 计算每个document的向量 docs = [] output_length = 0 with open(doc_file, "r") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "": words = word_tokenize(remove_punc(line)) output_length = max(len(words), output_length) docs.append(words) doc_vecs = [] output_length = args.output_length for d_words in docs: doc_vecs.append(sentence2vec(w2v_model, d_words, output_length)) print("len(doc_vecs)", len(doc_vecs)) logger.info("input_length:%d, output_length:%d" % (input_length, output_length)) # 计算每个doc出现的频率 doc_count = {} for ans in answers: for a in ans: if a in doc_count.keys(): doc_count[a] += 1 else: doc_count[a] = 1 # 计算每个doc的weight doc_weight = {} t_max = 0 for k in doc_count.keys(): t_max = max(t_max, doc_count[k]) for k in doc_count.keys(): doc_weight[k] = doc_count[k] / t_max total = len(question_vecs) train_num = int(total * 0.9) logger.info("train_num:%d, total:%d" % (train_num, total)) # 打乱数据 qa_index = list(range(total)) random.shuffle(qa_index) step = 0 while step * 200 <= train_num: # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w] q_encoder_input = [] r_decoder_input = [] w_decoder_input = [] weight_data_r = [] weight_data_w = [] y_data = [] qid_list = [] label_list = [] aid_list = [] logger.info("step: %d" % step) end = min(train_num, (step + 1) * 200) for ss in range(step * 200, end): i = qa_index[ss] logger.info("question: %d" % i) qid_list.append(i) label_list.append(1) y = [1] + [0] * ns_amount y_data.append(y) # question q_encoder_input.append(question_vecs[i]) # 每个question一个正确答案 aid = answers[i][0] aid_list.append(aid) r_decoder_input.append(doc_vecs[aid]) weight_data_r.append(doc_weight[aid]) # 10个un-related答案 aids = get_randoms(list(doc_weight.keys()), [aid], 10) w_decoder = [] w_weight = [] for aid in aids: w_decoder.append(doc_vecs[aid]) w_weight.append(doc_weight[aid]) w_decoder = np.array(w_decoder).reshape(output_length, args.input_dim, ns_amount) w_weight = np.array(w_weight).reshape((1, ns_amount)) w_decoder_input.append(w_decoder) weight_data_w.append(w_weight) for aaid in aids: qid_list.append(i) label_list.append(0) aid_list.append(aaid) # 这些答案都是unrelated y = [0] * (1 + ns_amount) y_data.append(y) # question q_encoder_input.append(question_vecs[i]) r_decoder_input.append(doc_vecs[aaid]) weight_data_r.append(doc_weight[aaid]) # 10个un-related答案 aids = get_randoms(list(doc_weight.keys()), [aid], 10) w_decoder = [] w_weight = [] for aid in aids: w_decoder.append(doc_vecs[aid]) w_weight.append(doc_weight[aid]) w_decoder = np.array(w_decoder).reshape( output_length, args.input_dim, ns_amount) w_weight = np.array(w_weight).reshape((1, ns_amount)) w_decoder_input.append(w_decoder) weight_data_w.append(w_weight) logger.info("loading weights: ckpt/nn_weights_%s.h5" % data_type) model = negative_samples(input_length=input_length, input_dim=args.input_dim, output_length=output_length, output_dim=args.output_dim, hidden_dim=args.hidden_dim, ns_amount=ns_amount, learning_rate=args.learning_rate, drop_rate=args.drop_rate) model.load_weights("ckpt/nn_weights_%s.h5" % data_type) new_dnn_model = Model(inputs=model.input, outputs=model.get_layer('dropout_con').output) logger.info("predicting...") res = new_dnn_model.predict([ q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w ]) # print(res) with open(to_file_path, "a") as f: for i in range(len(res)): row = res[i] feature_str = '' for j in range(len(row)): feature_str = feature_str + (" %d:%.9f" % (j + 1, row[j])) label = label_list[i] id = qid_list[i] doc_id = aid_list[i] line = "%d qid:%d%s # doc-%d \n" % (label, id, feature_str, doc_id) f.write(line) print("saved to:", to_file_path) logger.info("step:%d added" % step) step += 1 logger.info("saved to: %s" % to_file_path)
def get_train_data(data_type, w2v_model, ckpt_path, qa_file, doc_file, to_file_path, args, step=0): if os.path.exists(to_file_path): logger.info("file exists: %s" % to_file_path) return logger.info("preprocessing...") ns_amount = 10 questions = [] answers = [] # 计算每个question的向量 input_length = 0 with open(qa_file, "r", encoding="utf-8") as f: lines = f.readlines() for i, line in enumerate(lines): if i >= 2000: break line = line.strip().lower() if line != "" and i % 2 == 0: words = word_tokenize(remove_punc(line)) input_length = max(len(words), input_length) questions.append(words) elif line != "" and i % 2 == 1: arr = line.strip().split(" ") ans = [] for a in arr: if a != "": ans.append(int(a) - 1) # 因为原始数据从1开始计数,这里减去1。改为从0开始。 answers.append(ans) question_vecs = [] for q_words in questions: question_vecs.append(sentence2vec(w2v_model, q_words, input_length)) print("len(question_vecs)", len(question_vecs)) # 计算每个document的向量 docs = [] output_length = 0 with open(doc_file, "r", encoding="utf-8") as f: lines = f.readlines() for i, line in enumerate(lines): line = line.strip().lower() if line != "": words = word_tokenize(remove_punc(line)) output_length = max(len(words), output_length) docs.append(words) doc_vecs = [] output_length = 1000 for d_words in docs: doc_vecs.append(sentence2vec(w2v_model, d_words, output_length)) print("len(doc_vecs)", len(doc_vecs)) logger.info("input_length:%d, output_length:%d" % (input_length, output_length)) # 计算每个doc出现的频率 doc_count = {} for ii in range(len(docs)): doc_count[ii] = 0 for ans in answers: for a in ans: if a in doc_count.keys(): doc_count[a] += 1 # 计算每个doc的weight doc_weight = {} t_max = 0 for k in doc_count.keys(): t_max = max(t_max, doc_count[k]) for k in doc_count.keys(): doc_weight[k] = doc_count[k] / t_max logger.info("loading weights...") model = negative_samples(input_length=input_length, input_dim=args.input_dim, output_length=output_length, output_dim=args.output_dim, hidden_dim=args.hidden_dim, ns_amount=ns_amount, learning_rate=args.learning_rate, drop_rate=args.drop_rate) model.load_weights(ckpt_path) new_dnn_model = Model(inputs=model.input, outputs=model.get_layer('dropout_con').output) total = len(question_vecs) train_num = int(total * 0.9) qid_list = [] # 打乱数据 qa_index = list(range(total)) # random.shuffle(qa_index) for ss in range(train_num, total): i = qa_index[ss] # [q_encoder_input, r_decoder_input, w_decoder_input, weight_data_r, weight_data_w] q_encoder_input = [] r_decoder_input = [] w_decoder_input = [] weight_data_r = [] weight_data_w = [] logger.info("get all documents for question: %d" % i) print("get all documents for question: %d" % i) # qid_list.append(i) # label_list.append(1) cur_answers = answers[i] doc_list_ordered = [a for a in cur_answers] for aid in list(doc_weight.keys()): if aid not in doc_list_ordered: doc_list_ordered.append(aid) label_list = [] aid_list = [] print("len(doc_list_ordered):", len(doc_list_ordered)) print("len(cur_answers):", len(cur_answers)) for aid in doc_list_ordered: aid_list.append(aid) if aid in cur_answers: label_list.append(1) else: label_list.append(0) # question q_encoder_input.append(question_vecs[i]) r_decoder_input.append(doc_vecs[aid]) weight_data_r.append(doc_weight[aid]) # 10个un-related答案 aids = get_randoms(list(doc_weight.keys()), cur_answers, ns_amount) w_decoder = [] w_weight = [] for aid in aids: w_decoder.append(doc_vecs[aid]) w_weight.append(doc_weight[aid]) w_decoder = np.array(w_decoder).reshape(output_length, args.input_dim, ns_amount) w_weight = np.array(w_weight).reshape((1, ns_amount)) w_decoder_input.append(w_decoder) weight_data_w.append(w_weight) logger.info("now:%d , predicting question: %d" % (ss, i)) print("now:%d , predicting question: %d" % (ss, i)) start = 0 end = len(q_encoder_input) for cur in range(0, end, 1000): print("cur:%d / %d" % (cur, end)) a = q_encoder_input[cur:cur + 1000] b = r_decoder_input[cur:cur + 1000] c = w_decoder_input[cur:cur + 1000] d = weight_data_r[cur:cur + 1000] e = weight_data_w[cur:cur + 1000] res = new_dnn_model.predict([a, b, c, d, e]) # print(res) with open(to_file_path, "a") as f: for j in range(len(res)): row = res[j] feature_str = '' for k in range(len(row)): feature_str = feature_str + (" %d:%.9f" % (k + 1, row[k])) label = label_list[j] doc_id = aid_list[j] line = "%d qid:%d%s # doc-%d \n" % (label, i, feature_str, doc_id) f.write(line) print("saved to:", to_file_path) logger.info("total:%d" % total) logger.info("saved to: %s" % to_file_path)