def _get_extended_questions(): with open('data/extend/extra_questions.txt', 'r', encoding='utf8') as f: raw = f.read().strip() question_frames = raw.split( "====================================================================================================" ) question_frames = [qf.strip() for qf in question_frames[:-1]] def process(question_frame): # return original question and its permutations lines = question_frame.split('\n') lines = [l.strip() for l in lines] if lines[0][:2] == "No": return None original = lines[0].strip("Permutations of '")[:-2] permutations = [l for l in lines[1:] if l] return original, permutations pre_process = PreProcess() question_dict = {} for qf in question_frames: tmp = process(qf) if tmp: o, p = process(qf) k = " ".join(pre_process.process(o, remove_stop_words=False)) question_dict[k] = [ " ".join(pre_process.process(i, remove_stop_words=False)) for i in p ] return question_dict
def _convert_data(self, data_obj): pre_process = PreProcess() train_data = {} dev_data = {} idx = 0 for d in data_obj: # custom pre-process d['answer'] = d['answer'].strip("Answer:") context = " ".join(pre_process.process(d['answer'], url_norm=True)) if not context: continue original_question = " ".join( pre_process.process(d['question'], remove_stop_words=False)) extended_questions = self.extend_question_dict.get( original_question, []) if extended_questions: # split train and dev by questions train_questions, dev_questions = train_test_split( extended_questions, test_size=0.1, random_state=42) train_data[idx] = { 'context': d['answer'], 'c': context, 'qs': [original_question] + train_questions } dev_data[idx] = { 'context': d['answer'], 'c': context, 'qs': dev_questions } else: train_data[idx] = { 'context': d['answer'], 'c': context, 'qs': [original_question] } idx += 1 return train_data, dev_data
def _convert_data(data_obj): pre_process = PreProcess() data = {} idx = 0 for d in data_obj: # custom pre-process d['answer'] = d['answer'].strip("Answer:") d['answer'] = re.sub(" ", " ", d['answer']) context = " ".join(pre_process.process(d['answer'], url_norm=True)) question = " ".join( pre_process.process(d['question'], remove_stop_words=False)) if not (d['answer'] and context and question): continue data[idx] = { 'context': d['answer'], 'c': context, 'qs': [question] } idx += 1 return data
class Inference: def __init__(self, model_name, dataset): self.model_name = TRAINED_MODELS + model_name + "/" self.dataset = dataset self.data = Dataset(self.dataset) self.data.tfidf_compressor.train() self.model = self._load_model() self.pre_process = PreProcess() idx = list(self.data.train_data.keys()) idx.sort() self.train_c_word_set, self.train_c = self.data.get_all_c_word_set( self.data.train_data) self.all_train_contexts = np.array( [self.data.train_data[i]['context'] for i in idx]) self.related_questions = np.array( [self.data.train_data[i]['qs'] for i in idx]) def _load_model(self): # load model num_chars = self.data.get_num_chars() embeddings = get_trimmed_embeddings(DATA + "embedding_data.npz") model = NtuModel(model_name=self.model_name, embeddings=embeddings, num_chars=num_chars, batch_size=32, early_stopping=False, k_neg=0) model.build() saver = tf.train.Saver() saver.restore(model.sess, tf.train.latest_checkpoint(self.model_name)) return model def get_answer(self, question): question_example = self.pre_process.process(question, remove_stop_words=False) q_word_set = set(question_example) question_example = self.data.process_sent(" ".join(question_example)) filtered_idx = [] for i in range(len(self.train_c_word_set)): if len(q_word_set.intersection(self.train_c_word_set[i])) > 0: filtered_idx.append(i) context_examples = [ self.data.process_sent(self.data.tfidf_compressor.compress(c)) for c in self.train_c[filtered_idx] ] scores = self.model.get_scores(question_example, context_examples) c_max = scores.argsort()[::-1][:10] if len(c_max) == 0: return "There is no answer for that.", ["None"] top_related_questions = self.related_questions[filtered_idx][c_max] top_original_context = self.all_train_contexts[filtered_idx][c_max] # process top related questions related_question_examples = [ self.data.process_sent(i[0]) for i in top_related_questions ] q_closet = self._arg_closest_related_questions( question_example, related_question_examples) return top_original_context[q_closet], top_related_questions[q_closet] def _arg_closest_related_questions(self, question, related_questions): all_question = [question] + related_questions q_char_ids, q_word_ids = zip(*[zip(*zip(*x)) for x in all_question]) padded_q_word_ids, q_sequence_lengths = pad_sequences(q_word_ids, pad_tok=0) padded_q_char_ids, q_word_lengths = pad_sequences(q_char_ids, pad_tok=0, nlevels=2) feed_dict = { self.model.q_word_ids: padded_q_word_ids, self.model.q_char_ids: padded_q_char_ids, self.model.q_sequence_lengths: q_sequence_lengths, self.model.q_word_lengths: q_word_lengths, self.model.keep_op: 1.0, self.model.is_training: False } question_embeddings = self.model.sess.run(self.model.q_dense, feed_dict=feed_dict) q = question_embeddings[0] # 1, 300 rq = question_embeddings[1:] scores = np.sum(np.square(rq - q), axis=-1) q_min = scores.argsort()[0] return q_min
def _get_extended_questions(self): with open(DATA + self.dataset + "/extra_questions.txt", 'r', encoding='utf8') as f: raw = f.read().strip() question_frames = raw.split( "====================================================================================================" ) question_frames = [qf.strip() for qf in question_frames[:-1]] def process(question_frame): # return original question and its permutations lines = question_frame.split('\n') lines = [l.strip() for l in lines] if lines[0][:2] == "No": return None original = lines[0].strip("Permutations of '")[:-2] permutations = [l for l in lines[1:] if l] return original, permutations pre_process = PreProcess() question_dict = {} t = Timer() for qf in question_frames: tmp = process(qf) if tmp: t.start("", verbal=False) o, p = process(qf) k = " ".join(pre_process.process(o, remove_stop_words=False)) question_dict[k] = [ " ".join(pre_process.process(i, remove_stop_words=False)) for i in p ] # select the most diverse question set self.tf_idf.train([k] + question_dict[k]) del_num = len(question_dict[k]) // self.top_k if del_num == 0: t.remaining_time(t.stop(verbal=False), len(question_frames)) continue selected = [] while question_dict[k]: indices = self.tf_idf.distance(k, question_dict[k]) q = question_dict[k].pop(indices[0]) selected.append(q) if not question_dict[k]: break close_q = self.tf_idf.distance( q, question_dict[k])[::-1][:del_num] question_dict[k] = [ question_dict[k][i] for i in range(len(question_dict[k])) if i not in close_q ] question_dict[k] = selected t.remaining_time(t.stop(verbal=False), len(question_frames)) return question_dict