예제 #1
0
    def _get_extended_questions():
        with open('data/extend/extra_questions.txt', 'r',
                  encoding='utf8') as f:
            raw = f.read().strip()

        question_frames = raw.split(
            "===================================================================================================="
        )
        question_frames = [qf.strip() for qf in question_frames[:-1]]

        def process(question_frame):
            # return original question and its permutations
            lines = question_frame.split('\n')
            lines = [l.strip() for l in lines]
            if lines[0][:2] == "No":
                return None

            original = lines[0].strip("Permutations of '")[:-2]
            permutations = [l for l in lines[1:] if l]
            return original, permutations

        pre_process = PreProcess()

        question_dict = {}
        for qf in question_frames:
            tmp = process(qf)
            if tmp:
                o, p = process(qf)
                k = " ".join(pre_process.process(o, remove_stop_words=False))
                question_dict[k] = [
                    " ".join(pre_process.process(i, remove_stop_words=False))
                    for i in p
                ]

        return question_dict
예제 #2
0
    def _convert_data(self, data_obj):
        pre_process = PreProcess()

        train_data = {}
        dev_data = {}
        idx = 0
        for d in data_obj:
            # custom pre-process
            d['answer'] = d['answer'].strip("Answer:")

            context = " ".join(pre_process.process(d['answer'], url_norm=True))
            if not context:
                continue

            original_question = " ".join(
                pre_process.process(d['question'], remove_stop_words=False))
            extended_questions = self.extend_question_dict.get(
                original_question, [])

            if extended_questions:
                # split train and dev by questions
                train_questions, dev_questions = train_test_split(
                    extended_questions, test_size=0.1, random_state=42)

                train_data[idx] = {
                    'context': d['answer'],
                    'c': context,
                    'qs': [original_question] + train_questions
                }
                dev_data[idx] = {
                    'context': d['answer'],
                    'c': context,
                    'qs': dev_questions
                }
            else:
                train_data[idx] = {
                    'context': d['answer'],
                    'c': context,
                    'qs': [original_question]
                }
            idx += 1
        return train_data, dev_data
예제 #3
0
    def _convert_data(data_obj):
        pre_process = PreProcess()

        data = {}
        idx = 0
        for d in data_obj:
            # custom pre-process
            d['answer'] = d['answer'].strip("Answer:")
            d['answer'] = re.sub(" ", " ", d['answer'])

            context = " ".join(pre_process.process(d['answer'], url_norm=True))
            question = " ".join(
                pre_process.process(d['question'], remove_stop_words=False))
            if not (d['answer'] and context and question):
                continue
            data[idx] = {
                'context': d['answer'],
                'c': context,
                'qs': [question]
            }
            idx += 1
        return data
예제 #4
0
class Inference:
    def __init__(self, model_name, dataset):
        self.model_name = TRAINED_MODELS + model_name + "/"
        self.dataset = dataset

        self.data = Dataset(self.dataset)
        self.data.tfidf_compressor.train()

        self.model = self._load_model()
        self.pre_process = PreProcess()

        idx = list(self.data.train_data.keys())
        idx.sort()
        self.train_c_word_set, self.train_c = self.data.get_all_c_word_set(
            self.data.train_data)
        self.all_train_contexts = np.array(
            [self.data.train_data[i]['context'] for i in idx])
        self.related_questions = np.array(
            [self.data.train_data[i]['qs'] for i in idx])

    def _load_model(self):
        # load model
        num_chars = self.data.get_num_chars()

        embeddings = get_trimmed_embeddings(DATA + "embedding_data.npz")

        model = NtuModel(model_name=self.model_name,
                         embeddings=embeddings,
                         num_chars=num_chars,
                         batch_size=32,
                         early_stopping=False,
                         k_neg=0)
        model.build()
        saver = tf.train.Saver()
        saver.restore(model.sess, tf.train.latest_checkpoint(self.model_name))

        return model

    def get_answer(self, question):
        question_example = self.pre_process.process(question,
                                                    remove_stop_words=False)
        q_word_set = set(question_example)
        question_example = self.data.process_sent(" ".join(question_example))

        filtered_idx = []
        for i in range(len(self.train_c_word_set)):
            if len(q_word_set.intersection(self.train_c_word_set[i])) > 0:
                filtered_idx.append(i)

        context_examples = [
            self.data.process_sent(self.data.tfidf_compressor.compress(c))
            for c in self.train_c[filtered_idx]
        ]

        scores = self.model.get_scores(question_example, context_examples)
        c_max = scores.argsort()[::-1][:10]
        if len(c_max) == 0:
            return "There is no answer for that.", ["None"]

        top_related_questions = self.related_questions[filtered_idx][c_max]
        top_original_context = self.all_train_contexts[filtered_idx][c_max]

        # process top related questions
        related_question_examples = [
            self.data.process_sent(i[0]) for i in top_related_questions
        ]

        q_closet = self._arg_closest_related_questions(
            question_example, related_question_examples)
        return top_original_context[q_closet], top_related_questions[q_closet]

    def _arg_closest_related_questions(self, question, related_questions):
        all_question = [question] + related_questions
        q_char_ids, q_word_ids = zip(*[zip(*zip(*x)) for x in all_question])

        padded_q_word_ids, q_sequence_lengths = pad_sequences(q_word_ids,
                                                              pad_tok=0)
        padded_q_char_ids, q_word_lengths = pad_sequences(q_char_ids,
                                                          pad_tok=0,
                                                          nlevels=2)

        feed_dict = {
            self.model.q_word_ids: padded_q_word_ids,
            self.model.q_char_ids: padded_q_char_ids,
            self.model.q_sequence_lengths: q_sequence_lengths,
            self.model.q_word_lengths: q_word_lengths,
            self.model.keep_op: 1.0,
            self.model.is_training: False
        }
        question_embeddings = self.model.sess.run(self.model.q_dense,
                                                  feed_dict=feed_dict)
        q = question_embeddings[0]  # 1, 300
        rq = question_embeddings[1:]
        scores = np.sum(np.square(rq - q), axis=-1)

        q_min = scores.argsort()[0]
        return q_min
예제 #5
0
    def _get_extended_questions(self):
        with open(DATA + self.dataset + "/extra_questions.txt",
                  'r',
                  encoding='utf8') as f:
            raw = f.read().strip()

        question_frames = raw.split(
            "===================================================================================================="
        )
        question_frames = [qf.strip() for qf in question_frames[:-1]]

        def process(question_frame):
            # return original question and its permutations
            lines = question_frame.split('\n')
            lines = [l.strip() for l in lines]
            if lines[0][:2] == "No":
                return None

            original = lines[0].strip("Permutations of '")[:-2]
            permutations = [l for l in lines[1:] if l]
            return original, permutations

        pre_process = PreProcess()

        question_dict = {}
        t = Timer()
        for qf in question_frames:
            tmp = process(qf)
            if tmp:
                t.start("", verbal=False)
                o, p = process(qf)
                k = " ".join(pre_process.process(o, remove_stop_words=False))
                question_dict[k] = [
                    " ".join(pre_process.process(i, remove_stop_words=False))
                    for i in p
                ]

                # select the most diverse question set
                self.tf_idf.train([k] + question_dict[k])
                del_num = len(question_dict[k]) // self.top_k
                if del_num == 0:
                    t.remaining_time(t.stop(verbal=False),
                                     len(question_frames))
                    continue

                selected = []
                while question_dict[k]:
                    indices = self.tf_idf.distance(k, question_dict[k])
                    q = question_dict[k].pop(indices[0])
                    selected.append(q)
                    if not question_dict[k]:
                        break
                    close_q = self.tf_idf.distance(
                        q, question_dict[k])[::-1][:del_num]
                    question_dict[k] = [
                        question_dict[k][i]
                        for i in range(len(question_dict[k]))
                        if i not in close_q
                    ]
                question_dict[k] = selected
                t.remaining_time(t.stop(verbal=False), len(question_frames))

        return question_dict