Exemplo n.º 1
0
    def check_f1_em(self,
                    session,
                    context_path,
                    qn_path,
                    ans_path,
                    dataset,
                    num_samples=100,
                    print_to_screen=False):
        """
        Sample from the provided (train/dev) set.
        For each sample, calculate F1 and EM score.
        Return average F1 and EM score for all samples.
        Optionally pretty-print examples.

        Note: This function is not quite the same as the F1/EM numbers you get from "official_eval" mode.
        This function uses the pre-processed version of the e.g. dev set for speed,
        whereas "official_eval" mode uses the original JSON. Therefore:
          1. official_eval takes your max F1/EM score w.r.t. the three reference answers,
            whereas this function compares to just the first answer (which is what's saved in the preprocessed data)
          2. Our preprocessed version of the dev set is missing some examples
            due to tokenization issues (see squad_preprocess.py).
            "official_eval" includes all examples.

        Inputs:
          session: TensorFlow session
          qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files.
          dataset: string. Either "train" or "dev". Just for logging purposes.
          num_samples: int. How many samples to use. If num_samples=0 then do whole dataset.
          print_to_screen: if True, pretty-prints each example to screen

        Returns:
          F1 and EM: Scalars. The average across the sampled examples.
        """
        logging.info(
            "Calculating F1/EM for %s examples in %s set..." %
            (str(num_samples) if num_samples != 0 else "all", dataset))

        f1_total = 0.
        em_total = 0.
        example_num = 0

        tic = time.time()

        # Note here we select discard_long=False because we want to sample from the entire dataset
        # That means we're truncating, rather than discarding, examples with too-long context or questions
        for batch in get_batch_generator(self.word2id,
                                         context_path,
                                         qn_path,
                                         ans_path,
                                         self.FLAGS.batch_size,
                                         context_len=self.FLAGS.context_len,
                                         question_len=self.FLAGS.question_len,
                                         discard_long=False):

            pred_start_pos, pred_end_pos = self.get_start_end_pos(
                session, batch, dataset)

            # Convert the start and end positions to lists length batch_size
            pred_start_pos = pred_start_pos.tolist()  # list length batch_size
            pred_end_pos = pred_end_pos.tolist()  # list length batch_size

            for ex_idx, (pred_ans_start, pred_ans_end,
                         true_ans_tokens) in enumerate(
                             zip(pred_start_pos, pred_end_pos,
                                 batch.ans_tokens)):
                example_num += 1

                # Get the predicted answer
                # Important: batch.context_tokens contains the original words (no UNKs)
                # You need to use the original no-UNK version when measuring F1/EM
                pred_ans_tokens = batch.context_tokens[ex_idx][
                    pred_ans_start:pred_ans_end + 1]
                pred_answer = " ".join(pred_ans_tokens)

                # Get true answer (no UNKs)
                true_answer = " ".join(true_ans_tokens)

                # Calc F1/EM
                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)
                f1_total += f1
                em_total += em

                # Optionally pretty-print
                if print_to_screen:
                    print_example(self.word2id, batch.context_tokens[ex_idx],
                                  batch.qn_tokens[ex_idx],
                                  batch.ans_span[ex_idx,
                                                 0], batch.ans_span[ex_idx, 1],
                                  pred_ans_start, pred_ans_end, true_answer,
                                  pred_answer, f1, em)

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        f1_total /= example_num
        em_total /= example_num

        toc = time.time()
        logging.info(
            "Calculating F1/EM for %i examples in %s set took %.2f seconds" %
            (example_num, dataset, toc - tic))

        return f1_total, em_total
Exemplo n.º 2
0
    def check_f1_em(self,
                    session,
                    context_path,
                    qn_path,
                    ans_path,
                    dataset,
                    num_samples=100,
                    print_to_screen=False):
        """
        Sample from the provided (train/dev) set.
        For each sample, calculate F1 and EM score.
        Return average F1 and EM score for all samples.
        Optionally pretty-print examples.
        """
        logging.info(
            "Calculating F1/EM for %s examples in %s set..." %
            (str(num_samples) if num_samples != 0 else "all", dataset))

        f1_total = 0.
        em_total = 0.
        example_num = 0

        tic = time.time()
        for batch in get_batch_generator(self.word2id,
                                         context_path,
                                         qn_path,
                                         ans_path,
                                         self.FLAGS.batch_size,
                                         context_len=self.FLAGS.context_len,
                                         question_len=self.FLAGS.question_len,
                                         discard_long=False):

            pred_start_pos, pred_end_pos = self.get_start_end_pos(
                session, batch)

            # Convert the start and end positions to lists length batch_size
            pred_start_pos = pred_start_pos.tolist()  # list length batch_size
            pred_end_pos = pred_end_pos.tolist()  # list length batch_size

            for ex_idx, (pred_ans_start, pred_ans_end,
                         true_ans_tokens) in enumerate(
                             zip(pred_start_pos, pred_end_pos,
                                 batch.ans_tokens)):
                example_num += 1

                # Get the predicted answer
                pred_ans_tokens = batch.context_tokens[ex_idx][
                    pred_ans_start:pred_ans_end + 1]
                pred_answer = " ".join(pred_ans_tokens)

                # Get true answer (no UNKs)
                true_answer = " ".join(true_ans_tokens)

                # Calc F1/EM
                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)
                f1_total += f1
                em_total += em

                # Optionally pretty-print
                if print_to_screen:
                    print_example(self.word2id, batch.context_tokens[ex_idx],
                                  batch.qn_tokens[ex_idx],
                                  batch.ans_span[ex_idx,
                                                 0], batch.ans_span[ex_idx, 1],
                                  pred_ans_start, pred_ans_end, true_answer,
                                  pred_answer, f1, em)

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        f1_total /= example_num
        em_total /= example_num

        toc = time.time()
        logging.info(
            "Calculating F1/EM for %i examples in %s set took %.2f seconds" %
            (example_num, dataset, toc - tic))

        return f1_total, em_total
Exemplo n.º 3
0
    def check_f1_em(self,
                    session,
                    context_path,
                    qn_path,
                    ans_path,
                    dataset,
                    num_samples=10,
                    print_to_screen=False,
                    write_out=False,
                    file_out=None,
                    shuffle=True):
        """
        Sample from the provided (train/dev) set.
        For each sample, calculate F1 and EM score.
        Return average F1 and EM score for all samples.
        Optionally pretty-print examples.

        Inputs:
          session: TensorFlow session
          qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files.
          dataset: string. Either "train" or "dev". Just for logging purposes.
          num_samples: int. How many samples to use. If num_samples=0 then do whole dataset.
          print_to_screen: if True, pretty-prints each example to screen

        Returns:
          F1 and EM: Scalars. The average across the sampled examples.
        """
        logging.info(
            "Calculating F1/EM for %s examples in %s set..." %
            (str(num_samples) if num_samples != 0 else "all", dataset))

        f1_total = 0.
        em_total = 0.
        ed_total = 0.
        rough_em_total = 0.
        example_num = 0

        tic = time.time()
        ans_list = []
        graph_route_info = []
        # Note here we select discard_long=False because we want to sample from the entire dataset
        # That means we're truncating, rather than discarding, examples with too-long context or questions
        for batch in get_batch_generator(
                self.word2id,
                self.context2id,
                self.ans2id,
                context_path,
                qn_path,
                ans_path,
                self.FLAGS.batch_size,
                self.graph_vocab_class,
                context_len=self.FLAGS.context_len,
                question_len=self.FLAGS.question_len,
                answer_len=self.FLAGS.answer_len,
                discard_long=False,
                use_raw_graph=self.FLAGS.use_raw_graph,
                shuffle=shuffle,
                show_start_tokens=self.FLAGS.show_start_tokens,
                output_goal=True):
            train_ids, pred_ids, dev_final_states, pred_logits = self.get_prob_dists(
                session, batch)
            start_ids = batch.ans_ids[:, 0].reshape(-1)
            graph_length = np.sum(batch.context_mask, axis=1)

            if self.FLAGS.pred_method != 'beam':
                pred_ids, confidence_score, ans_str = verify_route(
                    start_ids, pred_logits, batch.context_tokens, self.ans2id,
                    self.id2ans, self.FLAGS.answer_len)

            f1_scores, em_scores, ed_scores, gm_scores = [], [], [], []

            pred_ids = pred_ids.tolist()  # the output of using test network
            for ex_idx, (pred_ans_list, true_ans_tokens) in enumerate(
                    zip(pred_ids, list(batch.ans_tokens))):
                example_num += 1
                pred_ans_tokens = []
                for id in pred_ans_list:
                    if id == PAD_ID:
                        break
                    else:
                        pred_ans_tokens.append(self.id2ans[id])
                pred_answer = " ".join(pred_ans_tokens)

                # Get true answer (no UNKs)
                true_answer = " ".join(true_ans_tokens[:])

                # Calculate metrics
                f1, em, edit_dist, goal_match = compute_all_metrics(
                    pred_ans_tokens, true_ans_tokens)
                f1_scores.append(f1)
                em_scores.append(em)
                ed_scores.append(edit_dist)
                gm_scores.append(goal_match)

                f1_total += f1

                em_total += em
                ed_total += edit_dist
                rough_em_total += goal_match
                ans_list.append(pred_answer)
                graph_route_info.append(
                    (str(int(graph_length[ex_idx])),
                     str(len(true_ans_tokens[1:-1])), str(int(em))))

                # Optionally pretty-print
                if print_to_screen:
                    print_example(self.word2id, self.context2id, self.ans2id,
                                  batch.context_tokens[ex_idx],
                                  batch.qn_tokens[ex_idx], true_answer,
                                  pred_answer, f1, em, edit_dist,
                                  confidence_score[ex_idx])

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break
        f1_total /= example_num
        em_total /= example_num
        ed_total /= example_num
        rough_em_total /= example_num

        toc = time.time()
        logging.info(
            "Calculating F1/EM for %i examples in %s set took %.2f seconds" %
            (example_num, dataset, toc - tic))
        if write_out:
            logging.info("Writing the prediction to {}".format(file_out))
            with open(file_out, 'w') as f:
                for line, extra_info in zip(ans_list, graph_route_info):
                    f.write(line + " " + " ".join(extra_info) + '\n')
            print("Wrote predictions to %s" % file_out)

            em_file = "em_" + str(file_out)
            logging.info("Writing EM scores to {}".format(em_file))
            with open(em_file, 'w') as f:
                for em in em_scores:
                    f.write(str(em) + '\n')
            print("Wrote EM Scores to %s" % em_file)

            ed_file = "ed_" + str(file_out)
            logging.info("Writing ED scores to {}".format(ed_file))
            with open(ed_file, 'w') as f:
                for ed in ed_scores:
                    f.write(str(ed) + '\n')
            print("Wrote ED Scores to %s" % ed_file)

            gm_file = "gm_" + str(file_out)
            logging.info("Writing GM scores to {}".format(gm_file))
            with open(gm_file, 'w') as f:
                for gm in gm_scores:
                    f.write(str(gm) + '\n')
            print("Wrote GM Scores to %s" % gm_file)

        return f1_total, em_total, ed_total, rough_em_total
Exemplo n.º 4
0
    def demo(self,
             session,
             context_path,
             qn_path,
             ans_path,
             dataset,
             num_samples=10,
             print_to_screen=False,
             write_out=False,
             file_out=None,
             shuffle=True):
        """
        Sample from the provided (train/dev) set.
        For each sample, calculate F1 and EM score.
        Return average F1 and EM score for all samples.
        Optionally pretty-print examples.

        Inputs:
          session: TensorFlow session
          qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files.
          dataset: string. Either "train" or "dev". Just for logging purposes.
          num_samples: int. How many samples to use. If num_samples=0 then do whole dataset.
          print_to_screen: if True, pretty-prints each example to screen

        Returns:
          F1 and EM: Scalars. The average across the sampled examples.
        """
        logging.info(
            "Calculating F1/EM for %s examples in %s set..." %
            (str(num_samples) if num_samples != 0 else "all", dataset))
        example_num = 0

        tic = time.time()
        ans_list = []
        graph_route_info = []

        for batch in get_batch_generator(
                self.word2id,
                self.context2id,
                self.ans2id,
                context_path,
                qn_path,
                ans_path,
                self.FLAGS.batch_size,
                self.graph_vocab_class,
                context_len=self.FLAGS.context_len,
                question_len=self.FLAGS.question_len,
                answer_len=self.FLAGS.answer_len,
                discard_long=False,
                use_raw_graph=self.FLAGS.use_raw_graph,
                shuffle=shuffle,
                show_start_tokens=self.FLAGS.show_start_tokens,
                output_goal=True):
            train_ids, pred_ids, dev_final_states, pred_logits = self.get_prob_dists(
                session, batch)
            start_ids = batch.ans_ids[:, 0].reshape(-1)

            if self.FLAGS.pred_method != 'beam':
                pred_ids, confidence_score, ans_str = output_route(
                    start_ids, pred_logits, batch.context_tokens, self.ans2id,
                    self.id2ans, self.FLAGS.answer_len)

            pred_ids = pred_ids.tolist()  # the output of using test network
            dev_attention_map = create_attention_images_summary(
                dev_final_states)
            print "dev_attention_map", dev_attention_map.shape
            dev_attention_map = dev_attention_map.eval().tolist()

            # the output of using training network, that the true input is fed as the input of the next RNN, for debug.
            for ex_idx, (pred_ans_list, true_ans_tokens,
                         attention_map) in enumerate(
                             zip(pred_ids, list(batch.ans_tokens),
                                 dev_attention_map)):

                example_num += 1
                pred_ans_tokens = []
                for id in pred_ans_list:
                    if id == PAD_ID:
                        break
                    else:
                        pred_ans_tokens.append(self.id2ans[id])
                pred_answer = " ".join(pred_ans_tokens)

                # Get true answer (no UNKs)
                true_answer = " ".join(true_ans_tokens[:])
                # Calculate metrics
                f1, em, edit_dist, rough_em = compute_all_metrics(
                    pred_ans_tokens, true_ans_tokens)
                ans_list.append(pred_answer)

                if print_to_screen:
                    print_example(self.word2id, self.context2id, self.ans2id,
                                  batch.context_tokens[ex_idx],
                                  batch.qn_tokens[ex_idx], true_answer,
                                  pred_answer, f1, em, edit_dist,
                                  confidence_score[ex_idx])
                    # Draw attention map
                    draw_attention(batch, ex_idx, attention_map,
                                   pred_ans_tokens)

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        toc = time.time()
        logging.info(
            "Calculating F1/EM for %i examples in %s set took %.2f seconds" %
            (example_num, dataset, toc - tic))
        if write_out:
            logging.info("Writing the prediction to {}".format(file_out))
            with open(file_out, 'w') as f:
                for line, extra_info in zip(ans_list, graph_route_info):
                    f.write(line + " " + " ".join(extra_info) + '\n')
            print("Wrote predictions to %s" % file_out)

        return
Exemplo n.º 5
0
    def get_error_stats(self,
                        session,
                        context_path,
                        qn_path,
                        ans_path,
                        dataset,
                        num_samples=10,
                        print_to_screen=False):
        """
        Sample from the provided (train/dev) set.
        For each sample, calculate F1 and EM score.
        Return average F1 and EM score for all samples.
        Optionally pretty-print examples.

        Note: This function is not quite the same as the F1/EM numbers you get from "official_eval" mode.
        This function uses the pre-processed version of the e.g. dev set for speed,
        whereas "official_eval" mode uses the original JSON. Therefore:
          1. official_eval takes your max F1/EM score w.r.t. the three reference answers,
            whereas this function compares to just the first answer (which is what's saved in the preprocessed data)
          2. Our preprocessed version of the dev set is missing some examples
            due to tokenization issues (see squad_preprocess.py).
            "official_eval" includes all examples.

        Inputs:
          session: TensorFlow session
          qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files.
          dataset: string. Either "train" or "dev". Just for logging purposes.
          num_samples: int. How many samples to use. If num_samples=0 then do whole dataset.
          print_to_screen: if True, pretty-prints each example to screen

        Returns:
          F1 and EM: Scalars. The average across the sampled examples.
        """
        logging.info(
            "Calculating Error stats for %s examples in %s set..." %
            (str(num_samples) if num_samples != 0 else "all", dataset))

        f1_total = 0.
        em_total = 0.
        example_num = 0

        tic = time.time()

        # Note here we select discard_long=False because we want to sample from the entire dataset
        # That means we're truncating, rather than discarding, examples with too-long context or questions
        first_token_qn_dict_wrong = defaultdict(float)
        first_token_qn_dict_total = defaultdict(float)
        first_token_qn_dict_f1 = defaultdict(float)

        for batch in get_batch_generator(self.word2id,
                                         context_path,
                                         qn_path,
                                         ans_path,
                                         self.FLAGS.batch_size,
                                         context_len=self.FLAGS.context_len,
                                         question_len=self.FLAGS.question_len,
                                         discard_long=False):

            pred_start_pos, pred_end_pos = self.get_start_end_pos(
                session, batch)

            # Convert the start and end positions to lists length batch_size
            pred_start_pos = pred_start_pos.tolist()  # list length batch_size
            pred_end_pos = pred_end_pos.tolist()  # list length batch_size
            for ex_idx, (pred_ans_start, pred_ans_end,
                         true_ans_tokens) in enumerate(
                             zip(pred_start_pos, pred_end_pos,
                                 batch.ans_tokens)):
                example_num += 1

                # Get the predicted answer
                # Important: batch.context_tokens contains the original words (no UNKs)
                # You need to use the original no-UNK version when measuring F1/EM
                pred_ans_tokens = batch.context_tokens[ex_idx][
                    pred_ans_start:pred_ans_end + 1]
                pred_answer = " ".join(pred_ans_tokens)

                # Get true answer (no UNKs)
                true_answer = " ".join(true_ans_tokens)

                # Calc F1/EM
                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)

                first_token_qn = batch.qn_tokens[ex_idx][0]
                first_token_qn_dict_total[first_token_qn] += 1
                #print 'example_num: ', example_num
                #print 'total words seen in first_token_qn_dict: ', sum(first_token_qn_dict_total.itervalues())
                if not em:
                    #we have found an error:
                    #get first token of error question:
                    first_token_qn_dict_wrong[first_token_qn] += 1

                f1_total += f1
                first_token_qn_dict_f1[first_token_qn] += f1
                em_total += em

                # Optionally pretty-print
                if print_to_screen:
                    print_example(self.word2id, batch.context_tokens[ex_idx],
                                  batch.qn_tokens[ex_idx],
                                  batch.ans_span[ex_idx,
                                                 0], batch.ans_span[ex_idx, 1],
                                  pred_ans_start, pred_ans_end, true_answer,
                                  pred_answer, f1, em)

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        f1_total /= example_num
        em_total /= example_num
        print 'total words: ', sum(first_token_qn_dict_total.itervalues())

        toc = time.time()
        logging.info(
            "Calculating F1/EM for %i examples in %s set took %.2f seconds" %
            (example_num, dataset, toc - tic))

        final_freq_dict = {}
        for token, count in sorted(first_token_qn_dict_total.iteritems(),
                                   key=lambda (k, v): (v, k)):
            #key is fist token of question, value is how many times that token occurs
            freq = first_token_qn_dict_wrong[
                token] / first_token_qn_dict_total[token]
            f1 = first_token_qn_dict_f1[token] / first_token_qn_dict_total[
                token]
            print "When first token is: [", token, "] f1:", f1, "We got : ", first_token_qn_dict_wrong[
                token], " wrong exact match out of ", first_token_qn_dict_total[
                    token], " percentage of 1st tokens that are this token: ", first_token_qn_dict_total[
                        token] / sum(first_token_qn_dict_total.itervalues(
                        )), " precentage of this token WRONG: ", freq

        print('em_total:', em_total)
        print('f1_total:', f1_total)
        return f1_total, em_total
Exemplo n.º 6
0
    def check_f1_em(self, session, context_path, qn_path, ans_path, dataset, num_samples=100, print_to_screen=False):
        """
        Sample from the provided (train/dev) set.
        For each sample, calculate F1 and EM score.
        Return average F1 and EM score for all samples.
        Optionally pretty-print examples.

        Note: This function is not quite the same as the F1/EM numbers you get from "official_eval" mode.
        This function uses the pre-processed version of the e.g. dev set for speed,
        whereas "official_eval" mode uses the original JSON. Therefore:
          1. official_eval takes your max F1/EM score w.r.t. the three reference answers,
            whereas this function compares to just the first answer (which is what's saved in the preprocessed data)
          2. Our preprocessed version of the dev set is missing some examples
            due to tokenization issues (see squad_preprocess.py).
            "official_eval" includes all examples.

        Inputs:
          session: TensorFlow session
          qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files.
          dataset: string. Either "train" or "dev". Just for logging purposes.
          num_samples: int. How many samples to use. If num_samples=0 then do whole dataset.
          print_to_screen: if True, pretty-prints each example to screen

        Returns:
          F1 and EM: Scalars. The average across the sampled examples.
        """
        logging.info("Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset))

        f1_total = 0.
        em_total = 0.
        example_num = 0

        tic = time.time()

        # Note here we select discard_long=False because we want to sample from the entire dataset
        # That means we're truncating, rather than discarding, examples with too-long context or questions
        for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False):

            pred_start_pos, pred_end_pos = self.get_start_end_pos(session, batch)

            # Convert the start and end positions to lists length batch_size
            pred_start_pos = pred_start_pos.tolist() # list length batch_size
            pred_end_pos = pred_end_pos.tolist() # list length batch_size

            for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)):
                example_num += 1

                # Get the predicted answer
                # Important: batch.context_tokens contains the original words (no UNKs)
                # You need to use the original no-UNK version when measuring F1/EM
                pred_ans_tokens = batch.context_tokens[ex_idx][pred_ans_start : pred_ans_end + 1]
                pred_answer = " ".join(pred_ans_tokens)

                # Get true answer (no UNKs)
                true_answer = " ".join(true_ans_tokens)

                # Calc F1/EM
                f1 = f1_score(pred_answer, true_answer)
                em = exact_match_score(pred_answer, true_answer)
                f1_total += f1
                em_total += em

                # Optionally pretty-print
                if print_to_screen:
                    print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em)

                if num_samples != 0 and example_num >= num_samples:
                    break

            if num_samples != 0 and example_num >= num_samples:
                break

        f1_total /= example_num
        em_total /= example_num

        toc = time.time()
        logging.info("Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc-tic))

        return f1_total, em_total