Exemplo n.º 1
0
 def _get_label_mapping(self, provided_split=None, provided_sentences=None):
     if self._label_mapping is not None:
         return self._label_mapping
     if tf.io.gfile.exists(self._label_mapping_path):
         self._label_mapping = utils.load_pickle(self._label_mapping_path)
         return self._label_mapping
     utils.log("Writing label mapping for task", self.name)
     tag_counts = collections.Counter()
     train_tags = set()
     for split in ["train", "dev", "eval"]:
         if not tf.io.gfile.exists(
                 os.path.join(self.config.raw_data_dir(self.name),
                              split + ".json")):
             continue
         if split == provided_split:
             split_sentences = provided_sentences
         else:
             split_sentences, _id = self._get_labeled_sentences(split)
         for _w, tags, _t in split_sentences:
             for tag in tags:
                 tag_counts[tag] += 1
                 if provided_split == "train":
                     train_tags.add(tag)
     labels = sorted(tag_counts.keys())
     label_mapping = {label: i for i, label in enumerate(labels)}
     utils.write_pickle(label_mapping, self._label_mapping_path)
     self._label_mapping = label_mapping
     return label_mapping
Exemplo n.º 2
0
def eval_bagging_best_score(data_dir, split, task_name, selected_idx):
    if task_name == 'ccks42ee':
        model_name_part = 'electra_ensemble'
    else:
        model_name_part = 'electra_ensemble2'
    all_models = []
    for batch_size in [24, 32]:
        for max_seq_length in [384, 480, 512]:
            for epoch in [2, 3]:
                model_name = "{}_{}_{}_{}".format(model_name_part, batch_size,
                                                  max_seq_length, epoch)
                all_models.append(model_name)
    models = [all_models[int(x)] for x in selected_idx.split('-')]

    all_nbest = []
    all_odds = []
    for dire in [os.path.join(data_dir, d) for d in models]:
        all_nbest.append(
            utils.load_pickle(
                (os.path.join(dire, 'models', 'electra_large', 'results',
                              '{}_qa'.format(task_name),
                              '{}_{}_all_nbest.pkl'.format(task_name,
                                                           split)))))
        all_odds.append(
            utils.load_json(
                (os.path.join(dire, 'models', 'electra_large', 'results',
                              '{}_qa'.format(task_name),
                              '{}_{}_null_odds.json'.format(task_name,
                                                            split)))))

    qid_answers = collections.OrderedDict()
    qid_questions = collections.OrderedDict()
    dataset = utils.load_json(
        (os.path.join(data_dir, model_name_part, 'finetuning_data', task_name,
                      '{}.json'.format(split))))['data']
    for article in dataset:
        for paragraph in article["paragraphs"]:
            for qa in paragraph['qas']:
                _qid = qa['id']
                qid_answers[_qid] = qa['answers'] if 'answers' in qa else ''
                qid_questions[_qid] = qa['question']

    all_nbest = filter_short_ans(all_nbest)
    output_dir = os.path.join(data_dir, 'electra_best', 'models',
                              'electra_large', 'results', 'ccks42bagging',
                              task_name)

    vote1(dataset, all_nbest, all_odds, qid_answers, split, output_dir)
    vote2(dataset, all_nbest, all_odds, qid_answers, split, output_dir)
    vote3(dataset, all_nbest, all_odds, qid_answers, qid_questions, models,
          split, output_dir)
Exemplo n.º 3
0
 def _get_label_mapping(self, provided_split=None, provided_sentences=None):
     # import pdb; pdb.set_trace() # IBO
     if self._label_mapping is not None:
         return self._label_mapping
     if tf.io.gfile.exists(self._label_mapping_path):
         self._label_mapping = utils.load_pickle(self._label_mapping_path)
         return self._label_mapping
     utils.log("Writing label mapping for task", self.name)
     tag_counts = collections.Counter()
     train_tags = set()
     for split in ["train", "dev", "test"]:
         if not tf.io.gfile.exists(
                 os.path.join(self.config.raw_data_dir(self.name),
                              split + ".txt")):
             continue
         if split == provided_split:
             split_sentences = provided_sentences
         else:
             split_sentences = self._get_labeled_sentences(split)
         for _, tags in split_sentences:
             if not self._is_token_level:
                 span_labels = tagging_utils.get_span_labels(tags)
                 tags = tagging_utils.get_tags(span_labels, len(tags),
                                               LABEL_ENCODING)
             for tag in tags:
                 tag_counts[tag] += 1
                 if provided_split == "train":
                     train_tags.add(tag)
     if self.name == "ccg":
         infrequent_tags = []
         for tag in tag_counts:
             if tag not in train_tags:
                 infrequent_tags.append(tag)
         label_mapping = {
             label: i
             for i, label in enumerate(
                 sorted(
                     filter(lambda t: t not in infrequent_tags,
                            tag_counts.keys())))
         }
         n = len(label_mapping)
         for tag in infrequent_tags:
             label_mapping[tag] = n
     else:
         labels = sorted(tag_counts.keys())
         label_mapping = {label: i for i, label in enumerate(labels)}
     utils.write_pickle(label_mapping, self._label_mapping_path)
     self._label_mapping = label_mapping
     return label_mapping
    for max_seq_length in [384, 480, 512]:
        for epoch in [2, 3]:
            model_name = "{}_{}_{}_{}".format(model_name_part, batch_size,
                                              max_seq_length, epoch)
            full_models.append(model_name)
models = full_models

models_predictions = collections.OrderedDict()
for d in models:
    dire = os.path.join(data_dir, d)
    try:
        prediction = collections.OrderedDict()
        prediction['eval_all_nbest'] = filter_short_ans(
            utils.load_pickle(
                (os.path.join(dire, 'models', 'electra_large', 'results',
                              '{}_qa'.format(task_name),
                              '{}_{}_all_nbest.pkl'.format(task_name,
                                                           split)))))
        prediction['squad_null_odds'] = utils.load_json(
            (os.path.join(dire, 'models', 'electra_large', 'results',
                          '{}_qa'.format(task_name),
                          '{}_{}_null_odds.json'.format(task_name, split))))
        models_predictions[d] = prediction
    except:
        utils.log(
            "Error at loading all_nbest.pkl & null_odds.json for model {}".
            format(d))
        continue

dataset = \
  utils.load_json((os.path.join(data_dir, model_name_part, 'finetuning_data', task_name, '{}.json'.format(split))))[