Exemplo n.º 1
0
    def write_predictions(self):
        """Write final predictions to the json file."""
        unique_id_to_result = {}
        for result in self._all_results:
            unique_id_to_result[result.unique_id] = result

        all_predictions = collections.OrderedDict()

        for example in self._eval_examples:
            example_id = example.qas_id
            features = self._task.featurize(example, False, for_eval=True)[0]
            result = unique_id_to_result[features[self._name + "_eid"]]
            max_index = np.argmax(result.logits)
            _max_index = max_index
            decoded_indexes = []
            for i in range(self._config.max_options_num - 1, -1, -1):
                if _max_index >= 2**i:
                    decoded_indexes.append(i)
                    _max_index -= 2**i
                elif _max_index <= 0:
                    break
            options_tags = features[self._name + "_options_tags"]
            combination_options = features[self._name + "_combination_options"]
            answer = None
            if features[self._name + "_type"] == "0":
                # 异常捕获,如果有异常要解决掉,下同
                if len(decoded_indexes) != 1 or decoded_indexes[0] >= len(
                        options_tags):
                    utils.log(
                        "decode single answer error, set answer C for default."
                    )
                    answer = "C"
                else:
                    answer = options_tags[decoded_indexes[0]]
            elif features[self._name + "_type"] == "1":
                if len(decoded_indexes
                       ) == 0 or decoded_indexes[0] >= len(options_tags):
                    utils.log(
                        "decode combination single answer error, set answer C for default."
                    )
                    answer = "C"
                else:
                    comb_ops_pred = []
                    for ind in decoded_indexes:
                        comb_ops_pred.append(options_tags[ind])
                    answer = None
                    for op, comb_ops in combination_options.items():
                        if set(comb_ops) == set(comb_ops_pred):
                            answer = op
                            break
                    if answer is None:
                        utils.log(
                            "decode combination single answer error, set answer C for default."
                        )
                        answer = "C"
            all_predictions[example_id] = [answer] if answer else []

        utils.write_json(
            dict(all_predictions),
            self._config.qa_preds_file(self._name + "_" + self._split))
Exemplo n.º 2
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
  """Run finetuning."""
  hvd.init()

  config.model_dir = config.model_dir if hvd.rank() == 0 else \
      os.path.join(config.model_dir, str(hvd.rank()))
  config.train_batch_size = config.train_batch_size // hvd.size()

  # Setup for training
  results = []
  trial = 1
  heading_info = "model={:}, trial {:}/{:}".format(
      config.model_name, trial, config.num_trials)
  heading = lambda msg: utils.heading(msg + ": " + heading_info)
  heading("Config")
  utils.log_config(config)
  generic_model_dir = config.model_dir
  tasks = task_builder.get_tasks(config)

  # Train and evaluate num_trials models with different random seeds
  while config.num_trials < 0 or trial <= config.num_trials:
    config.model_dir = generic_model_dir + "_" + str(trial)
    if config.do_train:
      utils.rmkdir(config.model_dir)

    model_runner = ModelRunner(config, tasks, hvd)
    if config.do_train:
      heading("Start training")
      model_runner.train()
      utils.log()

    if config.do_eval:
      heading("Run dev set evaluation")
      results.append(model_runner.evaluate())
      write_results(config, results)
      if config.write_test_outputs and trial <= config.n_writes_test:
        heading("Running on the test set and writing the predictions")
        for task in tasks:
          # Currently only writing preds for GLUE and SQuAD 2.0 is supported
          if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp",
                           "sts"]:
            for split in task.get_test_splits():
              model_runner.write_classification_outputs([task], trial, split)
          elif task.name == "squad":
            scorer = model_runner.evaluate_task(task, "test", False)
            scorer.write_predictions()
            preds = utils.load_json(config.qa_preds_file("squad"))
            null_odds = utils.load_json(config.qa_na_file("squad"))
            for q, _ in preds.items():
              if null_odds[q] > config.qa_na_threshold:
                preds[q] = ""
            utils.write_json(preds, config.test_predictions(
                task.name, "test", trial))
          else:
            utils.log("Skipping task", task.name,
                      "- writing predictions is not supported for this task")

    if trial != config.num_trials and (not config.keep_all_models):
      utils.rmrf(config.model_dir)
    trial += 1
Exemplo n.º 3
0
    def _process_dir(self, dir_path, json_path, relabel):
        if osp.exists(json_path):
            print("=> {} generated before, awesome!".format(json_path))
            split = read_json(json_path)
            return split['tracklets'], split['num_tracklets'], split['num_pids'], split['num_imgs_per_tracklet']

        print("=> Automatically generating split (might take a while for the first time, have a coffe)")
        pdirs = glob.glob(osp.join(dir_path, '*')) # avoid .DS_Store
        print("Processing {} with {} person identities".format(dir_path, len(pdirs)))

        pid_container = set()
        for pdir in pdirs:
            pid = int(osp.basename(pdir))
            pid_container.add(pid)
        pid2label = {pid:label for label, pid in enumerate(pid_container)}

        tracklets = []
        num_imgs_per_tracklet = []
        for pdir in pdirs:
            pid = int(osp.basename(pdir))
            if relabel: pid = pid2label[pid]
            tdirs = glob.glob(osp.join(pdir, '*'))
            for tdir in tdirs:
                raw_img_paths = glob.glob(osp.join(tdir, '*.jpg'))
                num_imgs = len(raw_img_paths)

                if num_imgs < self.min_seq_len:
                    continue

                num_imgs_per_tracklet.append(num_imgs)
                img_paths = []
                for img_idx in range(num_imgs):
                    # some tracklet starts from 0002 instead of 0001
                    img_idx_name = 'F' + str(img_idx+1).zfill(4)
                    res = glob.glob(osp.join(tdir, '*' + img_idx_name + '*.jpg'))
                    if len(res) == 0:
                        print("Warn: index name {} in {} is missing, jump to next".format(img_idx_name, tdir))
                        continue
                    img_paths.append(res[0])
                img_name = osp.basename(img_paths[0])
                camid = int(img_name[5]) - 1 # index-0
                img_paths = tuple(img_paths)
                tracklets.append((img_paths, pid, camid))

        num_pids = len(pid_container)
        num_tracklets = len(tracklets)

        print("Saving split to {}".format(json_path))
        split_dict = {
            'tracklets': tracklets,
            'num_tracklets': num_tracklets,
            'num_pids': num_pids,
            'num_imgs_per_tracklet': num_imgs_per_tracklet,
        }
        write_json(split_dict, json_path)

        return tracklets, num_tracklets, num_pids, num_imgs_per_tracklet
Exemplo n.º 4
0
 def write_predictions(self):
     """Write final predictions to the json file."""
     all_predictions = collections.OrderedDict()
     unique_id_to_text_a = {}
     for example in self._eval_examples:
         unique_id_to_text_a[example.eid] = example.text_a
     for eid, pred in zip(self._eid, self._preds):
         text_a = unique_id_to_text_a[eid]
         org_pred = pred * (self._max_value -
                            self._min_value) + self._min_value
         all_predictions[text_a] = max(1, round(org_pred))
     utils.write_json(
         dict(all_predictions),
         self._config.cl_preds_file(self._name + "_" + self._split))
Exemplo n.º 5
0
 def write_predictions(self):
     """Write final predictions to the json file."""
     all_predictions = collections.OrderedDict()
     unique_id_to_text_a = {}
     label_id_to_label_text = {}
     for example in self._eval_examples:
         unique_id_to_text_a[example.eid] = example.text_a
     for label_id, label_text in enumerate(self._label_list):
         label_id_to_label_text[label_id] = label_text
     for eid, pred in zip(self._eid, self._preds):
         text_a = unique_id_to_text_a[eid]
         label_text = label_id_to_label_text[int(pred)]
         all_predictions[text_a] = label_text
     utils.write_json(
         dict(all_predictions),
         self._config.cl_preds_file(self._name + "_" + self._split))
Exemplo n.º 6
0
 def _split_dataset(self):
     train_raw, train_gt = self._generate_list(self.train_split)
     val_raw, val_gt = self._generate_list(self.val_split)
     val_selected_raw = self._generate_list_selected(
         osp.join(self.val_selected_split, 'velodyne_raw'))
     val_selected_gt = self._generate_list_selected(
         osp.join(self.val_selected_split, 'groundtruth_depth'))
     test_raw = self._generate_list_selected(self.test_dir)
     splits = {
         'train_raw': train_raw,
         'train_gt': train_gt,
         'val_raw': val_raw,
         'val_gt': val_gt,
         'val_selected_raw': val_selected_raw,
         'val_selected_gt': val_selected_gt,
         'test_raw': test_raw
     }
     write_json(splits, self.splits)
Exemplo n.º 7
0
        def _serialize_balanced_dataset(self, tasks, is_training, split):
            """Write out the dataset as tfrecords."""
            labels = [
                self._tasks[i]._label_list for i in range(len(self_tasks))
            ]
            dataset_name = "_".join(sorted([task.name for task in tasks]))
            dataset_name += "_" + split
            dataset_prefix = os.path.join(self._config.preprocessed_data_dir,
                                          dataset_name)
            tfrecords_path = dataset_prefix + ".tfrecord"
            metadata_path = dataset_prefix + ".metadata"
            batch_size = (self._config.train_batch_size
                          if is_training else self._config.eval_batch_size)

            utils.log("Loading dataset", dataset_name)
            n_examples = None
            if (self._config.use_tfrecords_if_existing
                    and tf.io.gfile.exists(metadata_path)):
                n_examples = utils.load_json(metadata_path)["n_examples"]

            if n_examples is None:
                utils.log("Existing tfrecords not found so creating")
                examples = []
                for task in tasks:
                    task_examples = task.get_examples(split)
                    examples += task_examples
                if is_training:
                    random.shuffle(examples)
                utils.mkdir(tfrecords_path.rsplit("/", 1)[0])
                n_examples = self.serialize_examples(examples, is_training,
                                                     tfrecords_path,
                                                     batch_size)
                utils.write_json({"n_examples": n_examples}, metadata_path)

            input_fn = self._input_fn_builder(tfrecords_path, is_training)
            if is_training:
                steps = int(n_examples // batch_size *
                            self._config.num_train_epochs)
            else:
                steps = n_examples // batch_size

            return input_fn, steps
Exemplo n.º 8
0
    def write_predictions(self):
        """Write final predictions to the json file."""
        all_predictions = collections.OrderedDict()
        all_pred_results = self._get_improved_span_labels(False, True)
        assert len(self._eval_examples) == len(all_pred_results)

        for example, span_preds in zip(self._eval_examples, all_pred_results):
            words = example.words
            orig_id = example.orig_id
            word_to_char_mapping = self._word_to_char_mapping[orig_id]
            answers = collections.OrderedDict()
            for s, e, l in span_preds:
                if l not in answers.keys():
                    answers[l] = []
                answers[l].append(
                    ("".join(words[s:e + 1]), word_to_char_mapping[s],
                     word_to_char_mapping[e]))
            all_predictions[orig_id] = answers
        utils.write_json(
            dict(all_predictions),
            self._config.ner_preds_file(self._name + "_" + self._split))
Exemplo n.º 9
0
    def _prepare_split(self):
        if not osp.exists(self.split_path):
            print("Creating splits")
            mat_split_data = loadmat(self.split_mat_path)['ls_set']
            
            num_splits = mat_split_data.shape[0]
            num_total_ids = mat_split_data.shape[1]
            assert num_splits == 10
            assert num_total_ids == 300
            num_ids_each = num_total_ids/2

            # pids in mat_split_data are indices, so we need to transform them
            # to real pids
            person_cam1_dirs = os.listdir(self.cam_1_path)
            person_cam2_dirs = os.listdir(self.cam_2_path)

            # make sure persons in one camera view can be found in the other camera view
            assert set(person_cam1_dirs) == set(person_cam2_dirs)

            splits = []
            for i_split in range(num_splits):
                # first 50% for testing and the remaining for training, following Wang et al. ECCV'14.
                train_idxs = sorted(list(mat_split_data[i_split,num_ids_each:]))
                test_idxs = sorted(list(mat_split_data[i_split,:num_ids_each]))
                
                train_idxs = [int(i)-1 for i in train_idxs]
                test_idxs = [int(i)-1 for i in test_idxs]
                
                # transform pids to person dir names
                train_dirs = [person_cam1_dirs[i] for i in train_idxs]
                test_dirs = [person_cam1_dirs[i] for i in test_idxs]
                
                split = {'train': train_dirs, 'test': test_dirs}
                splits.append(split)

            print("Totally {} splits are created, following Wang et al. ECCV'14".format(len(splits)))
            print("Split file is saved to {}".format(self.split_path))
            write_json(splits, self.split_path)

        print("Splits created")
Exemplo n.º 10
0
def vote1(dataset, all_nbest, all_odds, qid_answers, split, output_dir):
    bagging_preds = collections.OrderedDict()
    bagging_odds = collections.OrderedDict()
    bagging_all_nbest = collections.OrderedDict()

    for qid in qid_answers:
        bagging_preds[qid] = \
          (seq([nbest[qid][0] for nbest in all_nbest]).sorted(key=lambda x: x['probability'])).list()[-1]['text']
        bagging_all_nbest[qid] = \
          [(seq([nbest[qid][0] for nbest in all_nbest]).sorted(key=lambda x: x['probability'])).list()[-1]]
        bagging_odds[qid] = np.mean([odds[qid] for odds in all_odds])

    utils.write_json(
        bagging_preds,
        os.path.join(output_dir, 'vote1',
                     'ccks42bagging_{}_preds.json'.format(split)))
    utils.write_pickle(
        bagging_all_nbest,
        os.path.join(output_dir, 'vote1',
                     'ccks42bagging_{}_all_nbest.pkl'.format(split)))
    utils.write_json(
        bagging_odds,
        os.path.join(output_dir, 'vote1',
                     'ccks42bagging_{}_null_odds.json'.format(split)))

    if split in ['train', 'dev']:
        out_eval = main2(dataset, bagging_preds, bagging_odds)
        utils.log('vote1')
        utils.log(out_eval)
    elif split == 'eval':
        for qid in bagging_preds.keys():
            if bagging_odds[qid] > -2.75:
                bagging_preds[qid] = ""
        utils.write_json(
            bagging_preds,
            os.path.join(output_dir, 'vote1',
                         'ccks42bagging_{}_1_preds.json'.format(split)))
    else:
        utils.log('{} split is not supported'.format(split))
Exemplo n.º 11
0
def vote2(dataset, all_nbest, all_odds, qid_answers, split, output_dir):
    bagging_preds = collections.OrderedDict()
    bagging_odds = collections.OrderedDict()

    for qid in qid_answers:
        preds_scores = (seq(all_nbest).map(lambda x: x[qid][0]).map(
            lambda x: (x['text'], x['probability']))).dict()
        compare = collections.defaultdict(lambda: 0.)
        for pred, score in preds_scores.items():
            compare[pred] += score
        compare = seq(compare.items()).sorted(lambda x: x[1]).reverse().list()
        bagging_preds[qid] = compare[0][0]

        bagging_odds[qid] = np.mean([odds[qid] for odds in all_odds])

    utils.write_json(
        bagging_preds,
        os.path.join(output_dir, 'vote2',
                     'ccks42bagging_{}_preds.json'.format(split)))
    utils.write_json(
        bagging_odds,
        os.path.join(output_dir, 'vote2',
                     'ccks42bagging_{}_null_odds.json'.format(split)))

    if split in ['train', 'dev']:
        out_eval = main2(dataset, bagging_preds, bagging_odds)
        utils.log('vote2')
        utils.log(out_eval)
    elif split == 'eval':
        for qid in bagging_preds.keys():
            if bagging_odds[qid] > -2.75:
                bagging_preds[qid] = ""
        utils.write_json(
            bagging_preds,
            os.path.join(output_dir, 'vote2',
                         'ccks42bagging_{}_1_preds.json'.format(split)))
    else:
        utils.log('{} split is not supported'.format(split))
Exemplo n.º 12
0
def vote3(dataset, all_nbest, all_odds, qid_answers, qid_questions, models,
          split, output_dir):
    bagging_preds = collections.OrderedDict()
    bagging_odds = collections.OrderedDict()

    def post_process(question, candi, weight=1):
        question = question.lower()
        first_token = candi['text'].split()[0]
        th = 0.
        if "when" in question:
            if first_token in [
                    'before', 'after', 'about', 'around', 'from', 'during'
            ]:
                candi['probability'] += th
        elif "where" in question:
            if first_token in [
                    'in', 'at', 'on', 'behind', 'from', 'through', 'between',
                    'throughout'
            ]:
                candi['probability'] += th
        elif "whose" in question:
            if "'s" in candi['text']:
                candi['probability'] += th
        elif "which" in question:
            if first_token == "the":
                candi['probability'] += th
        candi['probability'] *= weight
        return candi

    cof = 0.2

    for qid in qid_answers:
        question = qid_questions[qid]
        post_process_candidates = (seq(zip(all_nbest, models)).map(lambda x: (
            x[0][qid], cof if 'lr_epoch_results' in x[1] else 1.)).map(
                lambda x: seq(x[0]).map(lambda y: post_process(
                    question, y, x[1])).list()).flatten()).list()
        preds_probs = collections.defaultdict(lambda: [])
        for pred in post_process_candidates:
            preds_probs[pred['text']].append(pred['probability'])
        for pred in post_process_candidates:
            preds_probs[pred['text']] = np.mean(
                preds_probs[pred['text']]).__float__()
        bagging_preds[qid] = (seq(preds_probs.items()).sorted(
            lambda x: x[1]).reverse().map(lambda x: x[0])).list()[0]
        bagging_odds[qid] = np.mean([
            odds[qid] * cof if 'lr_epoch_results' in model else odds[qid]
            for odds, model in zip(all_odds, models)
        ])

    utils.write_json(
        bagging_preds,
        os.path.join(output_dir, 'vote3',
                     'ccks42bagging_{}_preds.json'.format(split)))
    utils.write_json(
        bagging_odds,
        os.path.join(output_dir, 'vote3',
                     'ccks42bagging_{}_null_odds.json'.format(split)))

    if split in ['train', 'dev']:
        out_eval = main2(dataset, bagging_preds, bagging_odds)
        utils.log('vote3')
        utils.log(out_eval)
    elif split == 'eval':
        for qid in bagging_preds.keys():
            if bagging_odds[qid] > -2.75:
                bagging_preds[qid] = ""
        utils.write_json(
            bagging_preds,
            os.path.join(output_dir, 'vote3',
                         'ccks42bagging_{}_1_preds.json'.format(split)))
    else:
        utils.log('{} split is not supported'.format(split))
Exemplo n.º 13
0
    def write_predictions(self):
        """Write final predictions to the json file."""
        unique_id_to_result = {}
        for result in self._all_results:
            unique_id_to_result[result.unique_id] = result

        _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
            "PrelimPrediction", [
                "feature_index", "start_index", "end_index", "start_logit",
                "end_logit"
            ])

        all_predictions = collections.OrderedDict()
        all_nbest_json = collections.OrderedDict()
        scores_diff_json = collections.OrderedDict()

        for example in self._eval_examples:
            example_id = example.qas_id if "squad" in self._name else example.qid
            features = self._task.featurize(example, False, for_eval=True)

            prelim_predictions = []
            # keep track of the minimum score of null start+end of position 0
            score_null = 1000000  # large and positive
            for (feature_index, feature) in enumerate(features):
                result = unique_id_to_result[feature[self._name + "_eid"]]
                if self._config.joint_prediction:
                    start_indexes = result.start_top_index
                    end_indexes = result.end_top_index
                else:
                    start_indexes = _get_best_indexes(result.start_logits,
                                                      self._config.n_best_size)
                    end_indexes = _get_best_indexes(result.end_logits,
                                                    self._config.n_best_size)
                # if we could have irrelevant answers, get the min score of irrelevant
                if self._v2:
                    if self._config.answerable_classifier:
                        feature_null_score = result.answerable_logit
                    else:
                        feature_null_score = result.start_logits[
                            0] + result.end_logits[0]
                    if feature_null_score < score_null:
                        score_null = feature_null_score
                for i, start_index in enumerate(start_indexes):
                    for j, end_index in enumerate(
                            end_indexes[i] if self._config.
                            joint_prediction else end_indexes):
                        # We could hypothetically create invalid predictions, e.g., predict
                        # that the start of the span is in the question. We throw out all
                        # invalid predictions.
                        if start_index >= len(feature[self._name + "_tokens"]):
                            continue
                        if end_index >= len(feature[self._name + "_tokens"]):
                            continue
                        if start_index == 0:
                            continue
                        if start_index not in feature[self._name +
                                                      "_token_to_orig_map"]:
                            continue
                        if end_index not in feature[self._name +
                                                    "_token_to_orig_map"]:
                            continue
                        if not feature[self._name +
                                       "_token_is_max_context"].get(
                                           start_index, False):
                            continue
                        if end_index < start_index:
                            continue
                        length = end_index - start_index + 1
                        if length > self._config.max_answer_length:
                            continue
                        start_logit = (result.start_top_log_probs[i]
                                       if self._config.joint_prediction else
                                       result.start_logits[start_index])
                        end_logit = (result.end_top_log_probs[i, j]
                                     if self._config.joint_prediction else
                                     result.end_logits[end_index])
                        prelim_predictions.append(
                            _PrelimPrediction(feature_index=feature_index,
                                              start_index=start_index,
                                              end_index=end_index,
                                              start_logit=start_logit,
                                              end_logit=end_logit))

            if self._v2:
                if len(prelim_predictions) == 0 and self._config.debug:
                    tokid = sorted(feature[self._name +
                                           "_token_to_orig_map"].keys())[0]
                    prelim_predictions.append(
                        _PrelimPrediction(feature_index=0,
                                          start_index=tokid,
                                          end_index=tokid + 1,
                                          start_logit=1.0,
                                          end_logit=1.0))
            prelim_predictions = sorted(prelim_predictions,
                                        key=lambda x:
                                        (x.start_logit + x.end_logit),
                                        reverse=True)

            _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
                "NbestPrediction", ["text", "start_logit", "end_logit"])

            seen_predictions = {}
            nbest = []
            for pred in prelim_predictions:
                if len(nbest) >= self._config.n_best_size:
                    break
                feature = features[pred.feature_index]
                tok_tokens = feature[self._name + "_tokens"][pred.start_index:(
                    pred.end_index + 1)]
                orig_doc_start = feature[self._name + "_token_to_orig_map"][
                    pred.start_index]
                orig_doc_end = feature[self._name +
                                       "_token_to_orig_map"][pred.end_index]
                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
                                                                 1)]
                tok_text = " ".join(tok_tokens)

                # De-tokenize WordPieces that have been split off.
                tok_text = tok_text.replace(" ##", "")
                tok_text = tok_text.replace("##", "")

                # Clean whitespace
                tok_text = tok_text.strip()
                tok_text = " ".join(tok_text.split())
                orig_text = " ".join(orig_tokens)

                final_text = get_final_text(self._config, tok_text, orig_text)
                if final_text in seen_predictions:
                    continue

                seen_predictions[final_text] = True

                nbest.append(
                    _NbestPrediction(text=final_text,
                                     start_logit=pred.start_logit,
                                     end_logit=pred.end_logit))

            # In very rare edge cases we could have no valid predictions. So we
            # just create a nonce prediction in this case to avoid failure.
            if not nbest:
                nbest.append(
                    _NbestPrediction(text="empty",
                                     start_logit=0.0,
                                     end_logit=0.0))

            assert len(nbest) >= 1

            total_scores = []
            best_non_null_entry = None
            for entry in nbest:
                total_scores.append(entry.start_logit + entry.end_logit)
                if not best_non_null_entry:
                    if entry.text:
                        best_non_null_entry = entry

            probs = _compute_softmax(total_scores)

            nbest_json = []
            for (i, entry) in enumerate(nbest):
                output = collections.OrderedDict()
                output["text"] = entry.text
                output["probability"] = probs[i]
                output["start_logit"] = entry.start_logit
                output["end_logit"] = entry.end_logit
                nbest_json.append(dict(output))

            assert len(nbest_json) >= 1

            if not self._v2:
                all_predictions[example_id] = nbest_json[0]["text"]
            else:
                # predict "" iff the null score - the score of best non-null > threshold
                if self._config.answerable_classifier:
                    score_diff = score_null
                else:
                    score_diff = score_null - best_non_null_entry.start_logit - (
                        best_non_null_entry.end_logit)
                scores_diff_json[example_id] = score_diff
                all_predictions[example_id] = best_non_null_entry.text

            all_nbest_json[example_id] = nbest_json

        utils.write_json(dict(all_predictions),
                         self._config.qa_preds_file(self._name))
        if self._v2:
            utils.write_json(
                {k: float(v)
                 for k, v in six.iteritems(scores_diff_json)},
                self._config.qa_na_file(self._name))
Exemplo n.º 14
0
    def _preprocess(self):
        """
        This function is a bit complex and ugly, what it does is
        1. Extract data from cuhk-03.mat and save as png images.
        2. Create 20 classic splits. (Li et al. CVPR'14)
        3. Create new split. (Zhong et al. CVPR'17)
        """
        print("Note: if root path is changed, the previously generated json files need to be re-generated (delete them first)")
        if osp.exists(self.imgs_labeled_dir) and \
           osp.exists(self.imgs_detected_dir) and \
           osp.exists(self.split_classic_det_json_path) and \
           osp.exists(self.split_classic_lab_json_path) and \
           osp.exists(self.split_new_det_json_path) and \
           osp.exists(self.split_new_lab_json_path):
            return

        mkdir_if_missing(self.imgs_detected_dir)
        mkdir_if_missing(self.imgs_labeled_dir)

        print("Extract image data from {} and save as png".format(self.raw_mat_path))
        mat = h5py.File(self.raw_mat_path, 'r')

        def _deref(ref):
            return mat[ref][:].T

        def _process_images(img_refs, campid, pid, save_dir):
            img_paths = [] # Note: some persons only have images for one view
            for imgid, img_ref in enumerate(img_refs):
                img = _deref(img_ref)
                # skip empty cell
                if img.size == 0 or img.ndim < 3: continue
                # images are saved with the following format, index-1 (ensure uniqueness)
                # campid: index of camera pair (1-5)
                # pid: index of person in 'campid'-th camera pair
                # viewid: index of view, {1, 2}
                # imgid: index of image, (1-10)
                viewid = 1 if imgid < 5 else 2
                img_name = '{:01d}_{:03d}_{:01d}_{:02d}.png'.format(campid+1, pid+1, viewid, imgid+1)
                img_path = osp.join(save_dir, img_name)
                imsave(img_path, img)
                img_paths.append(img_path)
            return img_paths

        def _extract_img(name):
            print("Processing {} images (extract and save) ...".format(name))
            meta_data = []
            imgs_dir = self.imgs_detected_dir if name == 'detected' else self.imgs_labeled_dir
            for campid, camp_ref in enumerate(mat[name][0]):
                camp = _deref(camp_ref)
                num_pids = camp.shape[0]
                for pid in range(num_pids):
                    img_paths = _process_images(camp[pid,:], campid, pid, imgs_dir)
                    assert len(img_paths) > 0, "campid{}-pid{} has no images".format(campid, pid)
                    meta_data.append((campid+1, pid+1, img_paths))
                print("done camera pair {} with {} identities".format(campid+1, num_pids))
            return meta_data

        meta_detected = _extract_img('detected')
        meta_labeled = _extract_img('labeled')

        def _extract_classic_split(meta_data, test_split):
            train, test = [], []
            num_train_pids, num_test_pids = 0, 0
            num_train_imgs, num_test_imgs = 0, 0
            for i, (campid, pid, img_paths) in enumerate(meta_data):
                
                if [campid, pid] in test_split:
                    for img_path in img_paths:
                        camid = int(osp.basename(img_path).split('_')[2])
                        test.append((img_path, num_test_pids, camid))
                    num_test_pids += 1
                    num_test_imgs += len(img_paths)
                else:
                    for img_path in img_paths:
                        camid = int(osp.basename(img_path).split('_')[2])
                        train.append((img_path, num_train_pids, camid))
                    num_train_pids += 1
                    num_train_imgs += len(img_paths)
            return train, num_train_pids, num_train_imgs, test, num_test_pids, num_test_imgs

        print("Creating classic splits (# = 20) ...")
        splits_classic_det, splits_classic_lab = [], []
        for split_ref in mat['testsets'][0]:
            test_split = _deref(split_ref).tolist()

            # create split for detected images
            train, num_train_pids, num_train_imgs, test, num_test_pids, num_test_imgs = \
                _extract_classic_split(meta_detected, test_split)
            splits_classic_det.append({
                'train': train, 'query': test, 'gallery': test,
                'num_train_pids': num_train_pids, 'num_train_imgs': num_train_imgs,
                'num_query_pids': num_test_pids, 'num_query_imgs': num_test_imgs,
                'num_gallery_pids': num_test_pids, 'num_gallery_imgs': num_test_imgs,
            })

            # create split for labeled images
            train, num_train_pids, num_train_imgs, test, num_test_pids, num_test_imgs = \
                _extract_classic_split(meta_labeled, test_split)
            splits_classic_lab.append({
                'train': train, 'query': test, 'gallery': test,
                'num_train_pids': num_train_pids, 'num_train_imgs': num_train_imgs,
                'num_query_pids': num_test_pids, 'num_query_imgs': num_test_imgs,
                'num_gallery_pids': num_test_pids, 'num_gallery_imgs': num_test_imgs,
            })
        
        write_json(splits_classic_det, self.split_classic_det_json_path)
        write_json(splits_classic_lab, self.split_classic_lab_json_path)

        def _extract_set(filelist, pids, pid2label, idxs, img_dir, relabel):
            tmp_set = []
            unique_pids = set()
            for idx in idxs:
                img_name = filelist[idx][0]
                camid = int(img_name.split('_')[2])
                pid = pids[idx]
                if relabel: pid = pid2label[pid]
                img_path = osp.join(img_dir, img_name)
                tmp_set.append((img_path, int(pid), camid))
                unique_pids.add(pid)
            return tmp_set, len(unique_pids), len(idxs)

        def _extract_new_split(split_dict, img_dir):
            train_idxs = split_dict['train_idx'].flatten() - 1 # index-0
            pids = split_dict['labels'].flatten()
            train_pids = set(pids[train_idxs])
            pid2label = {pid: label for label, pid in enumerate(train_pids)}
            query_idxs = split_dict['query_idx'].flatten() - 1
            gallery_idxs = split_dict['gallery_idx'].flatten() - 1
            filelist = split_dict['filelist'].flatten()
            train_info = _extract_set(filelist, pids, pid2label, train_idxs, img_dir, relabel=True)
            query_info = _extract_set(filelist, pids, pid2label, query_idxs, img_dir, relabel=False)
            gallery_info = _extract_set(filelist, pids, pid2label, gallery_idxs, img_dir, relabel=False)
            return train_info, query_info, gallery_info

        print("Creating new splits for detected images (767/700) ...")
        train_info, query_info, gallery_info = _extract_new_split(
            loadmat(self.split_new_det_mat_path),
            self.imgs_detected_dir,
        )
        splits = [{
            'train': train_info[0], 'query': query_info[0], 'gallery': gallery_info[0],
            'num_train_pids': train_info[1], 'num_train_imgs': train_info[2],
            'num_query_pids': query_info[1], 'num_query_imgs': query_info[2],
            'num_gallery_pids': gallery_info[1], 'num_gallery_imgs': gallery_info[2],
        }]
        write_json(splits, self.split_new_det_json_path)

        print("Creating new splits for labeled images (767/700) ...")
        train_info, query_info, gallery_info = _extract_new_split(
            loadmat(self.split_new_lab_mat_path),
            self.imgs_labeled_dir,
        )
        splits = [{
            'train': train_info[0], 'query': query_info[0], 'gallery': gallery_info[0],
            'num_train_pids': train_info[1], 'num_train_imgs': train_info[2],
            'num_query_pids': query_info[1], 'num_query_imgs': query_info[2],
            'num_gallery_pids': gallery_info[1], 'num_gallery_imgs': gallery_info[2],
        }]
        write_json(splits, self.split_new_lab_json_path)
Exemplo n.º 15
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
    """Run finetuning."""

    # Setup for training
    results = []
    trial = 1
    heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial,
                                                     config.num_trials)
    heading = lambda msg: utils.heading(msg + ": " + heading_info)
    heading("Config")
    utils.log_config(config)
    generic_model_dir = config.model_dir
    tasks = task_builder.get_tasks(config)

    # Train and evaluate num_trials models with different random seeds
    while config.num_trials < 0 or trial <= config.num_trials:
        config.model_dir = generic_model_dir + "_" + str(trial)
        if config.do_train:
            utils.rmkdir(config.model_dir)

        model_runner = ModelRunner(config, tasks)
        if config.do_train:
            heading("Start training")
            model_runner.train()
            utils.log()

        if config.do_eval:
            heading("Run dev set evaluation")
            results.append(model_runner.evaluate())
            if config.do_test:
                for task in tasks:
                    test_score = model_runner.evaluate_task_test(
                        task, results[-1][task.name]['checkpoint_path'])
                    results[-1][task.name]["test_results"] = test_score
            write_results(config, results)
            if config.write_test_outputs and trial <= config.n_writes_test:
                heading("Running on the test set and writing the predictions")
                for task in tasks:
                    # Currently only writing preds for GLUE and SQuAD 2.0 is supported
                    if task.name in [
                            "cola", "mrpc", "mnli", "sst", "rte", "qnli",
                            "qqp", "sts"
                    ]:
                        for split in task.get_test_splits():
                            model_runner.write_classification_outputs([task],
                                                                      trial,
                                                                      split)
                    elif task.name == "squad":
                        scorer = model_runner.evaluate_task(
                            task, "test", False)
                        scorer.write_predictions()
                        preds = utils.load_json(config.qa_preds_file("squad"))
                        null_odds = utils.load_json(config.qa_na_file("squad"))
                        for q, _ in preds.items():
                            if null_odds[q] > config.qa_na_threshold:
                                preds[q] = ""
                        utils.write_json(
                            preds,
                            config.test_predictions(task.name, "test", trial))
                    else:
                        utils.log(
                            "Skipping task", task.name,
                            "- writing predictions is not supported for this task"
                        )
        if config.do_predict:
            if "dev" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "dev")
                import pickle
                with open("predict_dev.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

            if "train" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "train")
                import pickle
                with open("predict_train.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

            if "test" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "test")
                import pickle
                with open("predict_test.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

        if trial != config.num_trials and (not config.keep_all_models):
            utils.rmrf(config.model_dir)
        trial += 1
Exemplo n.º 16
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
  """Run finetuning."""
  tf.get_variable_scope().reuse_variables() #import pdb; pdb.set_trace()

  # Setup for training
  results = []
  trial = 1
  heading_info = "model={:}, trial {:}/{:}".format(
      config.model_name, trial, config.num_trials)
  heading = lambda msg: utils.heading(msg + ": " + heading_info)
  heading("Config")
  utils.log_config(config)
  generic_model_dir = config.model_dir
  tasks = task_builder.get_tasks(config)
  # Train and evaluate num_trials models with different random seeds
  while config.num_trials < 0 or trial <= config.num_trials:
    config.model_dir = generic_model_dir + "_" + str(trial)
    if config.do_train:
      utils.rmkdir(config.model_dir)

    model_runner = ModelRunner(config, tasks)
    if config.do_train:
      heading("Start training")
      model_runner.train()
      utils.log()

    if config.do_eval:
      heading("Run dev set evaluation")
      results.append(model_runner.evaluate())
      write_results(config, results)
      if config.write_test_outputs and trial <= config.n_writes_test:
        heading("Running on the test set and writing the predictions")
        for task in tasks:
          # Currently only writing preds for GLUE and SQuAD 2.0 is supported
          if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp",
                           "sts"]:
            for split in task.get_test_splits():
              model_runner.write_classification_outputs([task], trial, split)
          elif task.name == "squad":
            scorer = model_runner.evaluate_task(task, "test", False)
            scorer.write_predictions()
            preds = utils.load_json(config.qa_preds_file("squad"))
            null_odds = utils.load_json(config.qa_na_file("squad"))
            for q, _ in preds.items():
              if null_odds[q] > config.qa_na_threshold:
                preds[q] = ""
            utils.write_json(preds, config.test_predictions(
                task.name, "test", trial))
          else:
            utils.log("Skipping task", task.name,
                      "- writing predictions is not supported for this task")

    if trial != config.num_trials and (not config.keep_all_models):
      utils.rmrf(config.model_dir)
    trial += 1

  # exporting the model
  if config.export_dir:
    # with tf.variable_scope(tf.get_variable_scope(), reuse=True):
    #   model_runner = ModelRunner(config, tasks)
    #   tf.gfile.MakeDirs(config.export_dir)
    #   checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315")
    #   squad_serving_input_fn = (
    #       build_squad_serving_input_fn(config.max_seq_length))
    #   utils.log("Starting to export model.")
    #   subfolder = model_runner._estimator.export_saved_model(
    #       export_dir_base=os.path.join(config.export_dir, "saved_model"),
    #       serving_input_receiver_fn=squad_serving_input_fn)
    tf.get_variable_scope().reuse_variables()
    model_runner = ModelRunner(config, tasks)
    tf.gfile.MakeDirs(config.export_dir)
    checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315")
    squad_serving_input_fn = (
        build_squad_serving_input_fn(config.max_seq_length))
    utils.log("Starting to export model.")
    subfolder = model_runner._estimator.export_saved_model(
        export_dir_base=os.path.join(config.export_dir, "saved_model"),
        serving_input_receiver_fn=squad_serving_input_fn)