def generate_vectors(json_input_filename, w2v_dim, perplexity, theta, pca_dims, dim=2):
    vectors = []
    most_dominant_labels = []
    image_ids = []
    label_map = utils.load_json(json_input_filename, w2v_dim)
    for image_id, label in label_map.iteritems():
        label_vectors = []
        label_scores = []
        label_desc = []
        for val in label:
            label_vectors.append(val['word2vec'])
            label_scores.append(val['score'])
            label_desc.append(str(''.join(c for c in val['description'] if c in string.printable)))
        output_vec = word2vec.linear_combination_vectors(vectors=label_vectors, coefficients=label_scores)

        vectors.append(output_vec)
        most_dominant_labels.append(label_desc[0])
        image_ids.append(image_id)

    embeddings = []
    for result in bh_tsne(vectors,
                          perplexity=perplexity,
                          initial_dims=pca_dims,
                          theta=theta,
                          no_dims=dim):
        embeddings.append(result)

    embeddings = utils.scale_max_abs(embeddings)
    return embeddings, most_dominant_labels, image_ids
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--data-dir",
                        required=True,
                        help="Location of data files (model weights, etc).")
    parser.add_argument("--model-name",
                        required=True,
                        help="The name of the model being fine-tuned.")
    parser.add_argument("--hparams",
                        default="{}",
                        help="JSON dict of model hyperparameters.")
    args = parser.parse_args()
    if args.hparams.endswith(".json"):
        hparams = utils.load_json(args.hparams)
    else:
        hparams = json.loads(args.hparams)
    tf.logging.set_verbosity(tf.logging.ERROR)
    run_finetuning(
        configure_finetuning.FinetuningConfig(args.model_name, args.data_dir,
                                              **hparams))
예제 #3
0
    def _serialize_dataset(self, tasks, is_training, split):
        """Write out the dataset as tfrecords."""
        dataset_name = "_".join(sorted([task.name for task in tasks]))
        dataset_name += "_" + split
        dataset_prefix = os.path.join(self._config.preprocessed_data_dir,
                                      dataset_name)
        tfrecords_path = dataset_prefix + ".tfrecord"
        metadata_path = dataset_prefix + ".metadata"
        batch_size = (self._config.train_batch_size
                      if is_training else self._config.eval_batch_size)

        utils.log("Loading dataset", dataset_name)
        n_examples = None
        if tf.io.gfile.exists(metadata_path):
            n_examples = utils.load_json(metadata_path)["n_examples"]

        if n_examples is None:
            utils.log("Existing tfrecords not found so creating")
            examples = []
            for task in tasks:
                print("task-----------", task)
                task_examples = task.get_examples(
                    data_dir=self._config.data_dir,
                    corpus=self._config.corpus,
                    split=split)  #data_dir,corpus,split
                examples += task_examples
            if is_training:
                random.shuffle(examples)
            utils.mkdir(tfrecords_path.rsplit("/", 1)[0])
            n_examples = self.serialize_examples(examples, is_training,
                                                 tfrecords_path, batch_size)
            utils.write_json({"n_examples": n_examples}, metadata_path)

        input_fn = self._input_fn_builder(tfrecords_path, is_training)
        if is_training:
            steps = int(n_examples // batch_size *
                        self._config.num_train_epochs)
        else:
            steps = n_examples // batch_size

        return input_fn, steps
예제 #4
0
def generate_vectors(json_input_filename,
                     w2v_dim,
                     perplexity,
                     theta,
                     pca_dims,
                     dim=2):
    vectors = []
    most_dominant_labels = []
    image_ids = []
    label_map = utils.load_json(json_input_filename, w2v_dim)
    for image_id, label in label_map.iteritems():
        label_vectors = []
        label_scores = []
        label_desc = []
        for val in label:
            label_vectors.append(val['word2vec'])
            label_scores.append(val['score'])
            label_desc.append(
                str(''.join(c for c in val['description']
                            if c in string.printable)))
        output_vec = word2vec.linear_combination_vectors(
            vectors=label_vectors, coefficients=label_scores)

        vectors.append(output_vec)
        most_dominant_labels.append(label_desc[0])
        image_ids.append(image_id)

    embeddings = []
    for result in bh_tsne(vectors,
                          perplexity=perplexity,
                          initial_dims=pca_dims,
                          theta=theta,
                          no_dims=dim):
        embeddings.append(result)

    embeddings = utils.scale_max_abs(embeddings)
    return embeddings, most_dominant_labels, image_ids
예제 #5
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--data-dir",
                        required=True,
                        help="Location of data files (model weights, etc).")
    parser.add_argument("--model-name",
                        required=True,
                        help="The name of the model being fine-tuned.")
    parser.add_argument("--hparams",
                        default="{}",
                        help="JSON dict of model hyperparameters.")
    args = parser.parse_args()
    if args.hparams.endswith(".json"):
        hparams = utils.load_json(args.hparams)
    else:
        hparams = json.loads(args.hparams)
    tf.logging.set_verbosity(tf.logging.ERROR)
    # hparams = {
    #   "do_train": "true",
    #   "do_eval": "false",
    #   "model_size": "base",
    #   "do_lower_case": "true",
    #   "vocab_size": 100000,
    #   "num_train_steps": 766000,
    #   "save_checkpoints_steps": 50000,
    #   "train_batch_size": 192,
    #   "max_seq_length": 128,
    # }
    train_or_eval(
        configure_pretraining.PretrainingConfig(
            # "danish_electra_base_uncased_100k",
            # "/bachelor_project/electra_google/base_uncased_100k_danish_data",
            # **hparams))
            args.model_name,
            args.data_dir,
            **hparams))
예제 #6
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--data_dir",
        required=True,
        type=str,
        help="Location of data files (model weights, etc).",
    )
    parser.add_argument(
        "--model_name",
        required=True,
        type=str,
        help="The name of the model being fine-tuned.",
    )
    parser.add_argument("--hparams",
                        default="{}",
                        type=str,
                        help="JSON dict of model hyperparameters.")
    parser.add_argument("--use_tpu", action="store_true", help="Using tpu.")
    parser.add_argument("--mixed_precision",
                        action="store_true",
                        help="Using mixed precision.")
    args = parser.parse_args()

    if args.mixed_precision:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
    if args.hparams.endswith(".json"):
        hparams = utils.load_json(args.hparams)
    else:
        hparams = json.loads(args.hparams)
    tf.logging.set_verbosity(tf.logging.ERROR)
    train_or_eval(
        configure_pretraining.PretrainingConfig(args.model_name, args.data_dir,
                                                args.use_tpu,
                                                args.mixed_precision,
                                                **hparams))
예제 #7
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
    """Run finetuning."""

    # Setup for training
    results = []
    trial = 1
    heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial,
                                                     config.num_trials)
    heading = lambda msg: utils.heading(msg + ": " + heading_info)
    heading("Config")
    utils.log_config(config)
    generic_model_dir = config.model_dir
    tasks = task_builder.get_tasks(config)

    # Train and evaluate num_trials models with different random seeds
    while config.num_trials < 0 or trial <= config.num_trials:

        print("#################################################")
        print(tasks)

        t = vars(config)
        print(t)
        print("#################################################")

        # Create Neptune Experiment
        neptune.create_experiment(name=f'tf-ft', params=vars(config))

        config.model_dir = generic_model_dir + "_" + str(trial) + '_' + str(
            random.randint(0, 10000))
        if config.do_train:
            utils.rmkdir(config.model_dir)

        model_runner = ModelRunner(config, tasks)
        if config.do_train:
            heading("Start training")
            model_runner.train()
            utils.log()

        if config.do_eval:
            heading("Run dev set evaluation")
            eval_result = model_runner.evaluate()
            results.append(eval_result)

            write_results(config, results)
            if config.write_test_outputs and trial <= config.n_writes_test:
                heading("Running on the test set and writing the predictions")
                for task in tasks:
                    # Currently only writing preds for GLUE and SQuAD 2.0 is supported
                    if task.name in [
                            "cola", "mrpc", "mnli", "sst", "rte", "qnli",
                            "qqp", "sts"
                    ]:
                        for split in task.get_test_splits():
                            model_runner.write_classification_outputs([task],
                                                                      trial,
                                                                      split)
                    elif task.name == "squad":
                        scorer = model_runner.evaluate_task(
                            task, "test", False)
                        scorer.write_predictions()
                        preds = utils.load_json(config.qa_preds_file("squad"))
                        null_odds = utils.load_json(config.qa_na_file("squad"))
                        for q, _ in preds.items():
                            if null_odds[q] > config.qa_na_threshold:
                                preds[q] = ""
                        utils.write_json(
                            preds,
                            config.test_predictions(task.name, "test", trial))
                    else:
                        utils.log(
                            "Skipping task", task.name,
                            "- writing predictions is not supported for this task"
                        )

        if trial != config.num_trials and (not config.keep_all_models):
            utils.rmrf(config.model_dir)
        trial += 1
def process_single(in_file, out_file):
    # Load the detected faces and embeddings and run the classifier
    result = [(face_id, predict_gender(embed), predict_gender_score(embed))
              for face_id, embed in load_json(in_file)]

    save_json(result, out_file)
            full_models.append(model_name)
models = full_models

models_predictions = collections.OrderedDict()
for d in models:
    dire = os.path.join(data_dir, d)
    try:
        prediction = collections.OrderedDict()
        prediction['eval_all_nbest'] = filter_short_ans(
            utils.load_pickle(
                (os.path.join(dire, 'models', 'electra_large', 'results',
                              '{}_qa'.format(task_name),
                              '{}_{}_all_nbest.pkl'.format(task_name,
                                                           split)))))
        prediction['squad_null_odds'] = utils.load_json(
            (os.path.join(dire, 'models', 'electra_large', 'results',
                          '{}_qa'.format(task_name),
                          '{}_{}_null_odds.json'.format(task_name, split))))
        models_predictions[d] = prediction
    except:
        utils.log(
            "Error at loading all_nbest.pkl & null_odds.json for model {}".
            format(d))
        continue

dataset = \
  utils.load_json((os.path.join(data_dir, model_name_part, 'finetuning_data', task_name, '{}.json'.format(split))))[
    'data']
qid_answers = collections.OrderedDict()
for article in dataset:
    for p in article['paragraphs']:
        for qa in p['qas']:
예제 #10
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
    """Run finetuning."""

    # Setup for training
    results = []
    trial = 1
    heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial,
                                                     config.num_trials)
    heading = lambda msg: utils.heading(msg + ": " + heading_info)
    heading("Config")
    utils.log_config(config)
    generic_model_dir = config.model_dir
    tasks = task_builder.get_tasks(config)

    # Train and evaluate num_trials models with different random seeds
    while config.num_trials < 0 or trial <= config.num_trials:
        config.model_dir = generic_model_dir + "_" + str(trial)
        if config.do_train:
            utils.rmkdir(config.model_dir)

        model_runner = ModelRunner(config, tasks)
        if config.do_train:
            heading("Start training")
            model_runner.train()
            utils.log()

        if config.do_eval:
            heading("Run dev set evaluation")
            results.append(model_runner.evaluate())
            if config.do_test:
                for task in tasks:
                    test_score = model_runner.evaluate_task_test(
                        task, results[-1][task.name]['checkpoint_path'])
                    results[-1][task.name]["test_results"] = test_score
            write_results(config, results)
            if config.write_test_outputs and trial <= config.n_writes_test:
                heading("Running on the test set and writing the predictions")
                for task in tasks:
                    # Currently only writing preds for GLUE and SQuAD 2.0 is supported
                    if task.name in [
                            "cola", "mrpc", "mnli", "sst", "rte", "qnli",
                            "qqp", "sts"
                    ]:
                        for split in task.get_test_splits():
                            model_runner.write_classification_outputs([task],
                                                                      trial,
                                                                      split)
                    elif task.name == "squad":
                        scorer = model_runner.evaluate_task(
                            task, "test", False)
                        scorer.write_predictions()
                        preds = utils.load_json(config.qa_preds_file("squad"))
                        null_odds = utils.load_json(config.qa_na_file("squad"))
                        for q, _ in preds.items():
                            if null_odds[q] > config.qa_na_threshold:
                                preds[q] = ""
                        utils.write_json(
                            preds,
                            config.test_predictions(task.name, "test", trial))
                    else:
                        utils.log(
                            "Skipping task", task.name,
                            "- writing predictions is not supported for this task"
                        )
        if config.do_predict:
            if "dev" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "dev")
                import pickle
                with open("predict_dev.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

            if "train" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "train")
                import pickle
                with open("predict_train.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

            if "test" in config.predict_split:
                results = model_runner.predict(tasks[0],
                                               config.predict_checkpoint_path,
                                               "test")
                import pickle
                with open("predict_test.pickle", "bw") as outfile:
                    pickle.dump(results, outfile)

        if trial != config.num_trials and (not config.keep_all_models):
            utils.rmrf(config.model_dir)
        trial += 1
예제 #11
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
  """Run finetuning."""
  tf.get_variable_scope().reuse_variables() #import pdb; pdb.set_trace()

  # Setup for training
  results = []
  trial = 1
  heading_info = "model={:}, trial {:}/{:}".format(
      config.model_name, trial, config.num_trials)
  heading = lambda msg: utils.heading(msg + ": " + heading_info)
  heading("Config")
  utils.log_config(config)
  generic_model_dir = config.model_dir
  tasks = task_builder.get_tasks(config)
  # Train and evaluate num_trials models with different random seeds
  while config.num_trials < 0 or trial <= config.num_trials:
    config.model_dir = generic_model_dir + "_" + str(trial)
    if config.do_train:
      utils.rmkdir(config.model_dir)

    model_runner = ModelRunner(config, tasks)
    if config.do_train:
      heading("Start training")
      model_runner.train()
      utils.log()

    if config.do_eval:
      heading("Run dev set evaluation")
      results.append(model_runner.evaluate())
      write_results(config, results)
      if config.write_test_outputs and trial <= config.n_writes_test:
        heading("Running on the test set and writing the predictions")
        for task in tasks:
          # Currently only writing preds for GLUE and SQuAD 2.0 is supported
          if task.name in ["cola", "mrpc", "mnli", "sst", "rte", "qnli", "qqp",
                           "sts"]:
            for split in task.get_test_splits():
              model_runner.write_classification_outputs([task], trial, split)
          elif task.name == "squad":
            scorer = model_runner.evaluate_task(task, "test", False)
            scorer.write_predictions()
            preds = utils.load_json(config.qa_preds_file("squad"))
            null_odds = utils.load_json(config.qa_na_file("squad"))
            for q, _ in preds.items():
              if null_odds[q] > config.qa_na_threshold:
                preds[q] = ""
            utils.write_json(preds, config.test_predictions(
                task.name, "test", trial))
          else:
            utils.log("Skipping task", task.name,
                      "- writing predictions is not supported for this task")

    if trial != config.num_trials and (not config.keep_all_models):
      utils.rmrf(config.model_dir)
    trial += 1

  # exporting the model
  if config.export_dir:
    # with tf.variable_scope(tf.get_variable_scope(), reuse=True):
    #   model_runner = ModelRunner(config, tasks)
    #   tf.gfile.MakeDirs(config.export_dir)
    #   checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315")
    #   squad_serving_input_fn = (
    #       build_squad_serving_input_fn(config.max_seq_length))
    #   utils.log("Starting to export model.")
    #   subfolder = model_runner._estimator.export_saved_model(
    #       export_dir_base=os.path.join(config.export_dir, "saved_model"),
    #       serving_input_receiver_fn=squad_serving_input_fn)
    tf.get_variable_scope().reuse_variables()
    model_runner = ModelRunner(config, tasks)
    tf.gfile.MakeDirs(config.export_dir)
    checkpoint_path = os.path.join(config.init_checkpoint, "model.ckpt-6315")
    squad_serving_input_fn = (
        build_squad_serving_input_fn(config.max_seq_length))
    utils.log("Starting to export model.")
    subfolder = model_runner._estimator.export_saved_model(
        export_dir_base=os.path.join(config.export_dir, "saved_model"),
        serving_input_receiver_fn=squad_serving_input_fn)
예제 #12
0
def run_finetuning(config: configure_finetuning.FinetuningConfig):
    """Run finetuning."""

    # Setup for training
    results = []
    trial = 1
    heading_info = "model={:}, trial {:}/{:}".format(config.model_name, trial,
                                                     config.num_trials)
    heading = lambda msg: utils.heading(msg + ": " + heading_info)
    heading("Config")
    utils.log_config(config)
    generic_model_dir = config.model_dir
    tasks = task_builder.get_tasks(config)

    # Train and evaluate num_trials models with different random seeds
    while config.num_trials < 0 or trial <= config.num_trials:
        config.model_dir = generic_model_dir + "_" + str(trial)
        if config.do_train:
            utils.rmkdir(config.model_dir)

        model_runner = ModelRunner(config, tasks)
        if config.do_train:
            heading_info = "model={:}, trial {:}/{:}".format(
                config.model_name, trial, config.num_trials)
            heading("Start training")
            model_runner.train()
            utils.log()

        if config.do_eval:
            if config.write_eval_outputs and trial <= config.n_writes_test:
                heading("Running on the dev set and writing the predictions")
                for task in tasks:
                    # Currently only writing preds for GLUE and SQuAD 2.0 is supported
                    if task.name in [
                            "cola", "mrpc", "mnli", "sst", "rte", "qnli",
                            "qqp", "sts"
                    ]:
                        for split in task.get_test_splits():
                            model_runner.write_classification_outputs([task],
                                                                      trial,
                                                                      split)
                    elif task.name == "squad":
                        scorer = model_runner.evaluate_task(task, "dev", False)
                        scorer.write_predictions()
                        preds = utils.load_json(
                            config.qa_preds_file(task.name + "_dev"))
                        null_odds = utils.load_json(
                            config.qa_na_file(task.name + "_dev"))
                        for q, _ in preds.items():
                            if null_odds[q] > config.qa_na_threshold:
                                preds[q] = ""
                        utils.write_json(
                            preds,
                            config.test_predictions(task.name, "dev", trial))
                    elif task.name == "cmrc2018" or task.name == "drcd":
                        scorer = model_runner.evaluate_task(task, "dev", False)
                        scorer.write_predictions()
                        preds = utils.load_json(
                            config.qa_preds_file(task.name + "_dev"))
                        #utils.write_json(preds, config.test_predictions(task.name, "dev", trial))
                        if config.num_trials > 1:
                            utils.write_json(
                                preds,
                                config.qa_preds_file(task.name + "_dev_" +
                                                     str(trial)))
                    else:
                        utils.log(
                            "Skipping task", task.name,
                            "- writing predictions is not supported for this task"
                        )
            else:
                heading("Run dev set evaluation")
                # results.append(model_runner.evaluate(split="train"))
                results.append(model_runner.evaluate(split="dev"))
                write_results(config, results)

        if config.do_test:
            if config.write_test_outputs and trial <= config.n_writes_test:
                heading("Running on the test set and writing the predictions")
                for task in tasks:
                    # Currently only writing preds for GLUE and SQuAD 2.0 is supported
                    if task.name in [
                            "cola", "mrpc", "mnli", "sst", "rte", "qnli",
                            "qqp", "sts"
                    ]:
                        for split in task.get_test_splits():
                            model_runner.write_classification_outputs([task],
                                                                      trial,
                                                                      split)
                    elif task.name == "squad":
                        scorer = model_runner.evaluate_task(
                            task, "eval", False)
                        scorer.write_predictions()
                        preds = utils.load_json(
                            config.qa_preds_file(task.name + "_eval"))
                        null_odds = utils.load_json(
                            config.qa_na_file(task.name + "_eval"))
                        for q, _ in preds.items():
                            if null_odds[q] > config.qa_na_threshold:
                                preds[q] = ""
                        utils.write_json(
                            preds,
                            config.test_predictions(task.name, "eval", trial))
                    elif task.name == "cmrc2018" or task.name == "drcd":
                        scorer = model_runner.evaluate_task(
                            task, "eval", False)
                        scorer.write_predictions()
                        preds = utils.load_json(
                            config.qa_preds_file(task.name + "_eval"))
                        #utils.write_json(preds, config.test_predictions(task.name, "eval", trial))
                        if config.num_trials > 1:
                            utils.write_json(
                                preds,
                                config.qa_preds_file(task.name + "_eval_" +
                                                     str(trial)))
                    elif task.name in [
                            "ccks42ee", "ccks42single", "ccks42multi"
                    ]:
                        scorer = model_runner.evaluate_task(
                            task, "eval", False)
                        scorer.write_predictions()
                        preds = utils.load_json(
                            config.qa_preds_file(task.name + "_eval"))
                        null_odds = utils.load_json(
                            config.qa_na_file(task.name + "_eval"))
                        for q, _ in preds.items():
                            if null_odds[q] > config.qa_na_threshold:
                                preds[q] = ""
                        utils.write_json(
                            preds,
                            config.qa_preds_file(task.name + "_eval_" +
                                                 str(trial)))
                    elif task.name in [
                            "ccks42ec", "ner", "ccks42num", "ccks42reg"
                    ]:
                        scorer = model_runner.evaluate_task(
                            task, "eval", False)
                        scorer.write_predictions()
                    else:
                        utils.log(
                            "Skipping task", task.name,
                            "- writing predictions is not supported for this task"
                        )
            else:
                heading("Run test set evaluation")
                results.append(model_runner.evaluate(split="eval"))
                write_results(config, results)

        if trial != config.num_trials and (not config.keep_all_models):
            utils.rmrf(config.model_dir)
        trial += 1