Пример #1
0
    def setUp(self):
        xnmt.events.clear()
        self.model_context = ModelContext()
        self.model_context.dynet_param_collection = PersistentParamCollection(
            "some_file", 1)
        self.model = DefaultTranslator(
            src_embedder=SimpleWordEmbedder(self.model_context,
                                            vocab_size=100),
            encoder=BiLSTMSeqTransducer(self.model_context),
            attender=MlpAttender(self.model_context),
            trg_embedder=SimpleWordEmbedder(self.model_context,
                                            vocab_size=100),
            decoder=MlpSoftmaxDecoder(self.model_context,
                                      vocab_size=100,
                                      bridge=CopyBridge(self.model_context,
                                                        dec_layers=1)),
        )
        self.model.initialize_training_strategy(TrainingStrategy())
        self.model.set_train(False)
        self.model.initialize_generator()

        self.training_corpus = BilingualTrainingCorpus(
            train_src="examples/data/head.ja",
            train_trg="examples/data/head.en",
            dev_src="examples/data/head.ja",
            dev_trg="examples/data/head.en")
        self.corpus_parser = BilingualCorpusParser(
            src_reader=PlainTextReader(),
            trg_reader=PlainTextReader(),
            training_corpus=self.training_corpus)
Пример #2
0
 def setUp(self):
     xnmt.events.clear()
     self.input_reader = PlainTextReader()
     list(self.input_reader.read_sents('examples/data/head.ja'))
     self.input_reader.freeze()
     self.context = ModelContext()
     self.context.dynet_param_collection = PersistentParamCollection(
         None, 0)
Пример #3
0
 def setUp(self):
     xnmt.events.clear()
     self.model_context = ModelContext()
     self.model_context.dynet_param_collection = PersistentParamCollection(
         "some_file", 1)
     self.training_corpus = BilingualTrainingCorpus(
         train_src="examples/data/head.ja",
         train_trg="examples/data/head.en",
         dev_src="examples/data/head.ja",
         dev_trg="examples/data/head.en")
     self.corpus_parser = BilingualCorpusParser(
         src_reader=PlainTextReader(),
         trg_reader=PlainTextReader(),
         training_corpus=self.training_corpus)
Пример #4
0
 def test_overfitting(self):
     self.model_context = ModelContext()
     self.model_context.dynet_param_collection = PersistentParamCollection(
         "some_file", 1)
     self.model_context.default_layer_dim = 16
     train_args = {}
     training_corpus = BilingualTrainingCorpus(
         train_src="examples/data/head.ja",
         train_trg="examples/data/head.en",
         dev_src="examples/data/head.ja",
         dev_trg="examples/data/head.en")
     train_args['corpus_parser'] = BilingualCorpusParser(
         training_corpus=training_corpus,
         src_reader=PlainTextReader(),
         trg_reader=PlainTextReader())
     train_args['training_strategy'] = TrainingStrategy()
     train_args['model'] = DefaultTranslator(
         src_embedder=SimpleWordEmbedder(self.model_context,
                                         vocab_size=100),
         encoder=BiLSTMSeqTransducer(self.model_context),
         attender=MlpAttender(self.model_context),
         trg_embedder=SimpleWordEmbedder(self.model_context,
                                         vocab_size=100),
         decoder=MlpSoftmaxDecoder(self.model_context, vocab_size=100),
     )
     train_args['model_file'] = None
     train_args['save_num_checkpoints'] = 0
     train_args['trainer'] = AdamTrainer(self.model_context, alpha=0.1)
     train_args['batcher'] = SrcBatcher(batch_size=10,
                                        break_ties_randomly=False)
     training_regimen = xnmt.train.TrainingRegimen(
         yaml_context=self.model_context, **train_args)
     training_regimen.model_context = self.model_context
     for _ in range(50):
         training_regimen.one_epoch(update_weights=True)
     self.assertAlmostEqual(
         0.0,
         training_regimen.logger.epoch_loss.loss_values['loss'] /
         training_regimen.logger.epoch_words,
         places=2)
Пример #5
0
def main(overwrite_args=None):
  argparser = argparse.ArgumentParser()
  argparser.add_argument("--dynet-mem", type=int)
  argparser.add_argument("--dynet-seed", type=int)
  argparser.add_argument("--dynet-autobatch", type=int)
  argparser.add_argument("--dynet-devices", type=str)
  argparser.add_argument("--dynet-viz", action='store_true', help="use visualization")
  argparser.add_argument("--dynet-gpu", action='store_true', help="use GPU acceleration")
  argparser.add_argument("--dynet-gpu-ids", type=int)
  argparser.add_argument("--dynet-gpus", type=int)
  argparser.add_argument("--dynet-weight-decay", type=float)
  argparser.add_argument("--generate-doc", action='store_true', help="Do not run, output documentation instead")
  argparser.add_argument("experiments_file")
  argparser.add_argument("experiment_name", nargs='*', help="Run only the specified experiments")
  argparser.set_defaults(generate_doc=False)
  args = argparser.parse_args(overwrite_args)

  config_parser = OptionParser()

  if args.generate_doc:
    print(config_parser.generate_options_table())
    exit(0)

  if args.dynet_seed:
    random.seed(args.dynet_seed)
    np.random.seed(args.dynet_seed)

  config_experiment_names = config_parser.experiment_names_from_file(args.experiments_file)

  results = []

  # Check ahead of time that all experiments exist, to avoid bad surprises
  experiment_names = args.experiment_name or config_experiment_names

  if args.experiment_name:
    nonexistent = set(experiment_names).difference(config_experiment_names)
    if len(nonexistent) != 0:
      raise Exception("Experiments {} do not exist".format(",".join(list(nonexistent))))

  for experiment_name in experiment_names:
    exp_tasks = config_parser.parse_experiment(args.experiments_file, experiment_name)

    print("=> Running {}".format(experiment_name))
    
    exp_args = exp_tasks.get("experiment", {})
    # TODO: refactor
    if not "model_file" in exp_args: exp_args["model_file"] = "<EXP>.mod"
    if not "hyp_file" in exp_args: exp_args["hyp_file"] = "<EXP>.hyp"
    if not "out_file" in exp_args: exp_args["out_file"] = "<EXP>.out"
    if not "err_file" in exp_args: exp_args["model_file"] = "<EXP>.err"
    if not "cfg_file" in exp_args: exp_args["cfg_file"] = None
    if not "eval_only" in exp_args: exp_args["eval_only"] = False
    if not "eval_metrics" in exp_args: exp_args["eval_metrics"] = "bleu"
    if "cfg_file" in exp_args and exp_args["cfg_file"] != None:
      shutil.copyfile(args.experiments_file, exp_args["cfg_file"])

    preproc_args = exp_tasks.get("preproc", {})
    # Do preprocessing
    print("> Preprocessing")
    xnmt.xnmt_preproc.xnmt_preproc(**preproc_args)

    print("> Initializing TrainingRegimen")
    train_args = exp_tasks["train"]
    train_args.model_file = exp_args["model_file"] # TODO: can we use param sharing for this?
    model_context = ModelContext()
    model_context.dynet_param_collection = PersistentParamCollection(exp_args["model_file"], 1)
    if hasattr(train_args, "glob"):
      for k in train_args.glob:
        setattr(model_context, k, train_args.glob[k])
    train_args = YamlSerializer().initialize_if_needed(UninitializedYamlObject(train_args), model_context)
    
    xnmt_decoder = exp_tasks.get("decode", {})
    xnmt_decoder.trg_file = exp_args["hyp_file"] # TODO: can we use param sharing for this?
    xnmt_decoder.model_file = None  # The model is passed to the decoder directly
    xnmt_decoder = YamlSerializer().initialize_if_needed(UninitializedYamlObject(xnmt_decoder), model_context)

    evaluate_args = exp_tasks.get("evaluate", {})
    evaluate_args["hyp_file"] = exp_args["hyp_file"]
    evaluators = map(lambda s: s.lower(), exp_args["eval_metrics"].split(","))

    output = Tee(exp_args["out_file"], 3)
    err_output = Tee(exp_args["err_file"], 3, error=True)

    # Do training
    if "random_search_report" in exp_tasks:
      print("> instantiated random parameter search: %s" % exp_tasks["random_search_report"])

    print("> Training")
    training_regimen = train_args
    training_regimen.xnmt_decoder = copy.copy(xnmt_decoder)
    training_regimen.evaluate_args = copy.copy(evaluate_args)

    eval_scores = "Not evaluated"
    if not exp_args["eval_only"]:
      training_regimen.run_epochs(exp_args["run_for_epochs"])

    if not exp_args["eval_only"]:
      print('reverting learned weights to best checkpoint..')
      training_regimen.model_context.dynet_param_collection.revert_to_best_model()
    if evaluators:
      print("> Evaluating test set")
      output.indent += 2
      xnmt_decoder(model_elements=(training_regimen.corpus_parser, training_regimen.model))
      eval_scores = []
      for evaluator in evaluators:
        evaluate_args["evaluator"] = evaluator
        eval_score = xnmt.xnmt_evaluate.xnmt_evaluate(**evaluate_args)
        print(eval_score)
        eval_scores.append(eval_score)
      output.indent -= 2

    results.append((experiment_name, eval_scores))

    output.close()
    err_output.close()

  print("")
  print("{:<30}|{:<40}".format("Experiment", " Final Scores"))
  print("-" * (70 + 1))

  for line in results:
    experiment_name, eval_scores = line
    for i in range(len(eval_scores)):
      print("{:<30}| {:<40}".format((experiment_name if i==0 else ""), str(eval_scores[i])))
Пример #6
0
    def __init__(self,
                 corpus_parser,
                 model_file,
                 model,
                 yaml_context=None,
                 glob={},
                 dev_every=0,
                 batcher=None,
                 training_strategy=None,
                 save_num_checkpoints=1,
                 pretrained_model_file="",
                 src_format="text",
                 trainer=None,
                 lr_decay=1.0,
                 lr_decay_times=3,
                 attempts_before_lr_decay=1,
                 dev_metrics="",
                 schedule_metric="loss",
                 restart_trainer=False,
                 reload_command=None):
        """
    :param corpus_parser:
    :param model_file:
    :param model:
    :param yaml_context: (TODO: remove default value)
    :param dev_every (int): dev checkpoints every n sentences (0 for only after epoch)
    :param batcher: Type of batcher. Defaults to SrcBatcher of batch size 32.
    :param training_strategy:
    :param save_num_checkpoints (int): Save recent n best checkpoints
    :param pretrained_model_file: Path of pre-trained model file
    :param src_format: Format of input data: text/contvec
    :param trainer: Trainer object, default is SGD with learning rate 0.1
    :param lr_decay (float):
    :param lr_decay_times (int):  Early stopping after decaying learning rate a certain number of times
    :param attempts_before_lr_decay (int): apply LR decay after dev scores haven't improved over this many checkpoints
    :param dev_metrics: Comma-separated list of evaluation metrics (bleu/wer/cer)
    :param schedule_metric: determine learning schedule based on this dev_metric (loss/bleu/wer/cer)
    :param restart_trainer: Restart trainer (useful for Adam) and revert weights to best dev checkpoint when applying LR decay (https://arxiv.org/pdf/1706.09733.pdf)
    :param reload_command: Command to change the input data after each epoch.
                           --epoch EPOCH_NUM will be appended to the command.
                           To just reload the data after each epoch set the command to 'true'.
    """
        dy.renew_cg()

        # TODO: don't need to keep a dedicated args object any longer
        args = dict(dev_every=dev_every,
                    batcher=batcher,
                    corpus_parser=corpus_parser,
                    training_strategy=training_strategy,
                    model_file=model_file,
                    save_num_checkpoints=save_num_checkpoints,
                    pretrained_model_file=pretrained_model_file,
                    src_format=src_format,
                    default_layer_dim=glob.get("default_layer_dim", 512),
                    trainer=trainer,
                    lr_decay=lr_decay,
                    lr_decay_times=lr_decay_times,
                    attempts_before_lr_decay=attempts_before_lr_decay,
                    dev_metrics=dev_metrics,
                    schedule_metric=schedule_metric,
                    restart_trainer=restart_trainer,
                    reload_command=reload_command,
                    dropout=glob.get("dropout", 0.0),
                    weight_noise=glob.get("weight_noise", 0.0),
                    model=model)
        self.args = args
        if yaml_context:
            self.model_context = yaml_context
        else:
            self.model_context = ModelContext()
            self.model_context.dynet_param_collection = PersistentParamCollection(
                self.args["model_file"], self.args["save_num_checkpoints"])

        if args["lr_decay"] > 1.0 or args["lr_decay"] <= 0.0:
            raise RuntimeError(
                "illegal lr_decay, must satisfy: 0.0 < lr_decay <= 1.0")
        self.num_times_lr_decayed = 0
        self.early_stopping_reached = False
        self.cur_attempt = 0

        self.evaluators = [
            s.lower() for s in self.args["dev_metrics"].split(",")
            if s.strip() != ""
        ]
        if self.args["schedule_metric"].lower() not in self.evaluators:
            self.evaluators.append(self.args["schedule_metric"].lower())
        if "loss" not in self.evaluators: self.evaluators.append("loss")

        if args["reload_command"] is not None:
            self._augmentation_handle = None
            self._augment_data_initial()

        # Initialize the serializer
        self.model_serializer = YamlSerializer()

        self.create_corpus_and_model()

        self.model.initialize_training_strategy(self.training_strategy)

        if self.args["batcher"] is None:
            self.batcher = SrcBatcher(32)
        else:
            self.batcher = self.args["batcher"]
        if args["src_format"] == "contvec":
            self.batcher.pad_token = np.zeros(self.model.src_embedder.emb_dim)
        self.pack_batches()
        self.logger = BatchLossTracker(args["dev_every"],
                                       self.total_train_sent)

        if args["trainer"] is None:
            self.trainer = xnmt.optimizer.SimpleSGDTrainer(
                self.model_context, 0.1)
        else:
            self.trainer = args["trainer"]