def setUp(self): xnmt.events.clear() self.model_context = ModelContext() self.model_context.dynet_param_collection = PersistentParamCollection( "some_file", 1) self.model = DefaultTranslator( src_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), encoder=BiLSTMSeqTransducer(self.model_context), attender=MlpAttender(self.model_context), trg_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), decoder=MlpSoftmaxDecoder(self.model_context, vocab_size=100, bridge=CopyBridge(self.model_context, dec_layers=1)), ) self.model.initialize_training_strategy(TrainingStrategy()) self.model.set_train(False) self.model.initialize_generator() self.training_corpus = BilingualTrainingCorpus( train_src="examples/data/head.ja", train_trg="examples/data/head.en", dev_src="examples/data/head.ja", dev_trg="examples/data/head.en") self.corpus_parser = BilingualCorpusParser( src_reader=PlainTextReader(), trg_reader=PlainTextReader(), training_corpus=self.training_corpus)
def setUp(self): xnmt.events.clear() self.input_reader = PlainTextReader() list(self.input_reader.read_sents('examples/data/head.ja')) self.input_reader.freeze() self.context = ModelContext() self.context.dynet_param_collection = PersistentParamCollection( None, 0)
def setUp(self): xnmt.events.clear() self.model_context = ModelContext() self.model_context.dynet_param_collection = PersistentParamCollection( "some_file", 1) self.training_corpus = BilingualTrainingCorpus( train_src="examples/data/head.ja", train_trg="examples/data/head.en", dev_src="examples/data/head.ja", dev_trg="examples/data/head.en") self.corpus_parser = BilingualCorpusParser( src_reader=PlainTextReader(), trg_reader=PlainTextReader(), training_corpus=self.training_corpus)
def test_overfitting(self): self.model_context = ModelContext() self.model_context.dynet_param_collection = PersistentParamCollection( "some_file", 1) self.model_context.default_layer_dim = 16 train_args = {} training_corpus = BilingualTrainingCorpus( train_src="examples/data/head.ja", train_trg="examples/data/head.en", dev_src="examples/data/head.ja", dev_trg="examples/data/head.en") train_args['corpus_parser'] = BilingualCorpusParser( training_corpus=training_corpus, src_reader=PlainTextReader(), trg_reader=PlainTextReader()) train_args['training_strategy'] = TrainingStrategy() train_args['model'] = DefaultTranslator( src_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), encoder=BiLSTMSeqTransducer(self.model_context), attender=MlpAttender(self.model_context), trg_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), decoder=MlpSoftmaxDecoder(self.model_context, vocab_size=100), ) train_args['model_file'] = None train_args['save_num_checkpoints'] = 0 train_args['trainer'] = AdamTrainer(self.model_context, alpha=0.1) train_args['batcher'] = SrcBatcher(batch_size=10, break_ties_randomly=False) training_regimen = xnmt.train.TrainingRegimen( yaml_context=self.model_context, **train_args) training_regimen.model_context = self.model_context for _ in range(50): training_regimen.one_epoch(update_weights=True) self.assertAlmostEqual( 0.0, training_regimen.logger.epoch_loss.loss_values['loss'] / training_regimen.logger.epoch_words, places=2)
def test_train_dev_loss_equal(self): self.model_context = ModelContext() self.model_context.dynet_param_collection = NonPersistentParamCollection( ) train_args = {} training_corpus = BilingualTrainingCorpus( train_src="examples/data/head.ja", train_trg="examples/data/head.en", dev_src="examples/data/head.ja", dev_trg="examples/data/head.en") train_args['corpus_parser'] = BilingualCorpusParser( training_corpus=training_corpus, src_reader=PlainTextReader(), trg_reader=PlainTextReader()) train_args['loss_calculator'] = LossCalculator() train_args['model'] = DefaultTranslator( src_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), encoder=BiLSTMSeqTransducer(self.model_context), attender=MlpAttender(self.model_context), trg_embedder=SimpleWordEmbedder(self.model_context, vocab_size=100), decoder=MlpSoftmaxDecoder(self.model_context, vocab_size=100), ) train_args['trainer'] = None train_args['batcher'] = SrcBatcher(batch_size=5, break_ties_randomly=False) train_args['run_for_epochs'] = 1 training_regimen = xnmt.training_regimen.SimpleTrainingRegimen( yaml_context=self.model_context, **train_args) training_regimen.model_context = self.model_context training_regimen.run_training(update_weights=False) self.assertAlmostEqual( training_regimen.logger.epoch_loss.loss_values['loss'] / training_regimen.logger.epoch_words, training_regimen.logger.dev_score.loss)
def main(overwrite_args=None): argparser = argparse.ArgumentParser() argparser.add_argument("--dynet-mem", type=int) argparser.add_argument("--dynet-seed", type=int) argparser.add_argument("--dynet-autobatch", type=int) argparser.add_argument("--dynet-devices", type=str) argparser.add_argument("--dynet-viz", action='store_true', help="use visualization") argparser.add_argument("--dynet-gpu", action='store_true', help="use GPU acceleration") argparser.add_argument("--dynet-gpu-ids", type=int) argparser.add_argument("--dynet-gpus", type=int) argparser.add_argument("--dynet-weight-decay", type=float) argparser.add_argument("--generate-doc", action='store_true', help="Do not run, output documentation instead") argparser.add_argument("experiments_file") argparser.add_argument("experiment_name", nargs='*', help="Run only the specified experiments") argparser.set_defaults(generate_doc=False) args = argparser.parse_args(overwrite_args) config_parser = OptionParser() if args.generate_doc: print(config_parser.generate_options_table()) exit(0) if args.dynet_seed: random.seed(args.dynet_seed) np.random.seed(args.dynet_seed) config_experiment_names = config_parser.experiment_names_from_file(args.experiments_file) results = [] # Check ahead of time that all experiments exist, to avoid bad surprises experiment_names = args.experiment_name or config_experiment_names if args.experiment_name: nonexistent = set(experiment_names).difference(config_experiment_names) if len(nonexistent) != 0: raise Exception("Experiments {} do not exist".format(",".join(list(nonexistent)))) for experiment_name in experiment_names: exp_tasks = config_parser.parse_experiment(args.experiments_file, experiment_name) print("=> Running {}".format(experiment_name)) exp_args = exp_tasks.get("experiment", {}) # TODO: refactor if not "model_file" in exp_args: exp_args["model_file"] = "<EXP>.mod" if not "hyp_file" in exp_args: exp_args["hyp_file"] = "<EXP>.hyp" if not "out_file" in exp_args: exp_args["out_file"] = "<EXP>.out" if not "err_file" in exp_args: exp_args["model_file"] = "<EXP>.err" if not "cfg_file" in exp_args: exp_args["cfg_file"] = None if not "eval_only" in exp_args: exp_args["eval_only"] = False if not "eval_metrics" in exp_args: exp_args["eval_metrics"] = "bleu" if "cfg_file" in exp_args and exp_args["cfg_file"] != None: shutil.copyfile(args.experiments_file, exp_args["cfg_file"]) preproc_args = exp_tasks.get("preproc", {}) # Do preprocessing print("> Preprocessing") xnmt.xnmt_preproc.xnmt_preproc(**preproc_args) print("> Initializing TrainingRegimen") train_args = exp_tasks["train"] train_args.model_file = exp_args["model_file"] # TODO: can we use param sharing for this? model_context = ModelContext() model_context.dynet_param_collection = PersistentParamCollection(exp_args["model_file"], 1) if hasattr(train_args, "glob"): for k in train_args.glob: setattr(model_context, k, train_args.glob[k]) train_args = YamlSerializer().initialize_if_needed(UninitializedYamlObject(train_args), model_context) xnmt_decoder = exp_tasks.get("decode", {}) xnmt_decoder.trg_file = exp_args["hyp_file"] # TODO: can we use param sharing for this? xnmt_decoder.model_file = None # The model is passed to the decoder directly xnmt_decoder = YamlSerializer().initialize_if_needed(UninitializedYamlObject(xnmt_decoder), model_context) evaluate_args = exp_tasks.get("evaluate", {}) evaluate_args["hyp_file"] = exp_args["hyp_file"] evaluators = map(lambda s: s.lower(), exp_args["eval_metrics"].split(",")) output = Tee(exp_args["out_file"], 3) err_output = Tee(exp_args["err_file"], 3, error=True) # Do training if "random_search_report" in exp_tasks: print("> instantiated random parameter search: %s" % exp_tasks["random_search_report"]) print("> Training") training_regimen = train_args training_regimen.xnmt_decoder = copy.copy(xnmt_decoder) training_regimen.evaluate_args = copy.copy(evaluate_args) eval_scores = "Not evaluated" if not exp_args["eval_only"]: training_regimen.run_epochs(exp_args["run_for_epochs"]) if not exp_args["eval_only"]: print('reverting learned weights to best checkpoint..') training_regimen.model_context.dynet_param_collection.revert_to_best_model() if evaluators: print("> Evaluating test set") output.indent += 2 xnmt_decoder(model_elements=(training_regimen.corpus_parser, training_regimen.model)) eval_scores = [] for evaluator in evaluators: evaluate_args["evaluator"] = evaluator eval_score = xnmt.xnmt_evaluate.xnmt_evaluate(**evaluate_args) print(eval_score) eval_scores.append(eval_score) output.indent -= 2 results.append((experiment_name, eval_scores)) output.close() err_output.close() print("") print("{:<30}|{:<40}".format("Experiment", " Final Scores")) print("-" * (70 + 1)) for line in results: experiment_name, eval_scores = line for i in range(len(eval_scores)): print("{:<30}| {:<40}".format((experiment_name if i==0 else ""), str(eval_scores[i])))
def __init__(self, corpus_parser, model_file, model, yaml_context=None, glob={}, dev_every=0, batcher=None, training_strategy=None, save_num_checkpoints=1, pretrained_model_file="", src_format="text", trainer=None, lr_decay=1.0, lr_decay_times=3, attempts_before_lr_decay=1, dev_metrics="", schedule_metric="loss", restart_trainer=False, reload_command=None): """ :param corpus_parser: :param model_file: :param model: :param yaml_context: (TODO: remove default value) :param dev_every (int): dev checkpoints every n sentences (0 for only after epoch) :param batcher: Type of batcher. Defaults to SrcBatcher of batch size 32. :param training_strategy: :param save_num_checkpoints (int): Save recent n best checkpoints :param pretrained_model_file: Path of pre-trained model file :param src_format: Format of input data: text/contvec :param trainer: Trainer object, default is SGD with learning rate 0.1 :param lr_decay (float): :param lr_decay_times (int): Early stopping after decaying learning rate a certain number of times :param attempts_before_lr_decay (int): apply LR decay after dev scores haven't improved over this many checkpoints :param dev_metrics: Comma-separated list of evaluation metrics (bleu/wer/cer) :param schedule_metric: determine learning schedule based on this dev_metric (loss/bleu/wer/cer) :param restart_trainer: Restart trainer (useful for Adam) and revert weights to best dev checkpoint when applying LR decay (https://arxiv.org/pdf/1706.09733.pdf) :param reload_command: Command to change the input data after each epoch. --epoch EPOCH_NUM will be appended to the command. To just reload the data after each epoch set the command to 'true'. """ dy.renew_cg() # TODO: don't need to keep a dedicated args object any longer args = dict(dev_every=dev_every, batcher=batcher, corpus_parser=corpus_parser, training_strategy=training_strategy, model_file=model_file, save_num_checkpoints=save_num_checkpoints, pretrained_model_file=pretrained_model_file, src_format=src_format, default_layer_dim=glob.get("default_layer_dim", 512), trainer=trainer, lr_decay=lr_decay, lr_decay_times=lr_decay_times, attempts_before_lr_decay=attempts_before_lr_decay, dev_metrics=dev_metrics, schedule_metric=schedule_metric, restart_trainer=restart_trainer, reload_command=reload_command, dropout=glob.get("dropout", 0.0), weight_noise=glob.get("weight_noise", 0.0), model=model) self.args = args if yaml_context: self.model_context = yaml_context else: self.model_context = ModelContext() self.model_context.dynet_param_collection = PersistentParamCollection( self.args["model_file"], self.args["save_num_checkpoints"]) if args["lr_decay"] > 1.0 or args["lr_decay"] <= 0.0: raise RuntimeError( "illegal lr_decay, must satisfy: 0.0 < lr_decay <= 1.0") self.num_times_lr_decayed = 0 self.early_stopping_reached = False self.cur_attempt = 0 self.evaluators = [ s.lower() for s in self.args["dev_metrics"].split(",") if s.strip() != "" ] if self.args["schedule_metric"].lower() not in self.evaluators: self.evaluators.append(self.args["schedule_metric"].lower()) if "loss" not in self.evaluators: self.evaluators.append("loss") if args["reload_command"] is not None: self._augmentation_handle = None self._augment_data_initial() # Initialize the serializer self.model_serializer = YamlSerializer() self.create_corpus_and_model() self.model.initialize_training_strategy(self.training_strategy) if self.args["batcher"] is None: self.batcher = SrcBatcher(32) else: self.batcher = self.args["batcher"] if args["src_format"] == "contvec": self.batcher.pad_token = np.zeros(self.model.src_embedder.emb_dim) self.pack_batches() self.logger = BatchLossTracker(args["dev_every"], self.total_train_sent) if args["trainer"] is None: self.trainer = xnmt.optimizer.SimpleSGDTrainer( self.model_context, 0.1) else: self.trainer = args["trainer"]