def __init__(self, pad_index: Ignore[int], teacher_config_path: str = '/DOES_NOT_EXIST'): teacher_config_path = relative_to_config_path(teacher_config_path) assert os.path.exists( teacher_config_path), "Teacher model config does not exist." nn.Module.__init__(self) teacher_model_config = get_configuration().clone() with open(teacher_config_path) as f: teacher_model_config.load(json.load(f)) push_configuration(teacher_model_config) update_and_ensure_model_output_path('test', None) best_model_path = find_best_model() if best_model_path is None: raise ValueError('Could not find the teacher model.') (src_vocab, tgt_vocab), _ = get_vocabularies() self.teacher_model = build_model(src_vocab, tgt_vocab) state_dict = torch.load(best_model_path) self.teacher_model.load_state_dict(state_dict['model_state']) self.teacher_model.to(get_device()) self.teacher_model.eval() self.src_pad_index = src_vocab.pad_index self.tgt_pad_index = tgt_vocab.pad_index pop_configuration() self.pad_index = pad_index
def test_tied_src_trg_softmax(self): # test source embedding, target embedding, and softmax tying torch.manual_seed(self.seed) cfg = copy.deepcopy(self.cfg) cfg["model"]["decoder"]["type"] = "transformer" cfg["model"]["tied_embeddings"] = True cfg["model"]["tied_softmax"] = True cfg["model"]["decoder"]["embeddings"]["embedding_dim"] = 64 cfg["model"]["encoder"]["embeddings"]["embedding_dim"] = 64 src_vocab = trg_vocab = self.vocab model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) src_weight = model.src_embed.lut.weight trg_weight = model.trg_embed.lut.weight output_weight = model.decoder.output_layer.weight self.assertTensorEqual(src_weight, trg_weight) self.assertTensorEqual(src_weight, output_weight) self.assertEqual(src_weight.shape, trg_weight.shape) self.assertEqual(trg_weight.shape, output_weight.shape) output_weight.data.fill_(3.) self.assertEqual(output_weight.sum().item(), 6528) self.assertEqual(output_weight.sum().item(), src_weight.sum().item()) self.assertEqual(output_weight.sum().item(), trg_weight.sum().item()) self.assertEqual(src_weight.sum().item(), trg_weight.sum().item())
def predict( input: Ignore[str], output: Ignore[str], log_prefix: Ignore[str], model: EncoderDecoder = None, batch_size_limit: int = 400, batch_limit_by_tokens: bool = True, ): logger = get_logger() (src_vocab, _), (src_field, tgt_field) = get_vocabularies() dataset = Corpora([src_field]) logger.info(f'{log_prefix}: Loading input file ...') with open(input) as src_stream: for src_sentence in src_stream: if src_sentence.strip(): dataset.append([src_sentence]) logger.info(f'{log_prefix}: Loading done.') if model is None: best_model_path = find_best_model() if best_model_path is None: raise RuntimeError( 'Model has not been trained yet. Train the model first.') model = build_model(src_field.vocabulary, tgt_field.vocabulary) state_dict = torch.load(best_model_path) model.load_state_dict(state_dict['model_state']) model.to(get_device()) with open(output, 'w') as output_stream, torch.no_grad(): for batch in dataset.iterate(get_device(), batch_size_limit, batch_limit_by_tokens, sort_by_length=False, shuffle=False): x_mask = batch[0] != src_vocab.pad_index x_mask = x_mask.unsqueeze(1) x_e = model.encode(batch[0], x_mask) y_hat, _ = beam_search(x_e, x_mask, model, get_scores=short_sent_penalty) sentence = src_field.to_sentence_str(batch[0][-1].tolist()) generated = tgt_field.to_sentence_str(y_hat[-1].tolist()) logger.info('SENTENCE:\n ---- {}'.format(sentence)) logger.info('GENERATED:\n ---- {}'.format(generated)) for generated in (src_field.to_sentence_str(s) for s in y_hat.tolist()): output_stream.write(f'{generated}\n')
def train(cfg_file: str) -> None: """ Main training function. After training, also test on test data if given. :param cfg_file: path to configuration yaml file """ cfg = load_config(cfg_file) # set the random seed set_seed(seed=cfg["training"].get("random_seed", 42)) # load the data train_data, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) # build an encoder-decoder model model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) # for training management, e.g. early stopping and model selection trainer = TrainManager(model=model, config=cfg) # store copy of original training config in model dir shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml") # log all entries of config log_cfg(cfg, trainer.logger) log_data_info(train_data=train_data, valid_data=dev_data, test_data=test_data, src_vocab=src_vocab, trg_vocab=trg_vocab, logging_function=trainer.logger.info) trainer.logger.info(str(model)) # store the vocabs src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"]) src_vocab.to_file(src_vocab_file) trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"]) trg_vocab.to_file(trg_vocab_file) # train the model trainer.train_and_validate(train_data=train_data, valid_data=dev_data) # predict with the best model on validation and test # (if test data is available) ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration) output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration) output_path = os.path.join(trainer.model_dir, output_name) test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
def main(model, test_src, test_trg, dictionary_src, dictionary_trg): # load model model_options config = load_json('{}.json'.format(model)) # load source dictionary and invert word_dict = load_dictionary(dictionary_src) word_idict = invert_dictionary(word_dict) word_idict[0] = config['eos_symbol'] word_idict[1] = config['unk_symbol'] # load target dictionary and invert word_dict_trg = load_dictionary(dictionary_trg) word_idict_trg = invert_dictionary(word_dict_trg) word_idict_trg[0] = config['eos_symbol'] word_idict_trg[1] = config['unk_symbol'] # load data data_iter = TextIterator(test_src, test_trg, [dictionary_src], dictionary_trg, n_words_source=config['n_words_src'], n_words_target=config['n_words_trg'], batch_size=config['valid_batch_size'], maxlen=100000, shuffle_each_epoch=False) print('Loading model') params = init_params(config) params = load_params(model + '.npz', params) tparams = init_theano_params(params) # random generator and global dropout/noise switch for this model trng = RandomStreams(1234) x, x_mask, y, y_mask, opt_ret, cost = build_model( tparams, trng, config, use_mask=True, use_noise=False) inps = [x, x_mask, y, y_mask] print('Building f_log_probs...', end="") f_log_probs = theano.function(inps, cost, profile=False) print('Done') # calculate the probabilities loss, perplexity = pred_probs(f_log_probs, prepare_batch, data_iter) mean_loss = loss.mean() print('Loss: %f' % mean_loss) print('PPX: %f' % perplexity)
def test_tied_embeddings(self): torch.manual_seed(self.seed) cfg = copy.deepcopy(self.cfg) cfg["model"]["tied_embeddings"] = True cfg["model"]["tied_softmax"] = False src_vocab = trg_vocab = self.vocab model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) self.assertEqual(src_vocab.itos, trg_vocab.itos) self.assertEqual(model.src_embed, model.trg_embed) self.assertTensorEqual(model.src_embed.lut.weight, model.trg_embed.lut.weight) self.assertEqual(model.src_embed.lut.weight.shape, model.trg_embed.lut.weight.shape)
def test_tied_softmax(self): torch.manual_seed(self.seed) cfg = copy.deepcopy(self.cfg) cfg["model"]["decoder"]["type"] = "transformer" cfg["model"]["tied_embeddings"] = False cfg["model"]["tied_softmax"] = True cfg["model"]["decoder"]["embeddings"]["embedding_dim"] = 64 src_vocab = trg_vocab = self.vocab model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) self.assertEqual(model.trg_embed.lut.weight.shape, model.decoder.output_layer.weight.shape) self.assertTensorEqual(model.trg_embed.lut.weight, model.decoder.output_layer.weight)
def test_transformer_layer_norm_init(self): torch.manual_seed(self.seed) cfg = copy.deepcopy(self.cfg) src_vocab = trg_vocab = self.vocab model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) def check_layer_norm(m: nn.Module): for name, child in m.named_children(): if isinstance(child, nn.LayerNorm): self.assertTensorEqual(child.weight, torch.ones([self.hidden_size])) self.assertTensorEqual(child.bias, torch.zeros([self.hidden_size])) else: check_layer_norm(child) check_layer_norm(model)
def train(max_steps: int = 100, batch_size_limit: int = 400, batch_limit_by_tokens: bool = True, report_interval_steps: int = 10, validation_interval_steps: int = 100, lr_scheduler_at: str = 'every_step', n_ckpts_to_keep: int = 3, teacher_forcing: bool = True, random_seed: int = 42): set_random_seeds(random_seed) logger = get_logger() train_dataset = get_train_dataset() assert len( train_dataset.fields ) >= 2, "Train dataset must have at least two fields (source and target)." validation_dataset = get_validation_dataset() assert len( validation_dataset.fields ) >= 2, "Validation dataset must have at least two fields (source and target)." loss_function = get_loss_function( train_dataset.fields[1].vocabulary.pad_index) model = build_model(train_dataset.fields[0].vocabulary, train_dataset.fields[1].vocabulary) model.to(get_device()) loss_function.to(get_device()) optimizer = build_optimizer(model.parameters()) scheduler = build_scheduler(optimizer) initialize(model) def noop(): return None def step_lr_scheduler(): return scheduler.step() run_scheduler_at_step = noop run_scheduler_at_validation = noop run_scheduler_at_epoch = noop if scheduler is not None: if lr_scheduler_at == 'every_step': run_scheduler_at_step = step_lr_scheduler elif lr_scheduler_at == 'every_validation': run_scheduler_at_validation = step_lr_scheduler elif lr_scheduler_at == 'every_epoch': run_scheduler_at_epoch = step_lr_scheduler step = 0 epoch = 0 kept_checkpoint_path_score_map = {} best_checkpoint_specs = {"score": -math.inf, "step": -1} @configured('model') def maybe_save_checkpoint(score: Ignore[float], output_path: str): if len(kept_checkpoint_path_score_map) < n_ckpts_to_keep or \ any(score > s for s in kept_checkpoint_path_score_map.values()): if len(kept_checkpoint_path_score_map) >= n_ckpts_to_keep: worst_checkpoint_path = sorted( kept_checkpoint_path_score_map.keys(), key=lambda p: kept_checkpoint_path_score_map[p], reverse=False) worst_checkpoint_path = worst_checkpoint_path[0] kept_checkpoint_path_score_map.pop(worst_checkpoint_path) try: os.unlink(worst_checkpoint_path) except: logger.warn( 'Could not unlink {}.'.format(worst_checkpoint_path)) if score > best_checkpoint_specs["score"]: logger.info( 'New `best model` found with score {:.3f} at step {}.'. format(score, step)) best_checkpoint_specs["score"] = score best_checkpoint_specs["step"] = step state_dict = { "step": step, "best_checkpoint_specs": best_checkpoint_specs, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict() if scheduler is not None else None } checkpoint_path = '{}/step_{}_score_{:.3f}.pt'.format( output_path, step, score) torch.save(state_dict, checkpoint_path) kept_checkpoint_path_score_map[checkpoint_path] = score model.train() validation_done_already = False while step < max_steps: start_time = time.time() total_tokens_processed = 0 for batch in train_dataset.iterate(get_device(), batch_size_limit, batch_limit_by_tokens): step += 1 if step >= max_steps: break x_mask = batch[0] != model.src_vocab.pad_index x_mask = x_mask.unsqueeze(1) y_mask = batch[1] != model.tgt_vocab.pad_index y_mask = y_mask.unsqueeze(1) x_e = model.encode(batch[0], x_mask) log_probs = model.decode(batch[1][:, :-1], x_e, y_mask[:, :, :-1], x_mask, teacher_forcing=teacher_forcing) token_count = y_mask[:, :, 1:].sum().item() loss = loss_function(log_probs, batch[1][:, 1:], model.get_target_embeddings()) / token_count loss.backward() optimizer.step() mark_optimization_step() optimizer.zero_grad() run_scheduler_at_step() total_tokens_processed += token_count if step > 0 and step % report_interval_steps == 0: elapsed_time = time.time() - start_time baseline_loss = loss_function.uniform_baseline_loss( log_probs, batch[1][:, 1:]) logger.info( 'Epoch_{} Step_{}: loss={:.3f}(vs {:.3f} uniform), tokens/s={:.1f}, lr={}' .format(epoch, step, loss.item(), baseline_loss, total_tokens_processed / elapsed_time, optimizer.param_groups[0]['lr'])) start_time = time.time() total_tokens_processed = 0 if step > 0 and step % validation_interval_steps == 0: log_prefix = 'Epoch_{} Step_{}'.format(epoch, step) score = evaluate(validation_dataset, log_prefix, model, loss_function) maybe_save_checkpoint(score) model.train() run_scheduler_at_validation() start_time = time.time() total_tokens_processed = 0 validation_done_already = True else: validation_done_already = False epoch += 1 logger.info('Epoch {} finished.'.format(epoch)) run_scheduler_at_epoch() if not validation_done_already: log_prefix = 'Final (epoch={} ~ step={})'.format(epoch, step) score = evaluate(validation_dataset, log_prefix, model, loss_function) maybe_save_checkpoint(score) logger.info('Best validation loss was {:.3f} at step {}.'.format( best_checkpoint_specs["score"], best_checkpoint_specs["step"]))
def evaluate(validation_dataset: Corpora, log_prefix: Ignore[str], model: EncoderDecoder = None, loss_function: Callable = None, batch_size_limit: int = 400, batch_limit_by_tokens: bool = True, teacher_forcing: bool = True, metrics: Tuple[Metric] = None): assert len( validation_dataset.fields ) >= 2, "Validation dataset must have at least two fields (source and target)." logger = get_logger() if loss_function is None: loss_function = get_loss_function( validation_dataset.fields[1].vocabulary.pad_index) loss_function.to(get_device()) if model is None: best_model_path = find_best_model() if best_model_path is None: raise RuntimeError( 'Model has not been trained yet. Train the model first.') model = build_model(validation_dataset.fields[0].vocabulary, validation_dataset.fields[1].vocabulary) state_dict = torch.load(best_model_path) model.load_state_dict(state_dict['model_state']) model.to(get_device()) pad_index = model.tgt_vocab.pad_index total_item_count = 0 total_validation_loss = 0 model.eval() printed_samples = 0 if metrics is None: metrics = (BleuMetric(), ) else: metrics = (BleuMetric(), ) + tuple( m for m in metrics if not isinstance(m, BleuMetric)) with torch.no_grad(): start_time = time.time() for validation_batch in validation_dataset.iterate( get_device(), batch_size_limit, batch_limit_by_tokens, sort_by_length=False, shuffle=False): x_mask = validation_batch[0] != model.src_vocab.pad_index x_mask = x_mask.unsqueeze(1) y_mask = validation_batch[1] != model.tgt_vocab.pad_index y_mask = y_mask.unsqueeze(1) x_e = model.encode(validation_batch[0], x_mask) log_probs = model.decode(validation_batch[1][:, :-1], x_e, y_mask[:, :, :-1], x_mask, teacher_forcing=teacher_forcing) loss = loss_function(log_probs, validation_batch[1][:, 1:], model.get_target_embeddings()) total_item_count += y_mask[:, :, 1:].sum().item() total_validation_loss += loss.item() y_hat, _ = beam_search(x_e, x_mask, model, get_scores=short_sent_penalty) if printed_samples < 4: sentence = validation_dataset.fields[0].to_sentence_str( validation_batch[0][-1].tolist()) reference = validation_dataset.fields[1].to_sentence_str( validation_batch[1][-1].tolist()) generated = validation_dataset.fields[1].to_sentence_str( y_hat[-1].tolist()) logger.info('SENTENCE:\n ---- {}'.format(sentence)) logger.info('REFERENCE:\n ---- {}'.format(reference)) logger.info('GENERATED:\n ---- {}'.format(generated)) printed_samples += 1 update_metric_params(y_hat, validation_batch[1], pad_index, metrics) elapsed_time = time.time() - start_time logger.info( f'{log_prefix}: ' f'evaluation_loss={total_validation_loss / total_item_count:.3f}, ' f'elapsed_time={int(elapsed_time + 0.5)}s') for metric_repr in (str(m) for m in metrics): logger.info(f'{log_prefix}: evaluation {metric_repr}') return metrics[0].get_score()
def build_models(self, *kwargs): logger.info('Building model') self.params, self.encoder_param_names = init_params(self.config) # reload parameters if self.config['reload'] and os.path.exists(self.model_path): logger.info('Reloading model parameters') self.params = load_params(self.model_path, self.params) self.tparams = init_theano_params(self.params) if self.shared_params is not None: # multi-task support # we replace whatever parameters we already have at this point with # the ones that we received as optional input # this needs to be done BEFORE building the model self.params, self.tparams = self.apply_shared_theano_params( self.shared_params, self.params, self.tparams) # random generator and global dropout/noise switch for this model self.trng = RandomStreams(1234) inps, opt_ret, cost = build_model(self.tparams, self.trng, self.config) cost = cost.mean() logger.info('Building tools') self.f_init, self.f_next = build_sampler(self.tparams, self.config, self.trng) # apply L2 regularization on weights if self.decay_c > 0.: decay_c = theano.shared(np.float32(self.decay_c), name='decay_c') weight_decay = 0. for kk, vv in iteritems(self.tparams): weight_decay += (vv**2).sum() weight_decay *= self.decay_c cost += weight_decay # regularize the alpha weights if self.alpha_c > 0. and not decoder.endswith('simple'): alpha_c = theano.shared(np.float32(self.alpha_c), name='alpha_c') alpha_reg = alpha_c * (( tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost logger.info('Building f_cost...') f_cost = theano.function(inps, cost, profile=profile) logger.info('Done') logger.info('Computing gradient...') grads = tensor.grad(cost, wrt=list(itervalues(self.tparams))) grads = clip_grad_norm(grads, self.clip_c) logger.info('Done') # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') logger.info('Building optimizers...') self.f_grad_shared, self.f_update = eval(self.optimizer)(lr, self.tparams, grads, inps, cost, opt_ret) logger.info('Done') # log probability function (for validation, so use model without noise!) logger.info('Building f_log_probs...') self.test_inp, _, self.test_cost = build_model(self.tparams, self.trng, self.config, use_mask=True, use_noise=False) self.f_log_probs = theano.function(self.test_inp, self.test_cost, profile=profile) logger.info('Done')
def translate(cfg_file, ckpt: str, output_path: str = None) -> None: """ Interactive translation function. Loads model from checkpoint and translates either the stdin input or asks for input to translate interactively. The input has to be pre-processed according to the data that the model was trained on, i.e. tokenized or split into subwords. Translations are printed to stdout. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load """ def _load_line_as_data(line): """ Create a dataset from one line via a temporary file. """ # write src input to temporary file tmp_name = "tmp" tmp_suffix = ".src" tmp_filename = tmp_name + tmp_suffix with open(tmp_filename, "w") as tmp_file: tmp_file.write("{}\n".format(line)) test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field) # remove temporary file if os.path.exists(tmp_filename): os.remove(tmp_filename) return test_data def _translate_data(test_data): """ Translates given dataset, using parameters from outer scope. """ # pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( model, data=test_data, batch_size=batch_size, level=level, max_output_length=max_output_length, eval_metric="", use_cuda=use_cuda, loss_function=None, beam_size=beam_size, beam_alpha=beam_alpha) return hypotheses cfg = load_config(cfg_file) # when checkpoint is not specified, take oldest from model dir if ckpt is None: model_dir = cfg["training"]["model_dir"] ckpt = get_latest_checkpoint(model_dir) batch_size = cfg["training"].get("batch_size", 1) use_cuda = cfg["training"].get("use_cuda", False) level = cfg["data"]["level"] max_output_length = cfg["training"].get("max_output_length", None) # read vocabs src_vocab_file = cfg["data"].get( "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt") trg_vocab_file = cfg["data"].get( "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt") src_vocab = Vocabulary(file=src_vocab_file) trg_vocab = Vocabulary(file=trg_vocab_file) data_cfg = cfg["data"] level = data_cfg["level"] lowercase = data_cfg["lowercase"] tok_fun = lambda s: list(s) if level == "char" else s.split() src_field = Field(init_token=None, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tok_fun, batch_first=True, lower=lowercase, unk_token=UNK_TOKEN, include_lengths=True) src_field.vocab = src_vocab # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() # whether to use beam search for decoding, 0: greedy decoding if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 0) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 0 beam_alpha = -1 if not sys.stdin.isatty(): # file given test_data = MonoDataset(path=sys.stdin, ext="", field=src_field) hypotheses = _translate_data(test_data) if output_path is not None: output_path_set = "{}".format(output_path) with open(output_path_set, mode="w", encoding="utf-8") as out_file: for hyp in hypotheses: out_file.write(hyp + "\n") print("Translations saved to: {}".format(output_path_set)) else: for hyp in hypotheses: print(hyp) else: # enter interactive mode batch_size = 1 while True: try: src_input = input("\nPlease enter a source sentence " "(pre-processed): \n") if not src_input.strip(): break # every line has to be made into dataset test_data = _load_line_as_data(line=src_input) hypotheses = _translate_data(test_data) print("JoeyNMT: {}".format(hypotheses[0])) except (KeyboardInterrupt, EOFError): print("\nBye.") break
def test(cfg_file, ckpt: str, output_path: str = None, save_attention: bool = False, logger: logging.Logger = None) -> None: """ Main test function. Handles loading a model from checkpoint, generating translations and storing them and attention plots. :param cfg_file: path to configuration file :param ckpt: path to checkpoint to load :param output_path: path to output :param save_attention: whether to save the computed attention weights :param logger: log output to this logger (creates new logger if not set) """ if logger is None: logger = logging.getLogger(__name__) FORMAT = '%(asctime)-15s - %(message)s' logging.basicConfig(format=FORMAT) logger.setLevel(level=logging.DEBUG) cfg = load_config(cfg_file) if "test" not in cfg["data"].keys(): raise ValueError("Test data must be specified in config.") # when checkpoint is not specified, take latest (best) from model dir if ckpt is None: model_dir = cfg["training"]["model_dir"] ckpt = get_latest_checkpoint(model_dir) if ckpt is None: raise FileNotFoundError( "No checkpoint found in directory {}.".format(model_dir)) try: step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0] except IndexError: step = "best" batch_size = cfg["training"]["batch_size"] batch_type = cfg["training"].get("batch_type", "sentence") use_cuda = cfg["training"].get("use_cuda", False) level = cfg["data"]["level"] eval_metric = cfg["training"]["eval_metric"] max_output_length = cfg["training"].get("max_output_length", None) # load the data _, dev_data, test_data, src_vocab, trg_vocab = load_data( data_cfg=cfg["data"]) data_to_predict = {"dev": dev_data, "test": test_data} # load model state from disk model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda) # build model and load parameters into it model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) model.load_state_dict(model_checkpoint["model_state"]) if use_cuda: model.cuda() # whether to use beam search for decoding, 0: greedy decoding if "testing" in cfg.keys(): beam_size = cfg["testing"].get("beam_size", 0) beam_alpha = cfg["testing"].get("alpha", -1) else: beam_size = 0 beam_alpha = -1 for data_set_name, data_set in data_to_predict.items(): #pylint: disable=unused-variable score, loss, ppl, sources, sources_raw, references, hypotheses, \ hypotheses_raw, attention_scores = validate_on_data( model, data=data_set, batch_size=batch_size, batch_type=batch_type, level=level, max_output_length=max_output_length, eval_metric=eval_metric, use_cuda=use_cuda, loss_function=None, beam_size=beam_size, beam_alpha=beam_alpha) #pylint: enable=unused-variable if "trg" in data_set.fields: decoding_description = "Greedy decoding" if beam_size == 0 else \ "Beam search decoding with beam size = {} and alpha = {}".\ format(beam_size, beam_alpha) logger.info("%4s %s: %6.2f [%s]", data_set_name, eval_metric, score, decoding_description) else: logger.info("No references given for %s -> no evaluation.", data_set_name) if save_attention: if attention_scores: attention_name = "{}.{}.att".format(data_set_name, step) attention_path = os.path.join(model_dir, attention_name) logger.info( "Saving attention plots. This might take a while..") store_attention_plots(attentions=attention_scores, targets=hypotheses_raw, sources=[s for s in data_set.src], indices=range(len(hypotheses)), output_prefix=attention_path) logger.info("Attention plots saved to: %s", attention_path) else: logger.warning("Attention scores could not be saved. " "Note that attention scores are not available " "when using beam search. " "Set beam_size to 0 for greedy decoding.") if output_path is not None: output_path_set = "{}.{}".format(output_path, data_set_name) with open(output_path_set, mode="w", encoding="utf-8") as out_file: for hyp in hypotheses: out_file.write(hyp + "\n") logger.info("Translations saved to: %s", output_path_set)
def train(self, model_name=None, output_dir=None, src_train=None, trg_train=None, src_valid=None, trg_valid=None, src_dicts=None, trg_dicts=None, factors=1, factors_trg=1, n_words_src=50000, n_words_trg=50000, dim_emb=100, dim_per_factor=(100, ), dim=100, dim_att=200, encoder='gru', encoder_layers=1, decoder='gru_cond', optimizer='adadelta', learning_rate=1e-3, decay_c=0., clip_c=1., alpha_c=0., dropout=False, dropout_src=0., dropout_trg=0., dropout_emb=0., dropout_rec=0., dropout_hid=0., batch_size=80, valid_batch_size=80, k=5, maxlen=50, max_epochs=20, bleu_script='nmt/multi-bleu.perl', bleu_val_burnin=0, val_set_out='validation.txt', validation_frequency=-1, display_frequency=100, save_frequency=-1, sample_frequency=200, beam_size=12, track_n_models=3, finish_after=-1, unk_symbol='<UNK>', eos_symbol='</s>', patience=10, early_stopping='cost', reload=False, verbose=1, disp_alignments=True, mtl=False, mtl_ratio=(), mtl_configs=(), mtl_decoder=False, n_shared_layers=1, **kwargs): """ Train an NMT system :return: """ # log options config = self.config logger.info(pformat(self.config)) # Model options model_path = os.path.join(output_dir, model_name + '.npz') config_path = os.path.join(output_dir, model_name + '.json') # create output dir if it does not exist if not os.path.exists(output_dir): os.makedirs(output_dir) # load dictionaries and invert them worddicts_src = [load_dictionary(d) for d in src_dicts] worddicts_trg = [load_dictionary(d) for d in trg_dicts] worddicts_src_r = [invert_dictionary(d) for d in worddicts_src] worddicts_trg_r = [invert_dictionary(d) for d in worddicts_trg] # reload options if reload: if os.path.exists(config_path): logger.info('Reloading model options: %s' % config_path) config = load_json(config_path) else: logger.info( 'Did NOT reload model options (file did not exist)') logger.info('Loading data') train = TextIterator(src_train, trg_train, src_dicts, trg_dicts, batch_size=batch_size, maxlen=maxlen, n_words_source=n_words_src, n_words_target=n_words_trg, shuffle_each_epoch=True, sort_by_length=True, maxibatch_size=20, factors=factors, factors_trg=factors_trg) valid = TextIterator(src_valid, trg_valid, src_dicts, trg_dicts, batch_size=batch_size, maxlen=maxlen, n_words_source=n_words_src, n_words_target=n_words_trg, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20, factors=factors, factors_trg=factors_trg) logger.info('Building model') params, encoder_param_names = init_params(config) # reload parameters if reload and os.path.exists(model_path): logger.info('Reloading model parameters') params = load_params(model_path, params) tparams = init_theano_params(params) if self.shared_params is not None: # multi-task support # we replace whatever parameters we already have at this point with # the ones that we received as optional input # this needs to be done BEFORE building the model params, tparams = self.apply_shared_theano_params( self.shared_params, params, tparams) # random generator and global dropout/noise switch for this model trng = RandomStreams(1234) inps, opt_ret, cost = build_model(tparams, trng, config) # x, x_mask, y, y_mask = inps cost = cost.mean() logger.info('Building tools') f_init, f_next = build_sampler(tparams, config, trng) # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(np.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in iteritems(tparams): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not decoder.endswith('simple'): alpha_c = theano.shared(np.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * (( tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost logger.info('Building f_cost...') f_cost = theano.function(inps, cost, profile=profile) logger.info('Done') logger.info('Computing gradient...') grads = tensor.grad(cost, wrt=list(itervalues(tparams))) grads = clip_grad_norm(grads, clip_c) logger.info('Done') # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') logger.info('Building optimizers...') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, opt_ret) logger.info('Done') # log probability function (for validation, so use model without noise!) logger.info('Building f_log_probs...') test_inp, _, test_cost = build_model(tparams, trng, config, use_mask=True, use_noise=False) f_log_probs = theano.function(test_inp, test_cost, profile=profile) logger.info('Done') # bleu validation bleu_validator = SimpleBleuValidator( tparams, config, trng, f_init, f_next, k=beam_size, src_dicts=worddicts_src, trg_idict=worddicts_trg_r[0], normalize=True, main_loop=self) if bleu_script else None # multi-task learning mtl_tasks = [] shared_params = OrderedDict() for k in encoder_param_names: shared_params[k] = tparams[k] if mtl: logger.info('Preparing MTL tasks') task_config = yaml.load(open(mtl_configs[0], mode='rb')) if task_config['model'] == 'imaginet': task_config['exp_id'] = self.config['exp_id'] mtl_tasks.append(ImaginetTrainer(task_config, shared_params)) elif task_config['model'] == 'nmt': mtl_tasks.append(Trainer(task_config, shared_params)) assert sum(mtl_ratio) == 1., 'MTL ratio must sum to 1' # to check how many times a task was executed task_stats = np.zeros(len(mtl_tasks) + 1) # start of optimization main loop logger.info('Optimization started...') early_stop = False saved_model_paths = [] # history of saved models best_params = None bad_counter = 0 # reload history if reload and os.path.exists(model_path): self.history_errs = list(np.load(model_path)['history_errs']) self.history_bleu = list(np.load(model_path)['history_bleu']) self.update_idx = np.load(model_path)['update_idx'] # set frequencies - if -1 is specified then freq set to #iters in epoch validation_frequency = len(train) // batch_size \ if validation_frequency == -1 else validation_frequency save_frequency = len(train) // batch_size \ if save_frequency == -1 else save_frequency sample_frequency = len(train) // batch_size \ if sample_frequency == -1 else sample_frequency # save initial model so we can re-use parameters (seed) logger.info('Saving initial model') params = unzip(tparams) with open(model_path + '.init', mode='wb') as f: np.savez(f, history_errs=self.history_errs, history_bleu=self.history_bleu, update_idx=self.update_idx, **params) dump_json(config, config_path) logger.info('Done saving model') for epoch_idx in range(max_epochs): self.epoch_idx = epoch_idx # self.update_idx // (len(train) // batch_size) n_samples = 0 # iterate over data batches for x_, y_ in train: # multi-task learning -- we simply do other tasks until we are # allowed to perform the main task (this loop) if mtl: n_tasks = len(mtl_ratio) task = 1 while task > 0: task = np.random.choice(n_tasks, 1, replace=False, p=mtl_ratio)[0] task_stats[task] += 1 if task > 0: mtl_tasks[task - 1].train_next_batch() # print('Training on task {:d}'.format(task)) # NMT training n_samples += len(x_) self.update_idx += 1 x, x_mask, y, y_mask = prepare_batch(x_, y_, maxlen=None) y = y[0] # only use first target factor for NMT inputs = [x, x_mask, y, y_mask] if x is None: logger.warning( 'Empty mini-batch! maxlen={}'.format(maxlen)) self.update_idx -= 1 continue # get error on this batch update_start_time = time.time() ret_vals = f_grad_shared(*inputs) cost = ret_vals[0] # do the update on parameters f_update(learning_rate) update_time = time.time() - update_start_time # check for bad numbers if np.isnan(cost) or np.isinf(cost): logger.warning('NaN detected') return 1., 1., 1. # verbose if np.mod(self.update_idx, display_frequency) == 0: if disp_alignments: # display info with max alpha value logger.info( 'Epoch %4d Update %8d Cost %4.8f UD %0.12f Max-alpha %0.4f' % (self.epoch_idx, self.update_idx, cost, update_time, ret_vals[1].max())) else: # display general info logger.info( 'Epoch %4d Update %8d Cost %4.8f UD %0.12f' % (self.epoch_idx, self.update_idx, cost, update_time)) # generate some samples if np.mod(self.update_idx, sample_frequency) == 0: print_samples(x, y, trng, f_init, f_next, maxlen, factors, worddicts_src_r, worddicts_trg_r, unk_symbol) # validation if np.mod(self.update_idx, validation_frequency) == 0: # intrinsic validation valid_errs, perplexity = pred_probs( f_log_probs, prepare_batch, valid) valid_err = valid_errs.mean() if np.isnan(valid_err): logger.warning('valid_err NaN detected') early_stop = True break # output validation info logger.info('Validation error: {:1.12f} PPX: {:f}'.format( valid_err, perplexity)) # BLEU validation if bleu_validator and self.update_idx >= bleu_val_burnin: bleu_score = bleu_validator.evaluate_model() logger.info('BLEU = {}'.format(bleu_score)) # save the best 3 models according to early-stopping if track_n_models > 0 and len(self.history_errs) > 0: if early_stopping == 'cost': if valid_err <= min(self.history_errs): logger.info( 'Saving model at epoch {} / iter {}...'. format(self.epoch_idx, self.update_idx)) path = os.path.join( output_dir, '{}.ep{}.iter{}.npz'.format( model_name, self.epoch_idx, self.update_idx)) with open(path, mode='wb') as f: np.savez(f, history_errs=self.history_errs, history_bleu=self.history_bleu, update_idx=self.update_idx, **unzip(tparams)) saved_model_paths.append(path) logger.info('Done saving model') # Save a model only if we've exceeding the point where # we start measuring BLEU scores elif early_stopping == 'bleu' and self.update_idx >= bleu_val_burnin: if len(self.history_bleu ) > 0 and bleu_score >= max( self.history_bleu): bestbleuhandle = open( '%s/bestBLEU' % output_dir, 'w') bestbleuhandle.write("%f" % bleu_score) bestbleuhandle.close() logger.info( 'Saving model at epoch {} / iter {}...'. format(self.epoch_idx, self.update_idx)) path = os.path.join( output_dir, '{}.ep{}.iter{}.bleu{}.npz'.format( model_name, self.epoch_idx, self.update_idx, bleu_score)) with open(path, mode='wb') as f: np.savez(f, history_errs=self.history_errs, history_bleu=self.history_bleu, update_idx=self.update_idx, **unzip(tparams)) saved_model_paths.append(path) logger.info('Done saving model') # Remove un-needed saved models if necessary if len(saved_model_paths) > track_n_models: path = saved_model_paths[0] logger.info('Deleting old model {}'.format(path)) with ignored(OSError): os.remove(path) saved_model_paths.pop(0) # remember the validation result self.history_errs.append(valid_err) if early_stopping == 'bleu' and self.update_idx >= bleu_val_burnin: # Remember the BLEU score at this point self.history_bleu.append(bleu_score) # reset bad counter (patience) if best validation so far if early_stopping == 'cost': if self.update_idx == 0 or valid_err <= \ np.array(self.history_errs).min(): best_params = unzip(tparams) if mtl: # Force the other tasks to save too mtl_tasks[0].save( string=".cost{}".format(valid_err)) if bad_counter > 0: bad_counter -= 1 elif early_stopping == 'bleu': if self.update_idx >= bleu_val_burnin: if bleu_score >= max(self.history_bleu): best_params = unzip(tparams) if mtl: # Force the other tasks to save too mtl_tasks[0].save( string=".bleu{}".format(bleu_score)) if bad_counter > 0: bad_counter -= 1 # save the best model so far (according to cost) logger.info('Saving best model (according to {})'.format( early_stopping)) if best_params is not None: params = best_params else: params = unzip(tparams) np.savez(model_path, history_errs=self.history_errs, history_bleu=self.history_bleu, update_idx=self.update_idx, **params) logger.info('Done saving best model') # check for early stop if early_stopping == 'cost': if len(self.history_errs) > patience and valid_err >= \ np.array(self.history_errs)[:-patience].min(): bad_counter += 1 logger.warn('Bad validation result. {}/{}'.format( bad_counter, patience)) if bad_counter >= patience: logger.info('Early stop activated.') early_stop = True elif early_stopping == 'bleu': if len(self.history_bleu) > patience and bleu_score <= \ np.array(self.history_bleu)[:-patience].max(): bad_counter += 1 logger.warn('Bad validation result. {}/{}'.format( bad_counter, patience)) if bad_counter >= patience: logger.info('Early stop activated.') early_stop = True # finish after this many updates if self.update_idx == finish_after: logger.info('Finishing after {:d} iterations!'.format( self.update_idx)) early_stop = True if early_stop: logger.info('Early Stop!') return 0 if mtl: logger.info(task_stats / task_stats.sum()) logger.info('Seen {:d} samples'.format(n_samples)) logger.info('Finished with main loop') return 0