예제 #1
0
    def __init__(self,
                 pad_index: Ignore[int],
                 teacher_config_path: str = '/DOES_NOT_EXIST'):
        teacher_config_path = relative_to_config_path(teacher_config_path)
        assert os.path.exists(
            teacher_config_path), "Teacher model config does not exist."
        nn.Module.__init__(self)
        teacher_model_config = get_configuration().clone()
        with open(teacher_config_path) as f:
            teacher_model_config.load(json.load(f))

        push_configuration(teacher_model_config)
        update_and_ensure_model_output_path('test', None)

        best_model_path = find_best_model()
        if best_model_path is None:
            raise ValueError('Could not find the teacher model.')
        (src_vocab, tgt_vocab), _ = get_vocabularies()
        self.teacher_model = build_model(src_vocab, tgt_vocab)
        state_dict = torch.load(best_model_path)
        self.teacher_model.load_state_dict(state_dict['model_state'])
        self.teacher_model.to(get_device())
        self.teacher_model.eval()
        self.src_pad_index = src_vocab.pad_index
        self.tgt_pad_index = tgt_vocab.pad_index

        pop_configuration()

        self.pad_index = pad_index
예제 #2
0
    def test_tied_src_trg_softmax(self):

        # test source embedding, target embedding, and softmax tying
        torch.manual_seed(self.seed)
        cfg = copy.deepcopy(self.cfg)

        cfg["model"]["decoder"]["type"] = "transformer"
        cfg["model"]["tied_embeddings"] = True
        cfg["model"]["tied_softmax"] = True
        cfg["model"]["decoder"]["embeddings"]["embedding_dim"] = 64
        cfg["model"]["encoder"]["embeddings"]["embedding_dim"] = 64

        src_vocab = trg_vocab = self.vocab
        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)

        src_weight = model.src_embed.lut.weight
        trg_weight = model.trg_embed.lut.weight
        output_weight = model.decoder.output_layer.weight

        self.assertTensorEqual(src_weight, trg_weight)
        self.assertTensorEqual(src_weight, output_weight)
        self.assertEqual(src_weight.shape, trg_weight.shape)
        self.assertEqual(trg_weight.shape, output_weight.shape)

        output_weight.data.fill_(3.)
        self.assertEqual(output_weight.sum().item(), 6528)
        self.assertEqual(output_weight.sum().item(), src_weight.sum().item())
        self.assertEqual(output_weight.sum().item(), trg_weight.sum().item())
        self.assertEqual(src_weight.sum().item(), trg_weight.sum().item())
예제 #3
0
def predict(
    input: Ignore[str],
    output: Ignore[str],
    log_prefix: Ignore[str],
    model: EncoderDecoder = None,
    batch_size_limit: int = 400,
    batch_limit_by_tokens: bool = True,
):

    logger = get_logger()

    (src_vocab, _), (src_field, tgt_field) = get_vocabularies()

    dataset = Corpora([src_field])
    logger.info(f'{log_prefix}: Loading input file ...')
    with open(input) as src_stream:
        for src_sentence in src_stream:
            if src_sentence.strip():
                dataset.append([src_sentence])
    logger.info(f'{log_prefix}: Loading done.')

    if model is None:
        best_model_path = find_best_model()
        if best_model_path is None:
            raise RuntimeError(
                'Model has not been trained yet. Train the model first.')
        model = build_model(src_field.vocabulary, tgt_field.vocabulary)
        state_dict = torch.load(best_model_path)
        model.load_state_dict(state_dict['model_state'])
        model.to(get_device())

    with open(output, 'w') as output_stream, torch.no_grad():

        for batch in dataset.iterate(get_device(),
                                     batch_size_limit,
                                     batch_limit_by_tokens,
                                     sort_by_length=False,
                                     shuffle=False):
            x_mask = batch[0] != src_vocab.pad_index
            x_mask = x_mask.unsqueeze(1)

            x_e = model.encode(batch[0], x_mask)
            y_hat, _ = beam_search(x_e,
                                   x_mask,
                                   model,
                                   get_scores=short_sent_penalty)
            sentence = src_field.to_sentence_str(batch[0][-1].tolist())
            generated = tgt_field.to_sentence_str(y_hat[-1].tolist())

            logger.info('SENTENCE:\n ---- {}'.format(sentence))
            logger.info('GENERATED:\n ---- {}'.format(generated))

            for generated in (src_field.to_sentence_str(s)
                              for s in y_hat.tolist()):
                output_stream.write(f'{generated}\n')
예제 #4
0
def train(cfg_file: str) -> None:
    """
    Main training function. After training, also test on test data if given.

    :param cfg_file: path to configuration yaml file
    """
    cfg = load_config(cfg_file)

    # set the random seed
    set_seed(seed=cfg["training"].get("random_seed", 42))

    # load the data
    train_data, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    # build an encoder-decoder model
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)

    # for training management, e.g. early stopping and model selection
    trainer = TrainManager(model=model, config=cfg)

    # store copy of original training config in model dir
    shutil.copy2(cfg_file, trainer.model_dir + "/config.yaml")

    # log all entries of config
    log_cfg(cfg, trainer.logger)

    log_data_info(train_data=train_data,
                  valid_data=dev_data,
                  test_data=test_data,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab,
                  logging_function=trainer.logger.info)

    trainer.logger.info(str(model))

    # store the vocabs
    src_vocab_file = "{}/src_vocab.txt".format(cfg["training"]["model_dir"])
    src_vocab.to_file(src_vocab_file)
    trg_vocab_file = "{}/trg_vocab.txt".format(cfg["training"]["model_dir"])
    trg_vocab.to_file(trg_vocab_file)

    # train the model
    trainer.train_and_validate(train_data=train_data, valid_data=dev_data)

    # predict with the best model on validation and test
    # (if test data is available)
    ckpt = "{}/{}.ckpt".format(trainer.model_dir, trainer.best_ckpt_iteration)
    output_name = "{:08d}.hyps".format(trainer.best_ckpt_iteration)
    output_path = os.path.join(trainer.model_dir, output_name)
    test(cfg_file, ckpt=ckpt, output_path=output_path, logger=trainer.logger)
예제 #5
0
def main(model, test_src, test_trg, dictionary_src, dictionary_trg):
    # load model model_options
    config = load_json('{}.json'.format(model))

    # load source dictionary and invert
    word_dict = load_dictionary(dictionary_src)
    word_idict = invert_dictionary(word_dict)
    word_idict[0] = config['eos_symbol']
    word_idict[1] = config['unk_symbol']

    # load target dictionary and invert
    word_dict_trg = load_dictionary(dictionary_trg)
    word_idict_trg = invert_dictionary(word_dict_trg)
    word_idict_trg[0] = config['eos_symbol']
    word_idict_trg[1] = config['unk_symbol']

    # load data
    data_iter = TextIterator(test_src, test_trg, [dictionary_src],
                             dictionary_trg,
                             n_words_source=config['n_words_src'],
                             n_words_target=config['n_words_trg'],
                             batch_size=config['valid_batch_size'],
                             maxlen=100000, shuffle_each_epoch=False)

    print('Loading model')
    params = init_params(config)
    params = load_params(model + '.npz', params)
    tparams = init_theano_params(params)

    # random generator and global dropout/noise switch for this model
    trng = RandomStreams(1234)

    x, x_mask, y, y_mask, opt_ret, cost = build_model(
        tparams, trng, config, use_mask=True, use_noise=False)
    inps = [x, x_mask, y, y_mask]

    print('Building f_log_probs...', end="")
    f_log_probs = theano.function(inps, cost, profile=False)
    print('Done')

    # calculate the probabilities
    loss, perplexity = pred_probs(f_log_probs, prepare_batch, data_iter)
    mean_loss = loss.mean()

    print('Loss: %f' % mean_loss)
    print('PPX: %f' % perplexity)
예제 #6
0
    def test_tied_embeddings(self):

        torch.manual_seed(self.seed)
        cfg = copy.deepcopy(self.cfg)
        cfg["model"]["tied_embeddings"] = True
        cfg["model"]["tied_softmax"] = False

        src_vocab = trg_vocab = self.vocab

        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)

        self.assertEqual(src_vocab.itos, trg_vocab.itos)
        self.assertEqual(model.src_embed, model.trg_embed)
        self.assertTensorEqual(model.src_embed.lut.weight,
                               model.trg_embed.lut.weight)
        self.assertEqual(model.src_embed.lut.weight.shape,
                         model.trg_embed.lut.weight.shape)
예제 #7
0
    def test_tied_softmax(self):

        torch.manual_seed(self.seed)
        cfg = copy.deepcopy(self.cfg)
        cfg["model"]["decoder"]["type"] = "transformer"
        cfg["model"]["tied_embeddings"] = False
        cfg["model"]["tied_softmax"] = True
        cfg["model"]["decoder"]["embeddings"]["embedding_dim"] = 64

        src_vocab = trg_vocab = self.vocab

        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)

        self.assertEqual(model.trg_embed.lut.weight.shape,
                         model.decoder.output_layer.weight.shape)

        self.assertTensorEqual(model.trg_embed.lut.weight,
                               model.decoder.output_layer.weight)
예제 #8
0
    def test_transformer_layer_norm_init(self):
        torch.manual_seed(self.seed)
        cfg = copy.deepcopy(self.cfg)

        src_vocab = trg_vocab = self.vocab

        model = build_model(cfg["model"],
                            src_vocab=src_vocab,
                            trg_vocab=trg_vocab)

        def check_layer_norm(m: nn.Module):
            for name, child in m.named_children():
                if isinstance(child, nn.LayerNorm):
                    self.assertTensorEqual(child.weight,
                                           torch.ones([self.hidden_size]))
                    self.assertTensorEqual(child.bias,
                                           torch.zeros([self.hidden_size]))
                else:
                    check_layer_norm(child)

        check_layer_norm(model)
예제 #9
0
def train(max_steps: int = 100,
          batch_size_limit: int = 400,
          batch_limit_by_tokens: bool = True,
          report_interval_steps: int = 10,
          validation_interval_steps: int = 100,
          lr_scheduler_at: str = 'every_step',
          n_ckpts_to_keep: int = 3,
          teacher_forcing: bool = True,
          random_seed: int = 42):

    set_random_seeds(random_seed)
    logger = get_logger()

    train_dataset = get_train_dataset()
    assert len(
        train_dataset.fields
    ) >= 2, "Train dataset must have at least two fields (source and target)."
    validation_dataset = get_validation_dataset()
    assert len(
        validation_dataset.fields
    ) >= 2, "Validation dataset must have at least two fields (source and target)."

    loss_function = get_loss_function(
        train_dataset.fields[1].vocabulary.pad_index)

    model = build_model(train_dataset.fields[0].vocabulary,
                        train_dataset.fields[1].vocabulary)

    model.to(get_device())
    loss_function.to(get_device())

    optimizer = build_optimizer(model.parameters())
    scheduler = build_scheduler(optimizer)

    initialize(model)

    def noop():
        return None

    def step_lr_scheduler():
        return scheduler.step()

    run_scheduler_at_step = noop
    run_scheduler_at_validation = noop
    run_scheduler_at_epoch = noop

    if scheduler is not None:
        if lr_scheduler_at == 'every_step':
            run_scheduler_at_step = step_lr_scheduler
        elif lr_scheduler_at == 'every_validation':
            run_scheduler_at_validation = step_lr_scheduler
        elif lr_scheduler_at == 'every_epoch':
            run_scheduler_at_epoch = step_lr_scheduler

    step = 0
    epoch = 0

    kept_checkpoint_path_score_map = {}

    best_checkpoint_specs = {"score": -math.inf, "step": -1}

    @configured('model')
    def maybe_save_checkpoint(score: Ignore[float], output_path: str):

        if len(kept_checkpoint_path_score_map) < n_ckpts_to_keep or \
                any(score > s for s in kept_checkpoint_path_score_map.values()):
            if len(kept_checkpoint_path_score_map) >= n_ckpts_to_keep:
                worst_checkpoint_path = sorted(
                    kept_checkpoint_path_score_map.keys(),
                    key=lambda p: kept_checkpoint_path_score_map[p],
                    reverse=False)
                worst_checkpoint_path = worst_checkpoint_path[0]
                kept_checkpoint_path_score_map.pop(worst_checkpoint_path)
                try:
                    os.unlink(worst_checkpoint_path)
                except:
                    logger.warn(
                        'Could not unlink {}.'.format(worst_checkpoint_path))

            if score > best_checkpoint_specs["score"]:
                logger.info(
                    'New `best model` found with score {:.3f} at step {}.'.
                    format(score, step))
                best_checkpoint_specs["score"] = score
                best_checkpoint_specs["step"] = step

            state_dict = {
                "step":
                step,
                "best_checkpoint_specs":
                best_checkpoint_specs,
                "model_state":
                model.state_dict(),
                "optimizer_state":
                optimizer.state_dict(),
                "scheduler_state":
                scheduler.state_dict() if scheduler is not None else None
            }
            checkpoint_path = '{}/step_{}_score_{:.3f}.pt'.format(
                output_path, step, score)
            torch.save(state_dict, checkpoint_path)
            kept_checkpoint_path_score_map[checkpoint_path] = score

    model.train()

    validation_done_already = False
    while step < max_steps:

        start_time = time.time()
        total_tokens_processed = 0
        for batch in train_dataset.iterate(get_device(), batch_size_limit,
                                           batch_limit_by_tokens):
            step += 1
            if step >= max_steps:
                break

            x_mask = batch[0] != model.src_vocab.pad_index
            x_mask = x_mask.unsqueeze(1)

            y_mask = batch[1] != model.tgt_vocab.pad_index
            y_mask = y_mask.unsqueeze(1)

            x_e = model.encode(batch[0], x_mask)
            log_probs = model.decode(batch[1][:, :-1],
                                     x_e,
                                     y_mask[:, :, :-1],
                                     x_mask,
                                     teacher_forcing=teacher_forcing)
            token_count = y_mask[:, :, 1:].sum().item()
            loss = loss_function(log_probs, batch[1][:, 1:],
                                 model.get_target_embeddings()) / token_count
            loss.backward()

            optimizer.step()
            mark_optimization_step()
            optimizer.zero_grad()

            run_scheduler_at_step()

            total_tokens_processed += token_count

            if step > 0 and step % report_interval_steps == 0:
                elapsed_time = time.time() - start_time
                baseline_loss = loss_function.uniform_baseline_loss(
                    log_probs, batch[1][:, 1:])
                logger.info(
                    'Epoch_{} Step_{}: loss={:.3f}(vs {:.3f} uniform), tokens/s={:.1f}, lr={}'
                    .format(epoch, step, loss.item(), baseline_loss,
                            total_tokens_processed / elapsed_time,
                            optimizer.param_groups[0]['lr']))
                start_time = time.time()
                total_tokens_processed = 0

            if step > 0 and step % validation_interval_steps == 0:
                log_prefix = 'Epoch_{} Step_{}'.format(epoch, step)
                score = evaluate(validation_dataset, log_prefix, model,
                                 loss_function)
                maybe_save_checkpoint(score)
                model.train()
                run_scheduler_at_validation()
                start_time = time.time()
                total_tokens_processed = 0
                validation_done_already = True
            else:
                validation_done_already = False

        epoch += 1
        logger.info('Epoch {} finished.'.format(epoch))
        run_scheduler_at_epoch()

    if not validation_done_already:
        log_prefix = 'Final (epoch={} ~ step={})'.format(epoch, step)
        score = evaluate(validation_dataset, log_prefix, model, loss_function)
        maybe_save_checkpoint(score)
    logger.info('Best validation loss was {:.3f} at step {}.'.format(
        best_checkpoint_specs["score"], best_checkpoint_specs["step"]))
예제 #10
0
def evaluate(validation_dataset: Corpora,
             log_prefix: Ignore[str],
             model: EncoderDecoder = None,
             loss_function: Callable = None,
             batch_size_limit: int = 400,
             batch_limit_by_tokens: bool = True,
             teacher_forcing: bool = True,
             metrics: Tuple[Metric] = None):
    assert len(
        validation_dataset.fields
    ) >= 2, "Validation dataset must have at least two fields (source and target)."

    logger = get_logger()

    if loss_function is None:
        loss_function = get_loss_function(
            validation_dataset.fields[1].vocabulary.pad_index)
        loss_function.to(get_device())
    if model is None:
        best_model_path = find_best_model()
        if best_model_path is None:
            raise RuntimeError(
                'Model has not been trained yet. Train the model first.')
        model = build_model(validation_dataset.fields[0].vocabulary,
                            validation_dataset.fields[1].vocabulary)
        state_dict = torch.load(best_model_path)
        model.load_state_dict(state_dict['model_state'])
        model.to(get_device())
    pad_index = model.tgt_vocab.pad_index

    total_item_count = 0
    total_validation_loss = 0
    model.eval()

    printed_samples = 0

    if metrics is None:
        metrics = (BleuMetric(), )
    else:
        metrics = (BleuMetric(), ) + tuple(
            m for m in metrics if not isinstance(m, BleuMetric))

    with torch.no_grad():

        start_time = time.time()
        for validation_batch in validation_dataset.iterate(
                get_device(),
                batch_size_limit,
                batch_limit_by_tokens,
                sort_by_length=False,
                shuffle=False):
            x_mask = validation_batch[0] != model.src_vocab.pad_index
            x_mask = x_mask.unsqueeze(1)

            y_mask = validation_batch[1] != model.tgt_vocab.pad_index
            y_mask = y_mask.unsqueeze(1)

            x_e = model.encode(validation_batch[0], x_mask)
            log_probs = model.decode(validation_batch[1][:, :-1],
                                     x_e,
                                     y_mask[:, :, :-1],
                                     x_mask,
                                     teacher_forcing=teacher_forcing)

            loss = loss_function(log_probs, validation_batch[1][:, 1:],
                                 model.get_target_embeddings())
            total_item_count += y_mask[:, :, 1:].sum().item()
            total_validation_loss += loss.item()

            y_hat, _ = beam_search(x_e,
                                   x_mask,
                                   model,
                                   get_scores=short_sent_penalty)

            if printed_samples < 4:
                sentence = validation_dataset.fields[0].to_sentence_str(
                    validation_batch[0][-1].tolist())
                reference = validation_dataset.fields[1].to_sentence_str(
                    validation_batch[1][-1].tolist())
                generated = validation_dataset.fields[1].to_sentence_str(
                    y_hat[-1].tolist())
                logger.info('SENTENCE:\n ---- {}'.format(sentence))
                logger.info('REFERENCE:\n ---- {}'.format(reference))
                logger.info('GENERATED:\n ---- {}'.format(generated))

                printed_samples += 1

            update_metric_params(y_hat, validation_batch[1], pad_index,
                                 metrics)

    elapsed_time = time.time() - start_time
    logger.info(
        f'{log_prefix}: '
        f'evaluation_loss={total_validation_loss / total_item_count:.3f}, '
        f'elapsed_time={int(elapsed_time + 0.5)}s')
    for metric_repr in (str(m) for m in metrics):
        logger.info(f'{log_prefix}: evaluation {metric_repr}')

    return metrics[0].get_score()
예제 #11
0
    def build_models(self, *kwargs):
        logger.info('Building model')
        self.params, self.encoder_param_names = init_params(self.config)

        # reload parameters
        if self.config['reload'] and os.path.exists(self.model_path):
            logger.info('Reloading model parameters')
            self.params = load_params(self.model_path, self.params)

        self.tparams = init_theano_params(self.params)

        if self.shared_params is not None:
            # multi-task support
            # we replace whatever parameters we already have at this point with
            # the ones that we received as optional input
            # this needs to be done BEFORE building the model
            self.params, self.tparams = self.apply_shared_theano_params(
                self.shared_params, self.params, self.tparams)

        # random generator and global dropout/noise switch for this model
        self.trng = RandomStreams(1234)

        inps, opt_ret, cost = build_model(self.tparams, self.trng, self.config)

        cost = cost.mean()

        logger.info('Building tools')
        self.f_init, self.f_next = build_sampler(self.tparams, self.config,
                                                 self.trng)

        # apply L2 regularization on weights
        if self.decay_c > 0.:
            decay_c = theano.shared(np.float32(self.decay_c), name='decay_c')
            weight_decay = 0.
            for kk, vv in iteritems(self.tparams):
                weight_decay += (vv**2).sum()
            weight_decay *= self.decay_c
            cost += weight_decay

        # regularize the alpha weights
        if self.alpha_c > 0. and not decoder.endswith('simple'):
            alpha_c = theano.shared(np.float32(self.alpha_c), name='alpha_c')
            alpha_reg = alpha_c * ((
                tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None]
                - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
            cost += alpha_reg

        # after all regularizers - compile the computational graph for cost
        logger.info('Building f_cost...')
        f_cost = theano.function(inps, cost, profile=profile)
        logger.info('Done')

        logger.info('Computing gradient...')
        grads = tensor.grad(cost, wrt=list(itervalues(self.tparams)))
        grads = clip_grad_norm(grads, self.clip_c)
        logger.info('Done')

        # compile the optimizer, the actual computational graph is compiled here
        lr = tensor.scalar(name='lr')
        logger.info('Building optimizers...')
        self.f_grad_shared, self.f_update = eval(self.optimizer)(lr,
                                                                 self.tparams,
                                                                 grads, inps,
                                                                 cost, opt_ret)
        logger.info('Done')

        # log probability function (for validation, so use model without noise!)
        logger.info('Building f_log_probs...')
        self.test_inp, _, self.test_cost = build_model(self.tparams,
                                                       self.trng,
                                                       self.config,
                                                       use_mask=True,
                                                       use_noise=False)
        self.f_log_probs = theano.function(self.test_inp,
                                           self.test_cost,
                                           profile=profile)
        logger.info('Done')
예제 #12
0
def translate(cfg_file, ckpt: str, output_path: str = None) -> None:
    """
    Interactive translation function.
    Loads model from checkpoint and translates either the stdin input or
    asks for input to translate interactively.
    The input has to be pre-processed according to the data that the model
    was trained on, i.e. tokenized or split into subwords.
    Translations are printed to stdout.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    """
    def _load_line_as_data(line):
        """ Create a dataset from one line via a temporary file. """
        # write src input to temporary file
        tmp_name = "tmp"
        tmp_suffix = ".src"
        tmp_filename = tmp_name + tmp_suffix
        with open(tmp_filename, "w") as tmp_file:
            tmp_file.write("{}\n".format(line))

        test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field)

        # remove temporary file
        if os.path.exists(tmp_filename):
            os.remove(tmp_filename)

        return test_data

    def _translate_data(test_data):
        """ Translates given dataset, using parameters from outer scope. """
        # pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores = validate_on_data(
            model, data=test_data, batch_size=batch_size, level=level,
            max_output_length=max_output_length, eval_metric="",
            use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
            beam_alpha=beam_alpha)
        return hypotheses

    cfg = load_config(cfg_file)

    # when checkpoint is not specified, take oldest from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)

    batch_size = cfg["training"].get("batch_size", 1)
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # read vocabs
    src_vocab_file = cfg["data"].get(
        "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt")
    trg_vocab_file = cfg["data"].get(
        "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt")
    src_vocab = Vocabulary(file=src_vocab_file)
    trg_vocab = Vocabulary(file=trg_vocab_file)

    data_cfg = cfg["data"]
    level = data_cfg["level"]
    lowercase = data_cfg["lowercase"]

    tok_fun = lambda s: list(s) if level == "char" else s.split()

    src_field = Field(init_token=None,
                      eos_token=EOS_TOKEN,
                      pad_token=PAD_TOKEN,
                      tokenize=tok_fun,
                      batch_first=True,
                      lower=lowercase,
                      unk_token=UNK_TOKEN,
                      include_lengths=True)
    src_field.vocab = src_vocab

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 0)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 0
        beam_alpha = -1

    if not sys.stdin.isatty():
        # file given
        test_data = MonoDataset(path=sys.stdin, ext="", field=src_field)
        hypotheses = _translate_data(test_data)

        if output_path is not None:
            output_path_set = "{}".format(output_path)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            print("Translations saved to: {}".format(output_path_set))
        else:
            for hyp in hypotheses:
                print(hyp)

    else:
        # enter interactive mode
        batch_size = 1
        while True:
            try:
                src_input = input("\nPlease enter a source sentence "
                                  "(pre-processed): \n")
                if not src_input.strip():
                    break

                # every line has to be made into dataset
                test_data = _load_line_as_data(line=src_input)

                hypotheses = _translate_data(test_data)
                print("JoeyNMT: {}".format(hypotheses[0]))

            except (KeyboardInterrupt, EOFError):
                print("\nBye.")
                break
예제 #13
0
def test(cfg_file,
         ckpt: str,
         output_path: str = None,
         save_attention: bool = False,
         logger: logging.Logger = None) -> None:
    """
    Main test function. Handles loading a model from checkpoint, generating
    translations and storing them and attention plots.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    :param output_path: path to output
    :param save_attention: whether to save the computed attention weights
    :param logger: log output to this logger (creates new logger if not set)
    """

    if logger is None:
        logger = logging.getLogger(__name__)
        FORMAT = '%(asctime)-15s - %(message)s'
        logging.basicConfig(format=FORMAT)
        logger.setLevel(level=logging.DEBUG)

    cfg = load_config(cfg_file)

    if "test" not in cfg["data"].keys():
        raise ValueError("Test data must be specified in config.")

    # when checkpoint is not specified, take latest (best) from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError(
                "No checkpoint found in directory {}.".format(model_dir))
        try:
            step = ckpt.split(model_dir + "/")[1].split(".ckpt")[0]
        except IndexError:
            step = "best"

    batch_size = cfg["training"]["batch_size"]
    batch_type = cfg["training"].get("batch_type", "sentence")
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    eval_metric = cfg["training"]["eval_metric"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # load the data
    _, dev_data, test_data, src_vocab, trg_vocab = load_data(
        data_cfg=cfg["data"])

    data_to_predict = {"dev": dev_data, "test": test_data}

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 0)
        beam_alpha = cfg["testing"].get("alpha", -1)
    else:
        beam_size = 0
        beam_alpha = -1

    for data_set_name, data_set in data_to_predict.items():

        #pylint: disable=unused-variable
        score, loss, ppl, sources, sources_raw, references, hypotheses, \
        hypotheses_raw, attention_scores = validate_on_data(
            model, data=data_set, batch_size=batch_size,
            batch_type=batch_type, level=level,
            max_output_length=max_output_length, eval_metric=eval_metric,
            use_cuda=use_cuda, loss_function=None, beam_size=beam_size,
            beam_alpha=beam_alpha)
        #pylint: enable=unused-variable

        if "trg" in data_set.fields:
            decoding_description = "Greedy decoding" if beam_size == 0 else \
                "Beam search decoding with beam size = {} and alpha = {}".\
                    format(beam_size, beam_alpha)
            logger.info("%4s %s: %6.2f [%s]", data_set_name, eval_metric,
                        score, decoding_description)
        else:
            logger.info("No references given for %s -> no evaluation.",
                        data_set_name)

        if save_attention:
            if attention_scores:
                attention_name = "{}.{}.att".format(data_set_name, step)
                attention_path = os.path.join(model_dir, attention_name)
                logger.info(
                    "Saving attention plots. This might take a while..")
                store_attention_plots(attentions=attention_scores,
                                      targets=hypotheses_raw,
                                      sources=[s for s in data_set.src],
                                      indices=range(len(hypotheses)),
                                      output_prefix=attention_path)
                logger.info("Attention plots saved to: %s", attention_path)
            else:
                logger.warning("Attention scores could not be saved. "
                               "Note that attention scores are not available "
                               "when using beam search. "
                               "Set beam_size to 0 for greedy decoding.")

        if output_path is not None:
            output_path_set = "{}.{}".format(output_path, data_set_name)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            logger.info("Translations saved to: %s", output_path_set)
예제 #14
0
    def train(self,
              model_name=None,
              output_dir=None,
              src_train=None,
              trg_train=None,
              src_valid=None,
              trg_valid=None,
              src_dicts=None,
              trg_dicts=None,
              factors=1,
              factors_trg=1,
              n_words_src=50000,
              n_words_trg=50000,
              dim_emb=100,
              dim_per_factor=(100, ),
              dim=100,
              dim_att=200,
              encoder='gru',
              encoder_layers=1,
              decoder='gru_cond',
              optimizer='adadelta',
              learning_rate=1e-3,
              decay_c=0.,
              clip_c=1.,
              alpha_c=0.,
              dropout=False,
              dropout_src=0.,
              dropout_trg=0.,
              dropout_emb=0.,
              dropout_rec=0.,
              dropout_hid=0.,
              batch_size=80,
              valid_batch_size=80,
              k=5,
              maxlen=50,
              max_epochs=20,
              bleu_script='nmt/multi-bleu.perl',
              bleu_val_burnin=0,
              val_set_out='validation.txt',
              validation_frequency=-1,
              display_frequency=100,
              save_frequency=-1,
              sample_frequency=200,
              beam_size=12,
              track_n_models=3,
              finish_after=-1,
              unk_symbol='<UNK>',
              eos_symbol='</s>',
              patience=10,
              early_stopping='cost',
              reload=False,
              verbose=1,
              disp_alignments=True,
              mtl=False,
              mtl_ratio=(),
              mtl_configs=(),
              mtl_decoder=False,
              n_shared_layers=1,
              **kwargs):
        """
        Train an NMT system
        :return:
        """

        # log options
        config = self.config
        logger.info(pformat(self.config))

        # Model options
        model_path = os.path.join(output_dir, model_name + '.npz')
        config_path = os.path.join(output_dir, model_name + '.json')

        # create output dir if it does not exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # load dictionaries and invert them
        worddicts_src = [load_dictionary(d) for d in src_dicts]
        worddicts_trg = [load_dictionary(d) for d in trg_dicts]
        worddicts_src_r = [invert_dictionary(d) for d in worddicts_src]
        worddicts_trg_r = [invert_dictionary(d) for d in worddicts_trg]

        # reload options
        if reload:
            if os.path.exists(config_path):
                logger.info('Reloading model options: %s' % config_path)
                config = load_json(config_path)
            else:
                logger.info(
                    'Did NOT reload model options (file did not exist)')

        logger.info('Loading data')
        train = TextIterator(src_train,
                             trg_train,
                             src_dicts,
                             trg_dicts,
                             batch_size=batch_size,
                             maxlen=maxlen,
                             n_words_source=n_words_src,
                             n_words_target=n_words_trg,
                             shuffle_each_epoch=True,
                             sort_by_length=True,
                             maxibatch_size=20,
                             factors=factors,
                             factors_trg=factors_trg)

        valid = TextIterator(src_valid,
                             trg_valid,
                             src_dicts,
                             trg_dicts,
                             batch_size=batch_size,
                             maxlen=maxlen,
                             n_words_source=n_words_src,
                             n_words_target=n_words_trg,
                             shuffle_each_epoch=False,
                             sort_by_length=True,
                             maxibatch_size=20,
                             factors=factors,
                             factors_trg=factors_trg)

        logger.info('Building model')
        params, encoder_param_names = init_params(config)

        # reload parameters
        if reload and os.path.exists(model_path):
            logger.info('Reloading model parameters')
            params = load_params(model_path, params)

        tparams = init_theano_params(params)

        if self.shared_params is not None:
            # multi-task support
            # we replace whatever parameters we already have at this point with
            # the ones that we received as optional input
            # this needs to be done BEFORE building the model
            params, tparams = self.apply_shared_theano_params(
                self.shared_params, params, tparams)

        # random generator and global dropout/noise switch for this model
        trng = RandomStreams(1234)

        inps, opt_ret, cost = build_model(tparams, trng, config)
        # x, x_mask, y, y_mask = inps

        cost = cost.mean()

        logger.info('Building tools')
        f_init, f_next = build_sampler(tparams, config, trng)

        # apply L2 regularization on weights
        if decay_c > 0.:
            decay_c = theano.shared(np.float32(decay_c), name='decay_c')
            weight_decay = 0.
            for kk, vv in iteritems(tparams):
                weight_decay += (vv**2).sum()
            weight_decay *= decay_c
            cost += weight_decay

        # regularize the alpha weights
        if alpha_c > 0. and not decoder.endswith('simple'):
            alpha_c = theano.shared(np.float32(alpha_c), name='alpha_c')
            alpha_reg = alpha_c * ((
                tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None]
                - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
            cost += alpha_reg

        # after all regularizers - compile the computational graph for cost
        logger.info('Building f_cost...')
        f_cost = theano.function(inps, cost, profile=profile)
        logger.info('Done')

        logger.info('Computing gradient...')
        grads = tensor.grad(cost, wrt=list(itervalues(tparams)))
        grads = clip_grad_norm(grads, clip_c)
        logger.info('Done')

        # compile the optimizer, the actual computational graph is compiled here
        lr = tensor.scalar(name='lr')
        logger.info('Building optimizers...')
        f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps,
                                                  cost, opt_ret)
        logger.info('Done')

        # log probability function (for validation, so use model without noise!)
        logger.info('Building f_log_probs...')
        test_inp, _, test_cost = build_model(tparams,
                                             trng,
                                             config,
                                             use_mask=True,
                                             use_noise=False)
        f_log_probs = theano.function(test_inp, test_cost, profile=profile)
        logger.info('Done')

        # bleu validation
        bleu_validator = SimpleBleuValidator(
            tparams,
            config,
            trng,
            f_init,
            f_next,
            k=beam_size,
            src_dicts=worddicts_src,
            trg_idict=worddicts_trg_r[0],
            normalize=True,
            main_loop=self) if bleu_script else None

        # multi-task learning
        mtl_tasks = []
        shared_params = OrderedDict()
        for k in encoder_param_names:
            shared_params[k] = tparams[k]

        if mtl:
            logger.info('Preparing MTL tasks')
            task_config = yaml.load(open(mtl_configs[0], mode='rb'))
            if task_config['model'] == 'imaginet':
                task_config['exp_id'] = self.config['exp_id']
                mtl_tasks.append(ImaginetTrainer(task_config, shared_params))
            elif task_config['model'] == 'nmt':
                mtl_tasks.append(Trainer(task_config, shared_params))
            assert sum(mtl_ratio) == 1., 'MTL ratio must sum to 1'

        # to check how many times a task was executed
        task_stats = np.zeros(len(mtl_tasks) + 1)

        # start of optimization main loop
        logger.info('Optimization started...')

        early_stop = False
        saved_model_paths = []  # history of saved models
        best_params = None
        bad_counter = 0

        # reload history
        if reload and os.path.exists(model_path):
            self.history_errs = list(np.load(model_path)['history_errs'])
            self.history_bleu = list(np.load(model_path)['history_bleu'])
            self.update_idx = np.load(model_path)['update_idx']

        # set frequencies - if -1 is specified then freq set to #iters in epoch
        validation_frequency = len(train) // batch_size \
            if validation_frequency == -1 else validation_frequency
        save_frequency = len(train) // batch_size \
            if save_frequency == -1 else save_frequency
        sample_frequency = len(train) // batch_size \
            if sample_frequency == -1 else sample_frequency

        # save initial model so we can re-use parameters (seed)
        logger.info('Saving initial model')
        params = unzip(tparams)
        with open(model_path + '.init', mode='wb') as f:
            np.savez(f,
                     history_errs=self.history_errs,
                     history_bleu=self.history_bleu,
                     update_idx=self.update_idx,
                     **params)
        dump_json(config, config_path)
        logger.info('Done saving model')

        for epoch_idx in range(max_epochs):

            self.epoch_idx = epoch_idx
            # self.update_idx // (len(train) // batch_size)

            n_samples = 0

            # iterate over data batches
            for x_, y_ in train:

                # multi-task learning -- we simply do other tasks until we are
                # allowed to perform the main task (this loop)
                if mtl:
                    n_tasks = len(mtl_ratio)
                    task = 1
                    while task > 0:
                        task = np.random.choice(n_tasks,
                                                1,
                                                replace=False,
                                                p=mtl_ratio)[0]
                        task_stats[task] += 1

                        if task > 0:
                            mtl_tasks[task - 1].train_next_batch()
                            # print('Training on task {:d}'.format(task))

                # NMT training
                n_samples += len(x_)
                self.update_idx += 1

                x, x_mask, y, y_mask = prepare_batch(x_, y_, maxlen=None)
                y = y[0]  # only use first target factor for NMT

                inputs = [x, x_mask, y, y_mask]

                if x is None:
                    logger.warning(
                        'Empty mini-batch! maxlen={}'.format(maxlen))
                    self.update_idx -= 1
                    continue

                # get error on this batch
                update_start_time = time.time()
                ret_vals = f_grad_shared(*inputs)
                cost = ret_vals[0]

                # do the update on parameters
                f_update(learning_rate)

                update_time = time.time() - update_start_time

                # check for bad numbers
                if np.isnan(cost) or np.isinf(cost):
                    logger.warning('NaN detected')
                    return 1., 1., 1.

                # verbose
                if np.mod(self.update_idx, display_frequency) == 0:
                    if disp_alignments:  # display info with max alpha value
                        logger.info(
                            'Epoch %4d Update %8d Cost %4.8f UD %0.12f Max-alpha %0.4f'
                            % (self.epoch_idx, self.update_idx, cost,
                               update_time, ret_vals[1].max()))
                    else:  # display general info
                        logger.info(
                            'Epoch %4d Update %8d Cost %4.8f UD %0.12f' %
                            (self.epoch_idx, self.update_idx, cost,
                             update_time))

                # generate some samples
                if np.mod(self.update_idx, sample_frequency) == 0:
                    print_samples(x, y, trng, f_init, f_next, maxlen, factors,
                                  worddicts_src_r, worddicts_trg_r, unk_symbol)

                # validation
                if np.mod(self.update_idx, validation_frequency) == 0:

                    # intrinsic validation
                    valid_errs, perplexity = pred_probs(
                        f_log_probs, prepare_batch, valid)
                    valid_err = valid_errs.mean()

                    if np.isnan(valid_err):
                        logger.warning('valid_err NaN detected')
                        early_stop = True
                        break

                    # output validation info
                    logger.info('Validation error: {:1.12f} PPX: {:f}'.format(
                        valid_err, perplexity))

                    # BLEU validation
                    if bleu_validator and self.update_idx >= bleu_val_burnin:
                        bleu_score = bleu_validator.evaluate_model()
                        logger.info('BLEU = {}'.format(bleu_score))

                    # save the best 3 models according to early-stopping
                    if track_n_models > 0 and len(self.history_errs) > 0:

                        if early_stopping == 'cost':
                            if valid_err <= min(self.history_errs):
                                logger.info(
                                    'Saving model at epoch {} / iter {}...'.
                                    format(self.epoch_idx, self.update_idx))
                                path = os.path.join(
                                    output_dir, '{}.ep{}.iter{}.npz'.format(
                                        model_name, self.epoch_idx,
                                        self.update_idx))
                                with open(path, mode='wb') as f:
                                    np.savez(f,
                                             history_errs=self.history_errs,
                                             history_bleu=self.history_bleu,
                                             update_idx=self.update_idx,
                                             **unzip(tparams))

                                saved_model_paths.append(path)
                                logger.info('Done saving model')

                        # Save a model only if we've exceeding the point where
                        # we start measuring BLEU scores
                        elif early_stopping == 'bleu' and self.update_idx >= bleu_val_burnin:
                            if len(self.history_bleu
                                   ) > 0 and bleu_score >= max(
                                       self.history_bleu):
                                bestbleuhandle = open(
                                    '%s/bestBLEU' % output_dir, 'w')
                                bestbleuhandle.write("%f" % bleu_score)
                                bestbleuhandle.close()
                                logger.info(
                                    'Saving model at epoch {} / iter {}...'.
                                    format(self.epoch_idx, self.update_idx))
                                path = os.path.join(
                                    output_dir,
                                    '{}.ep{}.iter{}.bleu{}.npz'.format(
                                        model_name, self.epoch_idx,
                                        self.update_idx, bleu_score))
                                with open(path, mode='wb') as f:
                                    np.savez(f,
                                             history_errs=self.history_errs,
                                             history_bleu=self.history_bleu,
                                             update_idx=self.update_idx,
                                             **unzip(tparams))

                                saved_model_paths.append(path)
                                logger.info('Done saving model')

                        # Remove un-needed saved models if necessary
                        if len(saved_model_paths) > track_n_models:
                            path = saved_model_paths[0]
                            logger.info('Deleting old model {}'.format(path))
                            with ignored(OSError):
                                os.remove(path)

                            saved_model_paths.pop(0)

                    # remember the validation result
                    self.history_errs.append(valid_err)
                    if early_stopping == 'bleu' and self.update_idx >= bleu_val_burnin:
                        # Remember the BLEU score at this point
                        self.history_bleu.append(bleu_score)

                    # reset bad counter (patience) if best validation so far
                    if early_stopping == 'cost':
                        if self.update_idx == 0 or valid_err <= \
                                np.array(self.history_errs).min():
                            best_params = unzip(tparams)
                            if mtl:
                                # Force the other tasks to save too
                                mtl_tasks[0].save(
                                    string=".cost{}".format(valid_err))
                            if bad_counter > 0:
                                bad_counter -= 1
                    elif early_stopping == 'bleu':
                        if self.update_idx >= bleu_val_burnin:
                            if bleu_score >= max(self.history_bleu):
                                best_params = unzip(tparams)
                                if mtl:
                                    # Force the other tasks to save too
                                    mtl_tasks[0].save(
                                        string=".bleu{}".format(bleu_score))
                                if bad_counter > 0:
                                    bad_counter -= 1

                    # save the best model so far (according to cost)
                    logger.info('Saving best model (according to {})'.format(
                        early_stopping))
                    if best_params is not None:
                        params = best_params
                    else:
                        params = unzip(tparams)
                    np.savez(model_path,
                             history_errs=self.history_errs,
                             history_bleu=self.history_bleu,
                             update_idx=self.update_idx,
                             **params)
                    logger.info('Done saving best model')

                    # check for early stop
                    if early_stopping == 'cost':
                        if len(self.history_errs) > patience and valid_err >= \
                                np.array(self.history_errs)[:-patience].min():
                            bad_counter += 1
                            logger.warn('Bad validation result. {}/{}'.format(
                                bad_counter, patience))

                            if bad_counter >= patience:
                                logger.info('Early stop activated.')
                                early_stop = True
                    elif early_stopping == 'bleu':
                        if len(self.history_bleu) > patience and bleu_score <= \
                                np.array(self.history_bleu)[:-patience].max():
                            bad_counter += 1
                            logger.warn('Bad validation result. {}/{}'.format(
                                bad_counter, patience))

                            if bad_counter >= patience:
                                logger.info('Early stop activated.')
                                early_stop = True

            # finish after this many updates
            if self.update_idx == finish_after:
                logger.info('Finishing after {:d} iterations!'.format(
                    self.update_idx))
                early_stop = True

            if early_stop:
                logger.info('Early Stop!')
                return 0

            if mtl:
                logger.info(task_stats / task_stats.sum())

        logger.info('Seen {:d} samples'.format(n_samples))
        logger.info('Finished with main loop')
        return 0