示例#1
0
def build_gradient_clipper(config: dict) -> Optional[Callable]:
    """
    Define the function for gradient clipping as specified in configuration.
    If not specified, returns None.

    Current options:
        - "clip_grad_val": clip the gradients if they exceed this value,
            see `torch.nn.utils.clip_grad_value_`
        - "clip_grad_norm": clip the gradients if their norm exceeds this value,
            see `torch.nn.utils.clip_grad_norm_`

    :param config: dictionary with training configurations
    :return: clipping function (in-place) or None if no gradient clipping
    """
    clip_grad_fun = None
    if "clip_grad_val" in config.keys():
        clip_value = config["clip_grad_val"]
        clip_grad_fun = lambda params: \
            nn.utils.clip_grad_value_(parameters=params,
                                      clip_value=clip_value)
    elif "clip_grad_norm" in config.keys():
        max_norm = config["clip_grad_norm"]
        clip_grad_fun = lambda params: \
            nn.utils.clip_grad_norm_(parameters=params, max_norm=max_norm)

    if "clip_grad_val" in config.keys() and "clip_grad_norm" in config.keys():
        raise ConfigurationError(
            "You can only specify either clip_grad_val or clip_grad_norm.")

    return clip_grad_fun
示例#2
0
def build_optimizer(config: dict, parameters: Generator) -> Optimizer:
    """
    Create an optimizer for the given parameters as specified in config.

    Except for the weight decay and initial learning rate,
    default optimizer settings are used.

    Currently supported configuration settings for "optimizer":
        - "sgd" (default): see `torch.optim.SGD`
        - "adam": see `torch.optim.adam`
        - "adagrad": see `torch.optim.adagrad`
        - "adadelta": see `torch.optim.adadelta`
        - "rmsprop": see `torch.optim.RMSprop`

    The initial learning rate is set according to "learning_rate" in the config.
    The weight decay is set according to "weight_decay" in the config.
    If they are not specified, the initial learning rate is set to 3.0e-4, the
    weight decay to 0.

    Note that the scheduler state is saved in the checkpoint, so if you load
    a model for further training you have to use the same type of scheduler.

    :param config: configuration dictionary
    :param parameters:
    :return: optimizer
    """
    optimizer_name = config.get("optimizer", "sgd").lower()
    learning_rate = config.get("learning_rate", 3.0e-4)
    weight_decay = config.get("weight_decay", 0)

    if optimizer_name == "adam":
        adam_betas = config.get("adam_betas", (0.9, 0.999))
        optimizer = torch.optim.Adam(parameters,
                                     weight_decay=weight_decay,
                                     lr=learning_rate,
                                     betas=adam_betas)
    elif optimizer_name == "adagrad":
        optimizer = torch.optim.Adagrad(parameters,
                                        weight_decay=weight_decay,
                                        lr=learning_rate)
    elif optimizer_name == "adadelta":
        optimizer = torch.optim.Adadelta(parameters,
                                         weight_decay=weight_decay,
                                         lr=learning_rate)
    elif optimizer_name == "rmsprop":
        optimizer = torch.optim.RMSprop(parameters,
                                        weight_decay=weight_decay,
                                        lr=learning_rate)
    elif optimizer_name == "sgd":
        # default
        optimizer = torch.optim.SGD(parameters,
                                    weight_decay=weight_decay,
                                    lr=learning_rate)
    else:
        raise ConfigurationError("Invalid optimizer. Valid options: 'adam', "
                                 "'adagrad', 'adadelta', 'rmsprop', 'sgd'.")
    return optimizer
示例#3
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    if cfg.get("tied_embeddings", False):
        if src_vocab.itos == trg_vocab.itos:
            # share embeddings for src and trg
            trg_embed = src_embed
        else:
            raise ConfigurationError(
                "Embedding cannot be tied since vocabularies differ.")
    else:
        trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                               vocab_size=len(trg_vocab),
                               padding_idx=trg_padding_idx)

    encoder = RecurrentEncoder(**cfg["encoder"],
                               emb_size=src_embed.embedding_dim)
    decoder = RecurrentDecoder(**cfg["decoder"],
                               encoder=encoder,
                               vocab_size=len(trg_vocab),
                               emb_size=trg_embed.embedding_dim)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
示例#4
0
def get_unroll_steps(unroll_steps_type: str, labels: torch.Tensor, epoch: int) -> int:
    """
    get number of unroll_steps depending on unroll_steps_type

    :param unroll_steps_type: type from yaml file
    :param labels: y values (ground truth)
    :param epoch: current epoch
    :return: number of steps to unroll the RNN
    """
    if unroll_steps_type == 'full_length':
        return labels.shape[1]
    elif unroll_steps_type == 'batch_length':
        return np.max(np.argwhere(labels.detach().numpy() == 3)[:, 1])
    elif unroll_steps_type == 'batch_number':
        return int(2 + np.ceil(epoch / 2))
    else:
        raise ConfigurationError('Unknown unroll_steps_type.')
示例#5
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None):
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    if cfg.get("architecture", "encoder-decoder") == "encoder-decoder":
        model = build_encoder_decoder_model(cfg, src_vocab, trg_vocab)
    elif cfg.get("architecture", "encoder-decoder") == "unsupervised-nmt":
        model = build_unsupervised_nmt_model(cfg, src_vocab, trg_vocab)
    else:
        raise ConfigurationError(
            "Supported architectures: 'encoder-decoder' and 'unsupervised-nmt'"
        )

    return model
示例#6
0
    def __init__(self,
                 model: Model,
                 config: dict,
                 batch_class: Batch = Batch) -> None:
        """
        Creates a new TrainManager for a model, specified as in configuration.

        :param model: torch module defining the model
        :param config: dictionary containing the training configurations
        :param batch_class: batch class to encapsulate the torch class
        """
        train_config = config["training"]
        self.batch_class = batch_class

        # files for logging and storing
        self.model_dir = train_config["model_dir"]
        assert os.path.exists(self.model_dir)

        self.logging_freq = train_config.get("logging_freq", 100)
        self.valid_report_file = "{}/validations.txt".format(self.model_dir)
        self.tb_writer = SummaryWriter(log_dir=self.model_dir +
                                       "/tensorboard/")

        self.save_latest_checkpoint = train_config.get("save_latest_ckpt",
                                                       True)

        # model
        self.model = model
        self._log_parameters_list()

        # objective
        self.label_smoothing = train_config.get("label_smoothing", 0.0)
        self.model.loss_function = XentLoss(pad_index=self.model.pad_index,
                                            smoothing=self.label_smoothing)
        self.normalization = train_config.get("normalization", "batch")
        if self.normalization not in ["batch", "tokens", "none"]:
            raise ConfigurationError("Invalid normalization option."
                                     "Valid options: "
                                     "'batch', 'tokens', 'none'.")

        # optimization
        self.learning_rate_min = train_config.get("learning_rate_min", 1.0e-8)

        self.clip_grad_fun = build_gradient_clipper(config=train_config)
        self.optimizer = build_optimizer(config=train_config,
                                         parameters=model.parameters())

        # validation & early stopping
        self.validation_freq = train_config.get("validation_freq", 1000)
        self.log_valid_sents = train_config.get("print_valid_sents", [0, 1, 2])
        self.ckpt_queue = collections.deque(
            maxlen=train_config.get("keep_last_ckpts", 5))
        self.eval_metric = train_config.get("eval_metric", "bleu")
        if self.eval_metric not in [
                'bleu', 'chrf', 'token_accuracy', 'sequence_accuracy'
        ]:
            raise ConfigurationError("Invalid setting for 'eval_metric', "
                                     "valid options: 'bleu', 'chrf', "
                                     "'token_accuracy', 'sequence_accuracy'.")
        self.early_stopping_metric = train_config.get("early_stopping_metric",
                                                      "eval_metric")

        # early_stopping_metric decides on how to find the early stopping point:
        # ckpts are written when there's a new high/low score for this metric.
        # If we schedule after BLEU/chrf/accuracy, we want to maximize the
        # score, else we want to minimize it.
        if self.early_stopping_metric in ["ppl", "loss"]:
            self.minimize_metric = True
        elif self.early_stopping_metric == "eval_metric":
            if self.eval_metric in [
                    "bleu", "chrf", "token_accuracy", "sequence_accuracy"
            ]:
                self.minimize_metric = False
            # eval metric that has to get minimized (not yet implemented)
            else:
                self.minimize_metric = True
        else:
            raise ConfigurationError(
                "Invalid setting for 'early_stopping_metric', "
                "valid options: 'loss', 'ppl', 'eval_metric'.")

        # eval options
        test_config = config["testing"]
        self.bpe_type = test_config.get("bpe_type", "subword-nmt")
        self.sacrebleu = {"remove_whitespace": True, "tokenize": "13a"}
        if "sacrebleu" in config["testing"].keys():
            self.sacrebleu["remove_whitespace"] = test_config["sacrebleu"] \
                .get("remove_whitespace", True)
            self.sacrebleu["tokenize"] = test_config["sacrebleu"] \
                .get("tokenize", "13a")

        # learning rate scheduling
        self.scheduler, self.scheduler_step_at = build_scheduler(
            config=train_config,
            scheduler_mode="min" if self.minimize_metric else "max",
            optimizer=self.optimizer,
            hidden_size=config["model"]["encoder"]["hidden_size"])

        # data & batch handling
        self.level = config["data"]["level"]
        if self.level not in ["word", "bpe", "char"]:
            raise ConfigurationError("Invalid segmentation level. "
                                     "Valid options: 'word', 'bpe', 'char'.")
        self.shuffle = train_config.get("shuffle", True)
        self.epochs = train_config["epochs"]
        self.batch_size = train_config["batch_size"]
        # Placeholder so that we can use the train_iter in other functions.
        self.train_iter = None
        self.train_iter_state = None
        # per-device batch_size = self.batch_size // self.n_gpu
        self.batch_type = train_config.get("batch_type", "sentence")
        self.eval_batch_size = train_config.get("eval_batch_size",
                                                self.batch_size)
        # per-device eval_batch_size = self.eval_batch_size // self.n_gpu
        self.eval_batch_type = train_config.get("eval_batch_type",
                                                self.batch_type)

        self.batch_multiplier = train_config.get("batch_multiplier", 1)

        # generation
        self.max_output_length = train_config.get("max_output_length", None)

        # CPU / GPU
        self.use_cuda = train_config["use_cuda"] and torch.cuda.is_available()
        self.n_gpu = torch.cuda.device_count() if self.use_cuda else 0
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.model.to(self.device)

        # fp16
        self.fp16 = train_config.get("fp16", False)
        if self.fp16:
            if 'apex' not in sys.modules:
                raise ImportError("Please install apex from "
                                  "https://www.github.com/nvidia/apex "
                                  "to use fp16 training.") from no_apex
            self.model, self.optimizer = amp.initialize(self.model,
                                                        self.optimizer,
                                                        opt_level='O1')
            # opt level: one of {"O0", "O1", "O2", "O3"}
            # see https://nvidia.github.io/apex/amp.html#opt-levels

        # initialize training statistics
        self.stats = self.TrainStatistics(
            steps=0,
            stop=False,
            total_tokens=0,
            best_ckpt_iter=0,
            best_ckpt_score=np.inf if self.minimize_metric else -np.inf,
            minimize_metric=self.minimize_metric)

        # model parameters
        if "load_model" in train_config.keys():
            self.init_from_checkpoint(
                train_config["load_model"],
                reset_best_ckpt=train_config.get("reset_best_ckpt", False),
                reset_scheduler=train_config.get("reset_scheduler", False),
                reset_optimizer=train_config.get("reset_optimizer", False),
                reset_iter_state=train_config.get("reset_iter_state", False))

        # multi-gpu training (should be after apex fp16 initialization)
        if self.n_gpu > 1:
            self.model = _DataParallel(self.model)
示例#7
0
    def __init__(self,
                 rnn_type: str = "gru",
                 emb_size: int = 0,
                 hidden_size: int = 0,
                 encoder: Encoder = None,
                 attention: str = "bahdanau",
                 num_layers: int = 1,
                 vocab_size: int = 0,
                 dropout: float = 0.,
                 emb_dropout: float = 0.,
                 hidden_dropout: float = 0.,
                 init_hidden: str = "bridge",
                 input_feeding: bool = True,
                 freeze: bool = False,
                 **kwargs) -> None:
        """
        Create a recurrent decoder with attention.

        :param rnn_type: rnn type, valid options: "lstm", "gru"
        :param emb_size: target embedding size
        :param hidden_size: size of the RNN
        :param encoder: encoder connected to this decoder
        :param attention: type of attention, valid options: "bahdanau", "luong"
        :param num_layers: number of recurrent layers
        :param vocab_size: target vocabulary size
        :param hidden_dropout: Is applied to the input to the attentional layer.
        :param dropout: Is applied between RNN layers.
        :param emb_dropout: Is applied to the RNN input (word embeddings).
        :param init_hidden: If "bridge" (default), the decoder hidden states are
            initialized from a projection of the last encoder state,
            if "zeros" they are initialized with zeros,
            if "last" they are identical to the last encoder state
            (only if they have the same size)
        :param input_feeding: Use Luong's input feeding.
        :param freeze: Freeze the parameters of the decoder during training.
        :param kwargs:
        """

        super().__init__()

        self.emb_dropout = torch.nn.Dropout(p=emb_dropout, inplace=False)
        self.type = rnn_type
        self.hidden_dropout = torch.nn.Dropout(p=hidden_dropout, inplace=False)
        self.hidden_size = hidden_size
        self.emb_size = emb_size

        rnn = nn.GRU if rnn_type == "gru" else nn.LSTM

        self.input_feeding = input_feeding
        if self.input_feeding:  # Luong-style
            # combine embedded prev word +attention vector before feeding to rnn
            self.rnn_input_size = emb_size + hidden_size
        else:
            # just feed prev word embedding
            self.rnn_input_size = emb_size

        # the decoder RNN
        self.rnn = rnn(self.rnn_input_size,
                       hidden_size,
                       num_layers,
                       batch_first=True,
                       dropout=dropout if num_layers > 1 else 0.)

        # combine output with context vector before output layer (Luong-style)
        self.att_vector_layer = nn.Linear(hidden_size + encoder.output_size,
                                          hidden_size,
                                          bias=True)

        self.output_layer = nn.Linear(hidden_size, vocab_size, bias=False)
        self._output_size = vocab_size

        if attention == "bahdanau":
            self.attention = BahdanauAttention(hidden_size=hidden_size,
                                               key_size=encoder.output_size,
                                               query_size=hidden_size)
        elif attention == "luong":
            self.attention = LuongAttention(hidden_size=hidden_size,
                                            key_size=encoder.output_size)
        else:
            raise ConfigurationError("Unknown attention mechanism: %s. "
                                     "Valid options: 'bahdanau', 'luong'." %
                                     attention)

        self.num_layers = num_layers
        self.hidden_size = hidden_size

        # to initialize from the final encoder state of last layer
        self.init_hidden_option = init_hidden
        if self.init_hidden_option == "bridge":
            self.bridge_layer = nn.Linear(encoder.output_size,
                                          hidden_size,
                                          bias=True)
        elif self.init_hidden_option == "last":
            if encoder.output_size != self.hidden_size:
                if encoder.output_size != 2 * self.hidden_size:  # bidirectional
                    raise ConfigurationError(
                        "For initializing the decoder state with the "
                        "last encoder state, their sizes have to match "
                        "(encoder: {} vs. decoder:  {})".format(
                            encoder.output_size, self.hidden_size))
        if freeze:
            freeze_params(self)
示例#8
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    # TODO if continue-us
    src_embed = PretrainedEmbeddings(src_vocab,
                                     trg_vocab,
                                     **cfg["encoder"]["embeddings"],
                                     vocab_size=len(src_vocab),
                                     padding_idx=src_padding_idx)

    # this ties source and target embeddings
    # for softmax layer tying, see further below
    if cfg.get("tied_embeddings", False):
        if src_vocab.itos == trg_vocab.itos:
            # share embeddings for src and trg
            trg_embed = src_embed
        else:
            raise ConfigurationError(
                "Embedding cannot be tied since vocabularies differ.")
    else:
        src_embed = PretrainedEmbeddings(src_vocab,
                                         trg_vocab,
                                         **cfg["encoder"]["embeddings"],
                                         vocab_size=len(src_vocab),
                                         padding_idx=src_padding_idx)

    # build encoder
    enc_dropout = cfg["encoder"].get("dropout", 0.)
    enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    if cfg["encoder"].get("type", "recurrent") == "transformer":
        assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
               cfg["encoder"]["hidden_size"], \
               "for transformer, emb_size must be hidden_size"

        encoder = TransformerEncoder(**cfg["encoder"],
                                     emb_size=src_embed.embedding_dim,
                                     emb_dropout=enc_emb_dropout)
    else:
        encoder = RecurrentEncoder(**cfg["encoder"],
                                   emb_size=src_embed.embedding_dim,
                                   emb_dropout=enc_emb_dropout)

    # build decoder
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)
    if cfg["decoder"].get("type", "recurrent") == "transformer":
        decoder = TransformerDecoder(**cfg["decoder"],
                                     encoder=encoder,
                                     vocab_size=len(trg_vocab),
                                     emb_size=trg_embed.embedding_dim,
                                     emb_dropout=dec_emb_dropout)
    else:
        decoder = RecurrentDecoder(**cfg["decoder"],
                                   encoder=encoder,
                                   vocab_size=len(trg_vocab),
                                   emb_size=trg_embed.embedding_dim,
                                   emb_dropout=dec_emb_dropout)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    # tie softmax layer with trg embeddings
    """
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.decoder.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer."
                f"shapes: output_layer.weight: {model.decoder.output_layer.weight.shape}; target_embed.lut.weight:{trg_embed.lut.weight.shape}")
    """
    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
示例#9
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None,
                trv_vocab: Vocabulary = None,
                canonizer=None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :param trv_vocab: kb true value lookup vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    if "embedding_files" in cfg.keys():  #init from pretrained
        assert not cfg.get(
            "tied_embeddings", False
        ), "TODO implement tied embeddings along with pretrained initialization"
        raise NotImplementedError(
            "TODO implement kbsrc embed loading for embedding files")
        weight_tensors = []
        for weight_file in cfg["embedding_files"]:
            with open(weight_file, "r") as f:
                weight = []
                for line in f.readlines():
                    line = line.split()
                    line = [float(x) for x in line]
                    weight.append(line)

            weight = FloatTensor(weight)
            weight_tensors.append(weight)
        # Set source Embeddings to Pretrained Embeddings
        src_embed = Embeddings(
            int(weight_tensors[0][0].shape[0]),
            False,  #TODO transformer: change to True
            len(weight_tensors[0]),
        )
        src_embed.lut.weight.data = weight_tensors[0]

        # Set target Embeddings to Pretrained Embeddings
        trg_embed = Embeddings(
            int(weight_tensors[1][0].shape[0]),
            False,  #TODO transformer: change to True
            len(weight_tensors[1]),
        )
        trg_embed.lut.weight.data = weight_tensors[1]
    else:
        src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                               vocab_size=len(src_vocab),
                               padding_idx=src_padding_idx)
        if cfg.get("kb_embed_separate", False):
            kbsrc_embed = Embeddings(**cfg["encoder"]["embeddings"],
                                     vocab_size=len(src_vocab),
                                     padding_idx=src_padding_idx)
        else:
            kbsrc_embed = src_embed

        # this ties source and target embeddings
        # for softmax layer tying, see further below
        if cfg.get("tied_embeddings", False):
            if src_vocab.itos == trg_vocab.itos:
                # share embeddings for src and trg
                trg_embed = src_embed
            else:
                raise ConfigurationError(
                    "Embedding cannot be tied since vocabularies differ.")
        else:
            # Latest TODO: init embeddings with vocab_size = len(trg_vocab joined with kb_vocab)
            trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                                   vocab_size=len(trg_vocab),
                                   padding_idx=trg_padding_idx)
    # build encoder
    enc_dropout = cfg["encoder"].get("dropout", 0.)
    enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    if cfg["encoder"].get("type", "recurrent") == "transformer":
        assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
               cfg["encoder"]["hidden_size"], \
               "for transformer, emb_size must be hidden_size"

        encoder = TransformerEncoder(**cfg["encoder"],
                                     emb_size=src_embed.embedding_dim,
                                     emb_dropout=enc_emb_dropout)
    else:
        encoder = RecurrentEncoder(**cfg["encoder"],
                                   emb_size=src_embed.embedding_dim,
                                   emb_dropout=enc_emb_dropout)

    # retrieve kb task info
    kb_task = bool(cfg.get("kb", False))
    k_hops = int(
        cfg.get("k_hops", 1)
    )  # k number of kvr attention layers in decoder (eric et al/default: 1)
    same_module_for_all_hops = bool(cfg.get("same_module_for_all_hops", False))
    do_postproc = bool(cfg.get("do_postproc", True))
    copy_from_source = bool(cfg.get("copy_from_source", True))
    canonization_func = None if canonizer is None else canonizer(
        copy_from_source=copy_from_source)
    kb_input_feeding = bool(cfg.get("kb_input_feeding", True))
    kb_feed_rnn = bool(cfg.get("kb_feed_rnn", True))
    kb_multihead_feed = bool(cfg.get("kb_multihead_feed", False))
    posEncKBkeys = cfg.get("posEncdKBkeys", False)
    tfstyletf = cfg.get("tfstyletf", True)
    infeedkb = bool(cfg.get("infeedkb", False))
    outfeedkb = bool(cfg.get("outfeedkb", False))
    add_kb_biases_to_output = bool(cfg.get("add_kb_biases_to_output", True))
    kb_max_dims = cfg.get("kb_max_dims", (16, 32))  # should be tuple
    double_decoder = cfg.get("double_decoder", False)
    tied_side_softmax = cfg.get(
        "tied_side_softmax",
        False)  # actually use separate linear layers, tying only the main one
    do_pad_kb_keys = cfg.get(
        "pad_kb_keys", True
    )  # doesnt need to be true for 1 hop (=>BIG PERFORMANCE SAVE), needs to be true for >= 2 hops

    if hasattr(kb_max_dims, "__iter__"):
        kb_max_dims = tuple(kb_max_dims)
    else:
        assert type(kb_max_dims) == int, kb_max_dims
        kb_max_dims = (kb_max_dims, )

    assert cfg["decoder"]["hidden_size"]
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)

    if cfg["decoder"].get("type", "recurrent") == "transformer":
        if tfstyletf:
            decoder = TransformerDecoder(
                **cfg["decoder"],
                encoder=encoder,
                vocab_size=len(trg_vocab),
                emb_size=trg_embed.embedding_dim,
                emb_dropout=dec_emb_dropout,
                kb_task=kb_task,
                kb_key_emb_size=kbsrc_embed.embedding_dim,
                feed_kb_hidden=kb_input_feeding,
                infeedkb=infeedkb,
                outfeedkb=outfeedkb,
                double_decoder=double_decoder)
        else:
            decoder = TransformerKBrnnDecoder(
                **cfg["decoder"],
                encoder=encoder,
                vocab_size=len(trg_vocab),
                emb_size=trg_embed.embedding_dim,
                emb_dropout=dec_emb_dropout,
                kb_task=kb_task,
                k_hops=k_hops,
                kb_max=kb_max_dims,
                same_module_for_all_hops=same_module_for_all_hops,
                kb_key_emb_size=kbsrc_embed.embedding_dim,
                kb_input_feeding=kb_input_feeding,
                kb_feed_rnn=kb_feed_rnn,
                kb_multihead_feed=kb_multihead_feed)
    else:
        if not kb_task:
            decoder = RecurrentDecoder(**cfg["decoder"],
                                       encoder=encoder,
                                       vocab_size=len(trg_vocab),
                                       emb_size=trg_embed.embedding_dim,
                                       emb_dropout=dec_emb_dropout)
        else:
            decoder = KeyValRetRNNDecoder(
                **cfg["decoder"],
                encoder=encoder,
                vocab_size=len(trg_vocab),
                emb_size=trg_embed.embedding_dim,
                emb_dropout=dec_emb_dropout,
                k_hops=k_hops,
                kb_max=kb_max_dims,
                same_module_for_all_hops=same_module_for_all_hops,
                kb_key_emb_size=kbsrc_embed.embedding_dim,
                kb_input_feeding=kb_input_feeding,
                kb_feed_rnn=kb_feed_rnn,
                kb_multihead_feed=kb_multihead_feed,
                do_pad_kb_keys=do_pad_kb_keys)

    # specify generator which is mostly just the output layer
    generator = Generator(dec_hidden_size=cfg["decoder"]["hidden_size"],
                          vocab_size=len(trg_vocab),
                          add_kb_biases_to_output=add_kb_biases_to_output,
                          double_decoder=double_decoder)

    model = Model(
                  encoder=encoder, decoder=decoder, generator=generator,
                  src_embed=src_embed, trg_embed=trg_embed,
                  src_vocab=src_vocab, trg_vocab=trg_vocab,\
                  kb_key_embed=kbsrc_embed,\
                  trv_vocab=trv_vocab,
                  k_hops=k_hops,
                  do_postproc=do_postproc,
                  canonize=canonization_func,
                  kb_att_dims=len(kb_max_dims),
                  posEncKBkeys=posEncKBkeys
                  )

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.generator.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.generator.output_layer.weight = trg_embed.lut.weight
            if model.generator.double_decoder:
                # (also also) share trg embeddings and side softmax layer
                assert hasattr(model.generator, "side_output_layer")
                if tied_side_softmax:
                    # because of distributivity this becomes O (x_1+x_2) instead of O_1 x_1 + O_2 x_2
                    model.generator.side_output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
示例#10
0
def translate(cfg_file, ckpt: str, output_path: str = None) -> None:
    """
    Interactive translation function.
    Loads model from checkpoint and translates either the stdin input or
    asks for input to translate interactively.
    The input has to be pre-processed according to the data that the model
    was trained on, i.e. tokenized or split into subwords.
    Translations are printed to stdout.

    :param cfg_file: path to configuration file
    :param ckpt: path to checkpoint to load
    """
    def _load_line_as_data(line):
        """ Create a dataset from one line via a temporary file. """
        # write src input to temporary file
        tmp_name = "tmp"
        tmp_suffix = ".src"
        tmp_filename = tmp_name + tmp_suffix
        with open(tmp_filename, "w") as tmp_file:
            tmp_file.write("{}\n".format(line))

        test_data = MonoDataset(path=tmp_name, ext=tmp_suffix, field=src_field)

        # remove temporary file
        if os.path.exists(tmp_filename):
            os.remove(tmp_filename)

        return test_data

    def _translate_data(test_data):
        """ Translates given dataset, using parameters from outer scope. """
        hypotheses = validate_on_data(model,
                                      data=test_data,
                                      batch_size=batch_size,
                                      trg_level=level,
                                      max_output_length=max_output_length,
                                      eval_metrics=[],
                                      use_cuda=use_cuda,
                                      loss_function=None,
                                      beam_size=beam_size,
                                      beam_alpha=beam_alpha)[2]
        return hypotheses

    cfg = load_config(cfg_file)

    # when checkpoint is not specified, take oldest from model dir
    if ckpt is None:
        model_dir = cfg["training"]["model_dir"]
        ckpt = get_latest_checkpoint(model_dir)

    batch_size = cfg["training"].get("batch_size", 1)
    use_cuda = cfg["training"].get("use_cuda", False)
    level = cfg["data"]["level"]
    max_output_length = cfg["training"].get("max_output_length", None)

    # read vocabs
    src_vocab_file = cfg["data"].get(
        "src_vocab", cfg["training"]["model_dir"] + "/src_vocab.txt")
    trg_vocab_file = cfg["data"].get(
        "trg_vocab", cfg["training"]["model_dir"] + "/trg_vocab.txt")
    src_vocab = Vocabulary.from_file(src_vocab_file)
    trg_vocab = Vocabulary.from_file(trg_vocab_file)

    data_cfg = cfg["data"]
    level = data_cfg["level"]
    lowercase = data_cfg["lowercase"]

    tok_fun = list if level == "char" else str.split

    src_field = Field(init_token=None,
                      eos_token=EOS_TOKEN,
                      pad_token=PAD_TOKEN,
                      tokenize=tok_fun,
                      batch_first=True,
                      lower=lowercase,
                      unk_token=UNK_TOKEN,
                      include_lengths=True)
    src_field.vocab = src_vocab

    # load model state from disk
    model_checkpoint = load_checkpoint(ckpt, use_cuda=use_cuda)

    # build model and load parameters into it
    model = build_model(cfg["model"],
                        vocabs={
                            "src": src_vocab,
                            "trg": trg_vocab
                        })
    model.load_state_dict(model_checkpoint["model_state"])

    if use_cuda:
        model.cuda()

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        beam_size = cfg["testing"].get("beam_size", 0)
        beam_alpha = cfg["testing"].get("alpha", 0)
    else:
        beam_size = 0
        beam_alpha = 0
    if beam_alpha < 0:
        raise ConfigurationError("alpha for length penalty should be >= 0")

    if not sys.stdin.isatty():
        # file given
        test_data = MonoDataset(path=sys.stdin, ext="", field=src_field)
        hypotheses = _translate_data(test_data)

        if output_path is not None:
            output_path_set = "{}".format(output_path)
            with open(output_path_set, mode="w", encoding="utf-8") as out_file:
                for hyp in hypotheses:
                    out_file.write(hyp + "\n")
            print("Translations saved to: {}".format(output_path_set))
        else:
            for hyp in hypotheses:
                print(hyp)

    else:
        # enter interactive mode
        batch_size = 1
        while True:
            try:
                src_input = input("\nPlease enter a source sentence "
                                  "(pre-processed): \n")
                if not src_input.strip():
                    break

                # every line has to be made into dataset
                test_data = _load_line_as_data(line=src_input)

                hypotheses = _translate_data(test_data)
                print("JoeyNMT: {}".format(hypotheses[0]))

            except (KeyboardInterrupt, EOFError):
                print("\nBye.")
                break
示例#11
0
def build_model(cfg: dict = None, vocabs: dict = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    src_vocab = vocabs["src"]
    trg_vocab = vocabs["trg"]
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    if "encoder" in cfg:
        enc_configs = {"src": cfg["encoder"]}
    else:
        assert "encoders" in cfg
        enc_configs = cfg["encoders"]

    enc_embeds = dict()
    encoders = dict()
    for name, enc_config in enc_configs.items():
        vocab = vocabs[name]
        if "multispace_embeddings" in enc_config:
            # multispace embeddings (meaning with a language feature)
            src_embed = build_multispace_embeddings(
                enc_config["multispace_embeddings"], vocabs, name)
        else:
            src_embed = build_embeddings(enc_config["embeddings"], vocab)

        encoder = build_encoder(enc_config, src_embed.embedding_dim)
        enc_embeds[name] = src_embed
        encoders[name] = encoder

    multi_encoder = len(encoders) > 1

    dec_config = cfg["decoder"]

    # this ties source and target embeddings
    if cfg.get("tied_embeddings", False):
        assert vocabs["src"].itos == trg_vocab.itos, \
            "Embedding cannot be tied because vocabularies differ."
        trg_embed = enc_embeds["src"]
    else:
        if "multispace_embeddings" in dec_config:
            trg_embed = build_multispace_embeddings(
                dec_config["multispace_embeddings"], vocabs, "trg")
        else:
            trg_embed = build_embeddings(dec_config["embeddings"], trg_vocab)

    # build decoder
    dec_dropout = dec_config.get("dropout", 0.)
    if "embeddings" not in dec_config:
        dec_emb_dropout = dec_dropout
    else:
        dec_emb_dropout = dec_config["embeddings"].get("dropout", dec_dropout)
    dec_type = dec_config.get("type", "recurrent")

    if not multi_encoder:
        dec_class = TransformerDecoder if dec_type == "transformer" \
            else RecurrentDecoder
    else:
        enc_out_sizes = {n: enc.output_size for n, enc in encoders.items()}
        if dec_type == "transformer":
            dec_class = MultiSourceTransformerDecoder
        else:
            dec_class = partial(MultiHeadRecurrentDecoder,
                                encoder_output_sizes=enc_out_sizes)

    decoder = dec_class(**dec_config,
                        encoder_output_size=encoders["src"].output_size,
                        vocab_size=len(trg_vocab),
                        emb_size=trg_embed.embedding_dim,
                        emb_dropout=dec_emb_dropout)

    if not multi_encoder:
        model = Model(encoder=encoders["src"],
                      decoder=decoder,
                      src_embed=enc_embeds["src"],
                      trg_embed=trg_embed,
                      src_vocab=vocabs["src"],
                      trg_vocab=trg_vocab)
    else:
        model = MultiEncoderModel(encoders=encoders,
                                  decoder=decoder,
                                  enc_embeds=enc_embeds,
                                  trg_embed=trg_embed,
                                  vocabs=vocabs)

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        assert isinstance(model.decoder.output_layers["vocab"], nn.Linear)
        if trg_embed.lut.weight.shape == \
                model.decoder.output_layers["vocab"].weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layers["vocab"].weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
示例#12
0
    def __init__(self, model: Model, config: dict) -> None:
        """
        Creates a new TrainManager for a model, specified as in configuration.

        :param model: torch module defining the model
        :param config: dictionary containing the training configurations
        """
        train_config = config["training"]

        # files for logging and storing
        self.model_dir = make_model_dir(train_config["model_dir"],
                                        overwrite=train_config.get(
                                            "overwrite", False))
        self.logger = make_logger(model_dir=self.model_dir)
        self.logging_freq = train_config.get("logging_freq", 100)
        self.valid_report_file = "{}/validations.txt".format(self.model_dir)
        self.tb_writer = SummaryWriter(log_dir=self.model_dir +
                                       "/tensorboard/")

        # model
        self.model = model
        self.pad_index = self.model.pad_index
        self.bos_index = self.model.bos_index
        self._log_parameters_list()

        # objective
        self.loss = WeightedCrossEntropy(ignore_index=self.pad_index)
        #nn.NLLLoss(ignore_index=self.pad_index, reduction='sum')
        self.normalization = train_config.get("normalization", "batch")
        if self.normalization not in ["batch", "tokens"]:
            raise ConfigurationError("Invalid normalization. "
                                     "Valid options: 'batch', 'tokens'.")

        # optimization
        self.learning_rate_min = train_config.get("learning_rate_min", 1.0e-8)
        self.clip_grad_fun = build_gradient_clipper(config=train_config)

        # re-order the model parameters by name before initialisation of optimizer
        # Reference: https://github.com/pytorch/pytorch/issues/1489
        all_params = list(model.named_parameters())
        sorted_params = sorted(all_params)
        sorted_params = OrderedDict(sorted_params)
        self.optimizer = build_optimizer(config=train_config,
                                         parameters=sorted_params.values())

        # save checkpoint by epoch
        self.save_freq = train_config.get("save_freq", -1)

        # validation & early stopping
        self.validation_freq = train_config.get("validation_freq", 1000)
        self.log_valid_sents = train_config.get("print_valid_sents", [0, 1, 2])
        self.ckpt_queue = queue.Queue(
            maxsize=train_config.get("keep_last_ckpts", 5))
        self.eval_metric = train_config.get("eval_metric", "bleu")
        if self.eval_metric not in ['bleu', 'chrf']:
            raise ConfigurationError("Invalid setting for 'eval_metric', "
                                     "valid options: 'bleu', 'chrf'.")
        self.early_stopping_metric = train_config.get("early_stopping_metric",
                                                      "eval_metric")
        # if we schedule after BLEU/chrf, we want to maximize it, else minimize
        # early_stopping_metric decides on how to find the early stopping point:
        # ckpts are written when there's a new high/low score for this metric
        if self.early_stopping_metric in ["ppl", "loss"]:
            self.minimize_metric = True
        elif self.early_stopping_metric == "eval_metric":
            if self.eval_metric in ["bleu", "chrf"]:
                self.minimize_metric = False
            else:  # eval metric that has to get minimized (not yet implemented)
                self.minimize_metric = True
        else:
            raise ConfigurationError(
                "Invalid setting for 'early_stopping_metric', "
                "valid options: 'loss', 'ppl', 'eval_metric'.")
        self.post_process = config["data"].get("post_process", True)

        # learning rate scheduling
        self.scheduler, self.scheduler_step_at = build_scheduler(
            config=train_config,
            scheduler_mode="min" if self.minimize_metric else "max",
            optimizer=self.optimizer)

        # data & batch handling
        self.level = config["data"]["level"]
        if self.level not in ["word", "bpe", "char"]:
            raise ConfigurationError("Invalid segmentation level. "
                                     "Valid options: 'word', 'bpe', 'char'.")
        self.shuffle = train_config.get("shuffle", True)
        self.epochs = train_config["epochs"]
        self.batch_size = train_config["batch_size"]
        self.batch_multiplier = train_config.get("batch_multiplier", 1)

        # generation
        self.max_output_length = train_config.get("max_output_length", None)

        # CPU / GPU
        self.use_cuda = train_config["use_cuda"]
        if self.use_cuda:
            self.model.cuda()

        # model parameters
        if "load_model" in train_config.keys():
            model_load_path = train_config["load_model"]
            self.logger.info("Loading model from %s", model_load_path)
            self.init_from_checkpoint(model_load_path)

        # initialize training statistics
        self.steps = 0
        # stop training if this flag is True by reaching learning rate minimum
        self.stop = False
        self.total_tokens = 0
        self.best_ckpt_iteration = 0
        # initial values for best scores
        self.best_ckpt_score = np.inf if self.minimize_metric else -np.inf
        # comparison function for scores
        self.is_best = lambda score: score < self.best_ckpt_score \
            if self.minimize_metric else score > self.best_ckpt_score

        # for learning with logged feedback
        if config["data"].get("feedback", None) is not None:
            self.logger.info("Learning with token-level feedback.")
        self.return_logp = config["testing"].get("return_logp", False)
示例#13
0
文件: model.py 项目: may-/joeynmt
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    logger.info("Building an encoder-decoder model...")
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    # this ties source and target embeddings
    # for softmax layer tying, see further below
    if cfg.get("tied_embeddings", False):
        if src_vocab.itos == trg_vocab.itos:
            # share embeddings for src and trg
            trg_embed = src_embed
        else:
            raise ConfigurationError(
                "Embedding cannot be tied since vocabularies differ.")
    else:
        trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                               vocab_size=len(trg_vocab),
                               padding_idx=trg_padding_idx)

    # build encoder
    enc_dropout = cfg["encoder"].get("dropout", 0.)
    enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    if cfg["encoder"].get("type", "recurrent") == "transformer":
        assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
               cfg["encoder"]["hidden_size"], \
               "for transformer, emb_size must be hidden_size"

        encoder = TransformerEncoder(**cfg["encoder"],
                                     emb_size=src_embed.embedding_dim,
                                     emb_dropout=enc_emb_dropout)
    else:
        encoder = RecurrentEncoder(**cfg["encoder"],
                                   emb_size=src_embed.embedding_dim,
                                   emb_dropout=enc_emb_dropout)

    # build decoder
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)
    if cfg["decoder"].get("type", "recurrent") == "transformer":
        decoder = TransformerDecoder(**cfg["decoder"],
                                     encoder=encoder,
                                     vocab_size=len(trg_vocab),
                                     emb_size=trg_embed.embedding_dim,
                                     emb_dropout=dec_emb_dropout)
    else:
        decoder = RecurrentDecoder(**cfg["decoder"],
                                   encoder=encoder,
                                   vocab_size=len(trg_vocab),
                                   emb_size=trg_embed.embedding_dim,
                                   emb_dropout=dec_emb_dropout)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.decoder.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    # initialize embeddings from file
    pretrained_enc_embed_path = cfg["encoder"]["embeddings"].get(
        "load_pretrained", None)
    pretrained_dec_embed_path = cfg["decoder"]["embeddings"].get(
        "load_pretrained", None)
    if pretrained_enc_embed_path:
        logger.info("Loading pretraind src embeddings...")
        model.src_embed.load_from_file(pretrained_enc_embed_path, src_vocab)
    if pretrained_dec_embed_path and not cfg.get("tied_embeddings", False):
        logger.info("Loading pretraind trg embeddings...")
        model.trg_embed.load_from_file(pretrained_dec_embed_path, trg_vocab)

    logger.info("Enc-dec model built.")
    return model
示例#14
0
文件: model.py 项目: kdrivas/joeynmt
def build_pretrained_model(cfg: dict = None,
                           pretrained_model: Model = None,
                           pretrained_src_vocab: Vocabulary = None,
                           src_vocab: Vocabulary = None,
                           trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    embedding_matrix = np.zeros((len(src_vocab), src_embed.embedding_dim))
    unknown_words = []
    for w in pretrained_src_vocab.itos:
        try:
            pre_ix = pretrained_src_vocab.stoi[w]
            ix = src_vocab.stoi[w]
            embedding_matrix[ix] = pretrained_model.src_embed.lut.weight[
                pre_ix].cpu().detach().numpy()
        except KeyError:
            unknown_words.append(w)

    src_embed.lut.weight = torch.nn.Parameter(
        torch.tensor(embedding_matrix, dtype=torch.float32))

    trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                           vocab_size=len(trg_vocab),
                           padding_idx=trg_padding_idx)

    # build decoder
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)

    encoder = pretrained_model.encoder
    encoder.train()
    set_requires_grad(encoder, True)

    # build encoder
    #enc_dropout = cfg["encoder"].get("dropout", 0.)
    #enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    #if cfg["encoder"].get("type", "recurrent") == "transformer":
    #    assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
    #           cfg["encoder"]["hidden_size"], \
    #           "for transformer, emb_size must be hidden_size"

    #    encoder = TransformerEncoder(**cfg["encoder"],
    #                                 emb_size=src_embed.embedding_dim,
    #                                 emb_dropout=enc_emb_dropout)
    #else:
    #    encoder = RecurrentEncoder(**cfg["encoder"],
    #                               emb_size=src_embed.embedding_dim,
    #                               emb_dropout=enc_emb_dropout)

    if cfg["decoder"].get("type", "recurrent") == "transformer":
        decoder = TransformerDecoder(**cfg["decoder"],
                                     encoder=encoder,
                                     vocab_size=len(trg_vocab),
                                     emb_size=trg_embed.embedding_dim,
                                     emb_dropout=dec_emb_dropout)
    else:
        decoder = RecurrentDecoder(**cfg["decoder"],
                                   encoder=encoder,
                                   vocab_size=len(trg_vocab),
                                   emb_size=trg_embed.embedding_dim,
                                   emb_dropout=dec_emb_dropout)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=pretrained_model.src_vocab,
                  trg_vocab=trg_vocab)

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.decoder.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    #initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
示例#15
0
def build_model(cfg: dict = None, vocabs: dict = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    if "encoders" in cfg:
        # two cases: are columns provided? If so, make an identical encoder
        # for each of them.
        # If instead keys are given...
        if "columns" in cfg["encoders"]:
            enc_columns = cfg["encoders"]["columns"]
            assert all(column in vocabs for column in enc_columns)
            shared_cfg = cfg["encoders"]["encoder"]
            enc_configs = {column: shared_cfg for column in enc_columns}
            share_embs = cfg["encoders"].get("share_embeddings", False)
            share_encoders = cfg["encoders"].get("share_encoders", False)
            if share_embs:
                any_v = next(v for k, v in vocabs.items() if k != "trg")
                assert all(v == any_v for k, v in vocabs.items() if k != "trg")
        else:
            enc_columns = list(cfg["encoders"].keys())
            enc_configs = cfg["encoders"]
            share_embs = False
            share_encoders = False
    else:
        enc_columns = ["src"]
        enc_configs = {"src": cfg["encoder"]}
        share_embs = False
        share_encoders = False

    dec_config = cfg["decoder"]

    emb_configs = {
        name: enc_config["embeddings"]
        for name, enc_config in enc_configs.items()
    }

    emb_configs["trg"] = dec_config["embeddings"]

    embeds = dict()
    encoders = dict()
    for enc_column, enc_cfg in enc_configs.items():
        # make each encoder

        if "feature_embeddings" in enc_cfg:
            # feature embeddings features come from label fields of a tsv
            embed = build_feature_embeddings(enc_cfg["feature_embeddings"],
                                             vocabs, enc_column)
        else:
            if share_embs and embeds:
                # get something that's already in the dict
                embed = next(iter(embeds.values()))
            else:
                # make a new embedding matrix
                vocab = vocabs[enc_column]
                emb_cfg = enc_cfg["embeddings"]
                embed = Embeddings(**emb_cfg,
                                   vocab_size=len(vocab),
                                   padding_idx=vocab.stoi[PAD_TOKEN])
        embeds[enc_column] = embed

        if share_encoders and encoders:
            encoder = next(iter(encoders.values()))
        else:
            enc_dropout = enc_cfg.get("dropout", 0.)
            enc_emb_dropout = enc_cfg["embeddings"].get("dropout", enc_dropout)
            enc_type = enc_cfg.get("type", "recurrent")
            '''
            if enc_type == "transformer":
                enc_emb_size = emb_cfg["embedding_dim"]
                enc_hidden_size = enc_cfg["hidden_size"]
                assert enc_emb_size == enc_hidden_size, \
                    "for transformer, emb_size must be hidden_size"
            '''
            enc_class = TransformerEncoder if enc_type == "transformer" \
                else RecurrentEncoder
            encoder = enc_class(**enc_cfg,
                                emb_size=embed.embedding_dim,
                                emb_dropout=enc_emb_dropout)
        encoders[enc_column] = encoder

    trg_vocab = vocabs["trg"]

    # this ties source and target embeddings
    # for softmax layer tying, see further below
    if cfg.get("tied_embeddings", False):
        assert vocabs["src"].itos == vocabs["trg"].itos, \
            "Embedding cannot be tied because vocabularies differ."
        embeds["trg"] = embeds["src"]
    else:
        # build the target embeddings
        if "feature_embeddings" in dec_config:
            # feature embeddings features come from label fields of a tsv
            embed = build_feature_embeddings(dec_config["feature_embeddings"],
                                             vocabs, "trg")
        else:
            trg_vocab = vocabs["trg"]
            dec_emb_cfg = dec_config["embeddings"]
            embed = Embeddings(**dec_emb_cfg,
                               vocab_size=len(trg_vocab),
                               padding_idx=trg_vocab.stoi[PAD_TOKEN])
        embeds["trg"] = embed

    # build decoder
    dec_dropout = dec_config.get("dropout", 0.)
    dec_type = dec_config.get("type", "recurrent")
    dec_class = TransformerDecoder if dec_type == "transformer" \
        else RecurrentDecoder
    decoder = dec_class(**dec_config,
                        encoder_output_size=encoder.output_size,
                        vocab_size=len(vocabs["trg"]),
                        emb_size=embeds["trg"].embedding_dim,
                        emb_dropout=emb_configs["trg"].get(
                            "dropout", dec_dropout),
                        multi_source=len(encoders) > 1,
                        head_names=list(encoders.keys()))

    if len(encoders) == 1:
        model = Model(encoder=encoders["src"],
                      decoder=decoder,
                      src_embed=embeds["src"],
                      trg_embed=embeds["trg"],
                      src_vocab=vocabs["src"],
                      trg_vocab=vocabs["trg"])
    else:
        model = MultiSourceModel(encoders=encoders,
                                 decoder=decoder,
                                 embeds=embeds,
                                 vocabs=vocabs)

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if embeds["trg"].lut.weight.shape == \
                model.decoder.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layer.weight = embeds["trg"].lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    initialize_model(model, cfg, vocabs["trg"].stoi[PAD_TOKEN])

    return model
示例#16
0
    def __init__(self, model: Model, config: dict) -> None:
        """
        Creates a new TrainManager for a model, specified as in configuration.

        :param model: torch module defining the model
        :param config: dictionary containing the training configurations
        """
        train_config = config["training"]

        # files for logging and storing
        self.model_dir = make_model_dir(train_config["model_dir"],
                                        overwrite=train_config.get(
                                            "overwrite", False))
        self.logger = make_logger(model_dir=self.model_dir)
        self.logging_freq = train_config.get("logging_freq", 100)
        self.valid_report_file = join(self.model_dir, "validations.txt")
        self.tb_writer = SummaryWriter(
            log_dir=join(self.model_dir, "tensorboard/")
        )
        self.log_sparsity = train_config.get("log_sparsity", False)

        self.apply_mask = train_config.get("apply_mask", False)
        self.valid_apply_mask = train_config.get("valid_apply_mask", True)

        # model
        self.model = model
        self.pad_index = self.model.pad_index
        self.bos_index = self.model.bos_index
        self._log_parameters_list()

        # objective
        objective = train_config.get("loss", "cross_entropy")
        loss_alpha = train_config.get("loss_alpha", 1.5)
        self.label_smoothing = train_config.get("label_smoothing", 0.0)
        if self.label_smoothing > 0 and objective == "cross_entropy":
            xent_loss = partial(
                LabelSmoothingLoss, smoothing=self.label_smoothing)
        else:
            xent_loss = nn.CrossEntropyLoss

        assert loss_alpha >= 1
        entmax_loss = partial(
            EntmaxBisectLoss, alpha=loss_alpha, n_iter=30
        )

        loss_funcs = {"cross_entropy": xent_loss,
                      "entmax15": partial(Entmax15Loss, k=512),
                      "sparsemax": partial(SparsemaxLoss, k=512),
                      "entmax": entmax_loss}
        if objective not in loss_funcs:
            raise ConfigurationError("Unknown loss function")
        loss_func = loss_funcs[objective]
        self.loss = loss_func(ignore_index=self.pad_index, reduction='sum')

        if "language_loss" in train_config:
            assert "language_weight" in train_config
            self.language_loss = loss_func(
                ignore_index=self.pad_index, reduction='sum'
            )
            self.language_weight = train_config["language_weight"]
        else:
            self.language_loss = None
            self.language_weight = 0.0

        self.norm_type = train_config.get("normalization", "batch")
        if self.norm_type not in ["batch", "tokens"]:
            raise ConfigurationError("Invalid normalization. "
                                     "Valid options: 'batch', 'tokens'.")

        # optimization
        self.learning_rate_min = train_config.get("learning_rate_min", 1.0e-8)

        self.clip_grad_fun = build_gradient_clipper(config=train_config)
        self.optimizer = build_optimizer(
            config=train_config, parameters=model.parameters())

        # validation & early stopping
        self.validation_freq = train_config.get("validation_freq", 1000)
        self.log_valid_sents = train_config.get("print_valid_sents", [0, 1, 2])
        self.plot_attention = train_config.get("plot_attention", False)
        self.ckpt_queue = queue.Queue(
            maxsize=train_config.get("keep_last_ckpts", 5))

        allowed = {'bleu', 'chrf', 'token_accuracy',
                   'sequence_accuracy', 'cer', 'wer'}
        eval_metrics = train_config.get("eval_metric", "bleu")
        if isinstance(eval_metrics, str):
            eval_metrics = [eval_metrics]
        if any(metric not in allowed for metric in eval_metrics):
            ok_metrics = " ".join(allowed)
            raise ConfigurationError("Invalid setting for 'eval_metric', "
                                     "valid options: {}".format(ok_metrics))
        self.eval_metrics = eval_metrics

        early_stop_metric = train_config.get("early_stopping_metric", "loss")
        allowed_early_stop = {"ppl", "loss"} | set(self.eval_metrics)
        if early_stop_metric not in allowed_early_stop:
            raise ConfigurationError(
                "Invalid setting for 'early_stopping_metric', "
                "valid options: 'loss', 'ppl', and eval_metrics.")
        self.early_stopping_metric = early_stop_metric
        self.minimize_metric = early_stop_metric in {"ppl", "loss",
                                                     "cer", "wer"}

        attn_metrics = train_config.get("attn_metric", [])
        if isinstance(attn_metrics, str):
            attn_metrics = [attn_metrics]
        ok_attn_metrics = {"support"}
        assert all(met in ok_attn_metrics for met in attn_metrics)
        self.attn_metrics = attn_metrics

        # learning rate scheduling
        if "encoder" in config["model"]:
            hidden_size = config["model"]["encoder"]["hidden_size"]
        else:
            hidden_size = config["model"]["encoders"]["src"]["hidden_size"]
        self.scheduler, self.scheduler_step_at = build_scheduler(
            config=train_config,
            scheduler_mode="min" if self.minimize_metric else "max",
            optimizer=self.optimizer,
            hidden_size=hidden_size)

        # data & batch handling
        data_cfg = config["data"]
        self.src_level = data_cfg.get(
            "src_level", data_cfg.get("level", "word")
        )
        self.trg_level = data_cfg.get(
            "trg_level", data_cfg.get("level", "word")
        )
        levels = ["word", "bpe", "char"]
        if self.src_level not in levels or self.trg_level not in levels:
            raise ConfigurationError("Invalid segmentation level. "
                                     "Valid options: 'word', 'bpe', 'char'.")

        self.shuffle = train_config.get("shuffle", True)
        self.epochs = train_config["epochs"]
        self.batch_size = train_config["batch_size"]
        self.batch_type = train_config.get("batch_type", "sentence")
        self.eval_batch_size = train_config.get("eval_batch_size",
                                                self.batch_size)
        self.eval_batch_type = train_config.get("eval_batch_type",
                                                self.batch_type)

        self.batch_multiplier = train_config.get("batch_multiplier", 1)

        # generation
        self.max_output_length = train_config.get("max_output_length", None)

        # CPU / GPU
        self.use_cuda = train_config["use_cuda"]
        if self.use_cuda:
            self.model.cuda()
            self.loss.cuda()

        # initialize training statistics
        self.steps = 0
        # stop training if this flag is True by reaching learning rate minimum
        self.stop = False
        self.total_tokens = 0
        self.best_ckpt_iteration = 0
        # initial values for best scores
        self.best_ckpt_score = np.inf if self.minimize_metric else -np.inf

        # model parameters
        if "load_model" in train_config.keys():
            model_load_path = train_config["load_model"]
            self.logger.info("Loading model from %s", model_load_path)
            restart_training = train_config.get("restart_training", False)
            self.init_from_checkpoint(model_load_path, restart_training)
示例#17
0
    def __init__(self, model: Model, config: dict) -> None:
        """
        Creates a new TrainManager for a model, specified as in configuration.

        :param model: torch module defining the model
        :param config: dictionary containing the training configurations
        """
        train_config = config["training"]

        # files for logging and storing
        self.model_dir = make_model_dir(train_config["model_dir"],
                                        overwrite=train_config.get(
                                            "overwrite", False))
        self.logger = make_logger("{}/train.log".format(self.model_dir))
        self.logging_freq = train_config.get("logging_freq", 100)
        self.valid_report_file = "{}/validations.txt".format(self.model_dir)
        self.tb_writer = SummaryWriter(log_dir=self.model_dir +
                                       "/tensorboard/")

        # model
        self.model = model
        self.pad_index = self.model.pad_index
        self.bos_index = self.model.bos_index
        self._log_parameters_list()

        # objective
        self.label_smoothing = train_config.get("label_smoothing", 0.0)
        self.loss = XentLoss(pad_index=self.pad_index,
                             smoothing=self.label_smoothing)
        self.normalization = train_config.get("normalization", "batch")
        if self.normalization not in ["batch", "tokens", "none"]:
            raise ConfigurationError("Invalid normalization option."
                                     "Valid options: "
                                     "'batch', 'tokens', 'none'.")

        # optimization
        self.learning_rate_min = train_config.get("learning_rate_min", 1.0e-8)

        self.clip_grad_fun = build_gradient_clipper(config=train_config)
        self.optimizer = build_optimizer(config=train_config,
                                         parameters=model.parameters())

        # validation & early stopping
        self.validation_freq = train_config.get("validation_freq", 1000)
        self.log_valid_sents = train_config.get("print_valid_sents", [0, 1, 2])
        self.ckpt_queue = queue.Queue(
            maxsize=train_config.get("keep_last_ckpts", 5))
        self.eval_metric = train_config.get("eval_metric", "bleu")
        if self.eval_metric not in [
                'bleu', 'chrf', 'token_accuracy', 'sequence_accuracy'
        ]:
            raise ConfigurationError("Invalid setting for 'eval_metric', "
                                     "valid options: 'bleu', 'chrf', "
                                     "'token_accuracy', 'sequence_accuracy'.")
        self.early_stopping_metric = train_config.get("early_stopping_metric",
                                                      "eval_metric")

        # early_stopping_metric decides on how to find the early stopping point:
        # ckpts are written when there's a new high/low score for this metric.
        # If we schedule after BLEU/chrf/accuracy, we want to maximize the
        # score, else we want to minimize it.
        if self.early_stopping_metric in ["ppl", "loss"]:
            self.minimize_metric = True
        elif self.early_stopping_metric == "eval_metric":
            if self.eval_metric in [
                    "bleu", "chrf", "token_accuracy", "sequence_accuracy"
            ]:
                self.minimize_metric = False
            # eval metric that has to get minimized (not yet implemented)
            else:
                self.minimize_metric = True
        else:
            raise ConfigurationError(
                "Invalid setting for 'early_stopping_metric', "
                "valid options: 'loss', 'ppl', 'eval_metric'.")

        # learning rate scheduling
        self.scheduler, self.scheduler_step_at = build_scheduler(
            config=train_config,
            scheduler_mode="min" if self.minimize_metric else "max",
            optimizer=self.optimizer,
            hidden_size=config["model"]["encoder"]["hidden_size"])

        # data & batch handling
        self.level = config["data"]["level"]
        if self.level not in ["word", "bpe", "char"]:
            raise ConfigurationError("Invalid segmentation level. "
                                     "Valid options: 'word', 'bpe', 'char'.")
        self.shuffle = train_config.get("shuffle", True)
        self.epochs = train_config["epochs"]
        self.batch_size = train_config["batch_size"]
        self.batch_type = train_config.get("batch_type", "sentence")
        self.eval_batch_size = train_config.get("eval_batch_size",
                                                self.batch_size)
        self.eval_batch_type = train_config.get("eval_batch_type",
                                                self.batch_type)

        self.batch_multiplier = train_config.get("batch_multiplier", 1)
        self.current_batch_multiplier = self.batch_multiplier

        # generation
        self.max_output_length = train_config.get("max_output_length", None)

        # CPU / GPU
        self.use_cuda = train_config["use_cuda"]
        if self.use_cuda:
            self.model.cuda()
            self.loss.cuda()

        # initialize accumalted batch loss (needed for batch_multiplier)
        self.norm_batch_loss_accumulated = 0
        # initialize training statistics
        self.steps = 0
        # stop training if this flag is True by reaching learning rate minimum
        self.stop = False
        self.total_tokens = 0
        self.best_ckpt_iteration = 0
        # initial values for best scores
        self.best_ckpt_score = np.inf if self.minimize_metric else -np.inf
        # comparison function for scores
        self.is_best = lambda score: score < self.best_ckpt_score \
            if self.minimize_metric else score > self.best_ckpt_score

        # model parameters
        if "load_model" in train_config.keys():
            model_load_path = train_config["load_model"]
            self.logger.info("Loading model from %s", model_load_path)
            reset_best_ckpt = train_config.get("reset_best_ckpt", False)
            reset_scheduler = train_config.get("reset_scheduler", False)
            reset_optimizer = train_config.get("reset_optimizer", False)
            self.init_from_checkpoint(model_load_path,
                                      reset_best_ckpt=reset_best_ckpt,
                                      reset_scheduler=reset_scheduler,
                                      reset_optimizer=reset_optimizer)
示例#18
0
    def __init__(self, model: Model, config: dict) -> None:
        """
        Creates a new TrainManager for a model, specified as in configuration.

        :param model: torch module defining the model
        :param config: dictionary containing the training configurations
        """
        train_config = config["training"]

        # files for logging and storing
        self.model_dir = train_config["model_dir"]
        make_model_dir(
            self.model_dir, overwrite=train_config.get("overwrite", False)
        )
        self.logger = make_logger(model_dir=self.model_dir)
        self.logging_freq = train_config.get("logging_freq", 100)
        self.valid_report_file = join(self.model_dir, "validations.txt")
        self.tb_writer = SummaryWriter(
            log_dir=join(self.model_dir, "tensorboard/")
        )

        # model
        self.model = model
        self.pad_index = self.model.pad_index
        self._log_parameters_list()

        # objective
        objective = train_config.get("loss", "cross_entropy")
        loss_alpha = train_config.get("loss_alpha", 1.5)

        assert loss_alpha >= 1
        # maybe don't do the label smoothing thing here, instead have
        # nn.CrossEntropyLoss
        # then you look up the loss func, and you either use it directly or
        # wrap it in FYLabelSmoothingLoss
        if objective == "softmax":
            objective = "cross_entropy"
        loss_funcs = {
            "cross_entropy": nn.CrossEntropyLoss,
            "entmax15": partial(Entmax15Loss, k=512),
            "sparsemax": partial(SparsemaxLoss, k=512),
            "entmax": partial(EntmaxBisectLoss, alpha=loss_alpha, n_iter=30)
        }
        if objective not in loss_funcs:
            raise ConfigurationError("Unknown loss function")

        loss_module = loss_funcs[objective]
        loss_func = loss_module(ignore_index=self.pad_index, reduction='sum')

        label_smoothing = train_config.get("label_smoothing", 0.0)
        label_smoothing_type = train_config.get("label_smoothing_type", "fy")
        assert label_smoothing_type in ["fy", "szegedy"]
        smooth_dist = train_config.get("smoothing_distribution", "uniform")
        assert smooth_dist in ["uniform", "unigram"]
        if label_smoothing > 0:
            if label_smoothing_type == "fy":
                # label smoothing entmax loss
                if smooth_dist is not None:
                    smooth_p = torch.FloatTensor(model.trg_vocab.frequencies)
                    smooth_p /= smooth_p.sum()
                else:
                    smooth_p = None
                loss_func = FYLabelSmoothingLoss(
                    loss_func, smoothing=label_smoothing, smooth_p=smooth_p
                )
            else:
                assert objective == "cross_entropy"
                loss_func = LabelSmoothingLoss(
                    ignore_index=self.pad_index,
                    reduction="sum",
                    smoothing=label_smoothing
                )
        self.loss = loss_func

        self.norm_type = train_config.get("normalization", "batch")
        if self.norm_type not in ["batch", "tokens"]:
            raise ConfigurationError("Invalid normalization. "
                                     "Valid options: 'batch', 'tokens'.")

        # optimization
        self.learning_rate_min = train_config.get("learning_rate_min", 1.0e-8)

        self.clip_grad_fun = build_gradient_clipper(config=train_config)
        self.optimizer = build_optimizer(
            config=train_config, parameters=model.parameters())

        # validation & early stopping
        self.validate_by_label = train_config.get("validate_by_label", False)
        self.validation_freq = train_config.get("validation_freq", 1000)
        self.log_valid_sents = train_config.get("print_valid_sents", [0, 1, 2])
        self.plot_attention = train_config.get("plot_attention", False)
        self.ckpt_queue = queue.Queue(
            maxsize=train_config.get("keep_last_ckpts", 5))

        allowed = {'bleu', 'chrf', 'token_accuracy',
                   'sequence_accuracy', 'cer', "wer", "levenshtein_distance"}
        eval_metrics = train_config.get("eval_metric", "bleu")
        if isinstance(eval_metrics, str):
            eval_metrics = [eval_metrics]
        if any(metric not in allowed for metric in eval_metrics):
            ok_metrics = " ".join(allowed)
            raise ConfigurationError("Invalid setting for 'eval_metric', "
                                     "valid options: {}".format(ok_metrics))
        self.eval_metrics = eval_metrics
        self.forced_sparsity = train_config.get("forced_sparsity", False)

        early_stop_metric = train_config.get("early_stopping_metric", "loss")
        allowed_early_stop = {"ppl", "loss"} | set(self.eval_metrics)
        if early_stop_metric not in allowed_early_stop:
            raise ConfigurationError(
                "Invalid setting for 'early_stopping_metric', "
                "valid options: 'loss', 'ppl', and eval_metrics.")
        self.early_stopping_metric = early_stop_metric
        min_metrics = {"ppl", "loss", "cer", "wer", "levenshtein_distance"}
        self.minimize_metric = early_stop_metric in min_metrics

        # learning rate scheduling
        hidden_size = _parse_hidden_size(config["model"])
        self.scheduler, self.sched_incr = build_scheduler(
            config=train_config,
            scheduler_mode="min" if self.minimize_metric else "max",
            optimizer=self.optimizer,
            hidden_size=hidden_size)

        # data & batch handling
        # src/trg magic
        if "level" in config["data"]:
            self.src_level = self.trg_level = config["data"]["level"]
        else:
            assert "src_level" in config["data"]
            assert "trg_level" in config["data"]
            self.src_level = config["data"]["src_level"]
            self.trg_level = config["data"]["trg_level"]

        self.shuffle = train_config.get("shuffle", True)
        self.epochs = train_config["epochs"]
        self.batch_size = train_config["batch_size"]
        self.batch_type = train_config.get("batch_type", "sentence")
        self.eval_batch_size = train_config.get("eval_batch_size",
                                                self.batch_size)
        self.eval_batch_type = train_config.get("eval_batch_type",
                                                self.batch_type)

        self.batch_multiplier = train_config.get("batch_multiplier", 1)

        # generation
        self.max_output_length = train_config.get("max_output_length", None)

        # CPU / GPU
        self.use_cuda = train_config["use_cuda"]
        if self.use_cuda:
            self.model.cuda()
            self.loss.cuda()

        # initialize training statistics
        self.steps = 0
        # stop training if this flag is True by reaching learning rate minimum
        self.stop = False
        self.total_tokens = 0
        self.best_ckpt_iteration = 0
        # initial values for best scores
        self.best_ckpt_score = np.inf if self.minimize_metric else -np.inf

        mrt_schedule = train_config.get("mrt_schedule", None)
        assert mrt_schedule is None or mrt_schedule in ["warmup", "mix", "mtl"]
        self.mrt_schedule = mrt_schedule
        self.mrt_p = train_config.get("mrt_p", 0.0)
        self.mrt_lambda = train_config.get("mrt_lambda", 1.0)
        assert 0 <= self.mrt_p <= 1
        assert 0 <= self.mrt_lambda <= 1
        self.mrt_start_steps = train_config.get("mrt_start_steps", 0)
        self.mrt_samples = train_config.get("mrt_samples", 1)
        self.mrt_alpha = train_config.get("mrt_alpha", 1.0)
        self.mrt_strategy = train_config.get("mrt_strategy", "sample")
        self.mrt_cost = train_config.get("mrt_cost", "levenshtein")
        self.mrt_max_len = train_config.get("mrt_max_len", 31)  # hmm
        self.step_counter = count()

        assert self.mrt_alpha > 0
        assert self.mrt_strategy in ["sample", "topk"]
        assert self.mrt_cost in ["levenshtein", "bleu"]

        # model parameters
        if "load_model" in train_config.keys():
            model_load_path = train_config["load_model"]
            reset_training = train_config.get("reset_training", False)
            self.logger.info("Loading model from %s", model_load_path)
            self.init_from_checkpoint(model_load_path, reset=reset_training)