def test_freeze(self):
     encoder = Embeddings(embedding_dim=self.emb_size,
                          vocab_size=self.vocab_size,
                          padding_idx=self.pad_idx,
                          freeze=True)
     for n, p in encoder.named_parameters():
         self.assertFalse(p.requires_grad)
 def test_forward(self):
     # fix the embedding weights
     weights = self._get_random_embedding_weights()
     emb = Embeddings(embedding_dim=self.emb_size,
                      vocab_size=self.vocab_size,
                      padding_idx=self.pad_idx)
     self._fill_embeddings(emb, weights)
     indices = torch.Tensor([0, 1, self.pad_idx, 9]).long()
     embedded = emb.forward(x=indices)
     # embedding operation is just slicing from weights matrix
     self.assertTensorEqual(embedded, torch.index_select(input=weights,
                                                   index=indices, dim=0))
     # after embedding, representations for PAD should still be zero
     self.assertTensorEqual(embedded[2], torch.zeros([self.emb_size]))
Пример #3
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    if cfg.get("tied_embeddings", False):
        if src_vocab.itos == trg_vocab.itos:
            # share embeddings for src and trg
            trg_embed = src_embed
        else:
            raise ConfigurationError(
                "Embedding cannot be tied since vocabularies differ.")
    else:
        trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                               vocab_size=len(trg_vocab),
                               padding_idx=trg_padding_idx)

    encoder = RecurrentEncoder(**cfg["encoder"],
                               emb_size=src_embed.embedding_dim)
    decoder = RecurrentDecoder(**cfg["decoder"],
                               encoder=encoder,
                               vocab_size=len(trg_vocab),
                               emb_size=trg_embed.embedding_dim)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
 def test_scale(self):
     # fix the embedding weights
     weights = self._get_random_embedding_weights()
     emb = Embeddings(embedding_dim=self.emb_size,
                      vocab_size=self.vocab_size,
                      padding_idx=self.pad_idx,
                      scale=True)
     emb.lut.weight.data = weights
     indices = torch.Tensor([0, 1, self.pad_idx, 9]).long()
     embedded = emb.forward(x=indices)
     # now scaled
     self.assertTensorNotEqual(
         torch.index_select(input=weights, index=indices, dim=0), embedded)
     self.assertTensorEqual(
         torch.index_select(input=weights, index=indices, dim=0)*
         (self.emb_size**0.5), embedded)
 def test_pad_zeros(self):
     emb = Embeddings(embedding_dim=self.emb_size,
                      vocab_size=self.vocab_size,
                      padding_idx=self.pad_idx)
     # pad embedding should be zeros
     self.assertTensorEqual(emb.lut.weight[self.pad_idx],
                      torch.zeros([self.emb_size]))
Пример #6
0
def build_embeddings(emb_config: dict, vocab: Vocabulary):
    padding_idx = vocab.stoi[PAD_TOKEN]

    embed = Embeddings(**emb_config,
                       vocab_size=len(vocab),
                       padding_idx=padding_idx)
    return embed
Пример #7
0
    def _build(self, batch_size):
        src_time_dim = 4
        vocab_size = 7

        emb = Embeddings(embedding_dim=self.emb_size,
                         vocab_size=vocab_size,
                         padding_idx=self.pad_index)

        decoder = TransformerDecoder(num_layers=self.num_layers,
                                     num_heads=self.num_heads,
                                     hidden_size=self.hidden_size,
                                     ff_size=self.ff_size,
                                     dropout=self.dropout,
                                     emb_dropout=self.dropout,
                                     vocab_size=vocab_size)

        encoder_output = torch.rand(size=(batch_size, src_time_dim,
                                          self.hidden_size))

        for p in decoder.parameters():
            torch.nn.init.uniform_(p, -0.5, 0.5)

        src_mask = torch.ones(size=(batch_size, 1, src_time_dim)) == 1

        encoder_hidden = None  # unused
        return src_mask, emb, decoder, encoder_output, encoder_hidden
Пример #8
0
    def _build(self, batch_size):
        src_time_dim = 4
        vocab_size = 7

        emb = Embeddings(embedding_dim=self.emb_size,
                         vocab_size=vocab_size,
                         padding_idx=self.pad_index)

        encoder = RecurrentEncoder(emb_size=self.emb_size,
                                   num_layers=self.num_layers,
                                   hidden_size=self.encoder_hidden_size,
                                   bidirectional=True)

        decoder = RecurrentDecoder(hidden_size=self.hidden_size,
                                   encoder=encoder,
                                   attention="bahdanau",
                                   emb_size=self.emb_size,
                                   vocab_size=self.vocab_size,
                                   num_layers=self.num_layers,
                                   init_hidden="bridge",
                                   input_feeding=True)

        encoder_output = torch.rand(size=(batch_size, src_time_dim,
                                          encoder.output_size))

        for p in decoder.parameters():
            torch.nn.init.uniform_(p, -0.5, 0.5)

        src_mask = torch.ones(size=(batch_size, 1, src_time_dim)) == 1

        encoder_hidden = torch.rand(size=(batch_size, encoder.output_size))

        return src_mask, emb, decoder, encoder_output, encoder_hidden
Пример #9
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None):
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    if cfg.get("tied_embeddings", False) \
        and src_vocab.itos == trg_vocab.itos:
        # share embeddings for src and trg
        trg_embed = src_embed
    else:
        trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                               vocab_size=len(trg_vocab),
                               padding_idx=trg_padding_idx)

    encoder = RecurrentEncoder(**cfg["encoder"],
                               emb_size=src_embed.embedding_dim)
    decoder = RecurrentDecoder(**cfg["decoder"],
                               encoder=encoder,
                               vocab_size=len(trg_vocab),
                               emb_size=trg_embed.embedding_dim)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
Пример #10
0
def setup_model(params: dict, data: Flickr8k) -> Tuple[Embeddings, Image2Caption]:
    """
    setup embeddings and seq2seq model

    :param params: params from the yaml file
    :param data: Flickr Dataset class
    """

    def get_base_arch(encoder_name: str) -> Callable:
        """
        wrapper for model, as EfficientNet does not support __name__

        :param encoder_name: name of the encoder to load
        :return: base_arch
        """
        if 'efficientnet' in encoder_name:
            base_arch = EfficientNet.from_pretrained(encoder_name).to(device)
            base_arch.__name__ = encoder_name
            return base_arch
        else:
            return getattr(models, encoder_name)

    encoder = Encoder(get_base_arch(params.get('encoder')), device, pretrained=True)
    vocab_size = len(data.corpus.vocab.itos)

    if params.get('decoder_type', 'RecurrentDecoder') == 'RecurrentDecoder':
        decoder_type = CustomRecurrentDecoder
    else:
        decoder_type = TransformerDecoder

    decoder = decoder_type(
        rnn_type=params.get('rnn_type'),
        emb_size=params['embed_size'],
        hidden_size=params['hidden_size'],
        encoder=encoder,
        vocab_size=vocab_size,
        init_hidden='bridge',
        attention=params['attention'],
        hidden_dropout=params['hidden_dropout'],
        emb_dropout=params['emb_dropout'],
        num_layers=params.get('decoder-num_layers', 1)
    )

    embeddings = Embeddings(embedding_dim=params['embed_size'], vocab_size=vocab_size)

    return embeddings, Image2Caption(encoder, decoder, embeddings, device, params['freeze_encoder'], params.get('fine_tuning', None), params.get('dropout_after_encoder', 0), params['hidden_size']).to(device)
Пример #11
0
 def test_size(self):
     emb = Embeddings(embedding_dim=self.emb_size,
                      vocab_size=self.vocab_size,
                      padding_idx=self.pad_idx)
     self.assertEqual(emb.lut.weight.shape,
                      torch.Size([self.vocab_size, self.emb_size]))
Пример #12
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None,
                trv_vocab: Vocabulary = None,
                canonizer=None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :param trv_vocab: kb true value lookup vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    if "embedding_files" in cfg.keys():  #init from pretrained
        assert not cfg.get(
            "tied_embeddings", False
        ), "TODO implement tied embeddings along with pretrained initialization"
        raise NotImplementedError(
            "TODO implement kbsrc embed loading for embedding files")
        weight_tensors = []
        for weight_file in cfg["embedding_files"]:
            with open(weight_file, "r") as f:
                weight = []
                for line in f.readlines():
                    line = line.split()
                    line = [float(x) for x in line]
                    weight.append(line)

            weight = FloatTensor(weight)
            weight_tensors.append(weight)
        # Set source Embeddings to Pretrained Embeddings
        src_embed = Embeddings(
            int(weight_tensors[0][0].shape[0]),
            False,  #TODO transformer: change to True
            len(weight_tensors[0]),
        )
        src_embed.lut.weight.data = weight_tensors[0]

        # Set target Embeddings to Pretrained Embeddings
        trg_embed = Embeddings(
            int(weight_tensors[1][0].shape[0]),
            False,  #TODO transformer: change to True
            len(weight_tensors[1]),
        )
        trg_embed.lut.weight.data = weight_tensors[1]
    else:
        src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                               vocab_size=len(src_vocab),
                               padding_idx=src_padding_idx)
        if cfg.get("kb_embed_separate", False):
            kbsrc_embed = Embeddings(**cfg["encoder"]["embeddings"],
                                     vocab_size=len(src_vocab),
                                     padding_idx=src_padding_idx)
        else:
            kbsrc_embed = src_embed

        # this ties source and target embeddings
        # for softmax layer tying, see further below
        if cfg.get("tied_embeddings", False):
            if src_vocab.itos == trg_vocab.itos:
                # share embeddings for src and trg
                trg_embed = src_embed
            else:
                raise ConfigurationError(
                    "Embedding cannot be tied since vocabularies differ.")
        else:
            # Latest TODO: init embeddings with vocab_size = len(trg_vocab joined with kb_vocab)
            trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                                   vocab_size=len(trg_vocab),
                                   padding_idx=trg_padding_idx)
    # build encoder
    enc_dropout = cfg["encoder"].get("dropout", 0.)
    enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    if cfg["encoder"].get("type", "recurrent") == "transformer":
        assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
               cfg["encoder"]["hidden_size"], \
               "for transformer, emb_size must be hidden_size"

        encoder = TransformerEncoder(**cfg["encoder"],
                                     emb_size=src_embed.embedding_dim,
                                     emb_dropout=enc_emb_dropout)
    else:
        encoder = RecurrentEncoder(**cfg["encoder"],
                                   emb_size=src_embed.embedding_dim,
                                   emb_dropout=enc_emb_dropout)

    # retrieve kb task info
    kb_task = bool(cfg.get("kb", False))
    k_hops = int(
        cfg.get("k_hops", 1)
    )  # k number of kvr attention layers in decoder (eric et al/default: 1)
    same_module_for_all_hops = bool(cfg.get("same_module_for_all_hops", False))
    do_postproc = bool(cfg.get("do_postproc", True))
    copy_from_source = bool(cfg.get("copy_from_source", True))
    canonization_func = None if canonizer is None else canonizer(
        copy_from_source=copy_from_source)
    kb_input_feeding = bool(cfg.get("kb_input_feeding", True))
    kb_feed_rnn = bool(cfg.get("kb_feed_rnn", True))
    kb_multihead_feed = bool(cfg.get("kb_multihead_feed", False))
    posEncKBkeys = cfg.get("posEncdKBkeys", False)
    tfstyletf = cfg.get("tfstyletf", True)
    infeedkb = bool(cfg.get("infeedkb", False))
    outfeedkb = bool(cfg.get("outfeedkb", False))
    add_kb_biases_to_output = bool(cfg.get("add_kb_biases_to_output", True))
    kb_max_dims = cfg.get("kb_max_dims", (16, 32))  # should be tuple
    double_decoder = cfg.get("double_decoder", False)
    tied_side_softmax = cfg.get(
        "tied_side_softmax",
        False)  # actually use separate linear layers, tying only the main one
    do_pad_kb_keys = cfg.get(
        "pad_kb_keys", True
    )  # doesnt need to be true for 1 hop (=>BIG PERFORMANCE SAVE), needs to be true for >= 2 hops

    if hasattr(kb_max_dims, "__iter__"):
        kb_max_dims = tuple(kb_max_dims)
    else:
        assert type(kb_max_dims) == int, kb_max_dims
        kb_max_dims = (kb_max_dims, )

    assert cfg["decoder"]["hidden_size"]
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)

    if cfg["decoder"].get("type", "recurrent") == "transformer":
        if tfstyletf:
            decoder = TransformerDecoder(
                **cfg["decoder"],
                encoder=encoder,
                vocab_size=len(trg_vocab),
                emb_size=trg_embed.embedding_dim,
                emb_dropout=dec_emb_dropout,
                kb_task=kb_task,
                kb_key_emb_size=kbsrc_embed.embedding_dim,
                feed_kb_hidden=kb_input_feeding,
                infeedkb=infeedkb,
                outfeedkb=outfeedkb,
                double_decoder=double_decoder)
        else:
            decoder = TransformerKBrnnDecoder(
                **cfg["decoder"],
                encoder=encoder,
                vocab_size=len(trg_vocab),
                emb_size=trg_embed.embedding_dim,
                emb_dropout=dec_emb_dropout,
                kb_task=kb_task,
                k_hops=k_hops,
                kb_max=kb_max_dims,
                same_module_for_all_hops=same_module_for_all_hops,
                kb_key_emb_size=kbsrc_embed.embedding_dim,
                kb_input_feeding=kb_input_feeding,
                kb_feed_rnn=kb_feed_rnn,
                kb_multihead_feed=kb_multihead_feed)
    else:
        if not kb_task:
            decoder = RecurrentDecoder(**cfg["decoder"],
                                       encoder=encoder,
                                       vocab_size=len(trg_vocab),
                                       emb_size=trg_embed.embedding_dim,
                                       emb_dropout=dec_emb_dropout)
        else:
            decoder = KeyValRetRNNDecoder(
                **cfg["decoder"],
                encoder=encoder,
                vocab_size=len(trg_vocab),
                emb_size=trg_embed.embedding_dim,
                emb_dropout=dec_emb_dropout,
                k_hops=k_hops,
                kb_max=kb_max_dims,
                same_module_for_all_hops=same_module_for_all_hops,
                kb_key_emb_size=kbsrc_embed.embedding_dim,
                kb_input_feeding=kb_input_feeding,
                kb_feed_rnn=kb_feed_rnn,
                kb_multihead_feed=kb_multihead_feed,
                do_pad_kb_keys=do_pad_kb_keys)

    # specify generator which is mostly just the output layer
    generator = Generator(dec_hidden_size=cfg["decoder"]["hidden_size"],
                          vocab_size=len(trg_vocab),
                          add_kb_biases_to_output=add_kb_biases_to_output,
                          double_decoder=double_decoder)

    model = Model(
                  encoder=encoder, decoder=decoder, generator=generator,
                  src_embed=src_embed, trg_embed=trg_embed,
                  src_vocab=src_vocab, trg_vocab=trg_vocab,\
                  kb_key_embed=kbsrc_embed,\
                  trv_vocab=trv_vocab,
                  k_hops=k_hops,
                  do_postproc=do_postproc,
                  canonize=canonization_func,
                  kb_att_dims=len(kb_max_dims),
                  posEncKBkeys=posEncKBkeys
                  )

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.generator.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.generator.output_layer.weight = trg_embed.lut.weight
            if model.generator.double_decoder:
                # (also also) share trg embeddings and side softmax layer
                assert hasattr(model.generator, "side_output_layer")
                if tied_side_softmax:
                    # because of distributivity this becomes O (x_1+x_2) instead of O_1 x_1 + O_2 x_2
                    model.generator.side_output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
Пример #13
0
def build_model(cfg: dict = None,
                src_vocab: Vocabulary = None,
                trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    # this ties source and target embeddings
    # for softmax layer tying, see further below
    if cfg.get("tied_embeddings", False):
        if src_vocab.itos == trg_vocab.itos:
            # share embeddings for src and trg
            trg_embed = src_embed
        else:
            raise ConfigurationError(
                "Embedding cannot be tied since vocabularies differ.")
    else:
        trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                               vocab_size=len(trg_vocab),
                               padding_idx=trg_padding_idx)

    # build encoder
    enc_dropout = cfg["encoder"].get("dropout", 0.)
    enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    if cfg["encoder"].get("type", "recurrent") == "transformer":
        assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
               cfg["encoder"]["hidden_size"], \
               "for transformer, emb_size must be hidden_size"

        encoder = TransformerEncoder(**cfg["encoder"],
                                     emb_size=src_embed.embedding_dim,
                                     emb_dropout=enc_emb_dropout)
    else:
        encoder = RecurrentEncoder(**cfg["encoder"],
                                   emb_size=src_embed.embedding_dim,
                                   emb_dropout=enc_emb_dropout)

    # build decoder
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)
    if cfg["decoder"].get("type", "recurrent") == "transformer":
        decoder = TransformerDecoder(**cfg["decoder"],
                                     encoder=encoder,
                                     vocab_size=len(trg_vocab),
                                     emb_size=trg_embed.embedding_dim,
                                     emb_dropout=dec_emb_dropout)
    else:
        decoder = RecurrentDecoder(**cfg["decoder"],
                                   encoder=encoder,
                                   vocab_size=len(trg_vocab),
                                   emb_size=trg_embed.embedding_dim,
                                   emb_dropout=dec_emb_dropout)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=src_vocab,
                  trg_vocab=trg_vocab)

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.decoder.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
Пример #14
0
def build_pretrained_model(cfg: dict = None,
                           pretrained_model: Model = None,
                           pretrained_src_vocab: Vocabulary = None,
                           src_vocab: Vocabulary = None,
                           trg_vocab: Vocabulary = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx)

    embedding_matrix = np.zeros((len(src_vocab), src_embed.embedding_dim))
    unknown_words = []
    for w in pretrained_src_vocab.itos:
        try:
            pre_ix = pretrained_src_vocab.stoi[w]
            ix = src_vocab.stoi[w]
            embedding_matrix[ix] = pretrained_model.src_embed.lut.weight[
                pre_ix].cpu().detach().numpy()
        except KeyError:
            unknown_words.append(w)

    src_embed.lut.weight = torch.nn.Parameter(
        torch.tensor(embedding_matrix, dtype=torch.float32))

    trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                           vocab_size=len(trg_vocab),
                           padding_idx=trg_padding_idx)

    # build decoder
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)

    encoder = pretrained_model.encoder
    encoder.train()
    set_requires_grad(encoder, True)

    # build encoder
    #enc_dropout = cfg["encoder"].get("dropout", 0.)
    #enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    #if cfg["encoder"].get("type", "recurrent") == "transformer":
    #    assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
    #           cfg["encoder"]["hidden_size"], \
    #           "for transformer, emb_size must be hidden_size"

    #    encoder = TransformerEncoder(**cfg["encoder"],
    #                                 emb_size=src_embed.embedding_dim,
    #                                 emb_dropout=enc_emb_dropout)
    #else:
    #    encoder = RecurrentEncoder(**cfg["encoder"],
    #                               emb_size=src_embed.embedding_dim,
    #                               emb_dropout=enc_emb_dropout)

    if cfg["decoder"].get("type", "recurrent") == "transformer":
        decoder = TransformerDecoder(**cfg["decoder"],
                                     encoder=encoder,
                                     vocab_size=len(trg_vocab),
                                     emb_size=trg_embed.embedding_dim,
                                     emb_dropout=dec_emb_dropout)
    else:
        decoder = RecurrentDecoder(**cfg["decoder"],
                                   encoder=encoder,
                                   vocab_size=len(trg_vocab),
                                   emb_size=trg_embed.embedding_dim,
                                   emb_dropout=dec_emb_dropout)

    model = Model(encoder=encoder,
                  decoder=decoder,
                  src_embed=src_embed,
                  trg_embed=trg_embed,
                  src_vocab=pretrained_model.src_vocab,
                  trg_vocab=trg_vocab)

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if trg_embed.lut.weight.shape == \
                model.decoder.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layer.weight = trg_embed.lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    #initialize_model(model, cfg, src_padding_idx, trg_padding_idx)

    return model
Пример #15
0
def build_model(cfg: dict = None, vocabs: dict = None) -> Model:
    """
    Build and initialize the model according to the configuration.

    :param cfg: dictionary configuration containing model specifications
    :param src_vocab: source vocabulary
    :param trg_vocab: target vocabulary
    :return: built and initialized model
    """
    if "encoders" in cfg:
        # two cases: are columns provided? If so, make an identical encoder
        # for each of them.
        # If instead keys are given...
        if "columns" in cfg["encoders"]:
            enc_columns = cfg["encoders"]["columns"]
            assert all(column in vocabs for column in enc_columns)
            shared_cfg = cfg["encoders"]["encoder"]
            enc_configs = {column: shared_cfg for column in enc_columns}
            share_embs = cfg["encoders"].get("share_embeddings", False)
            share_encoders = cfg["encoders"].get("share_encoders", False)
            if share_embs:
                any_v = next(v for k, v in vocabs.items() if k != "trg")
                assert all(v == any_v for k, v in vocabs.items() if k != "trg")
        else:
            enc_columns = list(cfg["encoders"].keys())
            enc_configs = cfg["encoders"]
            share_embs = False
            share_encoders = False
    else:
        enc_columns = ["src"]
        enc_configs = {"src": cfg["encoder"]}
        share_embs = False
        share_encoders = False

    dec_config = cfg["decoder"]

    emb_configs = {
        name: enc_config["embeddings"]
        for name, enc_config in enc_configs.items()
    }

    emb_configs["trg"] = dec_config["embeddings"]

    embeds = dict()
    encoders = dict()
    for enc_column, enc_cfg in enc_configs.items():
        # make each encoder

        if "feature_embeddings" in enc_cfg:
            # feature embeddings features come from label fields of a tsv
            embed = build_feature_embeddings(enc_cfg["feature_embeddings"],
                                             vocabs, enc_column)
        else:
            if share_embs and embeds:
                # get something that's already in the dict
                embed = next(iter(embeds.values()))
            else:
                # make a new embedding matrix
                vocab = vocabs[enc_column]
                emb_cfg = enc_cfg["embeddings"]
                embed = Embeddings(**emb_cfg,
                                   vocab_size=len(vocab),
                                   padding_idx=vocab.stoi[PAD_TOKEN])
        embeds[enc_column] = embed

        if share_encoders and encoders:
            encoder = next(iter(encoders.values()))
        else:
            enc_dropout = enc_cfg.get("dropout", 0.)
            enc_emb_dropout = enc_cfg["embeddings"].get("dropout", enc_dropout)
            enc_type = enc_cfg.get("type", "recurrent")
            '''
            if enc_type == "transformer":
                enc_emb_size = emb_cfg["embedding_dim"]
                enc_hidden_size = enc_cfg["hidden_size"]
                assert enc_emb_size == enc_hidden_size, \
                    "for transformer, emb_size must be hidden_size"
            '''
            enc_class = TransformerEncoder if enc_type == "transformer" \
                else RecurrentEncoder
            encoder = enc_class(**enc_cfg,
                                emb_size=embed.embedding_dim,
                                emb_dropout=enc_emb_dropout)
        encoders[enc_column] = encoder

    trg_vocab = vocabs["trg"]

    # this ties source and target embeddings
    # for softmax layer tying, see further below
    if cfg.get("tied_embeddings", False):
        assert vocabs["src"].itos == vocabs["trg"].itos, \
            "Embedding cannot be tied because vocabularies differ."
        embeds["trg"] = embeds["src"]
    else:
        # build the target embeddings
        if "feature_embeddings" in dec_config:
            # feature embeddings features come from label fields of a tsv
            embed = build_feature_embeddings(dec_config["feature_embeddings"],
                                             vocabs, "trg")
        else:
            trg_vocab = vocabs["trg"]
            dec_emb_cfg = dec_config["embeddings"]
            embed = Embeddings(**dec_emb_cfg,
                               vocab_size=len(trg_vocab),
                               padding_idx=trg_vocab.stoi[PAD_TOKEN])
        embeds["trg"] = embed

    # build decoder
    dec_dropout = dec_config.get("dropout", 0.)
    dec_type = dec_config.get("type", "recurrent")
    dec_class = TransformerDecoder if dec_type == "transformer" \
        else RecurrentDecoder
    decoder = dec_class(**dec_config,
                        encoder_output_size=encoder.output_size,
                        vocab_size=len(vocabs["trg"]),
                        emb_size=embeds["trg"].embedding_dim,
                        emb_dropout=emb_configs["trg"].get(
                            "dropout", dec_dropout),
                        multi_source=len(encoders) > 1,
                        head_names=list(encoders.keys()))

    if len(encoders) == 1:
        model = Model(encoder=encoders["src"],
                      decoder=decoder,
                      src_embed=embeds["src"],
                      trg_embed=embeds["trg"],
                      src_vocab=vocabs["src"],
                      trg_vocab=vocabs["trg"])
    else:
        model = MultiSourceModel(encoders=encoders,
                                 decoder=decoder,
                                 embeds=embeds,
                                 vocabs=vocabs)

    # tie softmax layer with trg embeddings
    if cfg.get("tied_softmax", False):
        if embeds["trg"].lut.weight.shape == \
                model.decoder.output_layer.weight.shape:
            # (also) share trg embeddings and softmax layer:
            model.decoder.output_layer.weight = embeds["trg"].lut.weight
        else:
            raise ConfigurationError(
                "For tied_softmax, the decoder embedding_dim and decoder "
                "hidden_size must be the same."
                "The decoder must be a Transformer.")

    # custom initialization of model parameters
    initialize_model(model, cfg, vocabs["trg"].stoi[PAD_TOKEN])

    return model
Пример #16
0
def build_unsupervised_nmt_model(
        cfg: dict = None,
        src_vocab: Vocabulary = None,
        trg_vocab: Vocabulary = None) -> UnsupervisedNMTModel:
    """
    Build an UnsupervisedNMTModel.

    :param cfg: model configuration
    :param src_vocab: Vocabulary for the src language
    :param trg_vocab: Vocabulary for the trg language
    :return: Unsupervised NMT model as specified in cfg
    """
    src_padding_idx = src_vocab.stoi[PAD_TOKEN]
    trg_padding_idx = trg_vocab.stoi[PAD_TOKEN]

    # build source and target embedding layers
    # embeddings in the encoder are pretrained and stay fixed
    loaded_src_embed = PretrainedEmbeddings(**cfg["encoder"]["embeddings"],
                                            vocab_size=len(src_vocab),
                                            padding_idx=src_padding_idx,
                                            vocab=src_vocab,
                                            freeze=True)

    loaded_trg_embed = PretrainedEmbeddings(**cfg["decoder"]["embeddings"],
                                            vocab_size=len(trg_vocab),
                                            padding_idx=trg_padding_idx,
                                            vocab=trg_vocab,
                                            freeze=True)

    # embeddings in the decoder are randomly initialised and will be learned
    src_embed = Embeddings(**cfg["encoder"]["embeddings"],
                           vocab_size=len(src_vocab),
                           padding_idx=src_padding_idx,
                           freeze=False)

    trg_embed = Embeddings(**cfg["decoder"]["embeddings"],
                           vocab_size=len(trg_vocab),
                           padding_idx=trg_padding_idx,
                           freeze=False)

    # build shared encoder
    enc_dropout = cfg["encoder"].get("dropout", 0.)
    enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout)
    if cfg["encoder"].get("type", "recurrent") == "transformer":
        assert cfg["encoder"]["embeddings"]["embedding_dim"] == \
               cfg["encoder"]["hidden_size"], \
               "for transformer, emb_size must be hidden_size"

        shared_encoder = TransformerEncoder(**cfg["encoder"],
                                            emb_size=src_embed.embedding_dim,
                                            emb_dropout=enc_emb_dropout)
    else:
        shared_encoder = RecurrentEncoder(**cfg["encoder"],
                                          emb_size=src_embed.embedding_dim,
                                          emb_dropout=enc_emb_dropout)

    # build src and trg language decoder
    dec_dropout = cfg["decoder"].get("dropout", 0.)
    dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout)
    if cfg["decoder"].get("type", "recurrent") == "transformer":
        src_decoder = TransformerDecoder(**cfg["decoder"],
                                         encoder=shared_encoder,
                                         vocab_size=len(src_vocab),
                                         emb_size=src_embed.embedding_dim,
                                         emb_dropout=dec_emb_dropout)
        trg_decoder = TransformerDecoder(**cfg["decoder"],
                                         encoder=shared_encoder,
                                         vocab_size=len(trg_vocab),
                                         emb_size=trg_embed.embedding_dim,
                                         emb_dropout=dec_emb_dropout)
    else:
        src_decoder = RecurrentDecoder(**cfg["decoder"],
                                       encoder=shared_encoder,
                                       vocab_size=len(src_vocab),
                                       emb_size=src_embed.embedding_dim,
                                       emb_dropout=dec_emb_dropout)
        trg_decoder = RecurrentDecoder(**cfg["decoder"],
                                       encoder=shared_encoder,
                                       vocab_size=len(trg_vocab),
                                       emb_size=trg_embed.embedding_dim,
                                       emb_dropout=dec_emb_dropout)

    # build unsupervised NMT model
    model = UnsupervisedNMTModel(loaded_src_embed, loaded_trg_embed, src_embed,
                                 trg_embed, shared_encoder, src_decoder,
                                 trg_decoder, src_vocab, trg_vocab)

    # initialise model
    # embed_initializer should be none so loaded encoder embeddings won't be overwritten
    initialize_model(model.src2src_translator, cfg, src_padding_idx,
                     src_padding_idx)
    initialize_model(model.src2trg_translator, cfg, src_padding_idx,
                     trg_padding_idx)
    initialize_model(model.trg2src_translator, cfg, trg_padding_idx,
                     src_padding_idx)
    initialize_model(model.trg2src_translator, cfg, trg_padding_idx,
                     trg_padding_idx)

    return model