def test_freeze(self): encoder = Embeddings(embedding_dim=self.emb_size, vocab_size=self.vocab_size, padding_idx=self.pad_idx, freeze=True) for n, p in encoder.named_parameters(): self.assertFalse(p.requires_grad)
def test_forward(self): # fix the embedding weights weights = self._get_random_embedding_weights() emb = Embeddings(embedding_dim=self.emb_size, vocab_size=self.vocab_size, padding_idx=self.pad_idx) self._fill_embeddings(emb, weights) indices = torch.Tensor([0, 1, self.pad_idx, 9]).long() embedded = emb.forward(x=indices) # embedding operation is just slicing from weights matrix self.assertTensorEqual(embedded, torch.index_select(input=weights, index=indices, dim=0)) # after embedding, representations for PAD should still be zero self.assertTensorEqual(embedded[2], torch.zeros([self.emb_size]))
def build_model(cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None) -> Model: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param src_vocab: source vocabulary :param trg_vocab: target vocabulary :return: built and initialized model """ src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) if cfg.get("tied_embeddings", False): if src_vocab.itos == trg_vocab.itos: # share embeddings for src and trg trg_embed = src_embed else: raise ConfigurationError( "Embedding cannot be tied since vocabularies differ.") else: trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx) encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim) decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim) model = Model(encoder=encoder, decoder=decoder, src_embed=src_embed, trg_embed=trg_embed, src_vocab=src_vocab, trg_vocab=trg_vocab) # custom initialization of model parameters initialize_model(model, cfg, src_padding_idx, trg_padding_idx) return model
def test_scale(self): # fix the embedding weights weights = self._get_random_embedding_weights() emb = Embeddings(embedding_dim=self.emb_size, vocab_size=self.vocab_size, padding_idx=self.pad_idx, scale=True) emb.lut.weight.data = weights indices = torch.Tensor([0, 1, self.pad_idx, 9]).long() embedded = emb.forward(x=indices) # now scaled self.assertTensorNotEqual( torch.index_select(input=weights, index=indices, dim=0), embedded) self.assertTensorEqual( torch.index_select(input=weights, index=indices, dim=0)* (self.emb_size**0.5), embedded)
def test_pad_zeros(self): emb = Embeddings(embedding_dim=self.emb_size, vocab_size=self.vocab_size, padding_idx=self.pad_idx) # pad embedding should be zeros self.assertTensorEqual(emb.lut.weight[self.pad_idx], torch.zeros([self.emb_size]))
def build_embeddings(emb_config: dict, vocab: Vocabulary): padding_idx = vocab.stoi[PAD_TOKEN] embed = Embeddings(**emb_config, vocab_size=len(vocab), padding_idx=padding_idx) return embed
def _build(self, batch_size): src_time_dim = 4 vocab_size = 7 emb = Embeddings(embedding_dim=self.emb_size, vocab_size=vocab_size, padding_idx=self.pad_index) decoder = TransformerDecoder(num_layers=self.num_layers, num_heads=self.num_heads, hidden_size=self.hidden_size, ff_size=self.ff_size, dropout=self.dropout, emb_dropout=self.dropout, vocab_size=vocab_size) encoder_output = torch.rand(size=(batch_size, src_time_dim, self.hidden_size)) for p in decoder.parameters(): torch.nn.init.uniform_(p, -0.5, 0.5) src_mask = torch.ones(size=(batch_size, 1, src_time_dim)) == 1 encoder_hidden = None # unused return src_mask, emb, decoder, encoder_output, encoder_hidden
def _build(self, batch_size): src_time_dim = 4 vocab_size = 7 emb = Embeddings(embedding_dim=self.emb_size, vocab_size=vocab_size, padding_idx=self.pad_index) encoder = RecurrentEncoder(emb_size=self.emb_size, num_layers=self.num_layers, hidden_size=self.encoder_hidden_size, bidirectional=True) decoder = RecurrentDecoder(hidden_size=self.hidden_size, encoder=encoder, attention="bahdanau", emb_size=self.emb_size, vocab_size=self.vocab_size, num_layers=self.num_layers, init_hidden="bridge", input_feeding=True) encoder_output = torch.rand(size=(batch_size, src_time_dim, encoder.output_size)) for p in decoder.parameters(): torch.nn.init.uniform_(p, -0.5, 0.5) src_mask = torch.ones(size=(batch_size, 1, src_time_dim)) == 1 encoder_hidden = torch.rand(size=(batch_size, encoder.output_size)) return src_mask, emb, decoder, encoder_output, encoder_hidden
def build_model(cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None): src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) if cfg.get("tied_embeddings", False) \ and src_vocab.itos == trg_vocab.itos: # share embeddings for src and trg trg_embed = src_embed else: trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx) encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim) decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim) model = Model(encoder=encoder, decoder=decoder, src_embed=src_embed, trg_embed=trg_embed, src_vocab=src_vocab, trg_vocab=trg_vocab) # custom initialization of model parameters initialize_model(model, cfg, src_padding_idx, trg_padding_idx) return model
def setup_model(params: dict, data: Flickr8k) -> Tuple[Embeddings, Image2Caption]: """ setup embeddings and seq2seq model :param params: params from the yaml file :param data: Flickr Dataset class """ def get_base_arch(encoder_name: str) -> Callable: """ wrapper for model, as EfficientNet does not support __name__ :param encoder_name: name of the encoder to load :return: base_arch """ if 'efficientnet' in encoder_name: base_arch = EfficientNet.from_pretrained(encoder_name).to(device) base_arch.__name__ = encoder_name return base_arch else: return getattr(models, encoder_name) encoder = Encoder(get_base_arch(params.get('encoder')), device, pretrained=True) vocab_size = len(data.corpus.vocab.itos) if params.get('decoder_type', 'RecurrentDecoder') == 'RecurrentDecoder': decoder_type = CustomRecurrentDecoder else: decoder_type = TransformerDecoder decoder = decoder_type( rnn_type=params.get('rnn_type'), emb_size=params['embed_size'], hidden_size=params['hidden_size'], encoder=encoder, vocab_size=vocab_size, init_hidden='bridge', attention=params['attention'], hidden_dropout=params['hidden_dropout'], emb_dropout=params['emb_dropout'], num_layers=params.get('decoder-num_layers', 1) ) embeddings = Embeddings(embedding_dim=params['embed_size'], vocab_size=vocab_size) return embeddings, Image2Caption(encoder, decoder, embeddings, device, params['freeze_encoder'], params.get('fine_tuning', None), params.get('dropout_after_encoder', 0), params['hidden_size']).to(device)
def test_size(self): emb = Embeddings(embedding_dim=self.emb_size, vocab_size=self.vocab_size, padding_idx=self.pad_idx) self.assertEqual(emb.lut.weight.shape, torch.Size([self.vocab_size, self.emb_size]))
def build_model(cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None, trv_vocab: Vocabulary = None, canonizer=None) -> Model: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param src_vocab: source vocabulary :param trg_vocab: target vocabulary :param trv_vocab: kb true value lookup vocabulary :return: built and initialized model """ src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] if "embedding_files" in cfg.keys(): #init from pretrained assert not cfg.get( "tied_embeddings", False ), "TODO implement tied embeddings along with pretrained initialization" raise NotImplementedError( "TODO implement kbsrc embed loading for embedding files") weight_tensors = [] for weight_file in cfg["embedding_files"]: with open(weight_file, "r") as f: weight = [] for line in f.readlines(): line = line.split() line = [float(x) for x in line] weight.append(line) weight = FloatTensor(weight) weight_tensors.append(weight) # Set source Embeddings to Pretrained Embeddings src_embed = Embeddings( int(weight_tensors[0][0].shape[0]), False, #TODO transformer: change to True len(weight_tensors[0]), ) src_embed.lut.weight.data = weight_tensors[0] # Set target Embeddings to Pretrained Embeddings trg_embed = Embeddings( int(weight_tensors[1][0].shape[0]), False, #TODO transformer: change to True len(weight_tensors[1]), ) trg_embed.lut.weight.data = weight_tensors[1] else: src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) if cfg.get("kb_embed_separate", False): kbsrc_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) else: kbsrc_embed = src_embed # this ties source and target embeddings # for softmax layer tying, see further below if cfg.get("tied_embeddings", False): if src_vocab.itos == trg_vocab.itos: # share embeddings for src and trg trg_embed = src_embed else: raise ConfigurationError( "Embedding cannot be tied since vocabularies differ.") else: # Latest TODO: init embeddings with vocab_size = len(trg_vocab joined with kb_vocab) trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx) # build encoder enc_dropout = cfg["encoder"].get("dropout", 0.) enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) if cfg["encoder"].get("type", "recurrent") == "transformer": assert cfg["encoder"]["embeddings"]["embedding_dim"] == \ cfg["encoder"]["hidden_size"], \ "for transformer, emb_size must be hidden_size" encoder = TransformerEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) else: encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) # retrieve kb task info kb_task = bool(cfg.get("kb", False)) k_hops = int( cfg.get("k_hops", 1) ) # k number of kvr attention layers in decoder (eric et al/default: 1) same_module_for_all_hops = bool(cfg.get("same_module_for_all_hops", False)) do_postproc = bool(cfg.get("do_postproc", True)) copy_from_source = bool(cfg.get("copy_from_source", True)) canonization_func = None if canonizer is None else canonizer( copy_from_source=copy_from_source) kb_input_feeding = bool(cfg.get("kb_input_feeding", True)) kb_feed_rnn = bool(cfg.get("kb_feed_rnn", True)) kb_multihead_feed = bool(cfg.get("kb_multihead_feed", False)) posEncKBkeys = cfg.get("posEncdKBkeys", False) tfstyletf = cfg.get("tfstyletf", True) infeedkb = bool(cfg.get("infeedkb", False)) outfeedkb = bool(cfg.get("outfeedkb", False)) add_kb_biases_to_output = bool(cfg.get("add_kb_biases_to_output", True)) kb_max_dims = cfg.get("kb_max_dims", (16, 32)) # should be tuple double_decoder = cfg.get("double_decoder", False) tied_side_softmax = cfg.get( "tied_side_softmax", False) # actually use separate linear layers, tying only the main one do_pad_kb_keys = cfg.get( "pad_kb_keys", True ) # doesnt need to be true for 1 hop (=>BIG PERFORMANCE SAVE), needs to be true for >= 2 hops if hasattr(kb_max_dims, "__iter__"): kb_max_dims = tuple(kb_max_dims) else: assert type(kb_max_dims) == int, kb_max_dims kb_max_dims = (kb_max_dims, ) assert cfg["decoder"]["hidden_size"] dec_dropout = cfg["decoder"].get("dropout", 0.) dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": if tfstyletf: decoder = TransformerDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout, kb_task=kb_task, kb_key_emb_size=kbsrc_embed.embedding_dim, feed_kb_hidden=kb_input_feeding, infeedkb=infeedkb, outfeedkb=outfeedkb, double_decoder=double_decoder) else: decoder = TransformerKBrnnDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout, kb_task=kb_task, k_hops=k_hops, kb_max=kb_max_dims, same_module_for_all_hops=same_module_for_all_hops, kb_key_emb_size=kbsrc_embed.embedding_dim, kb_input_feeding=kb_input_feeding, kb_feed_rnn=kb_feed_rnn, kb_multihead_feed=kb_multihead_feed) else: if not kb_task: decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) else: decoder = KeyValRetRNNDecoder( **cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout, k_hops=k_hops, kb_max=kb_max_dims, same_module_for_all_hops=same_module_for_all_hops, kb_key_emb_size=kbsrc_embed.embedding_dim, kb_input_feeding=kb_input_feeding, kb_feed_rnn=kb_feed_rnn, kb_multihead_feed=kb_multihead_feed, do_pad_kb_keys=do_pad_kb_keys) # specify generator which is mostly just the output layer generator = Generator(dec_hidden_size=cfg["decoder"]["hidden_size"], vocab_size=len(trg_vocab), add_kb_biases_to_output=add_kb_biases_to_output, double_decoder=double_decoder) model = Model( encoder=encoder, decoder=decoder, generator=generator, src_embed=src_embed, trg_embed=trg_embed, src_vocab=src_vocab, trg_vocab=trg_vocab,\ kb_key_embed=kbsrc_embed,\ trv_vocab=trv_vocab, k_hops=k_hops, do_postproc=do_postproc, canonize=canonization_func, kb_att_dims=len(kb_max_dims), posEncKBkeys=posEncKBkeys ) # tie softmax layer with trg embeddings if cfg.get("tied_softmax", False): if trg_embed.lut.weight.shape == \ model.generator.output_layer.weight.shape: # (also) share trg embeddings and softmax layer: model.generator.output_layer.weight = trg_embed.lut.weight if model.generator.double_decoder: # (also also) share trg embeddings and side softmax layer assert hasattr(model.generator, "side_output_layer") if tied_side_softmax: # because of distributivity this becomes O (x_1+x_2) instead of O_1 x_1 + O_2 x_2 model.generator.side_output_layer.weight = trg_embed.lut.weight else: raise ConfigurationError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters initialize_model(model, cfg, src_padding_idx, trg_padding_idx) return model
def build_model(cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None) -> Model: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param src_vocab: source vocabulary :param trg_vocab: target vocabulary :return: built and initialized model """ src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) # this ties source and target embeddings # for softmax layer tying, see further below if cfg.get("tied_embeddings", False): if src_vocab.itos == trg_vocab.itos: # share embeddings for src and trg trg_embed = src_embed else: raise ConfigurationError( "Embedding cannot be tied since vocabularies differ.") else: trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx) # build encoder enc_dropout = cfg["encoder"].get("dropout", 0.) enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) if cfg["encoder"].get("type", "recurrent") == "transformer": assert cfg["encoder"]["embeddings"]["embedding_dim"] == \ cfg["encoder"]["hidden_size"], \ "for transformer, emb_size must be hidden_size" encoder = TransformerEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) else: encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) # build decoder dec_dropout = cfg["decoder"].get("dropout", 0.) dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": decoder = TransformerDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) else: decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) model = Model(encoder=encoder, decoder=decoder, src_embed=src_embed, trg_embed=trg_embed, src_vocab=src_vocab, trg_vocab=trg_vocab) # tie softmax layer with trg embeddings if cfg.get("tied_softmax", False): if trg_embed.lut.weight.shape == \ model.decoder.output_layer.weight.shape: # (also) share trg embeddings and softmax layer: model.decoder.output_layer.weight = trg_embed.lut.weight else: raise ConfigurationError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters initialize_model(model, cfg, src_padding_idx, trg_padding_idx) return model
def build_pretrained_model(cfg: dict = None, pretrained_model: Model = None, pretrained_src_vocab: Vocabulary = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None) -> Model: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param src_vocab: source vocabulary :param trg_vocab: target vocabulary :return: built and initialized model """ src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx) embedding_matrix = np.zeros((len(src_vocab), src_embed.embedding_dim)) unknown_words = [] for w in pretrained_src_vocab.itos: try: pre_ix = pretrained_src_vocab.stoi[w] ix = src_vocab.stoi[w] embedding_matrix[ix] = pretrained_model.src_embed.lut.weight[ pre_ix].cpu().detach().numpy() except KeyError: unknown_words.append(w) src_embed.lut.weight = torch.nn.Parameter( torch.tensor(embedding_matrix, dtype=torch.float32)) trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx) # build decoder dec_dropout = cfg["decoder"].get("dropout", 0.) dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout) encoder = pretrained_model.encoder encoder.train() set_requires_grad(encoder, True) # build encoder #enc_dropout = cfg["encoder"].get("dropout", 0.) #enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) #if cfg["encoder"].get("type", "recurrent") == "transformer": # assert cfg["encoder"]["embeddings"]["embedding_dim"] == \ # cfg["encoder"]["hidden_size"], \ # "for transformer, emb_size must be hidden_size" # encoder = TransformerEncoder(**cfg["encoder"], # emb_size=src_embed.embedding_dim, # emb_dropout=enc_emb_dropout) #else: # encoder = RecurrentEncoder(**cfg["encoder"], # emb_size=src_embed.embedding_dim, # emb_dropout=enc_emb_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": decoder = TransformerDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) else: decoder = RecurrentDecoder(**cfg["decoder"], encoder=encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) model = Model(encoder=encoder, decoder=decoder, src_embed=src_embed, trg_embed=trg_embed, src_vocab=pretrained_model.src_vocab, trg_vocab=trg_vocab) # tie softmax layer with trg embeddings if cfg.get("tied_softmax", False): if trg_embed.lut.weight.shape == \ model.decoder.output_layer.weight.shape: # (also) share trg embeddings and softmax layer: model.decoder.output_layer.weight = trg_embed.lut.weight else: raise ConfigurationError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters #initialize_model(model, cfg, src_padding_idx, trg_padding_idx) return model
def build_model(cfg: dict = None, vocabs: dict = None) -> Model: """ Build and initialize the model according to the configuration. :param cfg: dictionary configuration containing model specifications :param src_vocab: source vocabulary :param trg_vocab: target vocabulary :return: built and initialized model """ if "encoders" in cfg: # two cases: are columns provided? If so, make an identical encoder # for each of them. # If instead keys are given... if "columns" in cfg["encoders"]: enc_columns = cfg["encoders"]["columns"] assert all(column in vocabs for column in enc_columns) shared_cfg = cfg["encoders"]["encoder"] enc_configs = {column: shared_cfg for column in enc_columns} share_embs = cfg["encoders"].get("share_embeddings", False) share_encoders = cfg["encoders"].get("share_encoders", False) if share_embs: any_v = next(v for k, v in vocabs.items() if k != "trg") assert all(v == any_v for k, v in vocabs.items() if k != "trg") else: enc_columns = list(cfg["encoders"].keys()) enc_configs = cfg["encoders"] share_embs = False share_encoders = False else: enc_columns = ["src"] enc_configs = {"src": cfg["encoder"]} share_embs = False share_encoders = False dec_config = cfg["decoder"] emb_configs = { name: enc_config["embeddings"] for name, enc_config in enc_configs.items() } emb_configs["trg"] = dec_config["embeddings"] embeds = dict() encoders = dict() for enc_column, enc_cfg in enc_configs.items(): # make each encoder if "feature_embeddings" in enc_cfg: # feature embeddings features come from label fields of a tsv embed = build_feature_embeddings(enc_cfg["feature_embeddings"], vocabs, enc_column) else: if share_embs and embeds: # get something that's already in the dict embed = next(iter(embeds.values())) else: # make a new embedding matrix vocab = vocabs[enc_column] emb_cfg = enc_cfg["embeddings"] embed = Embeddings(**emb_cfg, vocab_size=len(vocab), padding_idx=vocab.stoi[PAD_TOKEN]) embeds[enc_column] = embed if share_encoders and encoders: encoder = next(iter(encoders.values())) else: enc_dropout = enc_cfg.get("dropout", 0.) enc_emb_dropout = enc_cfg["embeddings"].get("dropout", enc_dropout) enc_type = enc_cfg.get("type", "recurrent") ''' if enc_type == "transformer": enc_emb_size = emb_cfg["embedding_dim"] enc_hidden_size = enc_cfg["hidden_size"] assert enc_emb_size == enc_hidden_size, \ "for transformer, emb_size must be hidden_size" ''' enc_class = TransformerEncoder if enc_type == "transformer" \ else RecurrentEncoder encoder = enc_class(**enc_cfg, emb_size=embed.embedding_dim, emb_dropout=enc_emb_dropout) encoders[enc_column] = encoder trg_vocab = vocabs["trg"] # this ties source and target embeddings # for softmax layer tying, see further below if cfg.get("tied_embeddings", False): assert vocabs["src"].itos == vocabs["trg"].itos, \ "Embedding cannot be tied because vocabularies differ." embeds["trg"] = embeds["src"] else: # build the target embeddings if "feature_embeddings" in dec_config: # feature embeddings features come from label fields of a tsv embed = build_feature_embeddings(dec_config["feature_embeddings"], vocabs, "trg") else: trg_vocab = vocabs["trg"] dec_emb_cfg = dec_config["embeddings"] embed = Embeddings(**dec_emb_cfg, vocab_size=len(trg_vocab), padding_idx=trg_vocab.stoi[PAD_TOKEN]) embeds["trg"] = embed # build decoder dec_dropout = dec_config.get("dropout", 0.) dec_type = dec_config.get("type", "recurrent") dec_class = TransformerDecoder if dec_type == "transformer" \ else RecurrentDecoder decoder = dec_class(**dec_config, encoder_output_size=encoder.output_size, vocab_size=len(vocabs["trg"]), emb_size=embeds["trg"].embedding_dim, emb_dropout=emb_configs["trg"].get( "dropout", dec_dropout), multi_source=len(encoders) > 1, head_names=list(encoders.keys())) if len(encoders) == 1: model = Model(encoder=encoders["src"], decoder=decoder, src_embed=embeds["src"], trg_embed=embeds["trg"], src_vocab=vocabs["src"], trg_vocab=vocabs["trg"]) else: model = MultiSourceModel(encoders=encoders, decoder=decoder, embeds=embeds, vocabs=vocabs) # tie softmax layer with trg embeddings if cfg.get("tied_softmax", False): if embeds["trg"].lut.weight.shape == \ model.decoder.output_layer.weight.shape: # (also) share trg embeddings and softmax layer: model.decoder.output_layer.weight = embeds["trg"].lut.weight else: raise ConfigurationError( "For tied_softmax, the decoder embedding_dim and decoder " "hidden_size must be the same." "The decoder must be a Transformer.") # custom initialization of model parameters initialize_model(model, cfg, vocabs["trg"].stoi[PAD_TOKEN]) return model
def build_unsupervised_nmt_model( cfg: dict = None, src_vocab: Vocabulary = None, trg_vocab: Vocabulary = None) -> UnsupervisedNMTModel: """ Build an UnsupervisedNMTModel. :param cfg: model configuration :param src_vocab: Vocabulary for the src language :param trg_vocab: Vocabulary for the trg language :return: Unsupervised NMT model as specified in cfg """ src_padding_idx = src_vocab.stoi[PAD_TOKEN] trg_padding_idx = trg_vocab.stoi[PAD_TOKEN] # build source and target embedding layers # embeddings in the encoder are pretrained and stay fixed loaded_src_embed = PretrainedEmbeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx, vocab=src_vocab, freeze=True) loaded_trg_embed = PretrainedEmbeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx, vocab=trg_vocab, freeze=True) # embeddings in the decoder are randomly initialised and will be learned src_embed = Embeddings(**cfg["encoder"]["embeddings"], vocab_size=len(src_vocab), padding_idx=src_padding_idx, freeze=False) trg_embed = Embeddings(**cfg["decoder"]["embeddings"], vocab_size=len(trg_vocab), padding_idx=trg_padding_idx, freeze=False) # build shared encoder enc_dropout = cfg["encoder"].get("dropout", 0.) enc_emb_dropout = cfg["encoder"]["embeddings"].get("dropout", enc_dropout) if cfg["encoder"].get("type", "recurrent") == "transformer": assert cfg["encoder"]["embeddings"]["embedding_dim"] == \ cfg["encoder"]["hidden_size"], \ "for transformer, emb_size must be hidden_size" shared_encoder = TransformerEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) else: shared_encoder = RecurrentEncoder(**cfg["encoder"], emb_size=src_embed.embedding_dim, emb_dropout=enc_emb_dropout) # build src and trg language decoder dec_dropout = cfg["decoder"].get("dropout", 0.) dec_emb_dropout = cfg["decoder"]["embeddings"].get("dropout", dec_dropout) if cfg["decoder"].get("type", "recurrent") == "transformer": src_decoder = TransformerDecoder(**cfg["decoder"], encoder=shared_encoder, vocab_size=len(src_vocab), emb_size=src_embed.embedding_dim, emb_dropout=dec_emb_dropout) trg_decoder = TransformerDecoder(**cfg["decoder"], encoder=shared_encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) else: src_decoder = RecurrentDecoder(**cfg["decoder"], encoder=shared_encoder, vocab_size=len(src_vocab), emb_size=src_embed.embedding_dim, emb_dropout=dec_emb_dropout) trg_decoder = RecurrentDecoder(**cfg["decoder"], encoder=shared_encoder, vocab_size=len(trg_vocab), emb_size=trg_embed.embedding_dim, emb_dropout=dec_emb_dropout) # build unsupervised NMT model model = UnsupervisedNMTModel(loaded_src_embed, loaded_trg_embed, src_embed, trg_embed, shared_encoder, src_decoder, trg_decoder, src_vocab, trg_vocab) # initialise model # embed_initializer should be none so loaded encoder embeddings won't be overwritten initialize_model(model.src2src_translator, cfg, src_padding_idx, src_padding_idx) initialize_model(model.src2trg_translator, cfg, src_padding_idx, trg_padding_idx) initialize_model(model.trg2src_translator, cfg, trg_padding_idx, src_padding_idx) initialize_model(model.trg2src_translator, cfg, trg_padding_idx, trg_padding_idx) return model