def build_decoder_and_generator(model_opt, fields): # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) decoder = build_decoder(model_opt, tgt_emb) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) return decoder, generator, tgt_emb
def make_generator(model_opt, decoder, fields, des='tgt'): # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields[des].vocab)), nn.LogSoftmax(dim=-1)) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields[des].vocab) return generator
def build_model(opt, dicts, nSets): opt = update_opt(opt) encoder = onmt.Models.Encoder(opt, dicts['src']) decoder = onmt.Models.Decoder(opt, dicts['tgt'], nSets) if opt.copy_pointer == True: generator = CopyGenerator(opt, dicts['tgt']) else: generator = onmt.Models.Generator(opt, dicts['tgt']) print(generator) model = onmt.Models.NMTModel(encoder, decoder) if opt.share_embedding: model.shareEmbedding(dicts) if opt.share_projection: model.shareProjection(generator) return model, generator
def build_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ "Unsupported model type %s" % model_opt.model_type # for backward compatibility if model_opt.rnn_size != -1: model_opt.enc_rnn_size = model_opt.rnn_size model_opt.dec_rnn_size = model_opt.rnn_size # Build embeddings. if model_opt.model_type == "text": src_fields = [f for n, f in fields['src']] assert len(src_fields) == 1 src_field = src_fields[0] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_fields = [f for n, f in fields['tgt']] assert len(tgt_fields) == 1 tgt_field = tgt_fields[0] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). device = torch.device("cuda" if gpu else "cpu") model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"][0][1].base_field.vocab)), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: assert len(fields["tgt"]) == 1 tgt_base_field = fields["tgt"][0][1].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: if model_opt.bert: for p in model.decoder.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) else: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: if model_opt.bert: for p in model.decoder.parameters(): if p.dim() > 1: xavier_uniform_(p) else: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if model_opt.pre_word_vecs_enc is not None: if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if model_opt.pre_word_vecs_dec is not None: if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) model.generator = generator model.to(device) return model
def make_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'src') src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts) encoder = make_encoder(model_opt, src_embeddings) elif model_opt.model_type == "img": encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Make decoder. tgt_dict = fields["tgt"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight # Make NMTModel(= encoder + decoder). if model_opt.encoder_type == "trigramrnn" and model_opt.decoder_type == "rnn": decoder = make_decoder(model_opt, tgt_embeddings) model = NMTSourceTrigramModel(encoder, decoder) elif model_opt.encoder_type == "brnn" and model_opt.decoder_type == "charrnn": [decoder1, decoder2] = make_decoder(model_opt, tgt_embeddings) model = NMTTargetCharModel(encoder, decoder1, decoder2) elif model_opt.encoder_type == "trigramrnn" and model_opt.decoder_type == "charrnn": [decoder1, decoder2] = make_decoder(model_opt, tgt_embeddings) model = CharNMTModel(encoder, decoder1, decoder2) else: decoder = make_decoder(model_opt, tgt_embeddings) model = NMTModel(encoder, decoder) model.model_type = model_opt.model_type # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), nn.LogSoftmax(dim=1)) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Initializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.pre_encoder: pretrained = torch.load(model_opt.pre_encoder) encoder_dict = {} for key in pretrained['model']: if key.startswith('encoder'): encoder_dict[key] = pretrained['model'][key] model_dict = model.state_dict() model_dict.update(encoder_dict) model.load_state_dict(model_dict) model.encoder.requires_grad = False for p in model.encoder.parameters(): p.requires_grad = False if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if model_opt.decoder_type == "charrnn": if hasattr(model.decoder1, 'embeddings'): model.decoder1.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) else: if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # Build embeddings. if model_opt.model_type == "text": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. if model_opt.domain_cls_enc == False: tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings and model_opt.encoder_type != 'bert': # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight if model_opt.domain_cls_enc == False: decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") if model_opt.domain_cls_enc: model = onmt.models.Domain_CLS_ENC(encoder, model_opt) else: model = onmt.models.NMTModel(encoder, decoder, tgt_field, model_opt) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) if model_opt.user_bias != "none": # Multi-task case: user_bias + domain_cls generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32)) if model_opt.domain_classify: dom_classifier = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, model_opt.domain_len), Cast(torch.float32), gen_func) model.dom_classifier = dom_classifier else: # Single-task case: user_bias or user_cls or domain_cls generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.user_classify: classifier = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, model_opt.user_len), Cast(torch.float32), gen_func) model.classifier = classifier if model_opt.domain_classify or model_opt.domain_cls_enc: dom_classifier = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, model_opt.domain_len), Cast(torch.float32), gen_func) model.dom_classifier = dom_classifier if model_opt.share_decoder_embeddings and model_opt.domain_cls_enc == False: if not model_opt.copy_attn: generator[0].weight = decoder.embeddings.word_lut.weight else: generator.linear.weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) #Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) elif model_opt.encoder_type != 'bert' or model_opt.decoder_type != 'bert': if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if (hasattr(model.encoder, 'embeddings') and not model_opt.encoder_type == 'bert'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if model_opt.domain_cls_enc == False and ( hasattr(model.decoder, 'embeddings') and not model_opt.decoder_type == 'bert'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) if model_opt.encoder_type == 'bert' or model_opt.decoder_type == 'bert': if model_opt.bert_type != 'none': model_opt.enc_bert_type = model_opt.bert_type model_opt.dec_bert_type = model_opt.bert_type if model_opt.enc_bert_type != 'none' and checkpoint is None: model.encoder.initialize_bert(model_opt.enc_bert_type) if model_opt.dec_bert_type != 'none' and checkpoint is None: model.decoder.initialize_bert(model_opt.dec_bert_type) # Tie word embedding layer of encoder BERT and decoder if model_opt.encoder_type == 'bert' and model_opt.share_embeddings: decoder.embeddings.word_lut.weight = \ encoder.embeddings.word_lut.weight # Tie decoder word embedding layer with generator weights if model_opt.share_decoder_embeddings: if not model_opt.copy_attn: generator[0].weight = \ decoder.embeddings.word_lut.weight else: generator.linear.weight = \ decoder.embeddings.word_lut.weight if model_opt.encoder_type == 'bert' and model_opt.decoder_type == 'bert': # Tie word, position and token_type embedding # layers of encoder and decoder BERT if model_opt.share_embeddings: decoder.embeddings.position_embeddings.weight = \ encoder.embeddings.position_embeddings.weight decoder.embeddings.token_type_embeddings.weight = \ encoder.embeddings.token_type_embeddings.weight # Tie self-attention between encoder and decoder if model_opt.share_self_attn: for encoder_layer, decoder_layer in zip( encoder.encoder.layer, decoder.transformer_layers): # QUERY clone_or_share_layer(decoder_layer.self_attn.linear_query, encoder_layer.attention.self.query, share=True) # KEY clone_or_share_layer(decoder_layer.self_attn.linear_keys, encoder_layer.attention.self.key, share=True) # VALUE clone_or_share_layer(decoder_layer.self_attn.linear_values, encoder_layer.attention.self.value, share=True) # MULTIHEAD ATTN FINAL LINEAR LAYER clone_or_share_layer(decoder_layer.self_attn.final_linear, encoder_layer.attention.output.dense, share=True) # Tie context-attention with self-attention if model_opt.tie_context_attn: for decoder_layer in decoder.transformer_layers: # QUERY clone_or_share_layer(decoder_layer.context_attn.linear_query, decoder_layer.self_attn.linear_query, share=True) # KEY clone_or_share_layer(decoder_layer.context_attn.linear_keys, decoder_layer.self_attn.linear_keys, share=True) # VALUE clone_or_share_layer(decoder_layer.context_attn.linear_values, decoder_layer.self_attn.linear_values, share=True) # MULTIHEAD ATTN FINAL LINEAR LAYER clone_or_share_layer(decoder_layer.context_attn.final_linear, decoder_layer.self_attn.final_linear, share=True) # Tie positionwise feedforward between encoder and decoder if model_opt.share_feed_forward: for encoder_layer, decoder_layer in zip( encoder.encoder.layer, decoder.transformer_layers): # TRANSFORMER FF clone_or_share_layer(decoder_layer.intermediate.dense, encoder_layer.intermediate.dense, share=True) clone_or_share_layer(decoder_layer.output.dense, encoder_layer.output.dense, share=True) model.generator = generator model.to(device) if model_opt.model_dtype == 'fp16': model.half() return model
def make_base_model(model_opt, fields, gpu, checkpoint=None, train_part="all"): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'src') src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts) encoder = make_encoder(model_opt, src_embeddings) elif model_opt.model_type == "img": encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Make decoder. tgt_dict = fields["tgt"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = make_decoder(model_opt, tgt_embeddings) context = make_context(model_opt, tgt_dict) # Make NMTModel(= encoder + decoder). if model_opt.RISK_ratio > 0.0: scorer = onmt.translate.GNMTGlobalScorer(model_opt.alpha, model_opt.beta, model_opt.coverage_penalty, model_opt.length_penalty) model = NMTModel(encoder, decoder, context, context_type=model_opt.context_type, tgt_vocab=fields['tgt'].vocab, beam_size=model_opt.beam_size, n_best=model_opt.n_best, gpu=gpu, scorer=scorer, min_length=model_opt.min_length, max_length=model_opt.max_length, stepwise_penalty=model_opt.stepwise_penalty, block_ngram_repeat=model_opt.block_ngram_repeat, ignore_when_blocking=model_opt.ignore_when_blocking, copy_attn=model_opt.copy_attn, context_size=model_opt.context_size) else: model = NMTModel(encoder, decoder, context, context_type=model_opt.context_type) model.model_type = model_opt.model_type # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), nn.LogSoftmax(dim=1)) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model_dict = checkpoint['model'] if train_part == "context": model_dict = model.state_dict() if 'join' in model_opt.context_type: pretrained_dict = {} for k, v in checkpoint['model'].items(): if k in model_dict: if 'doc_context' in k: k = k.replace('doc_context', 'doc_context.0') pretrained_dict[k] = v else: pretrained_dict = { k: v for k, v in checkpoint['model'].items() if k in model_dict and 'doc_context' not in k } model_dict.update(pretrained_dict) model.load_state_dict(model_dict, strict=False) generator.load_state_dict(checkpoint['generator']) if train_part == "context": print("Freezing parameters of main model") for param in model.parameters(): param.require_grad = False for param in generator.parameters(): param.require_grad = False print("Unfreezing parameters of context") for param in model.doc_context.parameters(): param.require_grad = True if model_opt.param_init != 0.0: param.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: if param.dim() > 1: xavier_uniform(param) else: if model_opt.param_init != 0.0: print('Intializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def build_end2end_model(model_opt, fields, gpu, checkpoint=None, sel_checkpoint=None, s2s_gen_checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. sel_checkpoint: the model gnerated by selector pre-train phase. Returns: the E2EModel. """ assert model_opt.model_type in ["text"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Build selector. src_dict = fields["src"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'src') sel_src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) selector = build_selector(model_opt, sel_src_embeddings) # Build encoder if model_opt.e2e_type == "separate_enc_sel": if model_opt.selector_share_embeddings: # the shared embeddings are in the encoder.embeddings # TODO: change the state name to load the embeddings in the pretrained selector embeddings assert model_opt.load_pretrained_selector_from == '' src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) src_embeddings.word_lut.weight = sel_src_embeddings.word_lut.weight else: src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) encoder = build_encoder(model_opt, src_embeddings) else: # model_opt.e2e_type == "share_enc_sel" src_embeddings = sel_src_embeddings encoder = None # Build decoder. tgt_dict = fields["tgt"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = build_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = build_decoder(model_opt, tgt_embeddings) # Build E2EModel(= encoder + selector + decoder). device = torch.device("cuda" if gpu else "cpu") model = onmt.models.E2EModel(encoder, selector, decoder, e2e_type=model_opt.e2e_type, use_gt_sel_probs=model_opt.use_gt_sel_probs) model.model_type = model_opt.model_type # Build Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), nn.LogSoftmax(dim=-1)) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: model.load_state_dict(checkpoint['end2end_model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if sel_checkpoint is not None: model.load_state_dict(sel_checkpoint['selector'], strict=False) if s2s_gen_checkpoint is not None: model.load_state_dict(s2s_gen_checkpoint['model'], strict=False) generator.load_state_dict(s2s_gen_checkpoint['generator']) # if hasattr(model.encoder, 'embeddings'): # model.encoder.embeddings.load_pretrained_vectors( # model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) # if hasattr(model.decoder, 'embeddings'): # model.decoder.embeddings.load_pretrained_vectors( # model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator model.to(device) return model
def make_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'src') src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts) encoder = make_encoder(model_opt, src_embeddings) elif model_opt.model_type == "img": encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Make decoder. tgt_dict = fields["tgt"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = make_decoder(model_opt, tgt_embeddings) # Make NMTModel(= encoder + decoder). model = NMTModel(encoder, decoder) model.model_type = model_opt.model_type # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), nn.LogSoftmax()) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Intializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # for back compat when attention_dropout was not defined try: model_opt.attention_dropout except AttributeError: model_opt.attention_dropout = model_opt.dropout # Build embeddings. src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight # Build encoder. encoder_x2y = build_encoder(model_opt, src_emb) encoder_y2x = build_encoder(model_opt, tgt_emb) # Build decoder. decoder_x2y = build_decoder(model_opt, tgt_emb) decoder_y2x = build_decoder(model_opt, src_emb) def share_attn_weight_and_bias(attn1, attn2, share_relative_pos_embeddings=False): attn2.linear_keys = attn1.linear_keys attn2.linear_values = attn1.linear_values attn2.linear_query = attn1.linear_query attn2.final_linear = attn1.final_linear if share_relative_pos_embeddings: assert model_opt.max_relative_positions > 0 attn2.relative_positions_embeddings = \ attn1.relative_positions_embeddings # logger.info('share encoder') encoder_y2x = encoder_x2y # logger.info('share cross_attns btw fwd & bwd decoders') for dec1, dec2 in zip(decoder_x2y.transformer_layers, decoder_y2x.transformer_layers): share_attn_weight_and_bias(dec1.context_attn, dec2.context_attn) # logger.info('share self_attns btw fwd & bwd decoders') for dec1, dec2 in zip(decoder_x2y.transformer_layers, decoder_y2x.transformer_layers): share_attn_weight_and_bias(dec1.self_attn, dec2.self_attn, model_opt.share_relative_pos_embeddings) # logger.info('share feed_forwards btw fwd & bwd decoders') for dec1, dec2 in zip(decoder_x2y.transformer_layers, decoder_y2x.transformer_layers): dec2.feed_forward.w_1 = dec1.feed_forward.w_1 dec2.feed_forward.w_2 = dec1.feed_forward.w_2 # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = onmt.models.NMTModel(encoder_x2y, encoder_y2x, decoder_x2y, decoder_y2x) # Build prior modeling prior = None if model_opt.learned_prior: assert model_opt.num_experts > 1 prior = onmt.models.Classifier( model_opt.enc_rnn_size, model_opt.num_experts, dropout=(model_opt.dropout[0] if type(model_opt.dropout) is list else model_opt.dropout)) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator_x2y = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func ) generator_y2x = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["src"].base_field.vocab)), Cast(torch.float32), gen_func ) if model_opt.share_decoder_embeddings: generator_x2y[0].weight = decoder_x2y.embeddings.word_lut.weight generator_y2x[0].weight = decoder_y2x.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) if model_opt.share_decoder_embeddings: generator_x2y.linear.weight = decoder_x2y.embeddings.word_lut.weight generator_y2x.linear.weight = decoder_y2x.embeddings.word_lut.weight # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = {fix_key(k): v for k, v in checkpoint['model'].items()} # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator_x2y.load_state_dict(checkpoint['generator_x2y'], strict=False) generator_y2x.load_state_dict(checkpoint['generator_y2x'], strict=False) if model_opt.learned_prior: prior.load_state_dict(checkpoint['prior'], strict=False) else: if model_opt.param_init != 0.0: def init_param(target_model): for p in target_model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) init_param(model) init_param(generator_x2y) init_param(generator_y2x) if model_opt.learned_prior: init_param(prior) if model_opt.param_init_glorot: def init_glorot(target_model): for p in target_model.parameters(): if p.dim() > 1: xavier_uniform_(p) init_glorot(model) init_glorot(generator_x2y) init_glorot(generator_y2x) if model_opt.learned_prior: init_glorot(prior) model.generator_x2y = generator_x2y model.generator_y2x = generator_y2x model.prior = prior model.to(device) if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam': model.half() return model
def make_latent_variable_LSTM(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ # Make encoder. src_dict = fields["src"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'src') # Seq2seq encoder src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts) encoder = make_encoder(model_opt, src_embeddings) # Latent variable-approximate distribution-mu/distribution-logvar src_embeddings_approx = make_embeddings(model_opt, src_dict, feature_dicts, for_vae=True) enc_approx = make_encoder(model_opt, src_embeddings_approx, for_vae=True) approx_mu = nn.Linear(model_opt.rnn_size_vae, model_opt.size_vae) approx_logvar = nn.Linear(model_opt.rnn_size_vae, model_opt.size_vae) # Latent variable-true posterior-mu/posterior-logvar src_embeddings_true = make_embeddings(model_opt, src_dict, feature_dicts, for_vae=True) enc_true = make_encoder(model_opt, src_embeddings_true, for_vae=True) true_mu = nn.Linear(model_opt.rnn_size_vae, model_opt.size_vae) true_logvar = nn.Linear(model_opt.rnn_size_vae, model_opt.size_vae) # For AVE-GlobalMemory glb = nn.Linear(model_opt.rnn_size, model_opt.size_vae + model_opt.rnn_size + model_opt.size_c) # Make decoder. tgt_dict = fields["tgt"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Control variable glv = GloVe_Discriminator(gpu) glv_model = glv.load_model(model_opt.glove_dir) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = make_decoder(model_opt, tgt_embeddings) # Make NMTModel(= encoder + decoder). model = LatentVaraibleModel(encoder, decoder, tgt_dict, enc_approx, approx_mu, approx_logvar, enc_true, true_mu, true_logvar, glb, glv, model_opt.max_gen_len, gpu) model.model_type = model_opt.model_type # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear( model_opt.rnn_size + model_opt.size_vae + model_opt.size_c, len(fields["tgt"].vocab)), nn.LogSoftmax()) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator( model_opt.rnn_size + model_opt.size_vae + model_opt.size_c, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Intializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type == "text", ("Unsupported model type %s" % (model_opt.model_type)) # Build encoder. logger.info("build_base_model") if model_opt.model_type == "text": src_dict = fields[ "src"].vocab # torchtext.vocab.Vocab object: dict_keys(['vectors', 'stoi', 'freqs', 'itos']) feature_dicts = inputters.collect_feature_vocabs(fields, 'src') # list: [] src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) ''' Embeddings( (make_embedding): Sequential( (emb_luts): Elementwise( (0): Embedding(24997, 500, padding_idx=1) ) ) ) ''' #logger.info("src embeddings") #logger.info(src_embeddings) logger.info("bulding question encoder") encoder = build_encoder(model_opt, src_embeddings) logger.info(encoder) ############### Modified ############################### ans_dict = fields["ans"].vocab ans_embeddings = build_embeddings(model_opt, ans_dict, feature_dicts) logger.info("building answer encoder") encoder_ans = build_encoder(model_opt, ans_embeddings) logger.info(encoder_ans) ##########################################################s # Build decoder. tgt_dict = fields["tgt"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = build_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = build_decoder(model_opt, tgt_embeddings) # Build NMTModel(= encoder + decoder). device = torch.device("cuda" if gpu else "cpu") model = NMTModel(encoder, encoder_ans, decoder) model.model_type = model_opt.model_type # Build Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size * 2, len(fields["tgt"].vocab)), nn.LogSoftmax(dim=-1)) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator model.to(device) return model
def make_base_model(model_opt, fields, gpu, checkpoint=None, stage1=True): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ if stage1: src = "src1" tgt = "tgt1" else: src = "src2" tgt = "tgt2" assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. if model_opt.model_type == "text": src_dict = fields[src].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, src) src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts) table_embeddings = make_embeddings(model_opt, src_dict, feature_dicts, discard_word=True) # reusing the same embedding weights print(table_embeddings.make_embedding[0]) table_embeddings.word_lut.weight = src_embeddings.word_lut.weight table_embeddings.field_lut.weight = src_embeddings.field_lut.weight table_embeddings.type_lut.weight = src_embeddings.type_lut.weight table_embeddings.ha_lut.weight = src_embeddings.ha_lut.weight encoder = make_encoder(model_opt, (src_embeddings, table_embeddings), stage1) elif model_opt.model_type == "img": encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Make decoder. tgt_dict = fields[tgt].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, tgt) tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight # NOTE: make decoder decoder = make_decoder(model_opt, tgt_embeddings, stage1) # Make NMTModel(= encoder + decoder). model = NMTModel(encoder, decoder) model.model_type = model_opt.model_type # Make Generator. if stage1: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt1"].vocab)), nn.LogSoftmax()) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: # NOTE: CopyGenerator generator = CopyGenerator(model_opt.rnn_size, fields["tgt2"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Intializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def build_base_model(model_opt, fields, gpu, FeatureValues, FeatureTensors, FeatureTypes, FeaturesList, FeatureNames, FTInfos, FeatureTypesNames, SimulationLanguages, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. WALS info checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Build encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'src') src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) encoder = build_encoder(model_opt, src_embeddings) elif model_opt.model_type == "img": if ("image_channel_size" not in model_opt.__dict__): image_channel_size = 3 else: image_channel_size = model_opt.image_channel_size encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, image_channel_size) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Build decoder. tgt_dict = fields["tgt"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = build_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = build_decoder(model_opt, tgt_embeddings) # Wals print( 'Building embeddings for each WALS feature and MLP models for each feature type...' ) embeddings_list, embeddings_keys, mlp_list, mlp_keys = [], [], [], [] for FeatureType in FeatureTypes: list_features = FeatureType[1] for Feature in list_features: globals()['embedding_%s' % Feature] = build_feature_embeddings( gpu, FeatureTensors, FeaturesList, FeatureNames, Feature) # 192 embedding structures, one for each feature. embeddings_keys.append(Feature) embeddings_list.append(globals()['embedding_%s' % Feature]) globals()['mlp_%s' % FeatureType[0]] = build_mlp_feature_type( model_opt, FTInfos, FeatureTypesNames, FeatureType[0]) # 11 MLPs, one for each feature type. mlp_keys.append(FeatureType[0]) mlp_list.append(globals()['mlp_%s' % FeatureType[0]]) embeddings_dic_keys = dict(zip(embeddings_keys, embeddings_list)) EmbeddingFeatures = nn.ModuleDict(embeddings_dic_keys) mlp_dic_keys = dict(zip(mlp_keys, mlp_list)) # Build NMTModel(= encoder + decoder). device = torch.device("cuda" if gpu else "cpu") if model_opt.wals_model == 'EncInitHidden_Target': MLP2RNNHiddenSize_Target = build_mlp2rnnhiddensize_target( model_opt, FTInfos) print('Embeddings for WALS features and MLP models are built!') model = EncoderInitialization(model_opt.wals_model, encoder, decoder, MLP2RNNHiddenSize_Target, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: uses WALS features from the target language to initialize encoder's hidden state." ) elif model_opt.wals_model == 'EncInitHidden_Both': MLP2RNNHiddenSize_Both = build_mlp2rnnhiddensize_both( model_opt, FTInfos) print('Embeddings for WALS features and MLP models are built!') model = EncoderInitialization(model_opt.wals_model, encoder, decoder, MLP2RNNHiddenSize_Both, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: uses WALS features from the source and target languages to initialize encoder's hidden state." ) elif model_opt.wals_model == 'DecInitHidden_Target': MLP2RNNHiddenSize_Target = build_mlp2rnnhiddensize_target( model_opt, FTInfos) print('Embeddings for WALS features and MLP models are built!') model = DecoderInitialization(model_opt.wals_model, encoder, decoder, MLP2RNNHiddenSize_Target, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: adds WALS features from the target language to the encoder's output to initialize decoder's hidden state." ) elif model_opt.wals_model == 'DecInitHidden_Both': MLP2RNNHiddenSize_Both = build_mlp2rnnhiddensize_both( model_opt, FTInfos) print('Embeddings for WALS features and MLP models are built!') model = DecoderInitialization(model_opt.wals_model, encoder, decoder, MLP2RNNHiddenSize_Both, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: adds WALS features from the source and target languages to the encoder's output to initialize decoder's hidden state." ) elif model_opt.wals_model == 'WalstoSource_Target': MLP2WALSHiddenSize_Target = build_mlp2walshiddensize_target( model_opt, FTInfos) print('Embeddings for WALS features and MLP models are built!') model = CombineWalsSourceWords(model_opt.wals_model, encoder, decoder, MLP2WALSHiddenSize_Target, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: concatenates WALS features from the target language to source words embeddings." ) elif model_opt.wals_model == 'WalstoSource_Both': MLP2WALSHiddenSize_Both = build_mlp2walshiddensize_both( model_opt, FTInfos) print('Embeddings for WALS features and MLP models are built!') model = CombineWalsSourceWords(model_opt.wals_model, encoder, decoder, MLP2WALSHiddenSize_Both, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: concatenates WALS features from the source and target languages to source words embeddings." ) elif model_opt.wals_model == 'WalstoTarget_Target': MLP2WALSHiddenSize_Target = build_mlp2walshiddensize_target( model_opt, FTInfos) print('Embeddings for WALS features and MLP models are built!') model = CombineWalsTargetWords(model_opt.wals_model, encoder, decoder, MLP2WALSHiddenSize_Target, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: concatenates WALS features from the target language to target words embeddings." ) elif model_opt.wals_model == 'WalstoTarget_Both': MLP2WALSHiddenSize_Both = build_mlp2walshiddensize_both( model_opt, FTInfos) print('Embeddings for WALS features and MLP models are built!') model = CombineWalsTargetWords(model_opt.wals_model, encoder, decoder, MLP2WALSHiddenSize_Both, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: concatenates WALS features from the source and target languages to target words embeddings." ) elif model_opt.wals_model == 'WalsDoublyAttentive_Target': MLPFeatureTypes = nn.ModuleDict(mlp_dic_keys) MLP_AttentionTarget = build_doublyattentive_target(model_opt) print('Embeddings for WALS features and MLP models are built!') model = WalsDoublyAttention(model_opt.wals_model, encoder, decoder, MLP_AttentionTarget, MLPFeatureTypes, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: the WALS features from the target language are incorporated as an additional attention mechanism." ) elif model_opt.wals_model == 'WalsDoublyAttentive_Both': MLPFeatureTypes = nn.ModuleDict(mlp_dic_keys) MLP_AttentionBoth = build_doublyattentive_both(model_opt) print('Embeddings for WALS features and MLP models are built!') model = WalsDoublyAttention(model_opt.wals_model, encoder, decoder, MLP_AttentionBoth, MLPFeatureTypes, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: the WALS features from the source and target languages are incorporated as an additional attention mechanism." ) elif model_opt.wals_model == 'WalstoDecHidden_Target': MLP2WALSHiddenSize_Target = build_mlp2walshiddensize_target( model_opt, FTInfos) print('Embeddings for WALS features and MLP models are built!') model = WalstoDecHidden(model_opt.wals_model, encoder, decoder, MLP2WALSHiddenSize_Target, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: concatenates WALS features from the target language to decoder hidden state." ) elif model_opt.wals_model == 'WalstoDecHidden_Both': MLP2WALSHiddenSize_Both = build_mlp2walshiddensize_both( model_opt, FTInfos) print('Embeddings for WALS features and MLP models are built!') model = WalstoDecHidden(model_opt.wals_model, encoder, decoder, MLP2WALSHiddenSize_Both, EmbeddingFeatures, FeatureValues, FeatureTypes, SimulationLanguages, model_opt) print( "Model created: concatenates WALS features from the source and target languages to decoder hidden state." ) else: raise Exception("WALS model type not yet implemented: %s" % (opt.wals_model)) model.model_type = model_opt.model_type # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator model.to(device) return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # Build embeddings. if model_opt.model_type == "text": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight if model_opt.share_position_embeddings: tgt_emb.make_embedding.pe.pe.weight = src_emb.make_embedding.pe.pe.weight decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") # Build separate LM if doing simple fusion if model_opt.simple_fusion: layers = 12 size = 768 heads = 12 lm_decoder_opt = copy.deepcopy(model_opt) lm_decoder_opt.dec_layers = layers lm_decoder_opt.use_GPT_version_ctxattn = False lm_decoder_opt.use_GPT_version_psa = False lm_decoder_opt.use_GPT_version_unconditional = True lm_decoder_opt.tgt_word_vec_size = size lm_decoder_opt.rnn_size = size lm_decoder_opt.dec_rnn_size = size lm_decoder_opt.transformer_ff = size * 4 lm_decoder_opt.dec_heads = heads lm_decoder_opt.position_encoding_learned_dec = True lm_decoder_opt.share_decoder_embeddings = True lm_decoder_opt.dropout = 0 lm_decoder_emb = build_embeddings(lm_decoder_opt, tgt_field, for_encoder=False) logger.info(lm_decoder_emb) lm_decoder = build_decoder(lm_decoder_opt, lm_decoder_emb) load_decoder = lm_decoder model = onmt.models.SimpleFusionModel(encoder, decoder, lm_decoder) generator = SimpleFusionGenerator(model_opt.dec_rnn_size, lm_decoder_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)) generator.lm_linear.weight = lm_decoder.embeddings.word_lut.weight if model_opt.share_decoder_embeddings: generator.decoder_linear.weight = decoder.embeddings.word_lut.weight gen_linear = generator.lm_linear else: load_decoder = decoder if model_opt.unconditional: model = onmt.models.UncondModel(decoder) else: model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) if model_opt.padded_vocab_fix_me_later: gen_func = nn.Sequential(PadGen(), gen_func) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight gen_linear = generator[0] else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) if model_opt.share_decoder_embeddings: generator.linear.weight = decoder.embeddings.word_lut.weight gen_linear = generator.linear if model_opt.encdec_share_params: for name, p in decoder.named_parameters(): if 'ctx' in name or 'context' in name: continue pointer = encoder attrs = name.split('.') for attr_name in attrs[:-1]: pointer = getattr(pointer, attr_name) # pointer now has the encoder version of the parameter parent setattr(pointer, attrs[-1], p) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # Normally, just load the model parameters from checkpoint if 'gpt2_params' not in checkpoint and 'enc_model' not in checkpoint: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility # Initialize rest of parameters normally if hasattr(model_opt, 'load_uncond_from') and model_opt.load_uncond_from: for p in decoder.parameters(): if p.dim() > 1: xavier_uniform_(p) # Always initialize encoder parameters normally for p in encoder.parameters(): if p.dim() > 1: xavier_uniform_(p) if model_opt.ctx_weight_param: for name, p in decoder.named_parameters(): if 'ctx_weight' in name: p.data.zero_() if 'ctx_bias' in name: p.data.fill_(-10) model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: # load the gpt parameters if 'gpt2_params' in checkpoint: init_something = model_opt.gpt2_init_embanddec or model_opt.simple_fusion or model_opt.gpt2_init_embandenc or model_opt.GPT_representation_mode != 'none' if init_something: # Initialize all the weights first if model_opt.gpt2_init_zero: for p in decoder.parameters(): p.data.zero_() if model_opt.simple_fusion: generator.decoder_linear.weight.data.zero_() generator.decoder_linear.bias.data.zero_() else: for p in decoder.parameters(): if p.dim() > 1: xavier_uniform_(p) # Always initialize encoder parameters normally if encoder is not None: for p in encoder.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if model_opt.zero_bias_init: gen_linear.bias.data.zero_() if model_opt.ctx_weight_param: for name, p in decoder.named_parameters(): if 'ctx_weight' in name: p.data.zero_() if 'ctx_bias' in name: p.data.fill_(-10) gen_linear.bias.data.zero_() load_models = [] if model_opt.GPT_representation_mode != 'none': load_embs = [] if model_opt.GPT_representation_loc in ['both', 'src']: load_models.append(src_emb.gpt_model) load_embs.append(src_emb) if model_opt.GPT_representation_loc in ['both', 'tgt']: load_models.append(tgt_emb.gpt_model) load_embs.append(tgt_emb) else: if model_opt.gpt2_init_embanddec or model_opt.simple_fusion: load_models = [load_decoder] elif model_opt.gpt2_init_embandenc: load_models = [encoder] it_list = list(checkpoint['gpt2_params']) for lm_idx, load_model in enumerate(load_models): #print(lm_idx, load_model) for name, array in it_list: name = name[12:] # skip "transformer." name = name.split('.') assigned = False if name[0] == 'wpe': if model_opt.GPT_representation_mode != 'none': pointer = load_embs[ lm_idx].make_embedding.pe.pe.weight else: pointer = load_model.embeddings.make_embedding.pe.pe.weight elif name[0] == 'wte': if model_opt.GPT_representation_mode != 'none': pointer = [ load_embs[lm_idx].make_embedding. emb_luts[0].weight, gen_linear.weight ] else: pointer = [ load_model.embeddings.make_embedding. emb_luts[0].weight ] if not model_opt.nopretrain_decemb: pointer.append(gen_linear.weight) if model_opt.simple_fusion and model_opt.sf_pretrain_dec_emb: pointer.append( decoder.embeddings.make_embedding. emb_luts[0].weight) elif name[0] == 'ln_f': if name[1] == 'weight': pointer = load_model.layer_norm.weight elif name[1] == 'bias': pointer = load_model.layer_norm.bias else: raise ValueError( 'I am missing something here!') elif name[0] == 'h': layer_num = name[1] pointer = getattr(load_model.transformer_layers, layer_num) if name[2] == 'attn': assigned = True pointer = pointer.self_attn full_data = torch.from_numpy(array) if name[3] == 'c_attn': end_size = full_data.shape[-1] // 3 assert full_data.shape[-1] % 3 == 0 if name[4] == 'bias': if init_something: pointer.linear_query.bias.data = full_data[: end_size] pointer.linear_keys.bias.data = full_data[ end_size:end_size * 2] pointer.linear_values.bias.data = full_data[ end_size * 2:] if model_opt.gpt2_params_std > 0: pointer.linear_query.bias.orig = full_data[:end_size].clone( ) pointer.linear_keys.bias.orig = full_data[ end_size:end_size * 2].clone() pointer.linear_values.bias.orig = full_data[ end_size * 2:].clone() elif name[4] == 'weight': if init_something: pointer.linear_query.weight.data = full_data[:, :end_size].t( ).contiguous() pointer.linear_keys.weight.data = full_data[:, end_size: end_size * 2].t( ).contiguous( ) pointer.linear_values.weight.data = full_data[:, end_size * 2:].t( ).contiguous( ) if model_opt.gpt2_params_std > 0: pointer.linear_query.weight.orig = full_data[:, :end_size].t( ).contiguous().clone() pointer.linear_keys.weight.orig = full_data[:, end_size: end_size * 2].t( ).contiguous( ).clone( ) pointer.linear_values.weight.orig = full_data[:, end_size * 2:].t( ).contiguous().clone() else: raise ValueError( 'I am missing something here!') elif name[3] == 'c_proj': if name[4] == 'bias': if init_something: pointer.final_linear.bias.data = full_data if model_opt.gpt2_params_std > 0: pointer.final_linear.bias.orig = full_data.clone( ) elif name[4] == 'weight': if init_something: pointer.final_linear.weight.data = full_data.t( ).contiguous() if model_opt.gpt2_params_std > 0: pointer.final_linear.weight.orig = full_data.t( ).contiguous().clone() else: raise ValueError( 'I am missing something here!') elif name[2] == 'ln_1' or name[2] == 'ln_2': num = name[2][3] pointer = getattr(pointer, 'layer_norm_' + num) if name[2] == 'bias': pointer = pointer.bias elif name[2] == 'weight': pointer = pointer.weight else: raise ValueError( 'I am missing something here!') elif name[2] == 'mlp': pointer = pointer.feed_forward pointer = getattr(pointer, name[2]) if name[3] == 'bias': pointer = pointer.bias elif name[3] == 'weight': pointer = pointer.weight else: raise ValueError( 'I am missing something here!') else: raise ValueError( 'I am missing something here!') else: raise ValueError('I am missing something here!') if not assigned: # if name[0] == 'wte': # print(array.shape) # continue if name[-1] == 'weight': array = array.T if not isinstance(pointer, list): pointer = [pointer] for pointer_i in pointer: target_size = int(math.ceil( array.shape[0] / 8)) * 8 padded_vocab = name[ 0] == 'wte' and pointer_i.shape[ 0] == target_size padded_vocab = padded_vocab and pointer_i.shape[ 1:] == array.shape[1:] try: assert pointer_i.shape == array.shape or padded_vocab except AssertionError as e: e.args += (pointer_i.shape, array.shape) raise if init_something: print( "Initialize PyTorch weight {}".format( name)) if padded_vocab: pointer_i.data[:array.shape[ 0]] = torch.from_numpy(array) else: pointer_i.data = torch.from_numpy( array) if model_opt.gpt2_params_std > 0: if padded_vocab: raise NotImplementedError else: pointer_i.orig = torch.from_numpy( array).clone() # name = name[6:] # skip "model/" # name = name.split('/') # assigned = False # if name[0] == 'wpe': # if model_opt.GPT_representation_mode != 'none': # pointer = load_embs[lm_idx].make_embedding.pe.pe.weight # else: # pointer = load_model.embeddings.make_embedding.pe.pe.weight # elif name[0] == 'wte': # if model_opt.GPT_representation_mode != 'none': # pointer = [load_embs[lm_idx].make_embedding.emb_luts[0].weight, gen_linear.weight] # else: # pointer = [load_model.embeddings.make_embedding.emb_luts[0].weight] # if not model_opt.nopretrain_decemb: # pointer.append(gen_linear.weight) # if model_opt.simple_fusion and model_opt.sf_pretrain_dec_emb: # pointer.append(decoder.embeddings.make_embedding.emb_luts[0].weight) # elif name[0] == 'ln_f': # if name[1] == 'g': # pointer = load_model.layer_norm.weight # elif name[1] == 'b': # pointer = load_model.layer_norm.bias # else: # raise ValueError('I am missing something here!') # elif name[0][0] == 'h': # layer_num = name[0][1:] # pointer = getattr(load_model.transformer_layers, layer_num) # if name[1] == 'attn': # assigned = True # pointer = pointer.self_attn # full_data = torch.from_numpy(array) # if name[2] == 'c_attn': # end_size = full_data.shape[-1]//3 # assert full_data.shape[-1] % 3 == 0 # if name[3] == 'b': # if init_something: # pointer.linear_query.bias.data = full_data[:end_size] # pointer.linear_keys.bias.data = full_data[end_size:end_size*2] # pointer.linear_values.bias.data = full_data[end_size*2:] # if model_opt.gpt2_params_std > 0: # pointer.linear_query.bias.orig = full_data[:end_size].clone() # pointer.linear_keys.bias.orig = full_data[end_size:end_size*2].clone() # pointer.linear_values.bias.orig = full_data[end_size*2:].clone() # elif name[3] == 'w': # if init_something: # pointer.linear_query.weight.data = full_data[:, :end_size].t().contiguous() # pointer.linear_keys.weight.data = full_data[:, end_size:end_size*2].t().contiguous() # pointer.linear_values.weight.data = full_data[:, end_size*2:].t().contiguous() # if model_opt.gpt2_params_std > 0: # pointer.linear_query.weight.orig = full_data[:, :end_size].t().contiguous().clone() # pointer.linear_keys.weight.orig = full_data[:, end_size:end_size*2].t().contiguous().clone() # pointer.linear_values.weight.orig = full_data[:, end_size*2:].t().contiguous().clone() # else: # raise ValueError('I am missing something here!') # elif name[2] == 'c_proj': # if name[3] == 'b': # if init_something: # pointer.final_linear.bias.data = full_data # if model_opt.gpt2_params_std > 0: # pointer.final_linear.bias.orig = full_data.clone() # elif name[3] == 'w': # if init_something: # pointer.final_linear.weight.data = full_data.t().contiguous() # if model_opt.gpt2_params_std > 0: # pointer.final_linear.weight.orig = full_data.t().contiguous().clone() # else: # raise ValueError('I am missing something here!') # elif name[1] == 'ln_1' or name[1] == 'ln_2': # num = name[1][3] # pointer = getattr(pointer, 'layer_norm_'+num) # if name[2] == 'b': # pointer = pointer.bias # elif name[2] == 'g': # pointer = pointer.weight # else: # raise ValueError('I am missing something here!') # elif name[1] == 'mlp': # pointer = pointer.feed_forward # pointer = getattr(pointer, name[2]) # if name[3] == 'b': # pointer = pointer.bias # elif name[3] == 'w': # pointer = pointer.weight # else: # raise ValueError('I am missing something here!') # else: # raise ValueError('I am missing something here!') # else: # raise ValueError('I am missing something here!') # if not assigned: # if name[0] == 'wte': # print(array.shape) # continue # if name[-1] == 'w' or name[-1] == 'g': # array = array.T # if not isinstance(pointer, list): # pointer = [pointer] # for pointer_i in pointer: # target_size = int(math.ceil(array.shape[0]/8))*8 # padded_vocab = name[0] == 'wte' and pointer_i.shape[0] == target_size # padded_vocab = padded_vocab and pointer_i.shape[1:] == array.shape[1:] # try: # assert pointer_i.shape == array.shape or padded_vocab # except AssertionError as e: # e.args += (pointer_i.shape, array.shape) # raise # if init_something: # print("Initialize PyTorch weight {}".format(name)) # if padded_vocab: # pointer_i.data[:array.shape[0]] = torch.from_numpy(array) # else: # pointer_i.data = torch.from_numpy(array) # if model_opt.gpt2_params_std > 0: # if padded_vocab: # raise NotImplementedError # else: # pointer_i.orig = torch.from_numpy(array).clone() if 'enc_model' in checkpoint: load_dict = { k[8:]: v for k, v in checkpoint['enc_model'] if 'encoder' in k } encoder.load_state_dict(load_dict, strict=True) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if not model_opt.unconditional and hasattr(model.encoder, 'embeddings') \ and model.encoder.embeddings is not None: model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) # remove requires_grad from params that are not trained: if model_opt.notrain_emb or model_opt.notrain_embanddec: if model_opt.position_encoding_learned_enc and model_opt.share_position_embeddings: model.encoder.embeddings.make_embedding.pe.pe.weight.requires_grad = False if model_opt.share_embeddings: model.encoder.embeddings.make_embedding.emb_luts[ 0].weight.requires_grad = False model.decoder.embeddings.make_embedding.pe.pe.weight.requires_grad = False model.decoder.embeddings.make_embedding.emb_luts[ 0].weight.requires_grad = False generator[0].weight.requires_grad = False if model_opt.notrain_genbias: generator[0].bias.requires_grad = False if model_opt.notrain_embanddec: for name, p in load_decoder.layer_norm.named_parameters(): p.requires_grad = False for name, p in load_decoder.transformer_layers.named_parameters(): if 'context' not in name and 'ctx' not in name: # Takes care of normal and psa versions p.requires_grad = False if model_opt.onlytrainln: for name, p in model.decoder.named_parameters(): if 'layer_norm' not in name: p.requires_grad = False for p in generator.parameters(): p.requires_grad = False if model_opt.onlytrainoutp: if model_opt.share_decoder_embeddings: raise ValueError for p in model.decoder.parameters(): p.requires_grad = False if model_opt.simple_fusion: for p in lm_decoder.parameters(): p.requires_grad = False for p in generator.lm_linear.parameters(): p.requires_grad = False model.generator = generator model.to(device) if model_opt.model_dtype == 'fp16': model.half() for p in model.parameters(): if hasattr(p, 'orig'): p.orig = p.orig.to(device) if model_opt.model_dtype == 'fp16': p.orig = p.orig.half() return model
def build_base_model(model_opt, fields, gpu, length_model, length_penalty_a, length_penalty_b, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # for backward compatibility if model_opt.rnn_size != -1: model_opt.enc_rnn_size = model_opt.rnn_size model_opt.dec_rnn_size = model_opt.rnn_size if model_opt.model_type == 'text' and \ model_opt.enc_rnn_size != model_opt.dec_rnn_size: raise AssertionError("""We do not support different encoder and decoder rnn sizes for translation now.""") # Build encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'src') src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) encoder = build_encoder(model_opt, src_embeddings) elif model_opt.model_type == "img": if ("image_channel_size" not in model_opt.__dict__): image_channel_size = 3 else: image_channel_size = model_opt.image_channel_size encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.enc_rnn_size, model_opt.dropout, image_channel_size) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.rnn_type, model_opt.enc_layers, model_opt.dec_layers, model_opt.brnn, model_opt.enc_rnn_size, model_opt.dec_rnn_size, model_opt.audio_enc_pooling, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Build decoder. tgt_dict = fields["tgt"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = build_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = build_decoder(model_opt, tgt_embeddings) # Build NMTModel(= encoder + decoder). device = torch.device("cuda" if gpu else "cpu") model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) # generator = nn.Sequential( # nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].vocab)), # gen_func # ) # MMM class tune_out_prob(nn.Module): def __init__(self): super(tune_out_prob, self).__init__() self.t_lens = None self.eos_ind = None self.batch_max_len = None self.word_index = None self.tgt_vocab_size = None self.validation = False def length_model_loss(self, scale, value, a, b): #return -(value / scale) ** 2 - scale.log() #return -((value / scale) **2)/2 - (2.5066*scale).log() return -a * (value / scale)**2 + b #*abs(scale) # return -((value / scale) ** 2)*scale + scale #return -(value / scale)*4 + scale def forward(self, x): y = x.clone() #mask = np.ones(x.size()) # for i in range(self.t_lens.size(-1)): # y[i*self.batch_size + self.t_lens[i], self.eos_ind] = \ # y[i * self.batch_size + self.t_lens[i], self.eos_ind].clone() + math.log(0.9) if self.training or self.validation: # training phase y = y.view(self.batch_max_len, -1, self.tgt_vocab_size) # eos_list = [(i * self.batch_max_len + self.t_lens.data.cpu().numpy()[i]) for i in # range(self.t_lens.size(-1))] # other_list = list(set(list(range(x.size(0)))) - set(eos_list)) # y[other_list, self.eos_ind] = -100 # y[eos_list, self.eos_ind] = 0 for wi in range(self.batch_max_len): delta_p = (self.t_lens - wi - 1).float() delta_p[delta_p < 0] = 0.05 * delta_p[delta_p < 0] scale = (self.t_lens.float()).sqrt() / 2.0 penalties = self.length_model_loss( scale, delta_p, length_penalty_a, length_penalty_b) #penalties[penalties > 0] = 0 y[wi, :, self.eos_ind] += penalties y = y.view(-1, self.tgt_vocab_size) #mask[eos_list, self.eos_ind] = +2 #mask[other_list, self.eos_ind] = -2 else: # translation phase if len( x.size() ) == 3: # x of shape [ tgt_len, batch_size, vocab ] is a full sentence # for i in range(len(self.t_lens)): # other_list = list(set(list(range(x.size(0)))) - set(list([self.t_lens.data.cpu().numpy()[i]]))) # #mask[other_list, i, self.eos_ind] = -2 # y[other_list, i, self.eos_ind] = -100 # if self.t_lens[i] < x.size(0): # #mask[self.t_lens[i], i, self.eos_ind] = +2 # y[self.t_lens[i], i, self.eos_ind] = 0 pass else: # x of shape [(batch_size x beam_size) , vocab ] is only for one step beam_size = x.size(0) // self.t_lens.numel() wi = self.word_index delta_p = (self.t_lens - wi - 2).float() delta_p[delta_p < 0] = 0.005 * delta_p[delta_p < 0] delta_p = delta_p.unsqueeze(1).expand( self.t_lens.numel(), beam_size).flatten() scale = (self.t_lens.float()).sqrt() / 2.0 scale = scale.unsqueeze(1).expand( self.t_lens.numel(), beam_size).flatten() penalties = self.length_model_loss( scale, delta_p, length_penalty_a, length_penalty_b) #penalties[penalties > 0] = 0 y[:, self.eos_ind] += penalties #y[eos_list ^ 1, self.eos_ind] = -100 return y #mask = torch.tensor(mask, dtype=x.dtype).to(device) #x= x+mask #return x # y = x.clone() # # 1. since y is the output of log_softmax, apply exponential # # to convert it to probabilistic form # y = torch.exp(y) # # 2. tune probabilities # eos_list = [(i * self.batch_max_len + self.t_lens.data.cpu().numpy()[i]) for i in # range(self.t_lens.size(-1))] # other_list = list(set(list(range(y.size(0)))) - set(eos_list)) # # z = y.clone() # # 2.1. tune probabilities for eos positions # z[eos_list, self.eos_ind] = 1 # z[eos_list, 0:self.eos_ind] = 0 # z[eos_list, self.eos_ind+1:-1] = 0 # # # 2.2. tune probabilities for non-eos positions # p_val = z[other_list, self.eos_ind] / (self.tgt_vocab_size - 1) # z[other_list, self.eos_ind] = 0 # non_eos_inds = list(set(list(range(self.tgt_vocab_size))) - set([self.eos_ind])) # for i in range(len(other_list)): # z[other_list[i], non_eos_inds] = y[other_list[i], non_eos_inds] + p_val[i] # # # 3. convert y back to log-probability form # z = torch.log(z) # return z # MMM if length_model == 'oracle' or length_model == 'fixed_ratio' or length_model == 'lstm': generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].vocab)), gen_func, tune_out_prob()) else: generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].vocab)), gen_func) # generator = nn.Sequential( # nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].vocab)), # gen_func # ) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.dec_rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = \ {fix_key(k): v for (k, v) in checkpoint['model'].items()} # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator model.to(device) return model
def make_base_model(model_opt, mappings, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ # Make encoder. src_dict = mappings['src_vocab'] src_embeddings = make_embeddings(model_opt, src_dict) encoder = make_encoder(model_opt, src_embeddings) # Make context embedder. if model_opt.num_context > 0: context_dict = mappings['utterance_vocab'] context_embeddings = make_embeddings(model_opt, context_dict) context_embedder = make_context_embedder(model_opt, context_embeddings) # Make kb embedder. if "multibank" in model_opt.global_attention: if model_opt.model == 'lf2lf': kb_embedder = None else: kb_dict = mappings['kb_vocab'] kb_embeddings = make_embeddings(model_opt, kb_dict) kb_embedder = make_context_embedder(model_opt, kb_embeddings, 'kb') # Make decoder. tgt_dict = mappings['tgt_vocab'] tgt_embeddings = make_embeddings(model_opt, tgt_dict) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = make_decoder(model_opt, tgt_embeddings, tgt_dict) if "multibank" in model_opt.global_attention: model = NegotiationModel(encoder, decoder, context_embedder, kb_embedder, stateful=model_opt.stateful) else: model = NMTModel(encoder, decoder, stateful=model_opt.stateful) model.model_type = 'text' # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential(nn.Linear(model_opt.rnn_size, len(tgt_dict)), nn.LogSoftmax()) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Intializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) wordvec = {'utterance': model_opt.pretrained_wordvec[0]} if len(model_opt.pretrained_wordvec) > 1: wordvec['kb'] = model_opt.pretrained_wordvec[1] def load_wordvec(embeddings, name): embeddings.load_pretrained_vectors( wordvec[name], model_opt.fix_pretrained_wordvec) # Don't need pretrained word vec for LFs if not model_opt.model in ('lf2lf', ): load_wordvec(model.encoder.embeddings, 'utterance') if hasattr(model, 'context_embedder'): load_wordvec(model.context_embedder.embeddings, 'utterance') if hasattr(model, 'kb_embedder') and model.kb_embedder is not None: load_wordvec(model.kb_embedder.embeddings, 'kb') if model_opt.model == 'seq2seq': load_wordvec(model.decoder.embeddings, 'utterance') # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Build encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'src') src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) encoder = build_encoder( model_opt, src_embeddings ) # we added additional encoder: TransformerEncoderLM elif model_opt.model_type == "img": if ("image_channel_size" not in model_opt.__dict__): image_channel_size = 3 else: image_channel_size = model_opt.image_channel_size encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.enc_rnn_size, model_opt.dropout, image_channel_size) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.rnn_type, model_opt.enc_layers, model_opt.dec_layers, model_opt.brnn, model_opt.enc_rnn_size, model_opt.dec_rnn_size, model_opt.audio_enc_pooling, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Build decoder. tgt_dict = fields["tgt"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = build_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = build_decoder(model_opt, tgt_embeddings) lm_aux = model_opt.encoder_type == "transformerAuxLTR" # Build NMTModel(= encoder + decoder). device = torch.device("cuda" if gpu else "cpu") # the model will return more stuff model = onmt.models.NMTModel(encoder, decoder, lm_aux=lm_aux) # Build Generator. # Hmmm...generator is just hidden states -> word in vocab # since we use shared embedding between encoder and decoder..plus shared embedding between # decoder src to tgt... if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].vocab)), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.dec_rnn_size, fields["tgt"].vocab) # Build Source Generator # not considering copy attention right now if lm_aux: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) # source vocab does not have <s> </s>, but share_vocab might be different... src_generator = nn.Sequential( nn.Linear(model_opt.enc_rnn_size, len(fields["src"].vocab)), gen_func) # this would have made sure that both encoder and decoder share the same generator if model_opt.share_decoder_embeddings: src_generator[0].weight = src_embeddings.word_lut.weight # Load the model states from checkpoint or initialize them. if checkpoint is not None: model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) # if lm_aux: # src_generator.load_state_dict(checkpoint['src_generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if lm_aux: for p in src_generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if lm_aux: for p in src_generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator if lm_aux: model.src_generator = src_generator model.to(device) return model
def make_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = ONMTDataset.collect_feature_dicts(fields) src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts) encoder = make_encoder(model_opt, src_embeddings) else: encoder = ImageEncoder(model_opt.layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) # Make decoder. tgt_dict = fields["tgt"].vocab # TODO: prepare for a future where tgt features are possible. feature_dicts = [] tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) decoder = make_decoder(model_opt, tgt_embeddings) # Make NMTModel(= encoder + decoder). model = NMTModel(encoder, decoder) # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab))) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt, fields["src"].vocab, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Intializing parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) logsoftmax = nn.LogSoftmax() softmax = nn.Softmax() # add the generator to the module (does this register the parameter?) model.generator = generator model.logsoftmax = logsoftmax model.softmax = softmax # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Build encoder. src_dict = fields["src"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'src') src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) encoder = build_encoder(model_opt, src_embeddings) # Build decoder. tgt_dict = fields["tgt"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = build_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = build_decoder(model_opt, tgt_embeddings) # Build NMTModel(= [session encoder] + encoder + decoder). device = torch.device("cuda" if gpu else "cpu") if model_opt.experiment == 'session': # Build Session Encoder. item_embeddings = build_embeddings(model_opt, fields["src_item_sku"].vocab, [], for_user=True) user_log_embeddings = build_embeddings(model_opt, fields["src_user_log"].vocab, [], for_user=True) user_op_embeddings = build_embeddings(model_opt, fields["src_operator"].vocab, [], for_user=True) user_site_cy_embeddings = build_embeddings(model_opt, fields["src_site_cy"].vocab, [], for_user=True) user_site_pro_embeddings = build_embeddings( model_opt, fields["src_site_pro"].vocab, [], for_user=True) user_site_ct_embeddings = build_embeddings(model_opt, fields["src_site_ct"].vocab, [], for_user=True) session_encoder = SessionEncoder(item_embeddings, user_log_embeddings, user_op_embeddings, user_site_cy_embeddings, user_site_pro_embeddings, user_site_ct_embeddings) else: session_encoder = None model = onmt.models.NMTModel(session_encoder, encoder, decoder) model.model_type = model_opt.model_type # Build Generator. Copy Generator. generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab, model_opt.session_weight, model_opt.explanation_weight) # Load the model states from checkpoint or initialize them. if checkpoint is not None: model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator model.to(device) return model
def make_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab src_feature_dicts = onmt.io.collect_feature_vocabs(fields, 'src') src_embeddings = make_embeddings(model_opt, src_dict, src_feature_dicts) encoder = make_encoder(model_opt, src_embeddings) elif model_opt.model_type == "img": encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Make decoder. tgt_dict = fields["tgt"].vocab tgt_feature_dicts = onmt.io.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = make_embeddings(model_opt, tgt_dict, tgt_feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = make_decoder(model_opt, tgt_embeddings) # Make inference network. inference_network = make_inference_network( model_opt, src_embeddings, tgt_embeddings, src_dict, src_feature_dicts, tgt_dict, tgt_feature_dicts ) if model_opt.inference_network_type != "none" else None if model_opt.prior_normalization == "bnshare": decoder.attn.bn_mu = inference_network.bn_mu decoder.attn.bn_std = inference_network.bn_std # Make NMTModel(= encoder + decoder + inference network). model = (NMTModel(encoder, decoder, None, "none") if inference_network is None else ViNMTModel( encoder, decoder, inference_network, dist_type=model_opt.dist_type, use_prior=model_opt.use_prior > 0)) model.model_type = model_opt.model_type # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), nn.LogSoftmax(dim=1)) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') #model.load_state_dict(checkpoint['model']) model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Intializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def make_base_model(model_opt, fields, gpu, checkpoint=None, stage1=True, basic_enc_dec=False): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ if stage1 and not basic_enc_dec: assert False src = "src1" tgt = "tgt1" else: src = "src2" tgt = "tgt2" src_hist = "src1_hist" if (basic_enc_dec or stage1) else None assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. if model_opt.model_type == "text": src_dict = fields[src].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, src) src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts, hist_dict=fields[src_hist].vocab, use_hier_hist=True) encoder = make_encoder(model_opt, src_embeddings, stage1, basic_enc_dec) elif model_opt.model_type == "img": encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Make decoder. tgt_dict = fields[tgt].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, tgt) tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = make_decoder(model_opt, tgt_embeddings, stage1 and not basic_enc_dec, basic_enc_dec) # Make NMTModel(= encoder + decoder). model = NMTModel(encoder, decoder) model.model_type = model_opt.model_type # Make Generator. generator = CopyGenerator(model_opt.rnn_size, fields["tgt2"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) # print("model load stats ...") # new_model_keys = set(model.state_dict().keys()) # old_model_keys = set(checkpoint['model'].keys()) # print("missing keys when load...") # print(new_model_keys - old_model_keys) # print("abundant keys when load...") # print(old_model_keys - new_model_keys) # print("gen load stats...") # new_gen_keys = set(generator.state_dict().keys()) # old_gen_keys = set(checkpoint['generator'].keys()) # print("missing keys when load...") # print(new_gen_keys - old_gen_keys) # print("abundant keys when load...") # print(old_gen_keys - new_gen_keys) else: if model_opt.param_init != 0.0: print('Intializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Build encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'src') src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) encoder = build_encoder(model_opt, src_embeddings) elif model_opt.model_type == "img": encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Build decoder. tgt_dict = fields["tgt"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = build_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = build_decoder(model_opt, tgt_embeddings) # Build NMTModel(= encoder + decoder). device = torch.device("cuda" if gpu else "cpu") model = onmt.models.NMTModel(encoder, decoder) model.model_type = model_opt.model_type # Build Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), nn.LogSoftmax(dim=-1)) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) if model_opt.share_embeddings: assert model.encoder.embeddings.word_lut.weight \ is model.decoder.embeddings.word_lut.weight # Add generator to model (this registers it as parameter of model). model.generator = generator model.to(device) return model
def make_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu: Boolean: whether to use gpu. checkpoint: the snapshot model. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = ONMTDataset.collect_feature_dicts(fields) src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts) if model_opt.encoder_type == "double_encoder": inter_dict = fields['inter'].vocab inter_feature_dicts = ONMTDataset.collect_feature_dicts(fields) inter_embeddings = make_embeddings(model_opt, inter_dict, inter_feature_dicts, for_encoder=False, for_encoder_int=True) encoder = make_encoder(model_opt, src_embeddings, inter_embeddings) else: encoder = make_encoder(model_opt, src_embeddings) else: encoder = ImageEncoder(model_opt.layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) # Make decoder. tgt_dict = fields["tgt"].vocab # TODO: prepare for a future where tgt features are possible. feature_dicts = [] tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) decoder = make_decoder(model_opt, tgt_embeddings) # Make NMTModel(= encoder + decoder). if model_opt.encoder_type == "double_encoder": model = DoubleEncNMTModel(encoder, decoder) else: model = NMTModel(encoder, decoder) # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), nn.LogSoftmax()) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt, fields["src"].vocab, fields["tgt"].vocab) # Load the model states from checkpoint. if checkpoint is not None: print('Loading model') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) # add the generator to the module (does this register the parameter?) model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the MemModel. """ # Build encoder. src_dict = fields["src"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'src') src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) encoder = build_encoder(model_opt, src_embeddings) # Build decoder. tgt_dict = fields["tgt"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = build_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = build_decoder(model_opt, tgt_embeddings) # Build NMTModel(= encoder + decoder). device = torch.device("cuda" if gpu else "cpu") model = onmt.models.MemModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].vocab)), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight elif model_opt.coref_vocab or model_opt.coref_attn: generator = CorefGenerator(model_opt.dec_rnn_size, fields["tgt"].vocab, fields["coref_tgt"].vocab) else: generator = CopyGenerator(model_opt.dec_rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator model.to(device) return model
def make_base_model_mmt(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the Multimodal NMT model. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'src') src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts) # encoder = make_encoder(model_opt, src_embeddings) if model_opt.multimodal_model_type in ['imgd', 'imge', 'src+img']: encoder = make_encoder(model_opt, src_embeddings) elif model_opt.multimodal_model_type == 'imgw': # model ImgW uses a specific source-language encoder encoder = RNNEncoderImageAsWord(model_opt.rnn_type, model_opt.brnn, model_opt.enc_layers, model_opt.rnn_size, model_opt.dropout, src_embeddings) else: raise Exception("Multi-modal model type not implemented: %s" % model_opt.multimodal_model_type) elif model_opt.model_type == "img": encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Make decoder. tgt_dict = fields["tgt"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = make_decoder(model_opt, tgt_embeddings) if model_opt.multimodal_model_type == 'src+img': # use the local image features "as is": encoder only reshapes them encoder_image = make_encoder_image_local_features(model_opt) else: # transform global image features before using them encoder_image = make_encoder_image_global_features(model_opt) # Make NMTModel(= encoder + decoder). # model = NMTModel(encoder, decoder) # model.model_type = model_opt.model_type if model_opt.multimodal_model_type == 'imgd': model = NMTImgDModel(encoder, decoder, encoder_image) elif model_opt.multimodal_model_type == 'imge': model = NMTImgEModel(encoder, decoder, encoder_image) elif model_opt.multimodal_model_type == 'imgw': model = NMTImgWModel(encoder, decoder, encoder_image) elif model_opt.multimodal_model_type == 'src+img': # using image encoder only to reshape local features model = NMTSrcImgModel(encoder, decoder, encoder_image) else: raise Exception("Multi-modal model type not yet implemented: %s" % (model_opt.multimodal_model_type)) model.model_type = model_opt.model_type # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), nn.LogSoftmax()) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Initializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def make_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'src') src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts) encoder = make_encoder(model_opt, src_embeddings) elif model_opt.model_type == "img": encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Make decoder. tgt_dict = fields["tgt"].vocab # TODO: prepare for a future where tgt features are possible. feature_dicts = onmt.io.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required if model_opt.share_embeddings: tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = make_decoder(model_opt, tgt_embeddings) # Make NMTModel(= encoder + decoder). model = NMTModel(encoder, decoder) model.model_type = model_opt.model_type # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), nn.LogSoftmax()) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt, fields["src"].vocab, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Intializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def make_base_model(model_opt, src_dict, tgt_dict, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text"], \ ("Unsupported model type %s" % (model_opt.model_type)) # Make encoder. src_embeddings = make_embeddings(model_opt, src_dict, for_encoder=True) encoder = make_encoder(model_opt, src_embeddings) # Make decoder. tgt_embeddings = make_embeddings(model_opt, tgt_dict, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight # if model_opt.pre_word_vecs_enc is not None: # print("Loading word vectors for encoder") # pretrained = torch.load(opt.pre_word_vecs_enc) # src_embeddings.word_lut.weight.data.copy_(pretrained) # if model_opt.pre_word_vecs_dec is not None: # print("Loading word vectors for encoder") # pretrained = torch.load(opt.pre_word_vecs_dec) # tgt_embeddings.word_lut.weight.data.copy_(pretrained) decoder = make_decoder(model_opt, tgt_embeddings) # Make NMTModel(= encoder + decoder). model = NMTModel(encoder, decoder) model.model_type = model_opt.model_type # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential(nn.Linear(model_opt.rnn_size, len(tgt_dict)), nn.LogSoftmax()) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, tgt_dict) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Intializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # for backward compatibility if model_opt.rnn_size != -1: model_opt.enc_rnn_size = model_opt.rnn_size model_opt.dec_rnn_size = model_opt.rnn_size if model_opt.model_type == 'text' and \ model_opt.enc_rnn_size != model_opt.dec_rnn_size: raise AssertionError("""We do not support different encoder and decoder rnn sizes for translation now.""") # Build encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'src') src_embeddings = build_embeddings(model_opt, src_dict, feature_dicts) encoder = build_encoder(model_opt, src_embeddings) if model_opt.refer: ref_dict = fields["ref"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'ref') ref_embeddings = build_embeddings(model_opt, ref_dict, feature_dicts) refer = build_encoder(model_opt, ref_embeddings) else: refer = None # Build decoder. tgt_dict = fields["tgt"].vocab feature_dicts = inputters.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = build_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight decoder = build_decoder(model_opt, tgt_embeddings) # Build NMTModel(= encoder + decoder). device = torch.device("cuda" if gpu else "cpu") model = onmt.models.NMTModel(encoder, decoder, refer) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].vocab)), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.dec_rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = \ {fix_key(k): v for (k, v) in checkpoint['model'].items()} # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator model.to(device) return model
def build_base_model(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ "Unsupported model type %s" % model_opt.model_type # for backward compatibility if model_opt.rnn_size != -1: model_opt.enc_rnn_size = model_opt.rnn_size model_opt.dec_rnn_size = model_opt.rnn_size # Build encoder. if model_opt.model_type == "text": feat_fields = [fields[k] for k in inputters.collect_features(fields, 'src')] src_emb = build_embeddings(model_opt, fields["src"], feat_fields) encoder = build_encoder(model_opt, src_emb) elif model_opt.model_type == "img": # why is build_encoder not used here? # why is the model_opt.__dict__ check necessary? if "image_channel_size" not in model_opt.__dict__: image_channel_size = 3 else: image_channel_size = model_opt.image_channel_size encoder = ImageEncoder( model_opt.enc_layers, model_opt.brnn, model_opt.enc_rnn_size, model_opt.dropout, image_channel_size ) elif model_opt.model_type == "audio": encoder = AudioEncoder( model_opt.rnn_type, model_opt.enc_layers, model_opt.dec_layers, model_opt.brnn, model_opt.enc_rnn_size, model_opt.dec_rnn_size, model_opt.audio_enc_pooling, model_opt.dropout, model_opt.sample_rate, model_opt.window_size ) # Build decoder. feat_fields = [fields[k] for k in inputters.collect_features(fields, 'tgt')] tgt_emb = build_embeddings( model_opt, fields["tgt"], feat_fields, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert fields['src'].vocab == fields['tgt'].vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = build_decoder(model_opt, tgt_emb) decoder2 = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). device = torch.device("cuda" if gpu else "cpu") # model = onmt.models.NMTModel(encoder, decoder) model = onmt.models.KTransformerModel(encoder, decoder, decoder2) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].vocab)), gen_func ) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: vocab_size = len(fields["tgt"].vocab) pad_idx = fields["tgt"].vocab.stoi[fields["tgt"].pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = {fix_key(k): v for k, v in checkpoint['model'].items()} # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) model.generator = generator model.to(device) return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # for back compat when attention_dropout was not defined try: model_opt.attention_dropout except AttributeError: model_opt.attention_dropout = model_opt.dropout # Build embeddings. if model_opt.model_type == "text" or model_opt.model_type == "vec": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) if model_opt.share_decoder_embeddings: generator.linear.weight = decoder.embeddings.word_lut.weight # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) model.generator = generator #if model_opt.teacher_forcing != "teacher": decoder.set_vocab_size(len(fields["tgt"].base_field.vocab)) decoder.set_generator(model.generator) model.to(device) if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam': model.half() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # Build embeddings. if model_opt.model_type == "text": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func ) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = {fix_key(k): v for k, v in checkpoint['model'].items()} # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) model.generator = generator model.to(device) if model_opt.model_dtype == 'fp16': model.half() return model