def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # for back compat when attention_dropout was not defined try: model_opt.attention_dropout except AttributeError: model_opt.attention_dropout = model_opt.dropout # Build embeddings. src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight # Build encoder. encoder_x2y = build_encoder(model_opt, src_emb) encoder_y2x = build_encoder(model_opt, tgt_emb) # Build decoder. decoder_x2y = build_decoder(model_opt, tgt_emb) decoder_y2x = build_decoder(model_opt, src_emb) def share_attn_weight_and_bias(attn1, attn2, share_relative_pos_embeddings=False): attn2.linear_keys = attn1.linear_keys attn2.linear_values = attn1.linear_values attn2.linear_query = attn1.linear_query attn2.final_linear = attn1.final_linear if share_relative_pos_embeddings: assert model_opt.max_relative_positions > 0 attn2.relative_positions_embeddings = \ attn1.relative_positions_embeddings # logger.info('share encoder') encoder_y2x = encoder_x2y # logger.info('share cross_attns btw fwd & bwd decoders') for dec1, dec2 in zip(decoder_x2y.transformer_layers, decoder_y2x.transformer_layers): share_attn_weight_and_bias(dec1.context_attn, dec2.context_attn) # logger.info('share self_attns btw fwd & bwd decoders') for dec1, dec2 in zip(decoder_x2y.transformer_layers, decoder_y2x.transformer_layers): share_attn_weight_and_bias(dec1.self_attn, dec2.self_attn, model_opt.share_relative_pos_embeddings) # logger.info('share feed_forwards btw fwd & bwd decoders') for dec1, dec2 in zip(decoder_x2y.transformer_layers, decoder_y2x.transformer_layers): dec2.feed_forward.w_1 = dec1.feed_forward.w_1 dec2.feed_forward.w_2 = dec1.feed_forward.w_2 # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = onmt.models.NMTModel(encoder_x2y, encoder_y2x, decoder_x2y, decoder_y2x) # Build prior modeling prior = None if model_opt.learned_prior: assert model_opt.num_experts > 1 prior = onmt.models.Classifier( model_opt.enc_rnn_size, model_opt.num_experts, dropout=(model_opt.dropout[0] if type(model_opt.dropout) is list else model_opt.dropout)) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator_x2y = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func ) generator_y2x = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["src"].base_field.vocab)), Cast(torch.float32), gen_func ) if model_opt.share_decoder_embeddings: generator_x2y[0].weight = decoder_x2y.embeddings.word_lut.weight generator_y2x[0].weight = decoder_y2x.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) if model_opt.share_decoder_embeddings: generator_x2y.linear.weight = decoder_x2y.embeddings.word_lut.weight generator_y2x.linear.weight = decoder_y2x.embeddings.word_lut.weight # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = {fix_key(k): v for k, v in checkpoint['model'].items()} # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator_x2y.load_state_dict(checkpoint['generator_x2y'], strict=False) generator_y2x.load_state_dict(checkpoint['generator_y2x'], strict=False) if model_opt.learned_prior: prior.load_state_dict(checkpoint['prior'], strict=False) else: if model_opt.param_init != 0.0: def init_param(target_model): for p in target_model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) init_param(model) init_param(generator_x2y) init_param(generator_y2x) if model_opt.learned_prior: init_param(prior) if model_opt.param_init_glorot: def init_glorot(target_model): for p in target_model.parameters(): if p.dim() > 1: xavier_uniform_(p) init_glorot(model) init_glorot(generator_x2y) init_glorot(generator_y2x) if model_opt.learned_prior: init_glorot(prior) model.generator_x2y = generator_x2y model.generator_y2x = generator_y2x model.prior = prior model.to(device) if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam': model.half() return model
def build_base_model(model_opt, gpu, tokenizer, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. gpu (bool): whether to use gpu. tokenizer: tokenizer used to build embedding layer, if opt.share_tokenizer = true tokenizer is a EasyTokenizer instance else is a dice contain {'src','tgt'}. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # Build source embeddings. if opt.share_tokenizer: src_emb = build_embeddings(model_opt, tokenizer, src_field) else: src_emb = build_embeddings(model_opt, tokenizer['src'], src_field) # Build encoder. encoder = TransformerEncoder.from_opt(model_opt, src_emb) # Build target embeddings. if opt.share_tokenizer: tgt_emb = build_embeddings(model_opt, tokenizer, for_encoder=False) else: tgt_emb = build_embeddings(model_opt, tokenizer['tgt'], for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: if not opt.share_tokenizer: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight # Build decoder. decoder = TransformerDecoder.from_opt(model_opt, src_emb) # Build TransformerModel(= encoder + decoder). model = TransformerModel(encoder, decoder) # Build Generator. # copy attention 是另一个论文提出的技术 if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = model.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_dim_size, len(tokenizer.vocal) if opt.share_tokenizer else len(tokenizer['tgt'].vocab)), Cast(torch.float32), gen_func ) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_dim_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = {fix_key(k): v for k, v in checkpoint['model'].items()} # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: # 判断如何初始化参数 if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) # 用xavier初始化 if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) # 使用预训练词嵌入层参数 if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) # 生成部分 model.generator = generator if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model.to(device) if model_opt.model_dtype == 'fp16': model.half()00000000000000 return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # Build embeddings. if model_opt.model_type == "text": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight if model_opt.share_position_embeddings: tgt_emb.make_embedding.pe.pe.weight = src_emb.make_embedding.pe.pe.weight decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") # Build separate LM if doing simple fusion if model_opt.simple_fusion: layers = 12 size = 768 heads = 12 lm_decoder_opt = copy.deepcopy(model_opt) lm_decoder_opt.dec_layers = layers lm_decoder_opt.use_GPT_version_ctxattn = False lm_decoder_opt.use_GPT_version_psa = False lm_decoder_opt.use_GPT_version_unconditional = True lm_decoder_opt.tgt_word_vec_size = size lm_decoder_opt.rnn_size = size lm_decoder_opt.dec_rnn_size = size lm_decoder_opt.transformer_ff = size * 4 lm_decoder_opt.dec_heads = heads lm_decoder_opt.position_encoding_learned_dec = True lm_decoder_opt.share_decoder_embeddings = True lm_decoder_opt.dropout = 0 lm_decoder_emb = build_embeddings(lm_decoder_opt, tgt_field, for_encoder=False) logger.info(lm_decoder_emb) lm_decoder = build_decoder(lm_decoder_opt, lm_decoder_emb) load_decoder = lm_decoder model = onmt.models.SimpleFusionModel(encoder, decoder, lm_decoder) generator = SimpleFusionGenerator(model_opt.dec_rnn_size, lm_decoder_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)) generator.lm_linear.weight = lm_decoder.embeddings.word_lut.weight if model_opt.share_decoder_embeddings: generator.decoder_linear.weight = decoder.embeddings.word_lut.weight gen_linear = generator.lm_linear else: load_decoder = decoder if model_opt.unconditional: model = onmt.models.UncondModel(decoder) else: model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) if model_opt.padded_vocab_fix_me_later: gen_func = nn.Sequential(PadGen(), gen_func) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight gen_linear = generator[0] else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) if model_opt.share_decoder_embeddings: generator.linear.weight = decoder.embeddings.word_lut.weight gen_linear = generator.linear if model_opt.encdec_share_params: for name, p in decoder.named_parameters(): if 'ctx' in name or 'context' in name: continue pointer = encoder attrs = name.split('.') for attr_name in attrs[:-1]: pointer = getattr(pointer, attr_name) # pointer now has the encoder version of the parameter parent setattr(pointer, attrs[-1], p) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # Normally, just load the model parameters from checkpoint if 'gpt2_params' not in checkpoint and 'enc_model' not in checkpoint: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility # Initialize rest of parameters normally if hasattr(model_opt, 'load_uncond_from') and model_opt.load_uncond_from: for p in decoder.parameters(): if p.dim() > 1: xavier_uniform_(p) # Always initialize encoder parameters normally for p in encoder.parameters(): if p.dim() > 1: xavier_uniform_(p) if model_opt.ctx_weight_param: for name, p in decoder.named_parameters(): if 'ctx_weight' in name: p.data.zero_() if 'ctx_bias' in name: p.data.fill_(-10) model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: # load the gpt parameters if 'gpt2_params' in checkpoint: init_something = model_opt.gpt2_init_embanddec or model_opt.simple_fusion or model_opt.gpt2_init_embandenc or model_opt.GPT_representation_mode != 'none' if init_something: # Initialize all the weights first if model_opt.gpt2_init_zero: for p in decoder.parameters(): p.data.zero_() if model_opt.simple_fusion: generator.decoder_linear.weight.data.zero_() generator.decoder_linear.bias.data.zero_() else: for p in decoder.parameters(): if p.dim() > 1: xavier_uniform_(p) # Always initialize encoder parameters normally if encoder is not None: for p in encoder.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if model_opt.zero_bias_init: gen_linear.bias.data.zero_() if model_opt.ctx_weight_param: for name, p in decoder.named_parameters(): if 'ctx_weight' in name: p.data.zero_() if 'ctx_bias' in name: p.data.fill_(-10) gen_linear.bias.data.zero_() load_models = [] if model_opt.GPT_representation_mode != 'none': load_embs = [] if model_opt.GPT_representation_loc in ['both', 'src']: load_models.append(src_emb.gpt_model) load_embs.append(src_emb) if model_opt.GPT_representation_loc in ['both', 'tgt']: load_models.append(tgt_emb.gpt_model) load_embs.append(tgt_emb) else: if model_opt.gpt2_init_embanddec or model_opt.simple_fusion: load_models = [load_decoder] elif model_opt.gpt2_init_embandenc: load_models = [encoder] it_list = list(checkpoint['gpt2_params']) for lm_idx, load_model in enumerate(load_models): #print(lm_idx, load_model) for name, array in it_list: name = name[12:] # skip "transformer." name = name.split('.') assigned = False if name[0] == 'wpe': if model_opt.GPT_representation_mode != 'none': pointer = load_embs[ lm_idx].make_embedding.pe.pe.weight else: pointer = load_model.embeddings.make_embedding.pe.pe.weight elif name[0] == 'wte': if model_opt.GPT_representation_mode != 'none': pointer = [ load_embs[lm_idx].make_embedding. emb_luts[0].weight, gen_linear.weight ] else: pointer = [ load_model.embeddings.make_embedding. emb_luts[0].weight ] if not model_opt.nopretrain_decemb: pointer.append(gen_linear.weight) if model_opt.simple_fusion and model_opt.sf_pretrain_dec_emb: pointer.append( decoder.embeddings.make_embedding. emb_luts[0].weight) elif name[0] == 'ln_f': if name[1] == 'weight': pointer = load_model.layer_norm.weight elif name[1] == 'bias': pointer = load_model.layer_norm.bias else: raise ValueError( 'I am missing something here!') elif name[0] == 'h': layer_num = name[1] pointer = getattr(load_model.transformer_layers, layer_num) if name[2] == 'attn': assigned = True pointer = pointer.self_attn full_data = torch.from_numpy(array) if name[3] == 'c_attn': end_size = full_data.shape[-1] // 3 assert full_data.shape[-1] % 3 == 0 if name[4] == 'bias': if init_something: pointer.linear_query.bias.data = full_data[: end_size] pointer.linear_keys.bias.data = full_data[ end_size:end_size * 2] pointer.linear_values.bias.data = full_data[ end_size * 2:] if model_opt.gpt2_params_std > 0: pointer.linear_query.bias.orig = full_data[:end_size].clone( ) pointer.linear_keys.bias.orig = full_data[ end_size:end_size * 2].clone() pointer.linear_values.bias.orig = full_data[ end_size * 2:].clone() elif name[4] == 'weight': if init_something: pointer.linear_query.weight.data = full_data[:, :end_size].t( ).contiguous() pointer.linear_keys.weight.data = full_data[:, end_size: end_size * 2].t( ).contiguous( ) pointer.linear_values.weight.data = full_data[:, end_size * 2:].t( ).contiguous( ) if model_opt.gpt2_params_std > 0: pointer.linear_query.weight.orig = full_data[:, :end_size].t( ).contiguous().clone() pointer.linear_keys.weight.orig = full_data[:, end_size: end_size * 2].t( ).contiguous( ).clone( ) pointer.linear_values.weight.orig = full_data[:, end_size * 2:].t( ).contiguous().clone() else: raise ValueError( 'I am missing something here!') elif name[3] == 'c_proj': if name[4] == 'bias': if init_something: pointer.final_linear.bias.data = full_data if model_opt.gpt2_params_std > 0: pointer.final_linear.bias.orig = full_data.clone( ) elif name[4] == 'weight': if init_something: pointer.final_linear.weight.data = full_data.t( ).contiguous() if model_opt.gpt2_params_std > 0: pointer.final_linear.weight.orig = full_data.t( ).contiguous().clone() else: raise ValueError( 'I am missing something here!') elif name[2] == 'ln_1' or name[2] == 'ln_2': num = name[2][3] pointer = getattr(pointer, 'layer_norm_' + num) if name[2] == 'bias': pointer = pointer.bias elif name[2] == 'weight': pointer = pointer.weight else: raise ValueError( 'I am missing something here!') elif name[2] == 'mlp': pointer = pointer.feed_forward pointer = getattr(pointer, name[2]) if name[3] == 'bias': pointer = pointer.bias elif name[3] == 'weight': pointer = pointer.weight else: raise ValueError( 'I am missing something here!') else: raise ValueError( 'I am missing something here!') else: raise ValueError('I am missing something here!') if not assigned: # if name[0] == 'wte': # print(array.shape) # continue if name[-1] == 'weight': array = array.T if not isinstance(pointer, list): pointer = [pointer] for pointer_i in pointer: target_size = int(math.ceil( array.shape[0] / 8)) * 8 padded_vocab = name[ 0] == 'wte' and pointer_i.shape[ 0] == target_size padded_vocab = padded_vocab and pointer_i.shape[ 1:] == array.shape[1:] try: assert pointer_i.shape == array.shape or padded_vocab except AssertionError as e: e.args += (pointer_i.shape, array.shape) raise if init_something: print( "Initialize PyTorch weight {}".format( name)) if padded_vocab: pointer_i.data[:array.shape[ 0]] = torch.from_numpy(array) else: pointer_i.data = torch.from_numpy( array) if model_opt.gpt2_params_std > 0: if padded_vocab: raise NotImplementedError else: pointer_i.orig = torch.from_numpy( array).clone() # name = name[6:] # skip "model/" # name = name.split('/') # assigned = False # if name[0] == 'wpe': # if model_opt.GPT_representation_mode != 'none': # pointer = load_embs[lm_idx].make_embedding.pe.pe.weight # else: # pointer = load_model.embeddings.make_embedding.pe.pe.weight # elif name[0] == 'wte': # if model_opt.GPT_representation_mode != 'none': # pointer = [load_embs[lm_idx].make_embedding.emb_luts[0].weight, gen_linear.weight] # else: # pointer = [load_model.embeddings.make_embedding.emb_luts[0].weight] # if not model_opt.nopretrain_decemb: # pointer.append(gen_linear.weight) # if model_opt.simple_fusion and model_opt.sf_pretrain_dec_emb: # pointer.append(decoder.embeddings.make_embedding.emb_luts[0].weight) # elif name[0] == 'ln_f': # if name[1] == 'g': # pointer = load_model.layer_norm.weight # elif name[1] == 'b': # pointer = load_model.layer_norm.bias # else: # raise ValueError('I am missing something here!') # elif name[0][0] == 'h': # layer_num = name[0][1:] # pointer = getattr(load_model.transformer_layers, layer_num) # if name[1] == 'attn': # assigned = True # pointer = pointer.self_attn # full_data = torch.from_numpy(array) # if name[2] == 'c_attn': # end_size = full_data.shape[-1]//3 # assert full_data.shape[-1] % 3 == 0 # if name[3] == 'b': # if init_something: # pointer.linear_query.bias.data = full_data[:end_size] # pointer.linear_keys.bias.data = full_data[end_size:end_size*2] # pointer.linear_values.bias.data = full_data[end_size*2:] # if model_opt.gpt2_params_std > 0: # pointer.linear_query.bias.orig = full_data[:end_size].clone() # pointer.linear_keys.bias.orig = full_data[end_size:end_size*2].clone() # pointer.linear_values.bias.orig = full_data[end_size*2:].clone() # elif name[3] == 'w': # if init_something: # pointer.linear_query.weight.data = full_data[:, :end_size].t().contiguous() # pointer.linear_keys.weight.data = full_data[:, end_size:end_size*2].t().contiguous() # pointer.linear_values.weight.data = full_data[:, end_size*2:].t().contiguous() # if model_opt.gpt2_params_std > 0: # pointer.linear_query.weight.orig = full_data[:, :end_size].t().contiguous().clone() # pointer.linear_keys.weight.orig = full_data[:, end_size:end_size*2].t().contiguous().clone() # pointer.linear_values.weight.orig = full_data[:, end_size*2:].t().contiguous().clone() # else: # raise ValueError('I am missing something here!') # elif name[2] == 'c_proj': # if name[3] == 'b': # if init_something: # pointer.final_linear.bias.data = full_data # if model_opt.gpt2_params_std > 0: # pointer.final_linear.bias.orig = full_data.clone() # elif name[3] == 'w': # if init_something: # pointer.final_linear.weight.data = full_data.t().contiguous() # if model_opt.gpt2_params_std > 0: # pointer.final_linear.weight.orig = full_data.t().contiguous().clone() # else: # raise ValueError('I am missing something here!') # elif name[1] == 'ln_1' or name[1] == 'ln_2': # num = name[1][3] # pointer = getattr(pointer, 'layer_norm_'+num) # if name[2] == 'b': # pointer = pointer.bias # elif name[2] == 'g': # pointer = pointer.weight # else: # raise ValueError('I am missing something here!') # elif name[1] == 'mlp': # pointer = pointer.feed_forward # pointer = getattr(pointer, name[2]) # if name[3] == 'b': # pointer = pointer.bias # elif name[3] == 'w': # pointer = pointer.weight # else: # raise ValueError('I am missing something here!') # else: # raise ValueError('I am missing something here!') # else: # raise ValueError('I am missing something here!') # if not assigned: # if name[0] == 'wte': # print(array.shape) # continue # if name[-1] == 'w' or name[-1] == 'g': # array = array.T # if not isinstance(pointer, list): # pointer = [pointer] # for pointer_i in pointer: # target_size = int(math.ceil(array.shape[0]/8))*8 # padded_vocab = name[0] == 'wte' and pointer_i.shape[0] == target_size # padded_vocab = padded_vocab and pointer_i.shape[1:] == array.shape[1:] # try: # assert pointer_i.shape == array.shape or padded_vocab # except AssertionError as e: # e.args += (pointer_i.shape, array.shape) # raise # if init_something: # print("Initialize PyTorch weight {}".format(name)) # if padded_vocab: # pointer_i.data[:array.shape[0]] = torch.from_numpy(array) # else: # pointer_i.data = torch.from_numpy(array) # if model_opt.gpt2_params_std > 0: # if padded_vocab: # raise NotImplementedError # else: # pointer_i.orig = torch.from_numpy(array).clone() if 'enc_model' in checkpoint: load_dict = { k[8:]: v for k, v in checkpoint['enc_model'] if 'encoder' in k } encoder.load_state_dict(load_dict, strict=True) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if not model_opt.unconditional and hasattr(model.encoder, 'embeddings') \ and model.encoder.embeddings is not None: model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) # remove requires_grad from params that are not trained: if model_opt.notrain_emb or model_opt.notrain_embanddec: if model_opt.position_encoding_learned_enc and model_opt.share_position_embeddings: model.encoder.embeddings.make_embedding.pe.pe.weight.requires_grad = False if model_opt.share_embeddings: model.encoder.embeddings.make_embedding.emb_luts[ 0].weight.requires_grad = False model.decoder.embeddings.make_embedding.pe.pe.weight.requires_grad = False model.decoder.embeddings.make_embedding.emb_luts[ 0].weight.requires_grad = False generator[0].weight.requires_grad = False if model_opt.notrain_genbias: generator[0].bias.requires_grad = False if model_opt.notrain_embanddec: for name, p in load_decoder.layer_norm.named_parameters(): p.requires_grad = False for name, p in load_decoder.transformer_layers.named_parameters(): if 'context' not in name and 'ctx' not in name: # Takes care of normal and psa versions p.requires_grad = False if model_opt.onlytrainln: for name, p in model.decoder.named_parameters(): if 'layer_norm' not in name: p.requires_grad = False for p in generator.parameters(): p.requires_grad = False if model_opt.onlytrainoutp: if model_opt.share_decoder_embeddings: raise ValueError for p in model.decoder.parameters(): p.requires_grad = False if model_opt.simple_fusion: for p in lm_decoder.parameters(): p.requires_grad = False for p in generator.lm_linear.parameters(): p.requires_grad = False model.generator = generator model.to(device) if model_opt.model_dtype == 'fp16': model.half() for p in model.parameters(): if hasattr(p, 'orig'): p.orig = p.orig.to(device) if model_opt.model_dtype == 'fp16': p.orig = p.orig.half() return model
def build_base_model(cls, src_types: List[str], model_opt, fields, gpu, checkpoint=None, gpu_id=None ): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # for back compat when attention_dropout was not defined try: model_opt.attention_dropout except AttributeError: model_opt.attention_dropout = model_opt.dropout # Build embeddings. src_embs: Dict[str, Optional[nn.Module]] = dict() # PN: we always have text srcs for now for src_type in src_types: src_field = fields[f"src.{src_type}"] src_embs[src_type] = cls.build_embeddings(model_opt, src_field) # end for # Build encoders. encoders: List[EncoderBase] = list() for src_type in src_types: encoders.append(cls.build_encoder(model_opt, src_embs[src_type])) # end for # Build decoder. tgt_field = fields["tgt"] tgt_emb = cls.build_embeddings(model_opt, tgt_field, for_encoder=False) # No share embedding in this model assert not model_opt.share_embeddings, "share embeddings not supported" # # Share the embedding matrix - preprocess with share_vocab required. # if model_opt.share_embeddings: # # src/tgt vocab should be the same if `-share_vocab` is specified. # assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ # "preprocess with -share_vocab if you use share_embeddings" # # tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = cls.build_decoder(model_opt, tgt_emb) # Build MultiSourceNMTModel(= encoders + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") # end if model = MultiSourceNMTModel(encoders, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func ) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = MultiSourceCopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = {fix_key(k): v for k, v in checkpoint['model'].items()} # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) for encoder in model.encoders: if hasattr(encoder, 'embeddings'): encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) model.generator = generator model.to(device) return model
def build_base_model(model_opt, fields, gpu, args, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # for back compat when attention_dropout was not defined try: model_opt.attention_dropout except AttributeError: model_opt.attention_dropout = model_opt.dropout # Build embeddings. if model_opt.model_type == "text" or model_opt.model_type == "vec": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = TransformerEncoder.from_opt(model_opt, src_emb) # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = TransformerDecoder.from_opt(model_opt, tgt_emb, args) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) if model_opt.share_decoder_embeddings: generator.linear.weight = decoder.embeddings.word_lut.weight # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) if args.model_type == 'decoder_ext': w = [] for i in range(model_opt.dec_layers): w.append([ decoder.transformer_layers[i].layer_norm_1.weight.data, decoder.transformer_layers[i].layer_norm_1.bias.data, decoder.transformer_layers[i].self_attn.linear_query. weight.data.transpose(-1, -2).contiguous(), decoder.transformer_layers[i].self_attn.linear_keys.weight. data.transpose(-1, -2).contiguous(), decoder.transformer_layers[i].self_attn.linear_values. weight.data.transpose(-1, -2).contiguous(), decoder. transformer_layers[i].self_attn.linear_query.bias.data, decoder.transformer_layers[i].self_attn.linear_keys.bias. data, decoder.transformer_layers[i].self_attn. linear_values.bias.data, decoder.transformer_layers[i]. self_attn.final_linear.weight.data.transpose( -1, -2).contiguous(), decoder.transformer_layers[i]. self_attn.final_linear.bias.data, decoder.transformer_layers[i].layer_norm_2.weight.data, decoder.transformer_layers[i].layer_norm_2.bias.data, decoder.transformer_layers[i].context_attn.linear_query. weight.data.transpose( -1, -2).contiguous(), decoder.transformer_layers[i]. context_attn.linear_keys.weight.data.transpose( -1, -2).contiguous(), decoder.transformer_layers[i]. context_attn.linear_values.weight.data.transpose( -1, -2).contiguous(), decoder.transformer_layers[i]. context_attn.linear_query.bias.data, decoder. transformer_layers[i].context_attn.linear_keys.bias.data, decoder.transformer_layers[i].context_attn.linear_values. bias.data, decoder.transformer_layers[i].context_attn. final_linear.weight.data.transpose( -1, -2).contiguous(), decoder.transformer_layers[i]. context_attn.final_linear.bias.data, decoder. transformer_layers[i].feed_forward.layer_norm.weight.data, decoder.transformer_layers[i].feed_forward.layer_norm.bias. data, decoder.transformer_layers[i].feed_forward.w_1. weight.data.transpose(-1, -2).contiguous(), decoder.transformer_layers[i].feed_forward.w_1.bias.data, decoder.transformer_layers[i].feed_forward.w_2.weight.data. transpose(-1, -2).contiguous(), decoder.transformer_layers[i].feed_forward.w_2.bias.data ]) for i in range(len(w[-1])): w[-1][i] = w[-1][i].cuda() if args.data_type == 'fp16': for i in range(len(w[-1])): w[-1][i] = w[-1][i].half() decoder_layers = nn.ModuleList([ FTDecoderLayer(model_opt.heads, model_opt.dec_rnn_size // model_opt.heads, w[i], args) for i in range(model_opt.dec_layers) ]) model.decoder.transformer_layers = decoder_layers elif args.model_type == 'decoding_ext': vocab_size = len(fields["tgt"].base_field.vocab) bos_idx = fields["tgt"].base_field.vocab.stoi[ fields["tgt"].base_field.init_token] eos_idx = fields["tgt"].base_field.vocab.stoi[ fields["tgt"].base_field.eos_token] decoding_weights = DecodingWeights(model_opt.dec_layers, model_opt.dec_rnn_size, vocab_size, checkpoint) decoding_weights.to_cuda() if args.data_type == 'fp16': decoding_weights.to_half() model.decoder = CustomDecoding(model_opt.dec_layers, model_opt.heads, model_opt.dec_rnn_size // model_opt.heads, vocab_size, bos_idx, eos_idx, decoding_weights, args=args) elif args.model_type == 'torch_decoding' or args.model_type == 'torch_decoding_with_decoder_ext': vocab_size = len(fields["tgt"].base_field.vocab) bos_idx = fields["tgt"].base_field.vocab.stoi[ fields["tgt"].base_field.init_token] eos_idx = fields["tgt"].base_field.vocab.stoi[ fields["tgt"].base_field.eos_token] decoding_weights = DecodingWeights(model_opt.dec_layers, model_opt.dec_rnn_size, vocab_size, checkpoint) decoding_weights.to_cuda() if args.data_type == 'fp16': decoding_weights.to_half() model.decoder = TorchDecoding(model_opt.dec_layers, model_opt.heads, model_opt.dec_rnn_size // model_opt.heads, vocab_size, bos_idx, eos_idx, decoding_weights, args=args) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) model.generator = generator model.to(device) if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam': model.half() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # for back compat when attention_dropout was not defined try: model_opt.attention_dropout except AttributeError: model_opt.attention_dropout = model_opt.dropout # Build Model if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = build_task_specific_model(model_opt, fields) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func ) if model_opt.share_decoder_embeddings: generator[0].weight = model.decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) if model_opt.share_decoder_embeddings: generator.linear.weight = model.decoder.embeddings.word_lut.weight # Load the model states from checkpoint or initialize them. if checkpoint is None or model_opt.update_vocab: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model, "encoder") and hasattr(model.encoder, "embeddings"): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = {fix_key(k): v for k, v in checkpoint['model'].items()} # end of patch for backward compatibility if model_opt.update_vocab: # Update model embeddings with those from the checkpoint after initialization use_embeddings_from_checkpoint(fields, model, generator, checkpoint) # Remove old vocabulary associated embeddings # Embedding layers enc_emb_name = "encoder.embeddings.make_embedding.emb_luts.0.weight" dec_emb_name = "decoder.embeddings.make_embedding.emb_luts.0.weight" del checkpoint["model"][enc_emb_name], checkpoint["model"][dec_emb_name] del checkpoint["generator"]["0.weight"], checkpoint["generator"]["0.bias"] model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) model.generator = generator model.to(device) if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam': model.half() return model
def __init__(self, generator_function, dec_rnn_size, base_field): super(BertGenerator, self).__init__() self.generator = nn.Sequential( nn.Linear(dec_rnn_size, len(base_field.vocab)), Cast(torch.float32), generator_function)
def __init__(self, d_in, d_out, gen_func): super().__init__() self.proj = nn.Linear(d_in, d_out) self.cast = Cast(torch.float32) self.gen_func = gen_func
def __init__(self, rnn_type, bidirectional_encoder, num_layers, hidden_size, attn_type="general", attn_func="softmax", coverage_attn=False, context_gate=None, copy_attn=False, dropout=0.0, embeddings=None, text_field=None, reuse_copy_attn=False, copy_attn_type="general"): super(RNNDecoderBase, self).__init__( attentional=attn_type != "none" and attn_type is not None) self.bidirectional_encoder = bidirectional_encoder self.num_layers = num_layers self.hidden_size = hidden_size self.embeddings = embeddings self.text_field = text_field self.dropout = nn.Dropout(dropout) # Decoder state self.state = {} # Build the RNN. self.rnn = self._build_rnn(rnn_type, input_size=self._input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout) vocab_size = len(self.text_field.base_field.vocab) self.generator = nn.Sequential(nn.Linear(self.hidden_size, vocab_size), Cast(torch.float32), nn.Softmax(dim=-1)) # Set up the context gate. self.context_gate = None if context_gate is not None: self.context_gate = context_gate_factory(context_gate, self._input_size, hidden_size, hidden_size, hidden_size) # Set up the standard attention. self._coverage = coverage_attn if not self.attentional: if self._coverage: raise ValueError("Cannot use coverage term with no attention.") self.attn = None else: self.attn = TopicAttention(hidden_size, coverage=coverage_attn, attn_type=attn_type, attn_func=attn_func) if copy_attn and not reuse_copy_attn: if copy_attn_type == "none" or copy_attn_type is None: raise ValueError( "Cannot use copy_attn with copy_attn_type none") self.copy_attn = GlobalAttention(hidden_size, attn_type=copy_attn_type, attn_func=attn_func) else: self.copy_attn = None self._reuse_copy_attn = reuse_copy_attn and copy_attn if self._reuse_copy_attn and not self.attentional: raise ValueError("Cannot reuse copy attention with no attention.")
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ assert model_opt.model_type in ["text", "img", "audio"], \ "Unsupported model type %s" % model_opt.model_type # for backward compatibility if model_opt.rnn_size != -1: model_opt.enc_rnn_size = model_opt.rnn_size model_opt.dec_rnn_size = model_opt.rnn_size # Build embeddings. if model_opt.model_type == "text": src_fields = [f for n, f in fields['src']] assert len(src_fields) == 1 src_field = src_fields[0] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_fields = [f for n, f in fields['tgt']] assert len(tgt_fields) == 1 tgt_field = tgt_fields[0] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"][0][1].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: assert len(fields["tgt"]) == 1 tgt_base_field = fields["tgt"][0][1].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) model.generator = generator model.to(device) if model_opt.model_dtype == 'fp16': logger.warning('FP16 is experimental, the generated checkpoints may ' 'be incompatible with a future version') model.half() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # Build embeddings. if model_opt.model_type == "text": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings and model_opt.encoder_type != 'bert': # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.share_decoder_embeddings: if not model_opt.copy_attn: generator[0].weight = decoder.embeddings.word_lut.weight else: generator.linear.weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) elif model_opt.encoder_type != 'bert' or model_opt.decoder_type != 'bert': if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if (hasattr(model.encoder, 'embeddings') and not model_opt.encoder_type == 'bert'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if (hasattr(model.decoder, 'embeddings') and not model_opt.decoder_type == 'bert'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) if model_opt.encoder_type == 'bert' or model_opt.decoder_type == 'bert': if model_opt.bert_type != 'none': model_opt.enc_bert_type = model_opt.bert_type model_opt.dec_bert_type = model_opt.bert_type if model_opt.enc_bert_type != 'none' and checkpoint is None: model.encoder.initialize_bert(model_opt.enc_bert_type) if model_opt.dec_bert_type != 'none' and checkpoint is None: model.decoder.initialize_bert(model_opt.dec_bert_type) # Tie word embedding layer of encoder BERT and decoder if model_opt.encoder_type == 'bert' and model_opt.share_embeddings: decoder.embeddings.word_lut.weight = \ encoder.embeddings.word_lut.weight # Tie decoder word embedding layer with generator weights if model_opt.share_decoder_embeddings: if not model_opt.copy_attn: generator[0].weight = \ decoder.embeddings.word_lut.weight else: generator.linear.weight = \ decoder.embeddings.word_lut.weight if model_opt.encoder_type == 'bert' and model_opt.decoder_type == 'bert': # Tie word, position and token_type embedding # layers of encoder and decoder BERT if model_opt.share_embeddings: decoder.embeddings.position_embeddings.weight = \ encoder.embeddings.position_embeddings.weight decoder.embeddings.token_type_embeddings.weight = \ encoder.embeddings.token_type_embeddings.weight # Tie self-attention between encoder and decoder if model_opt.share_self_attn: for encoder_layer, decoder_layer in zip( encoder.encoder.layer, decoder.transformer_layers): # QUERY clone_or_share_layer(decoder_layer.self_attn.linear_query, encoder_layer.attention.self.query, share=True) # KEY clone_or_share_layer(decoder_layer.self_attn.linear_keys, encoder_layer.attention.self.key, share=True) # VALUE clone_or_share_layer(decoder_layer.self_attn.linear_values, encoder_layer.attention.self.value, share=True) # MULTIHEAD ATTN FINAL LINEAR LAYER clone_or_share_layer(decoder_layer.self_attn.final_linear, encoder_layer.attention.output.dense, share=True) # Tie context-attention with self-attention if model_opt.tie_context_attn: for decoder_layer in decoder.transformer_layers: # QUERY clone_or_share_layer(decoder_layer.context_attn.linear_query, decoder_layer.self_attn.linear_query, share=True) # KEY clone_or_share_layer(decoder_layer.context_attn.linear_keys, decoder_layer.self_attn.linear_keys, share=True) # VALUE clone_or_share_layer(decoder_layer.context_attn.linear_values, decoder_layer.self_attn.linear_values, share=True) # MULTIHEAD ATTN FINAL LINEAR LAYER clone_or_share_layer(decoder_layer.context_attn.final_linear, decoder_layer.self_attn.final_linear, share=True) # Tie positionwise feedforward between encoder and decoder if model_opt.share_feed_forward: for encoder_layer, decoder_layer in zip( encoder.encoder.layer, decoder.transformer_layers): # TRANSFORMER FF clone_or_share_layer(decoder_layer.intermediate.dense, encoder_layer.intermediate.dense, share=True) clone_or_share_layer(decoder_layer.output.dense, encoder_layer.output.dense, share=True) model.generator = generator model.to(device) if model_opt.model_dtype == 'fp16': model.half() return model
def build_base_model(model_opt, fields, gpu, length_model, length_penalty_a, length_penalty_b, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # for back compat when attention_dropout was not defined try: model_opt.attention_dropout except AttributeError: model_opt.attention_dropout = model_opt.dropout # Build embeddings. if model_opt.model_type == "text" or model_opt.model_type == "vec": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) # MMM: commented the lines below # generator = nn.Sequential( # nn.Linear(model_opt.dec_rnn_size, # len(fields["tgt"].base_field.vocab)), # Cast(torch.float32), # gen_func # ) # MMM class tune_out_prob(nn.Module): def __init__(self): super(tune_out_prob, self).__init__() self.t_lens = None self.eos_ind = None self.batch_max_len = None self.word_index = None self.tgt_vocab_size = None self.validation = False def length_model_loss(self, scale, value, a, b): # return -(value / scale) ** 2 - scale.log() # return -((value / scale) **2)/2 - (2.5066*scale).log() return -a * (value / scale) ** 2 + b # *abs(scale) # return -((value / scale) ** 2)*scale + scale # return -(value / scale)*4 + scale def forward(self, x): y = x.clone() # mask = np.ones(x.size()) # for i in range(self.t_lens.size(-1)): # y[i*self.batch_size + self.t_lens[i], self.eos_ind] = \ # y[i * self.batch_size + self.t_lens[i], self.eos_ind].clone() + math.log(0.9) if self.training or self.validation: # training phase y = y.view(self.batch_max_len, -1, self.tgt_vocab_size) # eos_list = [(i * self.batch_max_len + self.t_lens.data.cpu().numpy()[i]) for i in # range(self.t_lens.size(-1))] # other_list = list(set(list(range(x.size(0)))) - set(eos_list)) # y[other_list, self.eos_ind] = -100 # y[eos_list, self.eos_ind] = 0 for wi in range(self.batch_max_len): delta_p = (self.t_lens - wi - 1).float() delta_p[delta_p < 0] = 0.05 * delta_p[delta_p < 0] scale = (self.t_lens.float()).sqrt() / 2.0 penalties = self.length_model_loss(scale, delta_p, length_penalty_a, length_penalty_b) # penalties[penalties > 0] = 0 y[wi, :, self.eos_ind] += penalties y = y.view(-1, self.tgt_vocab_size) # mask[eos_list, self.eos_ind] = +2 # mask[other_list, self.eos_ind] = -2 else: # translation phase if len(x.size()) == 3: # x of shape [ tgt_len, batch_size, vocab ] is a full sentence # for i in range(len(self.t_lens)): # other_list = list(set(list(range(x.size(0)))) - set(list([self.t_lens.data.cpu().numpy()[i]]))) # #mask[other_list, i, self.eos_ind] = -2 # y[other_list, i, self.eos_ind] = -100 # if self.t_lens[i] < x.size(0): # #mask[self.t_lens[i], i, self.eos_ind] = +2 # y[self.t_lens[i], i, self.eos_ind] = 0 pass else: # x of shape [(batch_size x beam_size) , vocab ] is only for one step beam_size = x.size(0) // self.t_lens.numel() wi = self.word_index delta_p = (self.t_lens - wi - 2).float() delta_p[delta_p < 0] = 0.005 * delta_p[delta_p < 0] delta_p = delta_p.unsqueeze(1).expand(self.t_lens.numel(), beam_size).flatten() scale = (self.t_lens.float()).sqrt() / 2.0 scale = scale.unsqueeze(1).expand(self.t_lens.numel(), beam_size).flatten() penalties = self.length_model_loss(scale, delta_p, length_penalty_a, length_penalty_b) # penalties[penalties > 0] = 0 y[:, self.eos_ind] += penalties # y[eos_list ^ 1, self.eos_ind] = -100 return y # mask = torch.tensor(mask, dtype=x.dtype).to(device) # x= x+mask # return x # y = x.clone() # # 1. since y is the output of log_softmax, apply exponential # # to convert it to probabilistic form # y = torch.exp(y) # # 2. tune probabilities # eos_list = [(i * self.batch_max_len + self.t_lens.data.cpu().numpy()[i]) for i in # range(self.t_lens.size(-1))] # other_list = list(set(list(range(y.size(0)))) - set(eos_list)) # # z = y.clone() # # 2.1. tune probabilities for eos positions # z[eos_list, self.eos_ind] = 1 # z[eos_list, 0:self.eos_ind] = 0 # z[eos_list, self.eos_ind+1:-1] = 0 # # # 2.2. tune probabilities for non-eos positions # p_val = z[other_list, self.eos_ind] / (self.tgt_vocab_size - 1) # z[other_list, self.eos_ind] = 0 # non_eos_inds = list(set(list(range(self.tgt_vocab_size))) - set([self.eos_ind])) # for i in range(len(other_list)): # z[other_list[i], non_eos_inds] = y[other_list[i], non_eos_inds] + p_val[i] # # # 3. convert y back to log-probability form # z = torch.log(z) # return z # MMM if length_model == 'oracle' or length_model == 'fixed_ratio' or length_model == 'lstm': generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func, tune_out_prob() ) else: generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func ) # /MMM if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = {fix_key(k): v for k, v in checkpoint['model'].items()} # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) model.generator = generator model.to(device) if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam': model.half() return model
def __init__(self, self_ref_mask_dict, proj, gen_func): super(SelfRefMaskGenerator, self).__init__() self.sr_dict = self_ref_mask_dict self.proj = proj self.cast = Cast(torch.float32) self.gen_func = gen_func
def build_base_model(model_opt, opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # Build embeddings. if model_opt.model_type == "text": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. redr_encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = onmt.models.NMTModel(redr_encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder.reference_encoder, 'embeddings'): model.encoder.reference_encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.encoder.history_encoder, 'embeddings'): model.encoder.history_encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) model.generator = generator model.to(device) if model_opt.model_dtype == 'fp16': model.half() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # Build embeddings. if model_opt.model_type == "text" or model_opt.model_type == "vec": src_field = fields["sent1"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. # tgt_field = fields["tgt"] # tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. #if model_opt.share_embeddings: # # src/tgt vocab should be the same if `-share_vocab` is specified. # assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ # "preprocess with -share_vocab if you use share_embeddings" # tgt_emb.word_lut.weight = src_emb.word_lut.weight #decoder = build_decoder(model_opt, tgt_emb) # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") # model = onmt.models.CLSModel(encoder) # Build Generator. gen_func = nn.LogSoftmax(dim=-1) classifier = nn.Sequential( nn.Linear(model_opt.enc_rnn_size * 4, model_opt.enc_rnn_size), nn.ReLU(), nn.Linear(model_opt.enc_rnn_size, model_opt.n_label), Cast(torch.float32), # gen_func ) model = onmt.models.CLSModel(encoder, classifier) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) classifier.load_state_dict(checkpoint['classifier'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in classifier.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) # if hasattr(model.decoder, 'embeddings'): # model.decoder.embeddings.load_pretrained_vectors( # model_opt.pre_word_vecs_dec) model.classifier = classifier model.to(device) return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # for back compat when attention_dropout was not defined try: model_opt.attention_dropout except AttributeError: model_opt.attention_dropout = model_opt.dropout # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") # model = onmt.models.NMTModel(encoder, decoder, model_opt.pos_enc, model_opt.pos_dec) gen_func = nn.LogSoftmax(dim=-1) # model = nn.Sequential( # nn.Linear(model_opt.enc_rnn_size, # model_opt.enc_rnn_size), # # nn.BatchNorm1d(model_opt.enc_rnn_size), # nn.ReLU(), # nn.Dropout() # ) input_size = model_opt.dec_rnn_size if model_opt.rl_step else model_opt.enc_rnn_size output_size = 54 if model_opt.sample_method == "topk" else 20 generators = {} for i, kv in enumerate(model_opt.generators.split(",")): k, _ = kv.split(":") # output_size = 54 if k == "0" else output_size generators[k] = nn.Sequential( nn.Linear(input_size, input_size), Cast(torch.float32), # nn.BatchNorm1d(model_opt.enc_rnn_size), nn.ReLU(), nn.Dropout(), nn.Linear(input_size, output_size), Cast(torch.float32), gen_func) class TMEPModel(nn.Module): def __init__(self, gens): super(TMEPModel, self).__init__() self.generators = gens def forward(self, inputs, fix_k=None): outputs = {} for name, gen in self.generators.items(): if name == fix_k: with torch.no_grad(): outputs[name] = gen(inputs) else: outputs[name] = gen(inputs) return outputs model = TMEPModel(generators) for k, v in model.generators.items(): setattr(model, k, v) # v.to(device) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) model.to(device) if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam': model.half() return model
def build_base_model(model_opt, fields, gpu, checkpoint=None, gpu_id=None): """Build a model from opts. Args: model_opt: the option loaded from checkpoint. It's important that the opts have been updated and validated. See :class:`onmt.utils.parse.ArgumentParser`. fields (dict[str, torchtext.data.Field]): `Field` objects for the model. gpu (bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. gpu_id (int or NoneType): Which GPU to use. Returns: the NMTModel. """ # for back compat when attention_dropout was not defined try: model_opt.attention_dropout except AttributeError: model_opt.attention_dropout = model_opt.dropout # Build embeddings. if model_opt.model_type == "text" or model_opt.model_type == "vec": src_field = fields["src"] src_emb = build_embeddings(model_opt, src_field) else: src_emb = None # Build encoder. encoder = build_encoder(model_opt, src_emb) # Build decoder. tgt_field = fields["tgt"] tgt_emb = build_embeddings(model_opt, tgt_field, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. assert src_field.base_field.vocab == tgt_field.base_field.vocab, \ "preprocess with -share_vocab if you use share_embeddings" tgt_emb.word_lut.weight = src_emb.word_lut.weight decoder = build_decoder(model_opt, tgt_emb) if "continuous" in model_opt.generator_function: #make target embeddings tgt_out_vectors = tgt_field.base_field.vocab.vectors if model_opt.center: center_emb = tgt_out_vectors.sum( dim=0, keepdim=True) / (tgt_out_vectors.size(0)) tgt_out_vectors = tgt_out_vectors - center_emb tgt_out_vectors_unitnorm = nn.functional.normalize(tgt_out_vectors, p=2, dim=1) tgt_out_emb = nn.Embedding(tgt_out_vectors.size(0), tgt_out_vectors.size(1)) tgt_out_emb.weight.data.copy_(tgt_out_vectors_unitnorm) tgt_out_emb.weight.requires_grad = False # do not train the embeddings # Build NMTModel(= encoder + decoder). if gpu and gpu_id is not None: device = torch.device("cuda", gpu_id) elif gpu and not gpu_id: device = torch.device("cuda") elif not gpu: device = torch.device("cpu") model = onmt.models.NMTModel(encoder, decoder) # Build Generator. if not model_opt.copy_attn: if model_opt.generator_function == 'continuous-linear': generator_modules = [ nn.Linear(model_opt.dec_rnn_size, tgt_out_vectors.size(1)) ] if model_opt.generator_layer_norm: generator_modules.append( nn.LayerNorm(tgt_out_vectors.size(1), eps=1e-6)) generator = nn.Sequential(*generator_modules) elif model_opt.generator_function == 'continuous-nonlinear': #add a non-linear layer before generating the continuous vector generator_modules = [ nn.Linear(model_opt.dec_rnn_size, tgt_out_vectors.size(1)), nn.ReLU(), nn.Linear(tgt_out_vectors.size(1), tgt_out_vectors.size(1)) ] if model_opt.generator_layer_norm: generator_modules.append( nn.LayerNorm(tgt_out_vectors.size(1), eps=1e-6)) generator = nn.Sequential(*generator_modules) else: if model_opt.generator_function == "sparsemax": gen_func = onmt.modules.sparse_activations.LogSparsemax(dim=-1) else: gen_func = nn.LogSoftmax(dim=-1) generator = nn.Sequential( nn.Linear(model_opt.dec_rnn_size, len(fields["tgt"].base_field.vocab)), Cast(torch.float32), gen_func) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: tgt_base_field = fields["tgt"].base_field vocab_size = len(tgt_base_field.vocab) pad_idx = tgt_base_field.vocab.stoi[tgt_base_field.pad_token] generator = CopyGenerator(model_opt.dec_rnn_size, vocab_size, pad_idx) # Load the model states from checkpoint or initialize them. if checkpoint is not None: # This preserves backward-compat for models using customed layernorm def fix_key(s): s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.b_2', r'\1.layer_norm\2.bias', s) s = re.sub(r'(.*)\.layer_norm((_\d+)?)\.a_2', r'\1.layer_norm\2.weight', s) return s checkpoint['model'] = { fix_key(k): v for k, v in checkpoint['model'].items() } # end of patch for backward compatibility model.load_state_dict(checkpoint['model'], strict=False) generator.load_state_dict(checkpoint['generator'], strict=False) else: if model_opt.param_init != 0.0: for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if model_opt.param_init_glorot: for p in model.parameters(): if p.dim() > 1: xavier_uniform_(p) for p in generator.parameters(): if p.dim() > 1: xavier_uniform_(p) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec) model.generator = generator if "continuous" in model_opt.generator_function: model.decoder.tgt_out_emb = tgt_out_emb if model_opt.share_decoder_embeddings: model.decoder.embeddings.tie_embeddings(tgt_out_emb.weight) model.to(device) if model_opt.model_dtype == 'fp16' and model_opt.optim == 'fusedadam': model.half() return model