def get_position_emb(num, dim): from fairseq.modules.sinusoidal_positional_embedding import SinusoidalPositionalEmbedding return SinusoidalPositionalEmbedding.get_embedding( num + 2, dim, 1).detach().numpy()[2:]
def convert_model_weight(self, opt: Opt) -> Dict[str, Any]: """ Convert state_dict between fairseq and ParlAI. :param opt: ParlAI opt :return state_dict: return a state dict to load into ParlAI model. """ # deal with embeddings state = self.state agent = self.agent state_dict = state['model'] return_dict = OrderedDict() for each_key in state_dict.keys(): mapped_key = each_key if mapped_key == 'encoder.version' or mapped_key == 'decoder.version': continue # 1. replace if embedding for emb in EMBEDDING_DICT_MAPPING: mapped_key = mapped_key.replace(emb, EMBEDDING_DICT_MAPPING[emb]) # 2. Replace attention if 'encoder' in each_key and 'self_attn' in each_key: mapped_key = mapped_key.replace('self_attn', 'attention') elif 'decoder' in each_key and 'self_attn' in each_key: mapped_key = mapped_key.replace('self_attn', 'self_attention') elif 'decoder' in each_key and 'encoder_attn' in each_key: mapped_key = mapped_key.replace('encoder_attn', 'encoder_attention') # 3. Replace multihead linear layers # fairseq sometimes chunks all three layers into one model weight if 'in_proj_weight' in mapped_key or 'in_proj_bias' in mapped_key: for weightorbias in {'weight', 'bias'}: attention_project_name = 'in_proj_{}'.format(weightorbias) if attention_project_name in mapped_key: weight = state_dict[each_key] size = int(weight.size(0) / 3) weights = weight.split(size, 0) # For Q, K, V in order return_dict[mapped_key.replace( attention_project_name, 'q_lin.{}'.format(weightorbias))] = weights[0] return_dict[mapped_key.replace( attention_project_name, 'k_lin.{}'.format(weightorbias))] = weights[1] return_dict[mapped_key.replace( attention_project_name, 'v_lin.{}'.format(weightorbias))] = weights[2] continue elif ('v_proj' in mapped_key or 'k_proj' in mapped_key or 'q_proj' in mapped_key): mapped_key = mapped_key.replace('v_proj', 'v_lin') mapped_key = mapped_key.replace('q_proj', 'q_lin') mapped_key = mapped_key.replace('k_proj', 'k_lin') # 4. Replace FFN layers for old, new in FFN_MAPPING.items(): mapped_key = mapped_key.replace(old, new) # 5. Fix layer norms if 'encoder.' in mapped_key: mapped_key = mapped_key.replace('attention_layer_norm', 'norm1') mapped_key = mapped_key.replace('final_layer_norm', 'norm2') else: mapped_key = mapped_key.replace('self_attention_layer_norm', 'norm1') mapped_key = mapped_key.replace('encoder_attention_layer_norm', 'norm2') mapped_key = mapped_key.replace('final_layer_norm', 'norm3') for _key in ['encoder', 'decoder']: mapped_key = mapped_key.replace(f'{_key}.layer_norm', f'{_key}.norm_embeddings') mapped_key = mapped_key.replace(f'{_key}.layernorm_embedding', f'{_key}.norm_embeddings') weight = state_dict[each_key] return_dict[mapped_key] = weight # 6. Shuffle embedding matrix given dictionary. enc_emb_key = 'encoder.embeddings.weight' bart_dict = os.path.join(opt['datapath'], 'models/bart/bart.large/dict.txt') with PathManager.open(bart_dict) as f: offset_dict = { i: l.split()[0] for i, l in enumerate(f.readlines()) } new_embs = return_dict[enc_emb_key].clone() for idx, new_idx in offset_dict.items(): try: new_embs[int(new_idx) + 4] = return_dict[enc_emb_key][idx + 4] except ValueError: # if idx is not an int if 'madeupword' in new_idx: pad_idx = int(new_idx.split('madeupword')[1]) new_embs[-(4 - pad_idx)] = return_dict[ 'encoder.embeddings.weight'][idx + 4] return_dict['encoder.embeddings.weight'] = new_embs # 7. Swap special tokens # Fairseq swaps the bos and eos token order for seq2seq models. # # ParlAI s2s models expect: # Encoder: TOKENS </s> # Decoder: <s> TOKENS <s> # Fairseq models get: # Encoder: TOKENS </s> # Decoder: </s> TOKENS <s> # # So we swap to get: # Encoder: TOKENS </s> # Decoder: </s> TOKENS <s> # size_dict = return_dict[enc_emb_key].size(0) if size_dict == len(agent.dict) + 1 and '<mask>' not in agent.dict: return_dict[enc_emb_key] = return_dict[enc_emb_key][:size_dict - 1, :] size_dict -= 1 specials, words = return_dict[enc_emb_key].split([4, size_dict - 4], 0) bos, pad, eos, unk = specials if not self.opt['retain_bos_emb']: bos = eos specials = torch.stack([pad, bos, eos, unk]) fp16_pad = (8 - (len(specials) + len(words)) % 8) % 8 fp16_pad_ez = torch.zeros(fp16_pad, specials.size(1)).type_as(specials) return_dict[enc_emb_key] = torch.cat( [ specials, # special tokens words, # word embeddings fp16_pad_ez, # fp16 requires embeddings size to be a multiple of 8 ], 0, ) return_dict['decoder.embeddings.weight'] = return_dict[enc_emb_key] return_dict['embeddings.weight'] = return_dict[enc_emb_key] # 8. Positional Embeddings if 'encoder.position_embeddings.weight' in return_dict: return_dict['encoder.position_embeddings.weight'] = return_dict[ 'encoder.position_embeddings.weight'][2:, :] return_dict['decoder.position_embeddings.weight'] = return_dict[ 'decoder.position_embeddings.weight'][2:, :] else: # sinusoidal embeddings from fairseq.modules.sinusoidal_positional_embedding import ( SinusoidalPositionalEmbedding, ) emb = SinusoidalPositionalEmbedding.get_embedding( 128 + 2, opt['embedding_size'], 1) del return_dict['encoder.position_embeddings._float_tensor'] del return_dict['decoder.position_embeddings._float_tensor'] return_dict['encoder.position_embeddings.weight'] = emb[2:] return_dict['decoder.position_embeddings.weight'] = emb[2:] return_dict['START'] = torch.LongTensor([1]) # type: ignore return return_dict
def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, normalized_attention=False, normalized_attention_logsoftmax=False, normalized_attention_by_entropy=False, positional_embeddings_in_attention=False, symmetric_kv_context_params=False, symmetric_kv_positional_params=False, #normalized_attention_by_positional_score=False, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.positional_embeddings_in_attention = positional_embeddings_in_attention self.symmetric_kv_context_params = symmetric_kv_context_params self.symmetric_kv_positional_params = symmetric_kv_positional_params #self.normalized_attention_by_positional_score=normalized_attention_by_positional_score self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if self.symmetric_kv_context_params: assert self.kdim == embed_dim, ( "Symmetric context attention requires kdim == embed_dim") self.q_proj.weight = self.k_proj.weight self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.normalized_attention = normalized_attention self.normalized_attention_logsoftmax = normalized_attention_logsoftmax self.normalized_attention_by_entropy = normalized_attention_by_entropy if self.normalized_attention: self.attention_gain = quant_noise( nn.Linear(embed_dim, num_heads, bias=True), q_noise, qn_block_size) if self.positional_embeddings_in_attention: self.pos_k_proj = quant_noise( nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.pos_q_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if self.symmetric_kv_positional_params: assert self.kdim == embed_dim, ( "Symmetric positional attention requires kdim == embed_dim" ) self.pos_q_proj.weight = self.pos_k_proj.weight self.pos_embeddings = SinusoidalPositionalEmbedding( embed_dim, None) self.reset_parameters() self.onnx_trace = False