def __init__( self, n_tokens, d_model=512, d_rep=128, n_head=8, n_encoder_layers=6, d_ff=2048, dropout=0.1, activation="relu", norm=True, pad_id=None, n_decoder_layers=6, ): super(TransformerModel, self).__init__() assert norm assert pad_id is not None self.config = {k: v for k, v in locals().items() if k != "self"} # Encoder self.encoder = CodeEncoder( n_tokens, d_model, d_rep, n_head, n_encoder_layers, d_ff, dropout, activation, norm, pad_id, project=False ) # Decoder decoder_layer = nn.TransformerDecoderLayer(d_model, n_head, d_ff, dropout, activation) decoder_norm = nn.LayerNorm(d_model) if norm else None self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=n_decoder_layers, norm=decoder_norm)
def __init__( self, n_tokens, n_output_tokens, d_model=512, d_out_projection=512, n_hidden_output=1, d_rep=128, n_head=8, n_encoder_layers=6, d_ff=2048, dropout=0.1, activation="relu", norm=True, pad_id=None, encoder_type="transformer", ): super(TypeTransformer, self).__init__() assert norm assert pad_id is not None self.config = {k: v for k, v in locals().items() if k != "self"} # Encoder and output for type prediction assert encoder_type in ["transformer", "lstm"] if encoder_type == "transformer": self.encoder = CodeEncoder(n_tokens, d_model, d_rep, n_head, n_encoder_layers, d_ff, dropout, activation, norm, pad_id, project=False) # TODO: Try LeakyReLU self.output = nn.Sequential(nn.Linear(d_model, d_model), nn.ReLU(), nn.Linear(d_model, n_output_tokens)) elif encoder_type == "lstm": self.encoder = CodeEncoderLSTM( n_tokens=n_tokens, d_model=d_model, d_rep=d_rep, n_encoder_layers=n_encoder_layers, dropout=dropout, pad_id=pad_id, project=False, ) layers = [] layers.append(nn.Linear(d_model * 2, d_out_projection)) if n_hidden_output > 1: layers.append(nn.Dropout(dropout)) layers.append(nn.ReLU()) for hidden_idx in range(n_hidden_output - 1): layers.append(nn.Linear(d_out_projection, d_out_projection)) layers.append(nn.Dropout(dropout)) layers.append(nn.ReLU()) layers.append(nn.Linear(d_out_projection, n_output_tokens)) self.output = nn.Sequential(*layers)
def make_encoder(self, n_tokens, d_model, d_rep, pad_id=None, encoder_type="transformer", lstm_project_mode="hidden", n_encoder_layers=6, dropout=0.1, **kwargs): if encoder_type == "transformer": return CodeEncoder(n_tokens, project=True, pad_id=pad_id, d_model=d_model, d_rep=d_rep, n_encoder_layers=n_encoder_layers, **kwargs) elif encoder_type == "lstm": return CodeEncoderLSTM( n_tokens=n_tokens, d_model=d_model, d_rep=d_rep, n_encoder_layers=n_encoder_layers, dropout=dropout, pad_id=pad_id, project=lstm_project_mode, ) else: raise ValueError
def __init__(self, n_tokens, d_model=512, pad_id=None, encoder_type="transformer", **encoder_args): super().__init__() self.n_tokens = n_tokens self.d_model = d_model if encoder_type == "transformer": self.encoder = CodeEncoder(n_tokens, project=False, pad_id=pad_id, d_model=d_model, **encoder_args) self.head_in = d_model elif encoder_type == "lstm": self.encoder = CodeEncoderLSTM(n_tokens=n_tokens, d_model=d_model, pad_id=pad_id, project=False, **encoder_args) self.head_in = 2 * d_model else: raise ValueError self.head = nn.Sequential(nn.Linear(self.head_in, d_model), nn.ReLU(), nn.LayerNorm(d_model))
class TransformerModel(nn.Module): def __init__( self, n_tokens, d_model=512, d_rep=128, n_head=8, n_encoder_layers=6, d_ff=2048, dropout=0.1, activation="relu", norm=True, pad_id=None, n_decoder_layers=6, ): super(TransformerModel, self).__init__() assert norm assert pad_id is not None self.config = {k: v for k, v in locals().items() if k != "self"} # Encoder self.encoder = CodeEncoder(n_tokens, d_model, d_rep, n_head, n_encoder_layers, d_ff, dropout, activation, norm, pad_id, project=False) # Decoder decoder_layer = nn.TransformerDecoderLayer(d_model, n_head, d_ff, dropout, activation) decoder_norm = nn.LayerNorm(d_model) if norm else None self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=n_decoder_layers, norm=decoder_norm) def forward(self, src_tok_ids, tgt_tok_ids, src_lengths=None, tgt_lengths=None): r""" Arguments: src_tok_ids: [B, L] long tensor tgt_tok_ids: [B, T] long tensor """ if src_tok_ids.size(0) != tgt_tok_ids.size(0): raise RuntimeError( "the batch number of src_tok_ids and tgt_tok_ids must be equal" ) # Encode memory = self.encoder(src_tok_ids) # Decode, using the same embedding and positional encoding as the encoder tgt_emb = self.encoder.embedding(tgt_tok_ids).transpose( 0, 1) * math.sqrt(self.config["d_model"]) tgt_emb = self.encoder.pos_encoder(tgt_emb) tgt_mask = self.generate_square_subsequent_mask( tgt_tok_ids.size(1)).to(tgt_tok_ids.device) if self.config["pad_id"] is None: assert False tgt_key_padding_mask = None else: tgt_key_padding_mask = tgt_tok_ids == self.config["pad_id"] output = self.decoder(tgt_emb, memory, tgt_mask=tgt_mask, memory_mask=None, tgt_key_padding_mask=tgt_key_padding_mask) logits = torch.matmul(output, self.encoder.embedding.weight.transpose( 0, 1)) # [T, B, ntok] return torch.transpose(logits, 0, 1) # [B, T, ntok] def generate_square_subsequent_mask(self, sz): r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0). """ mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill( mask == 1, float(0.0)) return mask
def __init__( self, n_tokens, d_model=512, n_head=8, n_encoder_layers=6, d_ff=2048, dropout=0.1, activation="relu", norm=True, pad_id=None, encoder_type="transformer", critic_type="bilinear_identity", bilinear_rank=None, ): super().__init__() assert norm assert pad_id is not None self.config = {k: v for k, v in locals().items() if k != "self"} # Encoder and output for type prediction assert encoder_type in ["transformer", "lstm"] if encoder_type == "transformer": d_critic_rep = d_model # Per token dimension, then take mean self.encoder = CodeEncoder( n_tokens=n_tokens, d_model=d_model, n_head=n_head, n_encoder_layers=n_encoder_layers, d_ff=d_ff, dropout=dropout, activation=activation, norm=norm, pad_id=pad_id, project=False, ) elif encoder_type == "lstm": d_critic_rep = 4 * d_model # 4 * d_model for 2 layer bidirectional LSTM self.encoder = CodeEncoderLSTM( n_tokens=n_tokens, d_model=d_model, n_encoder_layers=n_encoder_layers, dropout=dropout, pad_id=pad_id, project=False, ) if critic_type == "bilinear_diagonal": self.output_weight = nn.Parameter(torch.randn(d_critic_rep), requires_grad=True) elif critic_type == "bilinear_symmetric": self.output_weight = nn.Parameter(torch.randn( d_critic_rep, d_critic_rep), requires_grad=True) elif critic_type == "bilinear_symmetric_plus_identity": W = torch.randn(d_critic_rep, d_critic_rep) + torch.eye(d_critic_rep) self.output_weight = nn.Parameter(W, requires_grad=True) elif critic_type == "bilinear_identity": self.output_weight = None elif critic_type == "bilinear_lowrank": assert bilinear_rank W = torch.randn(bilinear_rank, d_critic_rep) self.output_weight = nn.Parameter(W, requires_grad=True) else: raise ValueError
def embed_augmented( # Data data_filepath: str, output_dir: str, spm_filepath: str, num_workers=1, max_seq_len=-1, min_alternatives=2, # Model encoder_type: str = "lstm", pretrain_resume_path: str = "", pretrain_resume_encoder_name: str = "encoder_q", # encoder_q, encoder_k, encoder pretrain_resume_project: bool = False, # no_output_attention: bool = False, n_encoder_layers: int = 2, d_model: int = 512, # Loss subword_regularization_alpha: float = 0, # Computational use_cuda: bool = True, seed: int = 0, ): torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) config = locals() logger.info(f"Config: {config}") if use_cuda: assert torch.cuda.is_available(), "CUDA not available. Check env configuration, or pass --use_cuda False" sp = spm.SentencePieceProcessor() sp.Load(spm_filepath) pad_id = sp.PieceToId("[PAD]") mask_id = sp.PieceToId("[MASK]") # Create model if encoder_type == "lstm": encoder = CodeEncoderLSTM( n_tokens=sp.GetPieceSize(), d_model=d_model, d_rep=256, n_encoder_layers=n_encoder_layers, dropout=0.1, pad_id=pad_id, project=False, ) encoder.config["project"] = "hidden" logger.info(f"Created CodeEncoderLSTM with {count_parameters(encoder)} params") elif encoder_type == "transformer": encoder = CodeEncoder(sp.GetPieceSize(), d_model, 256, 8, n_encoder_layers, 2048, 0.1, "relu", True, pad_id, project=False) logger.info(f"Created CodeEncoder with {count_parameters(encoder)} params") # Load pretrained checkpoint if pretrain_resume_path: logger.info( f"Resuming training from pretraining checkpoint {pretrain_resume_path}, pretrain_resume_encoder_name={pretrain_resume_encoder_name}" ) checkpoint = torch.load(pretrain_resume_path) pretrained_state_dict = checkpoint["model_state_dict"] for key in pretrained_state_dict.keys(): print("Pretrained state dict:", key) for key in encoder.state_dict().keys(): print("Encoder state dict:", key) encoder_state_dict = {} assert pretrain_resume_encoder_name in ["encoder_k", "encoder_q", "encoder"] for key, value in pretrained_state_dict.items(): if key.startswith(pretrain_resume_encoder_name + ".") and "project_layer" not in key: remapped_key = key[len(pretrain_resume_encoder_name + ".") :] logger.debug(f"Remapping checkpoint key {key} to {remapped_key}. Value mean: {value.mean().item()}") encoder_state_dict[remapped_key] = value encoder.load_state_dict(encoder_state_dict) logger.info(f"Loaded state dict from {pretrain_resume_path}") # Parallelize across GPUs encoder = nn.DataParallel(encoder) encoder = encoder.cuda() if use_cuda else encoder # Load batches consisting of augmented variants of the same program sp = spm.SentencePieceProcessor() sp.Load(config["spm_filepath"]) pad_id = sp.PieceToId("[PAD]") def pad_collate(batch): assert len(batch) == 1 X = batch[0] B = len(X) # Create tensor of sequence lengths, [B] or [2B] lengths = torch.tensor([len(x) for x in X], dtype=torch.long) # Create padded tensor for batch, [B, T] X = pad_sequence(X, batch_first=True, padding_value=pad_id) return X, lengths dataset = PrecomputedDataset( data_filepath, min_alternatives=min_alternatives, program_mode="all_alternatives", limit_size=-1, sp=sp, subword_regularization_alpha=subword_regularization_alpha, max_length=max_seq_len, ) loader = torch.utils.data.DataLoader( dataset, batch_size=1, shuffle=True, collate_fn=pad_collate, num_workers=num_workers, drop_last=False, pin_memory=False, ) representations = [] encoder.eval() os.makedirs(output_dir, exist_ok=True) with torch.no_grad(): # Evaluate metrics logger.info(f"Evaluating encoder...") pbar = tqdm.tqdm(loader, desc="evalaute") for X, lengths in pbar: rep = encoder(X.cuda(), lengths.cuda(), None) # [B, n_layers*n_directions*d_model] if encoder_type == "transformer": assert len(rep.shape) == 3 rep = rep.mean(dim=0) # rep is [T, B, dimension], so take mean across sequence rep = rep.cpu().numpy() X = X.cpu().numpy() print("rep", type(rep), "X", type(X)) print("rep", rep.shape, "X", X.shape) representations.append((X, rep)) if len(representations) and len(representations) % 100 == 0: path = os.path.join(output_dir, f"tokens_and_embeddings_{len(representations):06d}.pth") logger.info(f"Saving representations to {path}") # with open(path, "wb") as f: # pickle.dump(representations, f) # torch.save(path, representations) torch.save(representations, path)