def _build_model(self) -> Estimator: """ Initializes the estimator architecture. """ super()._build_model() if self.hparams.encoder_model != "LASER": self.layer = (int(self.hparams.layer) if self.hparams.layer != "mix" else self.hparams.layer) self.scalar_mix = (ScalarMixWithDropout( mixture_size=self.encoder.num_layers, dropout=self.hparams.scalar_mix_dropout, do_layer_norm=True, ) if self.layer == "mix" and self.hparams.pool != "default" else None) input_emb_sz = (self.encoder.output_units * 6 if self.hparams.pool != "cls+avg" else self.encoder.output_units * 2 * 6) self.ff = FeedForward( in_dim=input_emb_sz, hidden_sizes=self.hparams.hidden_sizes, activations=self.hparams.activations, dropout=self.hparams.dropout, final_activation=( self.hparams.final_activation if hasattr( self.hparams, "final_activation" ) # compatability with older checkpoints! else "Sigmoid"), )
def _build_model(self) -> Estimator: """ Initializes the estimator architecture. """ super()._build_model() if self.hparams.encoder_model != "LASER": self.layer = ( int(self.hparams.layer) if self.hparams.layer != "mix" else self.hparams.layer ) self.scalar_mix = ( ScalarMixWithDropout( mixture_size=self.encoder.num_layers, dropout=self.hparams.scalar_mix_dropout, do_layer_norm=True, ) if self.layer == "mix" and self.hparams.pool != "default" else None ) self.ff = FeedForward( in_dim=self.encoder.output_units * 4, hidden_sizes=self.hparams.hidden_sizes, activations=self.hparams.activations, dropout=self.hparams.dropout, )
def test_MNIST(self): seed_everything(3) """ STEP 1: LOADING DATASET """ images, labels = load_digits(return_X_y=True) images = [torch.Tensor(images[i, :]) for i in range(images.shape[0])] labels = torch.tensor(labels, dtype=torch.long) train_images, test_images, train_labels, test_labels = train_test_split( images, labels, test_size=0.2, random_state=42 ) train_dataset = list(zip(train_images, train_labels)) test_dataset = list(zip(test_images, test_labels)) """ STEP 2: MAKING DATASET ITERABLE """ batch_size = 256 n_iters = 80 num_epochs = n_iters / (len(train_dataset) / batch_size) num_epochs = int(num_epochs) train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=True ) test_loader = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=batch_size, shuffle=False ) """ STEP 3: INSTANTIATE MODEL CLASS """ model = FeedForward( in_dim=8 * 8, out_dim=10, hidden_sizes=100, activations="Tanh", final_activation=False, ) """ STEP 4: INSTANTIATE LOSS CLASS """ criterion = nn.CrossEntropyLoss() """ STEP 5: INSTANTIATE OPTIMIZER CLASS """ learning_rate = 0.1 optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) """ STEP 7: TRAIN THE MODEL """ iter = 0 for epoch in range(num_epochs): for i, (images, labels) in enumerate(train_loader): # Load images with gradient accumulation capabilities images = images.view(-1, 8 * 8).requires_grad_() # Clear gradients w.r.t. parameters optimizer.zero_grad() # Forward pass to get output/logits outputs = model(images) # Calculate Loss: softmax --> cross entropy loss loss = criterion(outputs, labels) # Getting gradients w.r.t. parameters loss.backward() # Updating parameters optimizer.step() iter += 1 if iter % 10 == 0: # Calculate Accuracy correct = 0 total = 0 # Iterate through test dataset for images, labels in test_loader: # Load images with gradient accumulation capabilities images = images.view(-1, 8 * 8).requires_grad_() # Forward pass only to get logits/output outputs = model(images) # Get predictions from the maximum value _, predicted = torch.max(outputs.data, 1) # Total number of labels total += labels.size(0) # Total correct predictions correct += (predicted == labels).sum() accuracy = 100 * correct // total self.assertGreaterEqual(accuracy, 95) self.assertEqual(round(0.1257449835538864, 2), round(loss.item(), 2))
class CometEstimator(Estimator): """ Estimator class that uses a pretrained encoder to extract features from the sequences and then passes those features to a feed forward estimator. :param hparams: Namespace containing the hyperparameters. """ class ModelConfig(Estimator.ModelConfig): switch_prob: float = 0.0 def __init__( self, hparams: Namespace, ) -> None: super().__init__(hparams) def _build_model(self) -> Estimator: """ Initializes the estimator architecture. """ super()._build_model() if self.hparams.encoder_model != "LASER": self.layer = ( int(self.hparams.layer) if self.hparams.layer != "mix" else self.hparams.layer ) self.scalar_mix = ( ScalarMixWithDropout( mixture_size=self.encoder.num_layers, dropout=self.hparams.scalar_mix_dropout, do_layer_norm=True, ) if self.layer == "mix" and self.hparams.pool != "default" else None ) input_emb_sz = ( self.encoder.output_units * 6 if self.hparams.pool != "cls+avg" else self.encoder.output_units * 2 * 6 ) self.ff = FeedForward( in_dim=input_emb_sz, hidden_sizes=self.hparams.hidden_sizes, activations=self.hparams.activations, dropout=self.hparams.dropout, final_activation=( self.hparams.final_activation if hasattr( self.hparams, "final_activation" ) # compatability with older checkpoints! else "Sigmoid" ), ) def configure_optimizers( self, ) -> Tuple[List[torch.optim.Optimizer], List[torch.optim.lr_scheduler.LambdaLR]]: """ Sets different Learning rates for different parameter groups. """ layer_parameters = self.encoder.layerwise_lr( self.hparams.encoder_learning_rate, self.hparams.layerwise_decay ) ff_parameters = [ {"params": self.ff.parameters(), "lr": self.hparams.learning_rate} ] if self.hparams.encoder_model != "LASER" and self.scalar_mix: scalar_mix_parameters = [ { "params": self.scalar_mix.parameters(), "lr": self.hparams.learning_rate, } ] optimizer = self._build_optimizer( layer_parameters + ff_parameters + scalar_mix_parameters ) else: optimizer = self._build_optimizer(layer_parameters + ff_parameters) scheduler = self._build_scheduler(optimizer) return [optimizer], [scheduler] def prepare_sample( self, sample: List[Dict[str, Union[str, float]]], inference: bool = False ) -> Union[ Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[str, torch.Tensor] ]: """ Function that prepares a sample to input the model. :param sample: list of dictionaries. :param inference: If set to true prepares only the model inputs. :returns: Tuple with 2 dictionaries (model inputs and targets). If `inference=True` returns only the model inputs. """ sample = collate_tensors(sample) src_inputs = self.encoder.prepare_sample(sample["src"]) mt_inputs = self.encoder.prepare_sample(sample["mt"]) ref_inputs = self.encoder.prepare_sample(sample["ref"]) src_inputs = {"src_" + k: v for k, v in src_inputs.items()} mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()} ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()} if "alt" in sample: alt_inputs = self.encoder.prepare_sample(sample["alt"]) alt_inputs = {"alt_" + k: v for k, v in alt_inputs.items()} inputs = {**src_inputs, **mt_inputs, **ref_inputs, **alt_inputs} else: inputs = {**src_inputs, **mt_inputs, **ref_inputs} if inference: return inputs targets = {"score": torch.tensor(sample["score"], dtype=torch.float)} return inputs, targets def forward( self, src_tokens: torch.tensor, mt_tokens: torch.tensor, ref_tokens: torch.tensor, src_lengths: torch.tensor, mt_lengths: torch.tensor, ref_lengths: torch.tensor, alt_tokens: torch.tensor = None, alt_lengths: torch.tensor = None, **kwargs ) -> Dict[str, torch.Tensor]: """ Function that encodes both Source, MT and Reference and returns a quality score. :param src_tokens: SRC sequences [batch_size x src_seq_len] :param mt_tokens: MT sequences [batch_size x mt_seq_len] :param ref_tokens: REF sequences [batch_size x ref_seq_len] :param src_lengths: SRC lengths [batch_size] :param mt_lengths: MT lengths [batch_size] :param ref_lengths: REF lengths [batch_size] :param alt_tokens: Alternative REF sequences [batch_size x alt_seq_len] :param alt_lengths: Alternative REF lengths [batch_size] :return: Dictionary with model outputs to be passed to the loss function. """ src_sentemb = self.get_sentence_embedding(src_tokens, src_lengths) mt_sentemb = self.get_sentence_embedding(mt_tokens, mt_lengths) ref_sentemb = self.get_sentence_embedding(ref_tokens, ref_lengths) diff_ref = torch.abs(mt_sentemb - ref_sentemb) diff_src = torch.abs(mt_sentemb - src_sentemb) prod_ref = mt_sentemb * ref_sentemb prod_src = mt_sentemb * src_sentemb if ( not hasattr( self.hparams, "switch_prob" ) # compatability with older checkpoints! or self.hparams.switch_prob <= 0.0 ): embedded_sequences = torch.cat( (mt_sentemb, ref_sentemb, prod_ref, diff_ref, prod_src, diff_src), dim=1 ) score = self.ff(embedded_sequences) if (alt_tokens is not None) and (alt_lengths is not None): alt_sentemb = self.get_sentence_embedding(alt_tokens, alt_lengths) diff_alt = torch.abs(mt_sentemb - alt_sentemb) prod_alt = mt_sentemb * alt_sentemb embedded_sequences = torch.cat( (mt_sentemb, alt_sentemb, prod_alt, diff_alt, prod_src, diff_src), dim=1, ) score = (score + self.ff(embedded_sequences)) / 2 return {"score": score} if self.training: switch = random.random() < self.hparams.switch_prob if switch: embedded_sequences = torch.cat( (mt_sentemb, ref_sentemb, prod_src, diff_src, prod_ref, diff_ref), dim=1, ) else: embedded_sequences = torch.cat( (mt_sentemb, ref_sentemb, prod_ref, diff_ref, prod_src, diff_src), dim=1, ) return {"score": self.ff(embedded_sequences)} elif (alt_tokens is not None) and (alt_lengths is not None): # Switcheroo Inference! alt_sentemb = self.get_sentence_embedding(alt_tokens, alt_lengths) diff_alt = torch.abs(mt_sentemb - alt_sentemb) prod_alt = mt_sentemb * alt_sentemb # Source + MT + Reference embedded_sequences = torch.cat( (mt_sentemb, ref_sentemb, prod_ref, diff_ref, prod_src, diff_src), dim=1 ) src_mt_ref = self.ff(embedded_sequences) # Reference + MT + Source embedded_sequences = torch.cat( (mt_sentemb, ref_sentemb, prod_src, diff_src, prod_ref, diff_ref), dim=1 ) ref_mt_src = self.ff(embedded_sequences) # Source + MT + Alternative Reference embedded_sequences = torch.cat( (mt_sentemb, alt_sentemb, prod_alt, diff_alt, prod_src, diff_src), dim=1 ) src_mt_alt = self.ff(embedded_sequences) # Alternative Reference + MT + Source embedded_sequences = torch.cat( (mt_sentemb, alt_sentemb, prod_src, diff_src, prod_alt, diff_alt), dim=1 ) alt_mt_src = self.ff(embedded_sequences) # Alternative Reference + MT + Reference embedded_sequences = torch.cat( (mt_sentemb, alt_sentemb, prod_alt, diff_alt, prod_ref, diff_ref), dim=1 ) alt_mt_ref = self.ff(embedded_sequences) # Reference + MT + Alternative Reference embedded_sequences = torch.cat( (mt_sentemb, ref_sentemb, prod_ref, diff_ref, prod_alt, diff_alt), dim=1 ) ref_mt_alt = self.ff(embedded_sequences) score = torch.stack( [src_mt_ref, ref_mt_src, src_mt_alt, alt_mt_src, alt_mt_ref, ref_mt_alt] ) confidence = 1 - score.std(dim=0) return {"score": score.mean(dim=0) * confidence, "confidence": confidence} else: # Usual scoring embedded_sequences = torch.cat( (mt_sentemb, ref_sentemb, prod_ref, diff_ref, prod_src, diff_src), dim=1 ) score = self.ff(embedded_sequences) * (1 - self.hparams.switch_prob) # Switch src and reference embeddings embedded_sequences = torch.cat( (mt_sentemb, ref_sentemb, prod_src, diff_src, prod_ref, diff_ref), dim=1 ) return { "score": score + self.ff(embedded_sequences) * self.hparams.switch_prob }