def validate(model: GTransformer, iterator,
             text_encoder: WhitespaceEncoder) -> None:
    """
    Function that computes the loss over the validation set.

    :param model: Sequence-to-sequence transformer model.
    :param iterator: Iterator object over the test Dataset.
    :param text_encoder: Torch NLP text encoder for tokenization and vectorization. 
    """
    total_loss, steps = 0, 0
    # Testing
    with torch.no_grad():
        model.train(False)
        for sample in iterator:
            # 1) Prepare Sample
            src, src_lengths, trg, shifted_trg, trg_lengths = prepare_sample(
                sample, text_encoder)
            # 2) Run model
            lprobs = model(
                src=src.cuda(),
                trg=shifted_trg.cuda(),
                src_mask=lengths_to_mask(src_lengths).unsqueeze(1).cuda(),
                trg_mask=lengths_to_mask(trg_lengths).unsqueeze(1).cuda())
            # 3) Compute loss
            loss = F.nll_loss(lprobs.transpose(2, 1),
                              trg.cuda(),
                              reduction='mean')
            # 4) Update training metrics
            total_loss += float(loss.item())
            steps += int(trg.ne(PAD_IDX).sum())
    print(f'-- total test loss {total_loss:.4}')
    print(f'-- test steps {steps}')
def train_loop(configs: dict, model: GTransformer, opt: torch.optim.Adam,
               train: Dataset, test: Dataset,
               text_encoder: WhitespaceEncoder) -> GTransformer:
    """
    Main training loop.

    :param configs: Configs defined on the default.yaml file.
    :param model: Sequence-to-sequence transformer.
    :param opt: Adam optimizer.
    :param train: The dataset used for training.
    :param test: The dataset used for validation.
    :param text_encoder: Torch NLP text encoder for tokenization and vectorization.
    """
    for e in range(configs.get('num_epochs', 8)):
        print(f'\n Epoch {e}')
        model.train()

        nr_batches = math.ceil(len(train) / configs.get('batch_size', 8))
        train_iter, test_iter = get_iterators(configs, train, test)
        total_loss, steps = 0, 0

        for sample in tqdm.tqdm(train_iter, total=nr_batches):
            # 0) Zero out previous grads
            opt.zero_grad()

            # 1) Prepare Sample
            src, src_lengths, trg, shifted_trg, trg_lengths = prepare_sample(
                sample, text_encoder)

            # 2) Run model
            lprobs = model(
                src=src.cuda(),
                trg=shifted_trg.cuda(),
                src_mask=lengths_to_mask(src_lengths).unsqueeze(1).cuda(),
                trg_mask=lengths_to_mask(trg_lengths).unsqueeze(1).cuda())

            # 3) Compute loss
            loss = F.nll_loss(lprobs.transpose(2, 1),
                              trg.cuda(),
                              reduction='mean')
            loss.backward()

            # 4) Update training metrics
            total_loss += float(loss.item())
            steps += int(trg.ne(0).sum())

            # 5) clip gradients
            # - If the total gradient vector has a length > 1, we clip it back down to 1.
            if configs.get('gradient_clipping', -1) > 0.0:
                nn.utils.clip_grad_norm_(model.parameters(),
                                         configs.get('gradient_clipping'))

            # 6) Optim step
            opt.step()

        print(f'-- total train loss {total_loss:.4}')
        total_steps = steps * (e + 1)
        print(f'-- train steps {total_steps}')
        validate(model, test_iter, text_encoder)
    return model
Exemplo n.º 3
0
    def forward(self, tokens, lengths):
        """ Usual pytorch forward function.
        :param tokens: text sequences [batch_size x src_seq_len]
        :param lengths: source lengths [batch_size]

        Returns:
            Dictionary with model outputs (e.g: logits)
        """
        tokens = tokens[:, :lengths.max()]
        # When using just one GPU this should not change behavior
        # but when splitting batches across GPU the tokens have padding
        # from the entire original batch
        mask = lengths_to_mask(lengths, device=tokens.device)

        # Run BERT model.
        word_embeddings = self.transformer(tokens, mask)[0]

        # Average Pooling
        word_embeddings = mask_fill(0.0, tokens, word_embeddings,
                                    self.tokenizer.padding_index)
        sentemb = torch.sum(word_embeddings, 1)
        sum_mask = mask.unsqueeze(-1).expand(
            word_embeddings.size()).float().sum(1)
        sentemb = sentemb / sum_mask

        return {"logits": self.classification_head(sentemb)}
Exemplo n.º 4
0
    def forward(self, tokens: torch.tensor, lengths: torch.tensor, **kwargs) -> dict:
        """
        Encodes a batch of sequences.
        :param tokens: Torch tensor with the input sequences [batch_size x seq_len].
        :param lengths: Torch tensor with the length of each sequence [seq_len].

        Returns: 
            - 'sentemb': tensor [batch_size x 1024] with the sentence encoding.
            - 'wordemb': tensor [batch_size x seq_len x 1024] with the word level embeddings.
            - 'all_layers': List with the word_embeddings returned by each layer.
            - 'mask': torch.Tensor [seq_len x batch_size] 
            - 'extra': tuple with the last_hidden_state [batch_size x seq_len x hidden_size],
                the pooler_output representing the entire sentence and the word embeddings for 
                all XLM-R layers (list of tensors [batch_size x seq_len x hidden_size]) 
        """
        mask = lengths_to_mask(lengths, device=tokens.device)
        # Run  RoBERTa model.
        last_hidden_states, pooler_output, all_layers = self.model(tokens, mask)
        return {
            "sentemb": pooler_output,
            "wordemb": last_hidden_states,
            "all_layers": all_layers,
            "mask": mask,
            "extra": (last_hidden_states, pooler_output, all_layers),
        }
Exemplo n.º 5
0
    def forward(self, tokens: torch.tensor, lengths: torch.tensor,
                **kwargs) -> dict:
        """
        Encodes a batch of sequences.
        :param tokens: Torch tensor with the input sequences [batch_size x seq_len].
        :param lengths: Torch tensor with the length of each sequence [seq_len].

        Returns: 
            - 'sentemb': tensor [batch_size x 1024] with the sentence encoding.
            - 'wordemb': tensor [batch_size x seq_len x 1024] with the word level embeddings.
            - 'all_layers': List with the word_embeddings returned by each layer.
            - 'mask': torch.Tensor [seq_len x batch_size] 
            - 'extra': tuple with all XLM-R layers (list of tensors [batch_size x seq_len x hidden_size]) 
        """
        mask = lengths_to_mask(lengths, device=tokens.device)
        # Run RoBERTa model.
        all_layers = self.model.extract_features(tokens,
                                                 return_all_hiddens=True)
        return {
            "sentemb": all_layers[-1][:, 0, :],
            "wordemb": all_layers[-1],
            "all_layers": all_layers,
            "mask": mask,
            "extra": (all_layers),
        }
Exemplo n.º 6
0
def test_lengths_to_mask():
    assert lengths_to_mask([3]).sum() == 3
    assert lengths_to_mask(torch.tensor(3)).sum() == 3
    assert lengths_to_mask([1, 2, 3]).sum() == 6
    assert lengths_to_mask([1, 2, 3])[0].sum() == 1
    assert lengths_to_mask([1, 2, 3])[0][0].item() == 1
    assert lengths_to_mask(torch.tensor([1, 2, 3]))[0][0].item() == 1
    assert lengths_to_mask(torch.tensor([1.0, 2.0, 3.0]))[0][0].item() == 1
    def forward(
        self,
        input_ids: torch.Tensor,
        input_lengths: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        # Reduce unnecessary padding.
        input_ids = input_ids[:, : input_lengths.max()]

        mask = lengths_to_mask(input_lengths, device=input_ids.device)

        # Run model.
        word_embeddings = self.transformer(input_ids, mask)[0]

        # Pooling Layer
        sentemb = self.apply_pooling(input_ids, word_embeddings, mask)

        # Classify
        return self.classification_head(sentemb)
Exemplo n.º 8
0
    def _build_seq_eos_mask(self,
                            tokens: torch.Tensor,
                            eos_id=3,
                            curr_pos_in_seq=0):
        """
        маскирует токены, которые идут после eos токена
        """

        current_max_seq_len = tokens.size(1)
        lengths = []
        for seq in tokens:
            eos_indexes = torch.nonzero(seq == eos_id)
            if eos_indexes.size(0) == 0:
                lengths.append(current_max_seq_len)
            else:
                current_len = eos_indexes[0, 0]
                lengths.append(current_len)
        assert len(lengths) == tokens.size(0)

        mask: torch.Tensor = lengths_to_mask(lengths, device=tokens.device)
        return mask
Exemplo n.º 9
0
    def forward(self, tokens: torch.Tensor,
                lengths: torch.Tensor) -> Dict[str, torch.Tensor]:
        """
        Encodes a batch of sequences.
        
        :param tokens: Torch tensor with the input sequences [batch_size x seq_len].
        :param lengths: Torch tensor with the length of each sequence [seq_len].

        :return: Dictionary with `sentemb` (tensor with dims [batch_size x output_units]), `wordemb` 
            (tensor with dims [batch_size x seq_len x output_units]), `mask` (input mask), 
            `all_layers` (List with word_embeddings from all layers), `extra` (tuple with all XLM-R layers).
        """
        mask = lengths_to_mask(lengths, device=tokens.device)
        all_layers = self.model.extract_features(tokens,
                                                 return_all_hiddens=True)
        return {
            "sentemb": all_layers[-1][:, 0, :],
            "wordemb": all_layers[-1],
            "all_layers": all_layers,
            "mask": mask,
            "extra": (all_layers),
        }
def prepare_sample(
    sample: dict, text_encoder: WhitespaceEncoder, label_encoder: LabelEncoder,
    max_length: int
) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor):
    """
    Function that receives a sample from the Dataset iterator and prepares t
    he input to feed the transformer model.
    :param sample: dictionary containing the inputs to build the batch 
        (e.g: [{'source': 'This flight was amazing!', 'target': 'pos'}, 
               {'source': 'I hate Iberia', 'target': 'neg'}])
    :param text_encoder: Torch NLP text encoder for tokenization and vectorization.
    :param label_encoder: Torch NLP label encoder for vectorization of labels.
    :param max_length: Max length of the input sequences.
         If a sequence passes that value it is truncated.
    """
    sample = collate_tensors(sample)
    input_seqs, input_lengths = text_encoder.batch_encode(sample['source'])
    target_seqs = label_encoder.batch_encode(sample['target'])
    # Truncate Inputs
    if input_seqs.size(1) > max_length:
        input_seqs = input_seqs[:, :max_length]
    input_mask = lengths_to_mask(input_lengths).unsqueeze(1)
    return input_seqs, input_mask, target_seqs
def train_manager(configs: dict) -> None:
    """
    Model Training functions.
    :param configs: Dictionary with the configs defined in default.yaml
    """
    with open('.preprocess.pkl', 'rb') as preprocess_file:
        text_encoder, train, test = pickle.load(preprocess_file)

    set_seed(configs.get('seed', 3))
    print(f'- nr. of training examples {len(train)}')
    print(f'- nr. of test examples {len(test)}')
    print(f'- vocab size: {text_encoder.vocab_size}')

    # Build Transformer model
    model = GTransformer(emb_size=configs.get('embedding_size', 128),
                         heads=configs.get('num_heads', 8),
                         depth=configs.get('depth', 6),
                         seq_length=configs.get('max_length', 1000),
                         vocab_size=text_encoder.vocab_size)
    model.cuda()

    # Build Optimizer
    opt = torch.optim.Adam(lr=configs.get('lr', 0.0001),
                           params=model.parameters())

    # Training Loop
    model = train_loop(configs, model, opt, train, test, text_encoder)

    # Now that the model is trained lets try to see what is the model output!
    sample = collate_tensors(SAMPLES)
    src_seqs, src_lengths = text_encoder.batch_encode(sample['source'])
    src_mask = lengths_to_mask(src_lengths).unsqueeze(1)
    ys, lengths = greedy_decode(model, src_seqs, src_mask)
    ys = text_encoder.batch_decode(ys, lengths)
    for i in range(len(SAMPLES)):
        print('\nTarget: {}\nModel:  {}'.format(SAMPLES[i]['target'], ys[i]))
Exemplo n.º 12
0
    def forward(self, tokens: torch.Tensor,
                lengths: torch.Tensor) -> Dict[str, torch.Tensor]:
        """
        Encodes a batch of sequences.

        :param tokens: Torch tensor with the input sequences [batch_size x seq_len].
        :param lengths: Torch tensor with the lenght of each sequence [seq_len].

        :return: Dictionary with `sentemb` (tensor with dims [batch_size x output_units]), `wordemb` 
            (tensor with dims [batch_size x seq_len x output_units]), `mask` (input mask), 
            `all_layers` (List with word_embeddings from all layers), `extra` (tuple with the 
            last_hidden_state, the pooler_output representing  the entire sentence and the word 
            embeddings for all BERT layers).
        """
        mask = lengths_to_mask(lengths, device=tokens.device)
        last_hidden_states, pooler_output, all_layers = self.model(
            tokens, mask)
        return {
            "sentemb": pooler_output,
            "wordemb": last_hidden_states,
            "all_layers": all_layers,
            "mask": mask,
            "extra": (last_hidden_states, pooler_output, all_layers),
        }
Exemplo n.º 13
0
    def forward(self, tokens: torch.Tensor, lengths: torch.Tensor,
                **kwargs) -> Dict[str, torch.Tensor]:
        """
        Encodes a batch of sequences.

        :param tokens: Torch tensor with the input sequences [batch_size x seq_len].
        :param lengths: Torch tensor with the lenght of each sequence [seq_len].

        :return: Dictionary with `sentemb` (tensor with dims [batch_size x output_units]), `wordemb` 
            (tensor with dims [batch_size x seq_len x output_units]), `mask` (input mask), 
            `all_layers` (List with word_embeddings from all layers, `extra` (tuple with the LSTM outputs, 
            hidden states and cell states).
        """
        self.lstm.flatten_parameters(
        )  # Is it required? should this be in the __init__?
        tokens, lengths, unsorted_idx = sort_sequences(tokens, lengths)

        if self.left_pad:
            # convert left-padding to right-padding
            tokens = convert_padding_direction(
                tokens,
                self.padding_idx,
                left_to_right=True,
            )

        bsz, seqlen = tokens.size()

        # embed tokens
        x = self.embed_tokens(tokens)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # pack embedded source tokens into a PackedSequence
        packed_x = nn.utils.rnn.pack_padded_sequence(x, lengths.data.tolist())

        # apply LSTM
        if self.bidirectional:
            state_size = 2 * self._n_layers, bsz, self.hidden_size
        else:
            state_size = self._n_layers, bsz, self.hidden_size

        h0 = x.data.new(*state_size).zero_()
        c0 = x.data.new(*state_size).zero_()
        packed_outs, (final_hiddens,
                      final_cells) = self.lstm(packed_x, (h0, c0))

        # unpack outputs and apply dropout
        x, _ = nn.utils.rnn.pad_packed_sequence(
            packed_outs, padding_value=self.padding_value)
        assert list(x.size()) == [seqlen, bsz, self.output_units]
        word_embeddings = x

        if self.bidirectional:

            def combine_bidir(outs):
                return torch.cat(
                    [
                        torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view(
                            1, bsz, self.output_units)
                        for i in range(self._n_layers)
                    ],
                    dim=0,
                )

            final_hiddens = combine_bidir(final_hiddens)
            final_cells = combine_bidir(final_cells)

        encoder_padding_mask = tokens.eq(self.padding_idx).t()

        # Set padded outputs to -inf so they are not selected by max-pooling
        padding_mask = tokens.eq(self.padding_idx).t().unsqueeze(-1)
        if padding_mask.any():
            x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x)

        # Build the sentence embedding by max-pooling over the encoder outputs
        sentemb = x.max(dim=0)[0]

        model_out = self.reorder_output(
            encoder_out={
                "sentemb": sentemb,
                "extra": (word_embeddings, final_hiddens, final_cells),
            },
            new_order=unsorted_idx,
        )
        model_out["mask"] = lengths_to_mask(lengths, device=tokens.device)
        model_out["wordemb"] = model_out["extra"][0].transpose(0, 1)
        model_out["all_layers"] = [model_out["wordemb"]]
        return model_out