示例#1
0
    def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
        """
        :param data: a 1d array of string type in size `B`
        :return: an ndarray in size `B x D`
        """
        try:
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
                if self.tokenizer.pad_token is None:
                    self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            ids_info = self.tokenizer.batch_encode_plus(
                data,
                max_length=self.max_length,
                truncation=self.truncation_strategy,
                pad_to_max_length=True)
        except ValueError:
            self.model.resize_token_embeddings(len(self.tokenizer))
            ids_info = self.tokenizer.batch_encode_plus(
                data, max_length=self.max_length, pad_to_max_length=True)
        token_ids_batch = self.array2tensor(ids_info['input_ids'])
        mask_ids_batch = self.array2tensor(ids_info['attention_mask'])
        with self.no_gradients():
            outputs = self.model(token_ids_batch,
                                 attention_mask=mask_ids_batch,
                                 output_hidden_states=True)

            hidden_states = outputs[-1]
            output_embeddings = hidden_states[-1]
            _mask_ids_batch = self.tensor2array(mask_ids_batch)
            _seq_output = self.tensor2array(output_embeddings)
            if self.pooling_strategy == 'auto':
                output = auto_reduce(_seq_output, _mask_ids_batch,
                                     self.model.base_model_prefix)
            elif self.pooling_strategy == 'mean':
                output = reduce_mean(_seq_output, _mask_ids_batch)
            elif self.pooling_strategy == 'max':
                output = reduce_max(_seq_output, _mask_ids_batch)
            elif self.pooling_strategy == 'min':
                output = reduce_min(_seq_output, _mask_ids_batch)
            else:
                self.logger.error(
                    f'pooling strategy not found: {self.pooling_strategy}')
                raise NotImplementedError
        return output
示例#2
0
    def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
        """

        :param data: a 1d array of string type in size `B`
        :return: an ndarray in size `B x D`
        """
        token_ids_batch = []
        mask_ids_batch = []
        for c_idx in range(data.shape[0]):
            token_ids = self.tokenizer.encode(data[c_idx],
                                              pad_to_max_length=True,
                                              max_length=self.max_length)
            mask_ids = [
                0 if t == self.tokenizer.pad_token_id else 1 for t in token_ids
            ]
            token_ids_batch.append(token_ids)
            mask_ids_batch.append(mask_ids)

        token_ids_batch = torch.tensor(token_ids_batch)
        mask_ids_batch = torch.tensor(mask_ids_batch)

        with torch.no_grad():
            seq_output, pooler_output, *_ = self.model(
                token_ids_batch, attention_mask=mask_ids_batch)
            if self.pooling_strategy == 'cls':
                output = pooler_output.numpy()

            elif self.pooling_strategy == 'mean':
                output = reduce_mean(seq_output.numpy(),
                                     mask_ids_batch.numpy())
            elif self.pooling_strategy == 'max':
                output = reduce_max(seq_output.numpy(), mask_ids_batch.numpy())
            elif self.pooling_strategy == 'min':
                output = reduce_min(seq_output.numpy(), mask_ids_batch.numpy())
            else:
                self.logger.error("pooling strategy not found: {}".format(
                    self.pooling_strategy))
                raise NotImplementedError

        return output
示例#3
0
 def test_reduce_mean_with_correct_input(self):
     correct_data = np.random.rand(10, 10, 3)
     correct_mask = np.random.rand(10, 10)
     correct_mean = reduce_mean(correct_data, correct_mask)
     assert type(correct_mean).__name__ == 'ndarray'
示例#4
0
def test_reduce_mean():
    results = reduce_mean(test_data, test_mask)
    for data, mask, result in zip(test_data, test_mask, results):
        num_valid_tokens = int(sum(mask))
        np.testing.assert_array_equal(data[:num_valid_tokens, :].mean(axis=0),
                                      result)
示例#5
0
 def test_reduce_mean_with_wrong_input(self):
     wrong_data = np.random.rand(10, 10)
     correct_mask = np.random.rand(10, 10)
     with self.assertRaises(Exception) as context:
         reduce_mean(wrong_data, correct_mask)
     self.assertTrue('tuple index out of range' in str(context.exception))