def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray': """ :param data: a 1d array of string type in size `B` :return: an ndarray in size `B x D` """ try: if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token if self.tokenizer.pad_token is None: self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) ids_info = self.tokenizer.batch_encode_plus( data, max_length=self.max_length, truncation=self.truncation_strategy, pad_to_max_length=True) except ValueError: self.model.resize_token_embeddings(len(self.tokenizer)) ids_info = self.tokenizer.batch_encode_plus( data, max_length=self.max_length, pad_to_max_length=True) token_ids_batch = self.array2tensor(ids_info['input_ids']) mask_ids_batch = self.array2tensor(ids_info['attention_mask']) with self.no_gradients(): outputs = self.model(token_ids_batch, attention_mask=mask_ids_batch, output_hidden_states=True) hidden_states = outputs[-1] output_embeddings = hidden_states[-1] _mask_ids_batch = self.tensor2array(mask_ids_batch) _seq_output = self.tensor2array(output_embeddings) if self.pooling_strategy == 'auto': output = auto_reduce(_seq_output, _mask_ids_batch, self.model.base_model_prefix) elif self.pooling_strategy == 'mean': output = reduce_mean(_seq_output, _mask_ids_batch) elif self.pooling_strategy == 'max': output = reduce_max(_seq_output, _mask_ids_batch) elif self.pooling_strategy == 'min': output = reduce_min(_seq_output, _mask_ids_batch) else: self.logger.error( f'pooling strategy not found: {self.pooling_strategy}') raise NotImplementedError return output
def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray': """ :param data: a 1d array of string type in size `B` :return: an ndarray in size `B x D` """ token_ids_batch = [] mask_ids_batch = [] for c_idx in range(data.shape[0]): token_ids = self.tokenizer.encode(data[c_idx], pad_to_max_length=True, max_length=self.max_length) mask_ids = [ 0 if t == self.tokenizer.pad_token_id else 1 for t in token_ids ] token_ids_batch.append(token_ids) mask_ids_batch.append(mask_ids) token_ids_batch = torch.tensor(token_ids_batch) mask_ids_batch = torch.tensor(mask_ids_batch) with torch.no_grad(): seq_output, pooler_output, *_ = self.model( token_ids_batch, attention_mask=mask_ids_batch) if self.pooling_strategy == 'cls': output = pooler_output.numpy() elif self.pooling_strategy == 'mean': output = reduce_mean(seq_output.numpy(), mask_ids_batch.numpy()) elif self.pooling_strategy == 'max': output = reduce_max(seq_output.numpy(), mask_ids_batch.numpy()) elif self.pooling_strategy == 'min': output = reduce_min(seq_output.numpy(), mask_ids_batch.numpy()) else: self.logger.error("pooling strategy not found: {}".format( self.pooling_strategy)) raise NotImplementedError return output
def test_reduce_mean_with_correct_input(self): correct_data = np.random.rand(10, 10, 3) correct_mask = np.random.rand(10, 10) correct_mean = reduce_mean(correct_data, correct_mask) assert type(correct_mean).__name__ == 'ndarray'
def test_reduce_mean(): results = reduce_mean(test_data, test_mask) for data, mask, result in zip(test_data, test_mask, results): num_valid_tokens = int(sum(mask)) np.testing.assert_array_equal(data[:num_valid_tokens, :].mean(axis=0), result)
def test_reduce_mean_with_wrong_input(self): wrong_data = np.random.rand(10, 10) correct_mask = np.random.rand(10, 10) with self.assertRaises(Exception) as context: reduce_mean(wrong_data, correct_mask) self.assertTrue('tuple index out of range' in str(context.exception))