def test_xlm_token_tensorizer(self): vocab = self._mock_vocab() xlm = ScriptXLMTensorizer( tokenizer=ScriptDoNothingTokenizer(), token_vocab=vocab, language_vocab=ScriptVocabulary(["ar", "cn", "en"]), max_seq_len=256, default_language="en", ) rand_tokens = [ [str(random.randint(100, 200)) for i in range(20)], [str(random.randint(100, 200)) for i in range(10)], ] tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens)) tokens = tokens.tolist() # eos token self.assertEqual(tokens[0][0], 202) self.assertEqual(tokens[0][-1], 202) # pad token self.assertEqual(tokens[1][12:], [200] * 10) languages = languages.tolist() self.assertEqual(languages[0], [2] * len(tokens[0])) self.assertEqual(languages[1][12:], [0] * 10) tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"])) languages = languages.tolist() self.assertEqual(languages[0][:], [1] * len(tokens[0])) self.assertEqual(languages[1][:12], [2] * 12)
def forward( self, right_texts: Optional[List[str]] = None, left_texts: Optional[List[str]] = None, right_tokens: Optional[List[List[str]]] = None, left_tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, right_dense_feat: Optional[List[List[float]]] = None, left_dense_feat: Optional[List[List[float]]] = None, ) -> torch.Tensor: if right_dense_feat is None or left_dense_feat is None: raise RuntimeError("Expect dense feature.") right_inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(right_texts), tokens=squeeze_2d(right_tokens), languages=squeeze_1d(languages), ) left_inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(left_texts), tokens=squeeze_2d(left_tokens), languages=squeeze_1d(languages), ) right_dense_feat = self.right_normalizer.normalize(right_dense_feat) left_dense_feat = self.left_normalizer.normalize(left_dense_feat) right_dense_tensor = torch.tensor(right_dense_feat, dtype=torch.float) left_dense_tensor = torch.tensor(left_dense_feat, dtype=torch.float) sentence_embedding = self._forward(right_inputs, left_inputs, right_dense_tensor, left_dense_tensor) return sentence_embedding
def test_xlm_tensorizer_input_sequence_exceeds_max_seq_len(self): xlm = self._mock_xlm_tensorizer(max_seq_len=20) rand_tokens = self.get_rand_tokens([30, 10]) tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens), ) sig_idxs = [len(t) + 2 for t in rand_tokens] expected_token_size = min(max(sig_idxs), xlm.max_seq_len) expected_token_padding = [ max(0, expected_token_size - cnt) for cnt in sig_idxs ] sig_idxs = [ expected_token_size - cnt for cnt in expected_token_padding ] padding_key = { tokens: 200, pad_masks: 0, languages: 0, positions: 0, } # verify padding for output_tensor, pad_val in padding_key.items(): self.validate_padding( output_tensor, pad_val, significant_idxs=sig_idxs, expected_batch_size=len(rand_tokens), expected_token_padding=expected_token_padding, )
def forward(self, tokens: List[List[str]], languages: Optional[List[str]] = None): input_tensors = self.tensorizer(pre_tokenized=squeeze_2d(tokens), languages=squeeze_1d(languages)) logits = self.model(input_tensors) return self.output_layer(logits)
def forward( self, right_dense_feat: List[List[float]], left_dense_feat: List[List[float]], texts: Optional[List[str]] = None, # multi_texts is of shape [batch_size, num_columns] multi_texts: Optional[List[List[str]]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, ): inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(texts, multi_texts), tokens=squeeze_2d(tokens), languages=squeeze_1d(languages), ) input_tensors = self.tensorizer(inputs) right_dense_feat = self.right_normalizer.normalize(right_dense_feat) left_dense_feat = self.left_normalizer.normalize(left_dense_feat) right_dense_tensor = torch.tensor(right_dense_feat, dtype=torch.float) left_dense_tensor = torch.tensor(left_dense_feat, dtype=torch.float) if self.tensorizer.device != "": right_dense_tensor = right_dense_tensor.to(self.tensorizer.device) left_dense_tensor = left_dense_tensor.to(self.tensorizer.device) logits = self.model(input_tensors, right_dense_tensor, left_dense_tensor) return self.output_layer(logits)
def test_roberta_tensorizer_input_exceeds_max_seq_len(self): roberta = self._mock_roberta_tensorizer(max_seq_len=28) rand_tokens = self.get_rand_tokens([25, 15, 5, 30]) expected_batch_size = 4 expected_token_size = 28 tokens, pad_mask, start_indices, end_indices, positions = roberta.tensorize( tokens=squeeze_2d(rand_tokens)) sig_idxs = [1 + len(t) + 1 for t in rand_tokens] expected_token_padding = [ expected_token_size - num for num in sig_idxs ] padding_key = { tokens: 200, pad_mask: 0, start_indices: 0, end_indices: 0, positions: 0, } # verify padding for output_tensor, pad_val in padding_key.items(): self.validate_padding( output_tensor, pad_val, significant_idxs=sig_idxs, expected_batch_size=expected_batch_size, expected_token_padding=expected_token_padding, )
def forward( self, texts: Optional[List[str]] = None, # multi_texts is of shape [batch_size, num_columns] multi_texts: Optional[List[List[str]]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, dense_feat: Optional[List[List[float]]] = None, ) -> torch.Tensor: if dense_feat is None: raise RuntimeError("Expect dense feature.") inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(texts, multi_texts), tokens=squeeze_2d(tokens), languages=squeeze_1d(languages), ) # call model dense_feat = self.normalizer.normalize(dense_feat) dense_tensor = torch.tensor(dense_feat, dtype=torch.float) sentence_embedding = self._forward(inputs, dense_tensor) if self.concat_dense: return torch.cat([sentence_embedding, dense_tensor], 1) else: return sentence_embedding
def test_roberta_tensorizer_default_padding(self): roberta = self._mock_roberta_tensorizer() rand_tokens = self.get_rand_tokens([20, 5, 15]) start_placeholder = 1 end_placeholder = 1 # num idxs that store significant values for elem in rand_token, i.e. [22, 7, 17] sig_idxs = [ start_placeholder + len(t) + end_placeholder for t in rand_tokens ] # pad every token to bottleneck value, i.e. [0, 15, 5] expected_token_padding = [max(sig_idxs) - num for num in sig_idxs] tokens, pad_mask, start_indices, end_indices, positions = roberta.tensorize( tokens=squeeze_2d(rand_tokens)) padding_key = { tokens: 200, pad_mask: 0, start_indices: 0, end_indices: 0, positions: 0, } # verify padding for output_tensor, pad_val in padding_key.items(): self.validate_padding( output_tensor, pad_val, significant_idxs=sig_idxs, expected_batch_size=len(rand_tokens), expected_token_padding=expected_token_padding, )
def test_xlm_tensorizer_batch_padding(self): xlm = self._mock_xlm_tensorizer() batch_padding_control = [0, 3, 6] xlm.set_padding_control("batch_length", batch_padding_control) rand_tokens = self.get_rand_tokens([20, 10]) tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens), ) sig_idxs = [len(t) + 2 for t in rand_tokens] + [0] expected_token_size = max(sig_idxs) expected_batch_size = min( max(len(rand_tokens), batch_padding_control[1]), xlm.max_seq_len) expected_token_padding = [ expected_token_size - cnt for cnt in sig_idxs ] padding_key = { tokens: 200, pad_masks: 0, languages: 0, positions: 0, } # verify padding for output_tensor, pad_val in padding_key.items(): self.validate_padding( output_tensor, pad_val, significant_idxs=sig_idxs, expected_batch_size=expected_batch_size, expected_token_padding=expected_token_padding, )
def forward( self, tokens: List[List[str]], dense_feat: List[List[float]], languages: Optional[List[str]] = None, ): input_tensors = self.tensorizer(pre_tokenized=squeeze_2d(tokens), languages=squeeze_1d(languages)) logits = self.model(input_tensors, torch.tensor(dense_feat).float()) return self.output_layer(logits)
def forward( self, right_texts: Optional[List[str]] = None, left_texts: Optional[List[str]] = None, right_tokens: Optional[List[List[str]]] = None, left_tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, ) -> torch.Tensor: right_inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(right_texts), tokens=squeeze_2d(right_tokens), languages=squeeze_1d(languages), ) left_inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(left_texts), tokens=squeeze_2d(left_tokens), languages=squeeze_1d(languages), ) return self._forward(right_inputs, left_inputs)
def forward( self, right_dense_feat: List[List[float]], left_dense_feat: List[List[float]], right_texts: Optional[List[str]] = None, left_texts: Optional[List[str]] = None, right_tokens: Optional[List[List[str]]] = None, left_tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, ): right_inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(right_texts), tokens=squeeze_2d(right_tokens), languages=squeeze_1d(languages), ) right_input_tensors = self.right_tensorizer(right_inputs) left_inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(left_texts), tokens=squeeze_2d(left_tokens), languages=squeeze_1d(languages), ) left_input_tensors = self.left_tensorizer(left_inputs) right_dense_feat = self.right_normalizer.normalize(right_dense_feat) left_dense_feat = self.left_normalizer.normalize(left_dense_feat) right_dense_tensor = torch.tensor(right_dense_feat, dtype=torch.float) left_dense_tensor = torch.tensor(left_dense_feat, dtype=torch.float) if self.right_tensorizer.device != "": right_dense_tensor = right_dense_tensor.to( self.right_tensorizer.device) if self.left_tensorizer.device != "": left_dense_tensor = left_dense_tensor.to( self.left_tensorizer.device) logits = self.model( right_input_tensors, left_input_tensors, right_dense_tensor, left_dense_tensor, ) return self.output_layer(logits)
def forward( self, # first input texts1: Optional[List[str]] = None, tokens1: Optional[List[List[str]]] = None, # second input texts2: Optional[List[str]] = None, tokens2: Optional[List[List[str]]] = None, ): inputs1: ScriptBatchInput = ScriptBatchInput( texts=squeeze_1d(texts1), tokens=squeeze_2d(tokens1), languages=None, ) inputs2: ScriptBatchInput = ScriptBatchInput( texts=squeeze_1d(texts2), tokens=squeeze_2d(tokens2), languages=None, ) input_tensors1 = self.tensorizer1(inputs1) input_tensors2 = self.tensorizer2(inputs2) return self.model(input_tensors1, input_tensors2)
def test_xlm_token_tensorizer(self): xlm = self._mock_xlm_tensorizer() rand_tokens = self.get_rand_tokens([20, 10]) tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens)) tokens = tokens.tolist() # eos token self.assertEqual(tokens[0][0], 202) self.assertEqual(tokens[0][-1], 202) # pad token self.assertEqual(tokens[1][12:], [200] * 10) languages = languages.tolist() self.assertEqual(languages[0], [2] * len(tokens[0])) self.assertEqual(languages[1][12:], [0] * 10) tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens), languages=squeeze_1d(["cn", "en"])) languages = languages.tolist() self.assertEqual(languages[0][:], [1] * len(tokens[0])) self.assertEqual(languages[1][:12], [2] * 12)
def forward( self, right_texts: Optional[List[str]] = None, left_texts: Optional[List[str]] = None, right_tokens: Optional[List[List[str]]] = None, left_tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, ): right_inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(right_texts), tokens=squeeze_2d(right_tokens), languages=squeeze_1d(languages), ) right_input_tensors = self.right_tensorizer(right_inputs) left_inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(left_texts), tokens=squeeze_2d(left_tokens), languages=squeeze_1d(languages), ) left_input_tensors = self.left_tensorizer(left_inputs) logits = self.model(right_input_tensors, left_input_tensors) return self.output_layer(logits)
def forward( self, texts: Optional[List[str]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, ): inputs: ScriptBatchInput = ScriptBatchInput( texts=squeeze_1d(texts), tokens=squeeze_2d(tokens), languages=squeeze_1d(languages), ) input_tensors = self.tensorizer(inputs) logits = self.model(input_tensors) return self.output_layer(logits)
def tensorize_1d( self, texts: Optional[List[str]] = None, tokens: Optional[List[List[str]]] = None, ): """ Process raw inputs(single sentence) into model input tensors, it supports two input formats: 1) multiple rows of single sentence 2) multiple rows of single sentence pre-processed tokens This function should handle the logic of calling numberize() and also padding the numberized result. """ return self.tensorize(squeeze_1d(texts), squeeze_2d(tokens))
def forward( self, texts: Optional[List[str]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, dense_feat: Optional[List[List[float]]] = None, ) -> torch.Tensor: inputs: ScriptBatchInput = ScriptBatchInput( texts=squeeze_1d(texts), tokens=squeeze_2d(tokens), languages=squeeze_1d(languages), ) input_tensors = self.tensorizer(inputs) # call model return self.model(input_tensors)[self.index]
def forward( self, texts: Optional[List[str]] = None, # multi_texts is of shape [batch_size, num_columns] multi_texts: Optional[List[List[str]]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, dense_feat: Optional[List[List[float]]] = None, ) -> List[torch.Tensor]: inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(texts, multi_texts), tokens=squeeze_2d(tokens), languages=squeeze_1d(languages), ) return self._forward(inputs)
def test_xlm_tensorizer_default_padding(self): xlm = self._mock_xlm_tensorizer() rand_tokens = self.get_rand_tokens([20, 10]) tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens)) sig_idxs = [len(t) + 2 for t in rand_tokens] expected_token_size = max(sig_idxs) expected_token_padding = [ expected_token_size - cnt for cnt in sig_idxs ] expected_batch_size = len(rand_tokens) # verify tensorized tokens padding tokens = tokens.tolist() self.assertEqual(len(tokens), expected_batch_size) self.assertEqual( max(len(t) for t in tokens), min(len(t) for t in tokens), expected_token_size, ) for i in range(expected_batch_size): self.assertEqual(tokens[i][sig_idxs[i]:], [200] * expected_token_padding[i]) # verify tensorized languages languages = languages.tolist() self.assertEqual(len(languages), expected_batch_size) for i in range(expected_batch_size): self.assertEqual(languages[i][:sig_idxs[i]], [2] * sig_idxs[i]) self.assertEqual(languages[i][sig_idxs[i]:], [0] * expected_token_padding[i]) # verify tensorized postions positions = positions.tolist() self.assertEqual(len(positions), expected_batch_size) for i in range(expected_batch_size): self.assertEqual(positions[i][sig_idxs[i]:], [0] * expected_token_padding[i]) # verify pad_masks pad_masks = pad_masks.tolist() self.assertEqual(len(pad_masks), expected_batch_size) for i in range(expected_batch_size): self.assertEqual(pad_masks[i][:sig_idxs[i]], [1] * sig_idxs[i]) self.assertEqual(pad_masks[i][sig_idxs[i]:], [0] * expected_token_padding[i])
def forward( self, texts: Optional[List[str]] = None, # multi_texts is of shape [batch_size, num_columns] multi_texts: Optional[List[List[str]]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, ): inputs: ScriptBatchInput = ScriptBatchInput( texts=resolve_texts(texts, multi_texts), tokens=squeeze_2d(tokens), languages=squeeze_1d(languages), ) input_tensors = self.tensorizer(inputs) logits = self.model(input_tensors) return self.output_layer(logits)
def forward( self, dense_feat: List[List[float]], texts: Optional[List[str]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, ): inputs: ScriptBatchInput = ScriptBatchInput( texts=squeeze_1d(texts), tokens=squeeze_2d(tokens), languages=squeeze_1d(languages), ) input_tensors = self.tensorizer(inputs) dense_feat = self.normalizer.normalize(dense_feat) logits = self.model(input_tensors, torch.tensor(dense_feat, dtype=torch.float)) return self.output_layer(logits)
def test_roberta_tensorizer_sequence_batch_padding(self): roberta = self._mock_roberta_tensorizer() seq_padding_control = [0, 48, 256] batch_padding_control = [0, 3, 6] roberta.set_padding_control("batch_length", batch_padding_control) roberta.set_padding_control("sequence_length", seq_padding_control) rand_tokens = self.get_rand_tokens([25, 15, 5, 30]) expected_batch_size = 6 expected_token_size = 48 tokens, pad_mask, start_indices, end_indices, positions = roberta.tensorize( tokens=squeeze_2d(rand_tokens)) sig_idxs = [1 + len(t) + 1 for t in rand_tokens] + [0, 0] expected_token_padding = [ expected_token_size - num for num in sig_idxs ] + [ 48, 48, ] padding_key = { tokens: 200, pad_mask: 0, start_indices: 0, end_indices: 0, positions: 0, } # verify padding for output_tensor, pad_val in padding_key.items(): self.validate_padding( output_tensor, pad_val, significant_idxs=sig_idxs, expected_batch_size=expected_batch_size, expected_token_padding=expected_token_padding, )
def forward( self, texts: Optional[List[str]] = None, tokens: Optional[List[List[str]]] = None, languages: Optional[List[str]] = None, dense_feat: Optional[List[List[float]]] = None, ) -> torch.Tensor: if dense_feat is None: raise RuntimeError("Expect dense feature.") inputs: ScriptBatchInput = ScriptBatchInput( texts=squeeze_1d(texts), tokens=squeeze_2d(tokens), languages=squeeze_1d(languages), ) input_tensors = self.tensorizer(inputs) # call model dense_feat = self.normalizer.normalize(dense_feat) dense_tensor = torch.tensor(dense_feat, dtype=torch.float) encoder_embedding = self.model(input_tensors, dense_tensor)[self.index] return torch.cat([encoder_embedding, dense_tensor], 1)
def test_roberta_tensorizer_seq_padding_size_exceeds_max_seq_len(self): roberta = self._mock_roberta_tensorizer(max_seq_len=20) seq_padding_control = [0, 32, 256] roberta.set_padding_control("sequence_length", seq_padding_control) rand_tokens = self.get_rand_tokens([30, 20, 10]) tokens, pad_mask, start_indices, end_indices, positions = roberta.tensorize( tokens=squeeze_2d(rand_tokens)) sig_idxs = [len(t) + 2 for t in rand_tokens] expected_batch_size = 3 expected_token_size = min(max(max(sig_idxs), seq_padding_control[1]), roberta.max_seq_len) expected_token_padding = [ max(0, expected_token_size - cnt) for cnt in sig_idxs ] sig_idxs = [ expected_token_size - cnt for cnt in expected_token_padding ] padding_key = { tokens: 200, pad_mask: 0, start_indices: 0, end_indices: 0, positions: 0, } # verify padding for output_tensor, pad_val in padding_key.items(): self.validate_padding( output_tensor, pad_val, significant_idxs=sig_idxs, expected_batch_size=expected_batch_size, expected_token_padding=expected_token_padding, )
def test_xlm_tensorizer_seq_padding_size_exceeds_max_seq_len(self): vocab = self._mock_vocab() xlm = ScriptXLMTensorizer( tokenizer=ScriptDoNothingTokenizer(), token_vocab=vocab, language_vocab=ScriptVocabulary(["ar", "cn", "en"]), max_seq_len=20, default_language="en", ) seq_padding_control = [0, 32, 256] xlm.set_padding_control("sequence_length", seq_padding_control) rand_tokens = [ [str(random.randint(100, 200)) for i in range(30)], [str(random.randint(100, 200)) for i in range(20)], [str(random.randint(100, 200)) for i in range(10)], ] tokens, pad_masks, languages, positions = xlm.tensorize( tokens=squeeze_2d(rand_tokens), ) token_count = [len(t) + 2 for t in rand_tokens] expected_batch_size = len(rand_tokens) expected_token_size = min( max(max(token_count), seq_padding_control[1]), xlm.max_seq_len) expected_padding_count = [ max(0, expected_token_size - cnt) for cnt in token_count ] token_count = [ expected_token_size - cnt for cnt in expected_padding_count ] # verify tensorized tokens padding tokens = tokens.tolist() self.assertEqual(len(tokens), expected_batch_size) self.assertEqual( max(len(t) for t in tokens), min(len(t) for t in tokens), expected_token_size, ) for i in range(expected_batch_size): self.assertEqual(tokens[i][token_count[i]:], [200] * expected_padding_count[i]) # verify tensorized languages languages = languages.tolist() self.assertEqual(len(languages), expected_batch_size) for i in range(expected_batch_size): self.assertEqual(languages[i][:token_count[i]], [2] * token_count[i]) self.assertEqual(languages[i][token_count[i]:], [0] * expected_padding_count[i]) # verify tensorized postions positions = positions.tolist() self.assertEqual(len(positions), expected_batch_size) for i in range(expected_batch_size): self.assertEqual(positions[i][token_count[i]:], [0] * expected_padding_count[i]) # verify pad_masks pad_masks = pad_masks.tolist() self.assertEqual(len(pad_masks), expected_batch_size) for i in range(expected_batch_size): self.assertEqual(pad_masks[i][:token_count[i]], [1] * token_count[i]) self.assertEqual(pad_masks[i][token_count[i]:], [0] * expected_padding_count[i])
def forward(self, tokens: List[List[str]]): input_tensors = self.tensorizer.tensorize(tokens=squeeze_2d(tokens)) logits = self.model(input_tensors) return self.output_layer(logits)