def untokenize_agnostic(self, token_list): """Turns CuBERT subtokens into whole tokens.""" # Untokenize agnostic. if (not token_list or token_list[-1] != unified_tokenizer.quote_special( unified_tokenizer.TokenKind.EOS.name)): raise ValueError('Token list %r should end with the EOS token %r.' % (token_list, unified_tokenizer.quote_special( unified_tokenizer.TokenKind.EOS.name))) whole_tokens = unified_tokenizer.reconstitute_full_unsanitary_tokens( token_list, sanitization_mapping=self.mappings, sentinel=unified_tokenizer.SENTINEL) return whole_tokens
def untokenize(self, token_list): """Untokenizes via `untokenize_abstract`.""" # Untokenize agnostic. if (not token_list or token_list[-1] != quote_special( unified_tokenizer.TokenKind.EOS.name)): raise ValueError( 'Token list %r should end with the EOS token %r.' % (token_list, quote_special(unified_tokenizer.TokenKind.EOS.name))) whole_tokens = unified_tokenizer.reconstitute_full_unsanitary_tokens( token_list, sanitization_mapping=self.mappings, sentinel=unified_tokenizer.SENTINEL) return self.untokenize_abstract(whole_tokens)
def test_reconstitute_raises_when_expected(self, subtokens, mappings): with self.assertRaises(ValueError): unified_tokenizer.reconstitute_full_unsanitary_tokens(subtokens, mappings, sentinel='^')
def test_reconstitute_returns_expected(self, subtokens, mappings, expected_tokens): whole_tokens = unified_tokenizer.reconstitute_full_unsanitary_tokens( subtokens, mappings, sentinel='^') self.assertSequenceEqual(expected_tokens, whole_tokens)