def __init__(self, sample, append_eos=False, target_vocab_size=None, min_occurrences=1, max_occurrences=1e3, reserved_tokens=RESERVED_ITOS): self.append_eos = append_eos if target_vocab_size is None: self.tokenizer = SubwordTextTokenizer() self.tokenizer.build_from_corpus(sample, min_count=min_occurrences) else: target_vocab_size -= len(reserved_tokens) self.tokenizer = SubwordTextTokenizer.build_to_target_size_from_corpus( sample, target_size=target_vocab_size, min_val=min_occurrences, max_val=max_occurrences) self.itos = reserved_tokens.copy() self.stoi = { token: index for index, token in enumerate(reserved_tokens) } for token in self.tokenizer.vocab: self.itos.append(token) self.stoi[token] = len(self.itos) - 1
def __init__(self, sample, append_eos=False, target_vocab_size=None, min_occurrences=1, max_occurrences=1e3): self.append_eos = append_eos if target_vocab_size is None: self.tokenizer = SubwordTextTokenizer() self.tokenizer.build_from_corpus(sample, min_count=min_occurrences) else: target_vocab_size -= len(RESERVED_ITOS) self.tokenizer = SubwordTextTokenizer.build_to_target_size_from_corpus( sample, target_size=target_vocab_size, min_val=min_occurrences, max_val=max_occurrences) self.stoi = RESERVED_STOI.copy() self.itos = RESERVED_ITOS[:] for token in self.tokenizer.vocab: self.itos.append(token) self.stoi[token] = len(self.itos) - 1
class SubwordEncoder(TextEncoder): """ Invertibly encoding text using a limited vocabulary. Applies Googles Tensor2Tensor SubwordTextTokenizer that invertibly encodes a native string as a sequence of subtokens from a limited vocabulary. In order to build the vocabulary, it uses recursive binary search to find a minimum token count `x` (s.t. `min_occurrences` <= `x` <= `max_occurrences`) that most closely matches the `target_size`. Tokenization Algorithm Reference: https://github.com/tensorflow/tensor2tensor/blob/8bdecbe434d93cb1e79c0489df20fee2d5a37dc2/tensor2tensor/data_generators/text_encoder.py#L389 Args: sample (list of strings): Sample of data to build dictionary on append_eos (bool, optional): If `True` append EOS token onto the end to the encoded vector. target_vocab_size (int, optional): Desired size of vocab. min_occurrences (int, optional): Lower bound for the minimum token count. max_occurrences (int, optional): Upper bound for the minimum token count. """ def __init__(self, sample, append_eos=False, target_vocab_size=None, min_occurrences=1, max_occurrences=1e3): self.append_eos = append_eos if target_vocab_size is None: self.tokenizer = SubwordTextTokenizer() self.tokenizer.build_from_corpus(sample, min_count=min_occurrences) else: target_vocab_size -= len(RESERVED_ITOS) self.tokenizer = SubwordTextTokenizer.build_to_target_size_from_corpus( sample, target_size=target_vocab_size, min_val=min_occurrences, max_val=max_occurrences) self.stoi = RESERVED_STOI.copy() self.itos = RESERVED_ITOS[:] for token in self.tokenizer.vocab: self.itos.append(token) self.stoi[token] = len(self.itos) - 1 @property def vocab(self): return self.itos def encode(self, text): text = self.tokenizer.encode(text) vector = [self.stoi.get(token, UNKNOWN_INDEX) for token in text] if self.append_eos: vector.append(EOS_INDEX) return torch.LongTensor(vector) def decode(self, tensor): tokens = [self.itos[index] for index in tensor] return self.tokenizer.decode(tokens)
def test_encode_decode(self): corpus = ( 'This is a corpus of text that provides a bunch of tokens from which ' 'to build a vocabulary. It will be used when strings are encoded ' 'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.' ) alphabet = set(corpus) ^ {' '} original = 'This is a coded sentence encoded by the SubwordTextTokenizer.' encoder = SubwordTextTokenizer.build_to_target_size_from_corpus( [corpus, original], target_size=100, min_val=2, max_val=10) # Encoding should be reversible. encoded = encoder.encode(original) decoded = encoder.decode(encoded) self.assertEqual(original, decoded) # The substrings coded and coder are frequent enough in the corpus that # they should appear in the vocabulary even though they are substrings # of other included strings. subtoken_strings = encoded self.assertIn('encoded_', subtoken_strings) self.assertIn('coded_', subtoken_strings) self.assertIn('SubwordTextTokenizer_', encoder._all_subtoken_strings) self.assertIn('coder_', encoder._all_subtoken_strings) # Every character in the corpus should be in the encoder's alphabet and # its subtoken vocabulary. self.assertTrue(alphabet.issubset(encoder._alphabet)) for a in alphabet: self.assertIn(a, encoder._all_subtoken_strings)
def test_unicode(self): corpus = 'Cat emoticons. \U0001F638 \U0001F639 \U0001F63A \U0001F63B' token_counts = collections.Counter(corpus.split(' ')) encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts( 100, token_counts, 2, 10) self.assertIn('\U0001F638', encoder._alphabet) self.assertIn('\U0001F63B', encoder._all_subtoken_strings)
def test_raises_exception_when_not_encodable(self): corpus = 'the quick brown fox jumps over the lazy dog' token_counts = collections.Counter(corpus.split(' ')) # Deliberately exclude some required encoding chars from the alphabet # and token list, making some strings unencodable. encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts( 100, token_counts, 2, 10) original = 'This has UPPER CASE letters that are out of alphabet' # Previously there was a bug which produced an infinite loop in this case. with self.assertRaises(AssertionError): encoder.encode(original)
def test_small_vocab(self): corpus = 'The quick brown fox jumps over the lazy dog' token_counts = collections.Counter(corpus.split(' ')) alphabet = set(corpus) ^ {' '} encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts( 10, token_counts, 2, 10) # All vocabulary elements are in the alphabet and subtoken strings even # if we requested a smaller vocabulary to assure all expected strings # are encodable. self.assertTrue(alphabet.issubset(encoder._alphabet)) for a in alphabet: self.assertIn(a, encoder._all_subtoken_strings)
def test_encodable_when_not_in_alphabet(self): corpus = 'the quick brown fox jumps over the lazy dog' token_counts = collections.Counter(corpus.split(' ')) encoder = SubwordTextTokenizer.build_to_target_size_from_token_counts( 100, token_counts, 2, 10) original = 'This has UPPER CASE letters that are out of alphabet' # Early versions could have an infinite loop when breaking into subtokens # if there was any out-of-alphabet characters in the encoded string. encoded = encoder.encode(original) decoded = encoder.decode(encoded) self.assertEqual(original, decoded) encoded_str = ''.join(encoded) self.assertIn('\\84;', encoded_str)
def test_token_counts(self): token_counts = SubwordTextTokenizer._count_tokens(self.corpus) expected = { u"'": 2, u".": 2, u". ": 1, u"... ": 1, u"Groucho": 1, u"Marx": 1, u"Mitch": 1, u"Hedberg": 1, u"I": 3, u"in": 2, u"my": 2, u"pajamas": 2, } self.assertDictContainsSubset(expected, token_counts)
def test_is_pickleable(): tokenizer = SubwordTextTokenizer() pickle.dumps(tokenizer)