def test_split_with_regex(self): tokenizer = Tokenizer(split_regex=r"[\s,;!.?\"\(\)\-]+") sentence = """ Your bones don't break, mine do. That's clear. Your cells react to bacteria and viruses differently than mine. You don't get sick, I do. That's also clear. But for some reason, you and I react the exact same way to water. We swallow it too fast, we choke. We get some in our lungs, we drown. However unreal it may seem, we are connected, you and I. We're on the same curve, just on opposite ends. """ expected = """ your bones don't break mine do that's clear your cells react to bacteria and viruses differently than mine you don't get sick i do that's also clear but for some reason you and i react the exact same way to water we swallow it too fast we choke we get some in our lungs we drown however unreal it may seem we are connected you and i we're on the same curve just on opposite ends """.split() tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, [t.value for t in tokens]) sentence = '"Please, buy me a coffee?" He implored-in vain.' expected = "please buy me a coffee he implored in vain".split() tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, [t.value for t in tokens])
def tokenize( text: str = None, pre_tokenized: List[Token] = None, tokenizer: Tokenizer = None, bos_token: Optional[str] = None, eos_token: Optional[str] = None, pad_token: str = PAD, use_eos_token_for_bos: bool = False, max_seq_len: int = 2**30, ): tokenized = (pre_tokenized or tokenizer.tokenize(text)[:max_seq_len - (bos_token is not None) - (eos_token is not None)]) if bos_token: if use_eos_token_for_bos: bos_token = eos_token tokenized = [Token(bos_token, -1, -1)] + tokenized if eos_token: tokenized.append(Token(eos_token, -1, -1)) if not tokenized: tokenized = [Token(pad_token, -1, -1)] tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end) for t in tokenized)) return tokenized_texts, start_idx, end_idx
def test_tokenize_no_byte_offsets(self): tokenizer = Tokenizer() sentence = "Ordér mê å ćoƒfee" expected = [ Token("ordér", 0, 5), Token("mê", 6, 8), Token("å", 9, 10), Token("ćoƒfee", 11, 17), ] tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, tokens)
def test_tokenize_use_byte_offsets(self): tokenizer = Tokenizer(use_byte_offsets=True) sentence = "Ordér mê å ćoƒfee" expected = [ Token("ordér", 0, 6), Token("mê", 7, 10), Token("å", 11, 13), Token("ćoƒfee", 14, 22), ] tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, tokens)
def tokenize( text: str = None, pre_tokenized: List[Token] = None, tokenizer: Tokenizer = None, add_bos_token: bool = False, add_eos_token: bool = False, use_eos_token_for_bos: bool = False, max_seq_len: int = 2**30, ): tokenized = (pre_tokenized or tokenizer.tokenize(text)[:max_seq_len - add_bos_token - add_eos_token]) if add_bos_token: bos = EOS if use_eos_token_for_bos else BOS tokenized = [Token(bos, -1, -1)] + tokenized if add_eos_token: tokenized.append(Token(EOS, -1, -1)) if not tokenized: tokenized = [Token(PAD, -1, -1)] tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end) for t in tokenized)) return tokenized_texts, start_idx, end_idx
def test_tokenize_dont_lowercase(self): tokenizer = Tokenizer(lowercase=False) sentence = "Order me a coffee" expected = ["Order", "me", "a", "coffee"] tokens = tokenizer.tokenize(sentence) self.assertListEqual(expected, [t.value for t in tokens])