def numberize(self, row): """Tokenize, look up in vocabulary.""" seq = [] if self.add_bol_token: bol = EOL if self.use_eol_token_for_bol else BOL tokens, _, _ = self._lookup_tokens( pre_tokenized=[Token(bol, -1, -1)]) seq.append(tokens) for raw_text in row[self.column]: tokens, _, _ = self._lookup_tokens(raw_text) seq.append(tokens) if self.add_eol_token: tokens, _, _ = self._lookup_tokens( pre_tokenized=[Token(EOL, -1, -1)]) seq.append(tokens) max_len = max(len(sentence) for sentence in seq) for sentence in seq: pad_len = max_len - len(sentence) if pad_len: sentence += [self.vocab.get_pad_index()] * pad_len return seq, len(seq)
def _process(self, row, raw_token_output): sentence_process_fn = ( self._tokenize if raw_token_output else self._lookup_tokens ) pad_token = ( self.vocab.pad_token if raw_token_output else self.vocab.get_pad_index() ) seq = [] if self.add_bol_token: bol = EOL if self.use_eol_token_for_bol else BOL tokens, _, _ = sentence_process_fn(pre_tokenized=[Token(bol, -1, -1)]) seq.append(list(tokens)) for raw_text in row[self.column]: tokens, _, _ = sentence_process_fn(raw_text) seq.append(list(tokens)) if self.add_eol_token: tokens, _, _ = sentence_process_fn(pre_tokenized=[Token(EOL, -1, -1)]) seq.append(list(tokens)) max_len = max(len(sentence) for sentence in seq) for sentence in seq: pad_len = max_len - len(sentence) if pad_len: sentence += [pad_token] * pad_len return seq, len(seq)
def tokenize( text: str = None, pre_tokenized: List[Token] = None, tokenizer: Tokenizer = None, bos_token: Optional[str] = None, eos_token: Optional[str] = None, pad_token: str = PAD, use_eos_token_for_bos: bool = False, max_seq_len: int = 2**30, ): tokenized = (pre_tokenized or tokenizer.tokenize(text)[:max_seq_len - (bos_token is not None) - (eos_token is not None)]) if bos_token: if use_eos_token_for_bos: bos_token = eos_token tokenized = [Token(bos_token, -1, -1)] + tokenized if eos_token: tokenized.append(Token(eos_token, -1, -1)) if not tokenized: tokenized = [Token(pad_token, -1, -1)] tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end) for t in tokenized)) return tokenized_texts, start_idx, end_idx
def _lookup_tokens(self, text): tokenized = self.tokenizer.tokenize(text)[:self.max_seq_len] if self.add_bos_token: bos = EOS if self.use_eos_token_for_bos else BOS tokenized = [Token(bos, -1, -1)] + tokenized if self.add_eos_token: tokenized.append(Token(EOS, -1, -1)) tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end) for t in tokenized)) tokens = self.vocab.lookup_all(tokenized_texts) return tokens, start_idx, end_idx
def _tokenize(self, text=None, pre_tokenized=None): tokenized = pre_tokenized or self.tokenizer.tokenize(text)[: self.max_seq_len] if self.add_bos_token: bos = EOS if self.use_eos_token_for_bos else BOS tokenized = [Token(bos, -1, -1)] + tokenized if self.add_eos_token: tokenized.append(Token(EOS, -1, -1)) if not tokenized: tokenized = [Token(PAD, -1, -1)] tokenized_texts, start_idx, end_idx = zip( *((t.value, t.start, t.end) for t in tokenized) ) return tokenized_texts, start_idx, end_idx
def numberize(self, row): """Convert text to bytes, pad batch.""" tokens = self.tokenizer.tokenize(row[self.text_column])[: self.max_seq_len] if self.add_bos_token: bos = EOS if self.use_eos_token_for_bos else BOS tokens = [Token(bos, -1, -1)] + tokens if self.add_eos_token: tokens.append(Token(EOS, -1, -1)) if not tokens: tokens = [Token(PAD, -1, -1)] bytes = [self._numberize_token(token)[: self.max_byte_len] for token in tokens] token_lengths = len(tokens) byte_lengths = [len(token_bytes) for token_bytes in bytes] return bytes, token_lengths, byte_lengths
def numberize(self, row): """Convert text to bytes, pad batch.""" tokens = self.tokenizer.tokenize(row[self.text_column])[: self.max_seq_len] if not tokens: tokens = [Token(PAD, -1, -1)] bytes = [self._numberize_token(token)[: self.max_byte_len] for token in tokens] token_lengths = len(tokens) byte_lengths = [len(token_bytes) for token_bytes in bytes] return bytes, token_lengths, byte_lengths
def tokenize( text: str = None, pre_tokenized: List[Token] = None, tokenizer: Tokenizer = None, add_bos_token: bool = False, add_eos_token: bool = False, use_eos_token_for_bos: bool = False, max_seq_len: int = 2**30, ): tokenized = (pre_tokenized or tokenizer.tokenize(text)[:max_seq_len - add_bos_token - add_eos_token]) if add_bos_token: bos = EOS if use_eos_token_for_bos else BOS tokenized = [Token(bos, -1, -1)] + tokenized if add_eos_token: tokenized.append(Token(EOS, -1, -1)) if not tokenized: tokenized = [Token(PAD, -1, -1)] tokenized_texts, start_idx, end_idx = zip(*((t.value, t.start, t.end) for t in tokenized)) return tokenized_texts, start_idx, end_idx