def string_to_fields(string: str, tokenizer: Tokenizer, token_indexers: Dict[str, TokenIndexer]): tokenized_string = tokenizer.tokenize(string) tokenized_string.insert(0, Token(END_SYMBOL)) field = TextField(tokenized_string, token_indexers) # TODO: always use single id token indexer and tokenizer default/bpe cause we will have bert/elmo passed to main str tokenized_golden_string = golden_tokenizer.tokenize(string) tokenized_golden_string.append( Token(END_SYMBOL)) # with eos at the end for loss compute field_golden = TextField(tokenized_golden_string, golden_token_indexers) return field, field_golden
def _check_start_end_tokens(self, start_symbol: str, end_symbol: str, tokenizer: Tokenizer) -> None: """Check that `tokenizer` correctly appends `start_symbol` and `end_symbol` to the sequence without splitting them. Raises a `ValueError` if this is not the case. """ tokens = tokenizer.tokenize(start_symbol + " " + end_symbol) err_msg = ( f"Bad start or end symbol ('{start_symbol}', '{end_symbol}') " f"for tokenizer {self._source_tokenizer}") try: start_token, end_token = tokens[0], tokens[-1] except IndexError: raise ValueError(err_msg) if start_token.text != start_symbol or end_token.text != end_symbol: raise ValueError(err_msg)
def token_alignment( data_tokens: Union[List[str], List[Token]], model_tokenizer: Tokenizer, start_tokens: List[str] = None, end_tokens: List[str] = None, ) -> Tuple[List[Token], List[int]]: """Aligns word tokens (data_tokens), with sub-word tokens. The Tokens in data_tokens may or may not be split into sub-words by model_tokenizer, e.g. if it's a tokenizer for a model like BERT. This method returns: (a) the tokens produced by model_tokenizer with optional start_tokens (e.g. [CLS]) and end tokens (e.g. [SEP]) (b) a list of spans: pairs of (start, inclusive-end) for which spans of sub-words correspond to the words """ model_tokens = [] data_to_model_map = [] if start_tokens: model_tokens.extend([Token(t) for t in start_tokens]) for token in data_tokens: # where in the model tokens are we starting the new data token data_to_model_map.append(len(model_tokens)) # if data_tokens is a list of Tokens, get the text out by using the # .text attribute if hasattr(token, 'text'): token = token.text model_tokens.extend(model_tokenizer.tokenize(token)) data_to_model_map.append(len(model_tokens)) data_to_model_map = [(data_to_model_map[i], data_to_model_map[i+1]-1) for i in range(len(data_to_model_map)-1)] if end_tokens: model_tokens.extend([Token(t) for t in end_tokens]) return model_tokens, data_to_model_map
def tokenize_to_string(text: str, tokenizer: Tokenizer) -> List[str]: """Sigh""" return [token.text for token in tokenizer.tokenize(text)]