def on_config_ready(self, **kwargs): super().on_config_ready(**kwargs) if 'albert_chinese' in self.config.transformer: self.transformer_tokenizer = BertTokenizer.from_pretrained( self.config.transformer, use_fast=True) else: self.transformer_tokenizer = AutoTokenizer.from_pretrained( self.config.transformer, use_fast=True)
def build_transformer_tokenizer(self): transformer = self.config.transformer if transformer: transformer_tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained( transformer, use_fast=True) else: transformer_tokenizer = None self.transformer_tokenizer = transformer_tokenizer return transformer_tokenizer
def build_transformer_tokenizer( config_or_str, use_fast=True, do_basic_tokenize=True) -> PreTrainedTokenizer: if isinstance(config_or_str, str): transformer = config_or_str else: transformer = config_or_str.transformer if use_fast and not do_basic_tokenize: warnings.warn( '`do_basic_tokenize=False` might not work when `use_fast=True`' ) return AutoTokenizer.from_pretrained( transformer, use_fast=use_fast, do_basic_tokenize=do_basic_tokenize)
def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], text_a_key: str, text_b_key: str = None, output_key=None, max_seq_length=512, truncate_long_sequences=True) -> None: super().__init__(max_seq_length, truncate_long_sequences) self.text_b = text_b_key self.text_a = text_a_key if output_key is None: output_key = self.text_a if text_b_key: output_key += '_' + text_b_key if output_key == '': output_key = self._KEY else: output_key = [f'{output_key}_{key}' for key in self._KEY] self.output_key = output_key if isinstance(tokenizer, str): tokenizer = AutoTokenizer.from_pretrained(tokenizer) self.tokenizer = tokenizer
def __init__( self, tokenizer: Union[PreTrainedTokenizer, str], input_key, output_key=None, max_seq_length=512, truncate_long_sequences=False, config: PretrainedConfig = None, cls_token_at_end=False, cls_token_segment_id=0, pad_token_segment_id=0, pad_on_left=False, do_padding=False, sep_token_extra=False, ret_mask_and_type=False, ret_prefix_mask=False, ret_token_span=True, ret_subtokens=False, ret_subtokens_group=False, cls_is_bos=False, sep_is_eos=False, do_basic_tokenize=True, use_fast=True, dict_force=None, strip_cls_sep=True, check_space_before=None, ) -> None: """A transformer tokenizer for token-level tasks. It honors the boundary of tokens and tokenize each token into several subtokens then merge them. The information about each subtoken belongs to which token are kept and returned as a new field in the sample. It also provides out-of-box sliding window trick on long sequences. Args: tokenizer: The identifier of a pre-trained tokenizer or a ``PreTrainedTokenizer``. input_key: The token key in samples. output_key: The output keys to store results. max_seq_length: Sentences longer than ``max_seq_len`` will be split into shorter ones if possible. truncate_long_sequences: ``True`` to truncate exceeded parts of long sequences. ``False`` to enable sliding window. config: The ``PretrainedConfig`` to determine the model structure of the transformer, so that special tokenization can be applied. cls_token_at_end: ``True`` to put ``[CLS]`` at the end of input tokens. cls_token_segment_id: The id of ``[CLS]``. pad_token_segment_id: The id of ``[SEP]``. pad_on_left: ``True`` to put ``[PAD]`` at the left side of input tokens. do_padding: ``True`` to pad sequence to the left. sep_token_extra: ``True`` to have two ``[SEP]``. ret_mask_and_type: ``True`` to return masks and type ids. ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token. ret_token_span: ``True`` to return span of each token measured by subtoken offsets. ret_subtokens: ``True`` to return list of subtokens belonging to each token for tokenization purpose. When enabled, the prefix mask for each subtoken is set to True as each subtoken is a token unit in tokenization task. Similarity, the token span for each token will be a continuous integer sequence. ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token. cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is. ``False`` (default) means the first token is not [CLS], it will have its own embedding other than the embedding of [CLS]. sep_is_eos: ``True`` means the last token of input is [SEP]. ``False`` means it's not but [SEP] will be appended, ``None`` means it dependents on `input[-1] == [EOS]`. do_basic_tokenize: Whether to do basic tokenization before wordpiece. use_fast: Whether or not to try to load the fast version of the tokenizer. dict_force: A dictionary doing longest-prefix-match on input text so that the head and tail of each keyword won't be concatenated to other tokens by transformer tokenizers. strip_cls_sep: ``True`` to strip [CLS] and [SEP] off the input tokens. check_space_before: ``True`` to detect the space before each token to handle underline in sentence piece tokenization. Examples: .. highlight:: python .. code-block:: python transform = TransformerSequenceTokenizer('bert-base-uncased', 'token') sample = {'token': 'HanLP good'.split()} print(transform(sample)) """ super().__init__(max_seq_length, truncate_long_sequences) tokenizer_name = tokenizer if isinstance( tokenizer, str) else tokenizer.name_or_path if check_space_before is None: # These tokenizer is BPE-based which appends a space before each token and tokenizes loving into # ['▁lo', 'ving'], tokenize 商品 into ['▁', '商品']. For the later case, the prefix '▁' has to be removed # as there is no space between some languages like Chinese check_space_before = tokenizer_name in ('xlm-roberta-base', 'xlm-roberta-large', 'google/mt5-small', 'google/mt5-base') self.check_space_before = check_space_before self.ret_subtokens_group = ret_subtokens_group self.ret_subtokens = ret_subtokens self.sep_is_eos = sep_is_eos self.ret_prefix_mask = ret_prefix_mask self.ret_mask_and_type = ret_mask_and_type self.cls_is_bos = cls_is_bos self.ret_token_span = ret_token_span if not output_key or isinstance(output_key, str): suffixes = ['input_ids'] if ret_mask_and_type: suffixes += 'attention_mask', 'token_type_ids' if ret_prefix_mask: suffixes += ['prefix_mask'] if ret_token_span: suffixes.append('token_span') if output_key is None: output_key = [f'{input_key}_{key}' for key in suffixes] elif output_key == '': output_key = suffixes else: output_key = [f'{output_key}_{key}' for key in suffixes] self.input_key = input_key self.output_key = output_key if config: xlnet = config_is(config, 'xlnet') pad_token_segment_id = 4 if xlnet else 0 cls_token_segment_id = 2 if xlnet else 0 cls_token_at_end = xlnet pad_on_left = xlnet if isinstance(tokenizer, str): tokenizer = AutoTokenizer.from_pretrained( tokenizer, use_fast=use_fast, do_basic_tokenize=do_basic_tokenize) if use_fast: # Dirty fix upstream bug: https://github.com/hankcs/HanLP/issues/1602 if hasattr(tokenizer, '_tokenizer') and hasattr( tokenizer._tokenizer, 'no_truncation'): _t = tokenizer._tokenizer _t.no_truncation() _t.no_padding() _t.no_truncation = _t.no_padding = lambda: None pad_token = tokenizer.pad_token self.pad_token_id = tokenizer.convert_tokens_to_ids([pad_token])[0] self.pad_token_segment_id = pad_token_segment_id if tokenizer_name in ('google/mt5-small', 'google/mt5-base'): # mt5 doesn't have cls or sep, but we can use something similar self.has_cls = False self.cls_token = '▁' self.cls_token_id = tokenizer.convert_tokens_to_ids(self.cls_token) self.sep_token = tokenizer.eos_token self.sep_token_id = tokenizer.eos_token_id else: self.has_cls = True self.cls_token = tokenizer.cls_token self.sep_token = tokenizer.sep_token self.cls_token_segment_id = cls_token_segment_id self.cls_token_id = tokenizer.cls_token_id self.sep_token_id = tokenizer.sep_token_id self.sep_token_extra = sep_token_extra self.cls_token_at_end = cls_token_at_end self.tokenizer = tokenizer self.pad_on_left = pad_on_left self.do_padding = do_padding if self.ret_token_span or not self.truncate_long_sequences: assert not self.cls_token_at_end assert not self.pad_on_left if self.ret_subtokens: if not use_fast: raise NotImplementedError( 'ret_subtokens is not available when using Python tokenizers. ' 'To use this feature, set use_fast = True.') self.dict: Optional[ DictInterface] = dict_force # For tokenization of raw text self.strip_cls_sep = strip_cls_sep
def main(): transformer = 'bert-base-uncased' tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(transformer) # _test_text_transform(tokenizer) _test_sequence_transform(tokenizer)