def build_tokenizer(self, tokenizer: TransformerSequenceTokenizer): """Build a transformer tokenizer for this task. Args: tokenizer: A tokenizer which is shared but can be adjusted to provide per-task settings. Returns: A TransformerSequenceTokenizer. """ if tokenizer.cls_is_bos != self.cls_is_bos or tokenizer.sep_is_eos != self.sep_is_eos: tokenizer = copy(tokenizer) tokenizer.cls_is_bos = self.cls_is_bos tokenizer.sep_is_eos = self.sep_is_eos return tokenizer
def build_dataloader(self, data, transform: TransformList = None, training=False, device=None, logger: logging.Logger = None, tokenizer: PreTrainedTokenizer = None, **kwargs) -> DataLoader: assert tokenizer dataset = TextTokenizingDataset(data, cache=isinstance(data, str), delimiter=self.config.sent_delimiter, generate_idx=isinstance(data, list), max_seq_len=self.config.max_seq_len, sent_delimiter=self.config.sent_delimiter, transform=[ TransformerSequenceTokenizer(tokenizer, 'text', ret_prefix_mask=True, ret_subtokens=True, ), FieldLength('text_input_ids', 'text_input_ids_length', delta=-2), generate_token_span_tuple]) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset, 'text_input_ids', 'text'), shuffle=training), device=device, dataset=dataset)
def make_batch_for_bart(augmented_concept, ret, tokenizer, device, training=True): token_field = 'concept' tokenizer = TransformerSequenceTokenizer(tokenizer.tokenizer, token_field, cls_is_bos=True, sep_is_eos=None) encodings = [ tokenizer({token_field: x[:-1] if training else x}) for x in augmented_concept ] ret.update(merge_list_of_dict(encodings)) decoder_mask = [] max_seq_len = len(max(ret['concept_input_ids'], key=len)) last_concept_offset = [] for spans, concepts in zip(ret['concept_token_span'], augmented_concept): mask = ~SelfAttentionMask.get_mask( max_seq_len, device, ret_parameter=False) for group in spans: for i in range(len(group)): for j in range(i + 1, len(group)): mask[group[i], group[j]] = True decoder_mask.append(mask) last_concept_offset.append(len(concepts) - 1) ret['decoder_mask'] = torch.stack(decoder_mask) if not training: ret['last_concept_offset'] = torch.tensor(last_concept_offset, device=device, dtype=torch.long) subtoken_to_tensor(token_field, ret)
def tokenizer_transform(self) -> TransformerSequenceTokenizer: if not self._tokenizer_transform: self._tokenizer_transform = TransformerSequenceTokenizer( self.transformer_tokenizer, self.config.token_key, ret_token_span=True) return self._tokenizer_transform
def tokenizer_transform(self): if not self._tokenizer_transform: self._tokenizer_transform = TransformerSequenceTokenizer(self.transformer_tokenizer, self.config.token_key, ret_subtokens=True, ret_subtokens_group=True, ret_token_span=False) return self._tokenizer_transform
def build_tokenizer_transform(self): return TransformerSequenceTokenizer(self.transformer_tokenizer, 'token', '', ret_token_span=True, cls_is_bos=True, max_seq_length=self.config.get( 'max_sequence_length', 512), truncate_long_sequences=False)
def build_tokenizer(self, tokenizer: TransformerSequenceTokenizer): # The transform for tokenizer needs very special settings, ensure these settings are set properly. return TransformerSequenceTokenizer( tokenizer.tokenizer, tokenizer.input_key, tokenizer.output_key, tokenizer.max_seq_length, tokenizer.truncate_long_sequences, ret_subtokens=True, ret_subtokens_group=True, ret_token_span=True, cls_is_bos=True, sep_is_eos=True, use_fast=tokenizer.tokenizer.is_fast, dict_force=self.dict_force, strip_cls_sep=False, )
def __init__(self, field: str, transformer: str, average_subwords=False, scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None, word_dropout: Optional[Union[float, Tuple[float, str]]] = None, max_sequence_length=None, truncate_long_sequences=False, cls_is_bos=False, sep_is_eos=False, ret_token_span=True, ret_subtokens=False, ret_subtokens_group=False, ret_prefix_mask=False, ret_raw_hidden_states=False, transformer_args: Dict[str, Any] = None, use_fast=True, do_basic_tokenize=True, trainable=True) -> None: """A contextual word embedding builder which builds a :class:`~hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule` and a :class:`~hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer`. Args: field: The field to work on. Usually some token fields. transformer: An identifier of a ``PreTrainedModel``. average_subwords: ``True`` to average subword representations. scalar_mix: Layer attention. word_dropout: Dropout rate of randomly replacing a subword with MASK. max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding window. truncate_long_sequences: ``True`` to return hidden states of each layer. cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is. ``False`` (default) means the first token is not [CLS], it will have its own embedding other than the embedding of [CLS]. sep_is_eos: ``True`` means the last token of input is [SEP]. ``False`` means it's not but [SEP] will be appended, ``None`` means it dependents on `input[-1] == [EOS]`. ret_token_span: ``True`` to return span of each token measured by subtoken offsets. ret_subtokens: ``True`` to return list of subtokens belonging to each token. ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token. ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token. ret_raw_hidden_states: ``True`` to return hidden states of each layer. transformer_args: Extra arguments passed to the transformer. use_fast: Whether or not to try to load the fast version of the tokenizer. do_basic_tokenize: Whether to do basic tokenization before wordpiece. trainable: ``False`` to use static embeddings. """ super().__init__() self.truncate_long_sequences = truncate_long_sequences self.transformer_args = transformer_args self.trainable = trainable self.ret_subtokens_group = ret_subtokens_group self.ret_subtokens = ret_subtokens self.ret_raw_hidden_states = ret_raw_hidden_states self.sep_is_eos = sep_is_eos self.cls_is_bos = cls_is_bos self.max_sequence_length = max_sequence_length self.word_dropout = word_dropout self.scalar_mix = scalar_mix self.average_subwords = average_subwords self.transformer = transformer self.field = field self._transformer_tokenizer = AutoTokenizer_.from_pretrained( self.transformer, use_fast=use_fast, do_basic_tokenize=do_basic_tokenize) self._tokenizer_transform = TransformerSequenceTokenizer( self._transformer_tokenizer, field, truncate_long_sequences=truncate_long_sequences, ret_prefix_mask=ret_prefix_mask, ret_token_span=ret_token_span, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos, ret_subtokens=ret_subtokens, ret_subtokens_group=ret_subtokens_group, max_seq_length=self.max_sequence_length)