예제 #1
0
    def build_tokenizer(self, tokenizer: TransformerSequenceTokenizer):
        """Build a transformer tokenizer for this task.

        Args:
            tokenizer: A tokenizer which is shared but can be adjusted to provide per-task settings.

        Returns:
            A TransformerSequenceTokenizer.

        """
        if tokenizer.cls_is_bos != self.cls_is_bos or tokenizer.sep_is_eos != self.sep_is_eos:
            tokenizer = copy(tokenizer)
            tokenizer.cls_is_bos = self.cls_is_bos
            tokenizer.sep_is_eos = self.sep_is_eos
        return tokenizer
예제 #2
0
 def build_dataloader(self,
                      data,
                      transform: TransformList = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      tokenizer: PreTrainedTokenizer = None,
                      **kwargs) -> DataLoader:
     assert tokenizer
     dataset = TextTokenizingDataset(data, cache=isinstance(data, str), delimiter=self.config.sent_delimiter,
                                     generate_idx=isinstance(data, list),
                                     max_seq_len=self.config.max_seq_len,
                                     sent_delimiter=self.config.sent_delimiter,
                                     transform=[
                                         TransformerSequenceTokenizer(tokenizer,
                                                                      'text',
                                                                      ret_prefix_mask=True,
                                                                      ret_subtokens=True,
                                                                      ),
                                         FieldLength('text_input_ids', 'text_input_ids_length', delta=-2),
                                         generate_token_span_tuple])
     return PadSequenceDataLoader(
         batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset, 'text_input_ids', 'text'),
                                                  shuffle=training),
         device=device,
         dataset=dataset)
예제 #3
0
파일: amr.py 프로젝트: lei1993/HanLP
def make_batch_for_bart(augmented_concept,
                        ret,
                        tokenizer,
                        device,
                        training=True):
    token_field = 'concept'
    tokenizer = TransformerSequenceTokenizer(tokenizer.tokenizer,
                                             token_field,
                                             cls_is_bos=True,
                                             sep_is_eos=None)
    encodings = [
        tokenizer({token_field: x[:-1] if training else x})
        for x in augmented_concept
    ]
    ret.update(merge_list_of_dict(encodings))
    decoder_mask = []
    max_seq_len = len(max(ret['concept_input_ids'], key=len))
    last_concept_offset = []
    for spans, concepts in zip(ret['concept_token_span'], augmented_concept):
        mask = ~SelfAttentionMask.get_mask(
            max_seq_len, device, ret_parameter=False)
        for group in spans:
            for i in range(len(group)):
                for j in range(i + 1, len(group)):
                    mask[group[i], group[j]] = True
        decoder_mask.append(mask)
        last_concept_offset.append(len(concepts) - 1)
    ret['decoder_mask'] = torch.stack(decoder_mask)
    if not training:
        ret['last_concept_offset'] = torch.tensor(last_concept_offset,
                                                  device=device,
                                                  dtype=torch.long)
    subtoken_to_tensor(token_field, ret)
예제 #4
0
 def tokenizer_transform(self) -> TransformerSequenceTokenizer:
     if not self._tokenizer_transform:
         self._tokenizer_transform = TransformerSequenceTokenizer(
             self.transformer_tokenizer,
             self.config.token_key,
             ret_token_span=True)
     return self._tokenizer_transform
예제 #5
0
 def tokenizer_transform(self):
     if not self._tokenizer_transform:
         self._tokenizer_transform = TransformerSequenceTokenizer(self.transformer_tokenizer,
                                                                  self.config.token_key,
                                                                  ret_subtokens=True,
                                                                  ret_subtokens_group=True,
                                                                  ret_token_span=False)
     return self._tokenizer_transform
예제 #6
0
 def build_tokenizer_transform(self):
     return TransformerSequenceTokenizer(self.transformer_tokenizer,
                                         'token',
                                         '',
                                         ret_token_span=True,
                                         cls_is_bos=True,
                                         max_seq_length=self.config.get(
                                             'max_sequence_length', 512),
                                         truncate_long_sequences=False)
예제 #7
0
 def build_tokenizer(self, tokenizer: TransformerSequenceTokenizer):
     # The transform for tokenizer needs very special settings, ensure these settings are set properly.
     return TransformerSequenceTokenizer(
         tokenizer.tokenizer,
         tokenizer.input_key,
         tokenizer.output_key,
         tokenizer.max_seq_length,
         tokenizer.truncate_long_sequences,
         ret_subtokens=True,
         ret_subtokens_group=True,
         ret_token_span=True,
         cls_is_bos=True,
         sep_is_eos=True,
         use_fast=tokenizer.tokenizer.is_fast,
         dict_force=self.dict_force,
         strip_cls_sep=False,
     )
예제 #8
0
    def __init__(self,
                 field: str,
                 transformer: str,
                 average_subwords=False,
                 scalar_mix: Union[ScalarMixWithDropoutBuilder, int] = None,
                 word_dropout: Optional[Union[float, Tuple[float,
                                                           str]]] = None,
                 max_sequence_length=None,
                 truncate_long_sequences=False,
                 cls_is_bos=False,
                 sep_is_eos=False,
                 ret_token_span=True,
                 ret_subtokens=False,
                 ret_subtokens_group=False,
                 ret_prefix_mask=False,
                 ret_raw_hidden_states=False,
                 transformer_args: Dict[str, Any] = None,
                 use_fast=True,
                 do_basic_tokenize=True,
                 trainable=True) -> None:
        """A contextual word embedding builder which builds a
        :class:`~hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule` and a
        :class:`~hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer`.

        Args:
            field: The field to work on. Usually some token fields.
            transformer:  An identifier of a ``PreTrainedModel``.
            average_subwords: ``True`` to average subword representations.
            scalar_mix: Layer attention.
            word_dropout: Dropout rate of randomly replacing a subword with MASK.
            max_sequence_length: The maximum sequence length. Sequence longer than this will be handled by sliding
                window.
            truncate_long_sequences: ``True`` to return hidden states of each layer.
            cls_is_bos: ``True`` means the first token of input is treated as [CLS] no matter what its surface form is.
                        ``False`` (default) means the first token is not [CLS], it will have its own embedding other than
                        the embedding of [CLS].
            sep_is_eos: ``True`` means the last token of input is [SEP].
                        ``False`` means it's not but [SEP] will be appended,
                        ``None`` means it dependents on `input[-1] == [EOS]`.
            ret_token_span: ``True`` to return span of each token measured by subtoken offsets.
            ret_subtokens: ``True`` to return list of subtokens belonging to each token.
            ret_subtokens_group: ``True`` to return list of offsets of subtokens belonging to each token.
            ret_prefix_mask: ``True`` to generate a mask where each non-zero element corresponds to a prefix of a token.
            ret_raw_hidden_states: ``True`` to return hidden states of each layer.
            transformer_args: Extra arguments passed to the transformer.
            use_fast: Whether or not to try to load the fast version of the tokenizer.
            do_basic_tokenize: Whether to do basic tokenization before wordpiece.
            trainable: ``False`` to use static embeddings.
        """
        super().__init__()
        self.truncate_long_sequences = truncate_long_sequences
        self.transformer_args = transformer_args
        self.trainable = trainable
        self.ret_subtokens_group = ret_subtokens_group
        self.ret_subtokens = ret_subtokens
        self.ret_raw_hidden_states = ret_raw_hidden_states
        self.sep_is_eos = sep_is_eos
        self.cls_is_bos = cls_is_bos
        self.max_sequence_length = max_sequence_length
        self.word_dropout = word_dropout
        self.scalar_mix = scalar_mix
        self.average_subwords = average_subwords
        self.transformer = transformer
        self.field = field
        self._transformer_tokenizer = AutoTokenizer_.from_pretrained(
            self.transformer,
            use_fast=use_fast,
            do_basic_tokenize=do_basic_tokenize)
        self._tokenizer_transform = TransformerSequenceTokenizer(
            self._transformer_tokenizer,
            field,
            truncate_long_sequences=truncate_long_sequences,
            ret_prefix_mask=ret_prefix_mask,
            ret_token_span=ret_token_span,
            cls_is_bos=cls_is_bos,
            sep_is_eos=sep_is_eos,
            ret_subtokens=ret_subtokens,
            ret_subtokens_group=ret_subtokens_group,
            max_seq_length=self.max_sequence_length)