Exemplo n.º 1
0
Arquivo: amr.py Projeto: lei1993/HanLP
def subtoken_to_tensor(token_field, ret):
    token_input_ids = PadSequenceDataLoader.pad_data(
        ret[f'{token_field}_input_ids'], 0, torch.long)
    token_token_span = PadSequenceDataLoader.pad_data(
        ret[f'{token_field}_token_span'], 0, torch.long)
    ret.update({
        f'{token_field}_token_span': token_token_span,
        f'{token_field}_input_ids': token_input_ids
    })
Exemplo n.º 2
0
 def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger = None,
                      sampler_builder: SamplerBuilder = None, gradient_accumulation=1, **kwargs) -> DataLoader:
     if isinstance(data, TransformableDataset):
         dataset = data
     else:
         args = dict((k, self.config.get(k, None)) for k in
                     ['delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint'])
         dataset = self.build_dataset(data, **args)
     if self.config.token_key is None:
         self.config.token_key = next(iter(dataset[0]))
         logger.info(
             f'Guess [bold][blue]token_key={self.config.token_key}[/blue][/bold] according to the '
             f'training dataset: [blue]{dataset}[/blue]')
     dataset.append_transform(self.tokenizer_transform)
     dataset.append_transform(self.last_transform())
     if not isinstance(data, list):
         dataset.purge_cache()
     if self.vocabs.mutable:
         self.build_vocabs(dataset, logger)
     if sampler_builder is not None:
         sampler = sampler_builder.build([len(x[f'{self.config.token_key}_input_ids']) for x in dataset], shuffle,
                                         gradient_accumulation=gradient_accumulation if shuffle else 1)
     else:
         sampler = None
     return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler)
Exemplo n.º 3
0
 def build_dataloader(self,
                      data,
                      transform: TransformList = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      tokenizer: PreTrainedTokenizer = None,
                      **kwargs) -> DataLoader:
     assert tokenizer
     dataset = TextTokenizingDataset(data, cache=isinstance(data, str), delimiter=self.config.sent_delimiter,
                                     generate_idx=isinstance(data, list),
                                     max_seq_len=self.config.max_seq_len,
                                     sent_delimiter=self.config.sent_delimiter,
                                     transform=[
                                         TransformerSequenceTokenizer(tokenizer,
                                                                      'text',
                                                                      ret_prefix_mask=True,
                                                                      ret_subtokens=True,
                                                                      ),
                                         FieldLength('text_input_ids', 'text_input_ids_length', delta=-2),
                                         generate_token_span_tuple])
     return PadSequenceDataLoader(
         batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset, 'text_input_ids', 'text'),
                                                  shuffle=training),
         device=device,
         dataset=dataset)
Exemplo n.º 4
0
 def build_dataloader(self,
                      data,
                      transform: Callable = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      cache=False,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     dataset = CRFConstituencyParsing.build_dataset(self, data, transform)
     if isinstance(data, str):
         dataset.purge_cache()
     if self.vocabs.mutable:
         CRFConstituencyParsing.build_vocabs(self, dataset, logger)
     if dataset.cache:
         timer = CountdownTimer(len(dataset))
         # noinspection PyCallByClass
         BiaffineDependencyParser.cache_dataset(self, dataset, timer,
                                                training, logger)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset)
Exemplo n.º 5
0
 def build_dataloader(self,
                      data,
                      batch_size,
                      shuffle,
                      device,
                      logger=None,
                      **kwargs) -> DataLoader:
     vocabs = self.vocabs
     token_embed = self._convert_embed()
     dataset = data if isinstance(
         data, TransformableDataset) else self.build_dataset(
             data, transform=[vocabs])
     if vocabs.mutable:
         # Before building vocabs, let embeddings submit their vocabs, some embeddings will possibly opt out as their
         # transforms are not relevant to vocabs
         if isinstance(token_embed, Embedding):
             transform = token_embed.transform(vocabs=vocabs)
             if transform:
                 dataset.transform.insert(-1, transform)
         self.build_vocabs(dataset, logger)
     if isinstance(token_embed, Embedding):
         # Vocabs built, now add all transforms to the pipeline. Be careful about redundant ones.
         transform = token_embed.transform(vocabs=vocabs)
         if transform and transform not in dataset.transform:
             dataset.transform.insert(-1, transform)
     sampler = SortingSampler(
         [len(sample[self.config.token_key]) for sample in dataset],
         batch_size,
         shuffle=shuffle)
     return PadSequenceDataLoader(dataset,
                                  device=device,
                                  batch_sampler=sampler,
                                  vocabs=vocabs)
Exemplo n.º 6
0
 def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger, sampler_builder,
                      gradient_accumulation,
                      **kwargs) -> DataLoader:
     # shuffle = False  # We need to find the smallest grad_acc
     dataset = HeadDrivenPhraseStructureDataset(data, transform=[append_bos_eos])
     if self.config.get('transform', None):
         dataset.append_transform(self.config.transform)
     dataset.append_transform(self.vocabs)
     if isinstance(self.config.embed, Embedding):
         transform = self.config.embed.transform(vocabs=self.vocabs)
         if transform:
             dataset.append_transform(transform)
     dataset.append_transform(self.vocabs)
     field_length = FieldLength('token')
     dataset.append_transform(field_length)
     if isinstance(data, str):
         dataset.purge_cache()  # Enable cache
     if self.vocabs.mutable:
         self.build_vocabs(dataset, logger)
     if 'token' in self.vocabs:
         lens = [x[field_length.dst] for x in dataset]
     else:
         lens = [len(x['token_input_ids']) for x in dataset]
     if sampler_builder:
         sampler = sampler_builder.build(lens, shuffle, gradient_accumulation)
     else:
         sampler = None
     return PadSequenceDataLoader(batch_sampler=sampler,
                                  batch_size=batch_size,
                                  device=device,
                                  dataset=dataset)
Exemplo n.º 7
0
 def build_dataloader(self,
                      data,
                      batch_size,
                      shuffle,
                      device,
                      logger: logging.Logger = None,
                      vocabs=None,
                      sampler_builder=None,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     if vocabs is None:
         vocabs = self.vocabs
     transform = TransformList(unpack_ner, FieldLength('token'))
     if isinstance(self.config.embed, Embedding):
         transform.append(self.config.embed.transform(vocabs=vocabs))
     transform.append(self.vocabs)
     dataset = self.build_dataset(data, vocabs, transform)
     if vocabs.mutable:
         self.build_vocabs(dataset, logger, vocabs)
     if 'token' in vocabs:
         lens = [x['token'] for x in dataset]
     else:
         lens = [len(x['token_input_ids']) for x in dataset]
     if sampler_builder:
         sampler = sampler_builder.build(lens, shuffle,
                                         gradient_accumulation)
     else:
         sampler = None
     return PadSequenceDataLoader(batch_sampler=sampler,
                                  device=device,
                                  dataset=dataset)
Exemplo n.º 8
0
Arquivo: mlm.py Projeto: lei1993/HanLP
 def build_dataloader(self,
                      data,
                      batch_size,
                      shuffle=False,
                      device=None,
                      logger: logging.Logger = None,
                      verbose=False,
                      **kwargs) -> DataLoader:
     dataset = MaskedLanguageModelDataset(
         [{
             'token': x
         } for x in data],
         generate_idx=True,
         transform=TransformerTextTokenizer(self.tokenizer,
                                            text_a_key='token'))
     if verbose:
         verbose = CountdownTimer(len(dataset))
     lens = []
     for each in dataset:
         lens.append(len(each['token_input_ids']))
         if verbose:
             verbose.log(
                 'Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]'
             )
     dataloader = PadSequenceDataLoader(dataset,
                                        batch_sampler=SortingSampler(
                                            lens, batch_size=batch_size),
                                        device=device)
     return dataloader
Exemplo n.º 9
0
 def build_dataloader(self,
                      data,
                      transform: TransformList = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     transform.insert(0, append_bos)
     dataset = BiaffineDependencyParser.build_dataset(self, data, transform)
     if isinstance(data, str):
         dataset.purge_cache()
     if self.vocabs.mutable:
         BiaffineDependencyParser.build_vocabs(self,
                                               dataset,
                                               logger,
                                               transformer=True)
     if dataset.cache:
         timer = CountdownTimer(len(dataset))
         BiaffineDependencyParser.cache_dataset(self, dataset, timer,
                                                training, logger)
     max_seq_len = self.config.get('max_seq_len', None)
     if max_seq_len and isinstance(data, str):
         dataset.prune(lambda x: len(x['token_input_ids']) > 510, logger)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset, length_field='FORM'),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset,
                                  pad=self.get_pad_dict())
Exemplo n.º 10
0
 def build_dataloader(self,
                      data,
                      transform: TransformList = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      cache=False,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     args = dict((k, self.config[k]) for k in [
         'delimiter', 'max_seq_len', 'sent_delimiter', 'char_level',
         'hard_constraint'
     ] if k in self.config)
     # We only need those transforms before TransformerTokenizer
     transformer_index = transform.index_by_type(
         TransformerSequenceTokenizer)
     assert transformer_index is not None
     transform = transform[:transformer_index + 1]
     if self.transform:
         transform.insert(0, self.transform)
     transform.append(self.last_transform())
     dataset = self.build_dataset(data,
                                  cache=cache,
                                  transform=transform,
                                  **args)
     if self.vocabs.mutable:
         self.build_vocabs(dataset, logger)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset, 'token_input_ids'),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset)
Exemplo n.º 11
0
 def build_dataloader(self,
                      data,
                      transform: Callable = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      cache=False,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     _transform = [generate_lemma_rule, append_bos, self.vocabs, transform]
     if isinstance(data, str) and not self.config.punct:
         _transform.append(PunctuationMask('token', 'punct_mask'))
     dataset = UniversalDependenciesParser.build_dataset(
         self, data, _transform)
     if self.vocabs.mutable:
         UniversalDependenciesParser.build_vocabs(self,
                                                  dataset,
                                                  logger,
                                                  transformer=True)
     max_seq_len = self.config.get('max_seq_len', None)
     if max_seq_len and isinstance(data, str):
         dataset.prune(lambda x: len(x['token_input_ids']) > max_seq_len,
                       logger)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset, length_field='token'),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset,
                                  pad={'arc': 0})
Exemplo n.º 12
0
 def build_dataloader(self,
                      data,
                      transform: Callable = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      cache=False,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     args = dict((k, self.config[k]) for k in [
         'delimiter', 'max_seq_len', 'sent_delimiter', 'char_level',
         'hard_constraint'
     ] if k in self.config)
     dataset = self.build_dataset(data,
                                  cache=cache,
                                  transform=transform,
                                  **args)
     dataset.append_transform(self.vocabs)
     dataset.purge_cache()
     if self.vocabs.mutable:
         self.build_vocabs(dataset, logger)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset)
Exemplo n.º 13
0
 def build_dataloader(self,
                      data,
                      transform: TransformList = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     if isinstance(data, list):
         data = BiaffineSemanticDependencyParser.build_samples(
             self, data, self.config.use_pos)
     dataset = BiaffineSemanticDependencyParser.build_dataset(
         self, data, transform)
     if isinstance(data, str):
         dataset.purge_cache()
     if self.vocabs.mutable:
         BiaffineSemanticDependencyParser.build_vocabs(self,
                                                       dataset,
                                                       logger,
                                                       transformer=True)
     if dataset.cache:
         timer = CountdownTimer(len(dataset))
         BiaffineSemanticDependencyParser.cache_dataset(
             self, dataset, timer, training, logger)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset,
                                  pad=self.get_pad_dict())
Exemplo n.º 14
0
 def build_dataloader(self,
                      data,
                      batch_size,
                      sampler_builder: SamplerBuilder = None,
                      gradient_accumulation=1,
                      shuffle=False,
                      device=None,
                      logger: logging.Logger = None,
                      **kwargs) -> DataLoader:
     if isinstance(data, TransformableDataset):
         dataset = data
     else:
         dataset = self.build_dataset(data, [
             self.config.embed.transform(vocabs=self.vocabs), self.vocabs,
             FieldLength('token')
         ])
     if self.vocabs.mutable:
         # noinspection PyTypeChecker
         self.build_vocabs(dataset, logger)
     lens = [len(x['token_input_ids']) for x in dataset]
     if sampler_builder:
         sampler = sampler_builder.build(lens, shuffle,
                                         gradient_accumulation)
     else:
         sampler = None
     return PadSequenceDataLoader(dataset,
                                  batch_size,
                                  shuffle,
                                  device=device,
                                  batch_sampler=sampler)
Exemplo n.º 15
0
 def build_dataloader(self,
                      data,
                      transform: Callable = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     dataset = BiaffineSecondaryParser.build_dataset(self, data, transform)
     dataset.purge_cache()
     if self.vocabs.mutable:
         BiaffineSecondaryParser.build_vocabs(self,
                                              dataset,
                                              logger,
                                              transformer=True)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset,
                                  pad={
                                      'arc': 0,
                                      'arc_2nd': False
                                  })
Exemplo n.º 16
0
 def build_dataloader(self, data, transform: Callable = None, training=False, device=None,
                      logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader:
     dataset = self.build_dataset(data, isinstance(data, list), logger, transform)
     return PadSequenceDataLoader(
         batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
                                                  gradient_accumulation=gradient_accumulation),
         device=device,
         dataset=dataset)
Exemplo n.º 17
0
 def build_dataloader(self, data, transform: Callable = None, training=False, device=None,
                      logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader:
     dataset = self.build_dataset(data, transform=[transform, self.vocabs])
     if self.vocabs.mutable:
         SpanBIOSemanticRoleLabeler.build_vocabs(self, dataset, logger)
     return PadSequenceDataLoader(
         batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training,
                                                  gradient_accumulation=gradient_accumulation),
         device=device,
         dataset=dataset)
Exemplo n.º 18
0
 def build_dataloader(self,
                      data,
                      shuffle=False,
                      device=None,
                      logger: logging.Logger = None,
                      **kwargs) -> DataLoader:
     embed: FastTextEmbedding = self.config.embed
     dataset = FastTextDataset([{
         'token': data
     }],
                               transform=embed.transform())
     return PadSequenceDataLoader(dataset, device=device)
Exemplo n.º 19
0
    def build_dataloader(self,
                         data,
                         shuffle,
                         device,
                         embed: Embedding,
                         training=False,
                         logger=None,
                         gradient_accumulation=1,
                         sampler_builder=None,
                         batch_size=None,
                         bos='\0',
                         **kwargs) -> DataLoader:
        first_transform = TransformList(functools.partial(append_bos, bos=bos))
        embed_transform = embed.transform(vocabs=self.vocabs)
        transformer_transform = self._get_transformer_transform_from_transforms(
            embed_transform)
        if embed_transform:
            if transformer_transform and isinstance(embed_transform,
                                                    TransformList):
                embed_transform.remove(transformer_transform)

            first_transform.append(embed_transform)
        dataset = self.build_dataset(data, first_transform=first_transform)
        if self.config.get('transform', None):
            dataset.append_transform(self.config.transform)

        if self.vocabs.mutable:
            self.build_vocabs(dataset, logger, self._transformer_trainable())
        if transformer_transform and isinstance(embed_transform,
                                                TransformList):
            embed_transform.append(transformer_transform)

        dataset.append_transform(FieldLength('token', 'sent_length'))
        if isinstance(data, str):
            dataset.purge_cache()
        if len(dataset) > 1000 and isinstance(data, str):
            timer = CountdownTimer(len(dataset))
            self.cache_dataset(dataset, timer, training, logger)
        if sampler_builder:
            lens = [sample['sent_length'] for sample in dataset]
            sampler = sampler_builder.build(lens, shuffle,
                                            gradient_accumulation)
        else:
            sampler = None
        loader = PadSequenceDataLoader(dataset=dataset,
                                       batch_sampler=sampler,
                                       batch_size=batch_size,
                                       pad=self.get_pad_dict(),
                                       device=device,
                                       vocabs=self.vocabs)
        return loader
Exemplo n.º 20
0
 def build_dataloader(self, data, batch_size, shuffle, device, text_a_key, text_b_key,
                      label_key,
                      logger: logging.Logger = None,
                      sorting=True,
                      **kwargs) -> DataLoader:
     if not batch_size:
         batch_size = self.config.batch_size
     dataset = self.build_dataset(data)
     dataset.append_transform(self.vocabs)
     if self.vocabs.mutable:
         if not any([text_a_key, text_b_key]):
             if len(dataset.headers) == 2:
                 self.config.text_a_key = dataset.headers[0]
                 self.config.label_key = dataset.headers[1]
             elif len(dataset.headers) >= 3:
                 self.config.text_a_key, self.config.text_b_key, self.config.label_key = dataset.headers[0], \
                                                                                         dataset.headers[1], \
                                                                                         dataset.headers[-1]
             else:
                 raise ValueError('Wrong dataset format')
             report = {'text_a_key', 'text_b_key', 'label_key'}
             report = dict((k, self.config[k]) for k in report)
             report = [f'{k}={v}' for k, v in report.items() if v]
             report = ', '.join(report)
             logger.info(f'Guess [bold][blue]{report}[/blue][/bold] according to the headers of training dataset: '
                         f'[blue]{dataset}[/blue]')
         self.build_vocabs(dataset, logger)
         dataset.purge_cache()
     # if self.config.transform:
     #     dataset.append_transform(self.config.transform)
     dataset.append_transform(TransformerTextTokenizer(tokenizer=self.transformer_tokenizer,
                                                       text_a_key=self.config.text_a_key,
                                                       text_b_key=self.config.text_b_key,
                                                       max_seq_length=self.config.max_seq_length,
                                                       truncate_long_sequences=self.config.truncate_long_sequences,
                                                       output_key=''))
     batch_sampler = None
     if sorting and not isdebugging():
         if dataset.cache and len(dataset) > 1000:
             timer = CountdownTimer(len(dataset))
             lens = []
             for idx, sample in enumerate(dataset):
                 lens.append(len(sample['input_ids']))
                 timer.log('Pre-processing and caching dataset [blink][yellow]...[/yellow][/blink]',
                           ratio_percentage=None)
         else:
             lens = [len(sample['input_ids']) for sample in dataset]
         batch_sampler = SortingSampler(lens, batch_size=batch_size, shuffle=shuffle,
                                        batch_max_tokens=self.config.batch_max_tokens)
     return PadSequenceDataLoader(dataset, batch_size, shuffle, batch_sampler=batch_sampler, device=device)
Exemplo n.º 21
0
 def build_dataloader(self, data, batch_size, shuffle, device,
                      logger: logging.Logger, **kwargs) -> DataLoader:
     dataset = SentenceBoundaryDetectionDataset(data,
                                                **self.config,
                                                transform=[self.vocabs])
     if isinstance(data, str):
         dataset.purge_cache()
     if not self.vocabs:
         self.build_vocabs(dataset, logger)
     return PadSequenceDataLoader(dataset,
                                  batch_size=batch_size,
                                  shuffle=shuffle,
                                  device=device,
                                  pad={'label_id': .0})
Exemplo n.º 22
0
 def build_dataloader(self,
                      data: List[str],
                      shuffle=False,
                      device=None,
                      logger: logging.Logger = None,
                      doc2vec=False,
                      batch_size=32,
                      **kwargs) -> DataLoader:
     dataset = Word2VecDataset(
         [{
             'token': x
         } for x in data],
         transform=self._tokenize if doc2vec else self.vocabs)
     return PadSequenceDataLoader(dataset,
                                  device=device,
                                  batch_size=batch_size)
Exemplo n.º 23
0
    def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger,
                         generate_idx=False, **kwargs) -> DataLoader:
        batch_max_tokens = self.config.batch_max_tokens
        gradient_accumulation = self.config.get('gradient_accumulation', 1)
        if batch_size:
            batch_size //= gradient_accumulation
        if batch_max_tokens:
            batch_max_tokens //= gradient_accumulation
        dataset = self.build_dataset(data, generate_idx, logger)

        sampler = SortingSampler([x['token_length'] for x in dataset],
                                 batch_size=batch_size,
                                 batch_max_tokens=batch_max_tokens,
                                 shuffle=shuffle)
        return PadSequenceDataLoader(batch_sampler=sampler,
                                     device=device,
                                     dataset=dataset)
Exemplo n.º 24
0
 def build_dataloader(self, data, batch_size, shuffle, device,
                      logger: logging.Logger, **kwargs) -> DataLoader:
     dataset = CONLL12CorefDataset(data, [FieldLength('text')])
     if isinstance(self.config.embed, Embedding):
         transform = self.config.embed.transform(vocabs=self.vocabs)
         if transform:
             dataset.append_transform(transform)
     dataset.append_transform(self.vocabs)
     if isinstance(data, str):
         dataset.purge_cache()  # Enable cache
     if self.vocabs.mutable:
         self.build_vocabs(dataset)
     return PadSequenceDataLoader(batch_size=batch_size,
                                  shuffle=shuffle,
                                  device=device,
                                  dataset=dataset,
                                  pad={
                                      'spans': 0,
                                      'span_labels': -1
                                  })
Exemplo n.º 25
0
 def build_dataloader(self,
                      data,
                      transform: TransformList = None,
                      training=False,
                      device=None,
                      logger: logging.Logger = None,
                      gradient_accumulation=1,
                      **kwargs) -> DataLoader:
     transform = copy(transform)
     transform.append(unpack_ner)
     dataset = BiaffineNamedEntityRecognizer.build_dataset(
         self, data, self.vocabs, transform)
     if self.vocabs.mutable:
         BiaffineNamedEntityRecognizer.build_vocabs(self, dataset, logger,
                                                    self.vocabs)
     return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build(
         self.compute_lens(data, dataset),
         shuffle=training,
         gradient_accumulation=gradient_accumulation),
                                  device=device,
                                  dataset=dataset)
Exemplo n.º 26
0
 def build_dataloader(self,
                      data,
                      shuffle,
                      device,
                      training=False,
                      logger=None,
                      gradient_accumulation=1,
                      sampler_builder=None,
                      batch_size=None,
                      **kwargs) -> DataLoader:
     dataset = self.build_dataset(data)
     if self.vocabs.mutable:
         self.build_vocabs(dataset, logger, self.config.transformer)
     transformer_tokenizer = self.transformer_tokenizer
     if transformer_tokenizer:
         dataset.transform.append(self.build_tokenizer_transform())
     dataset.append_transform(FieldLength('token', 'sent_length'))
     if isinstance(data, str):
         dataset.purge_cache()
     if len(dataset) > 1000 and isinstance(data, str):
         timer = CountdownTimer(len(dataset))
         self.cache_dataset(dataset, timer, training, logger)
     if self.config.transformer:
         lens = [len(sample['input_ids']) for sample in dataset]
     else:
         lens = [sample['sent_length'] for sample in dataset]
     if sampler_builder:
         sampler = sampler_builder.build(lens, shuffle,
                                         gradient_accumulation)
     else:
         sampler = None
     loader = PadSequenceDataLoader(dataset=dataset,
                                    batch_sampler=sampler,
                                    batch_size=batch_size,
                                    num_workers=0 if isdebugging() else 2,
                                    pad=self.get_pad_dict(),
                                    device=device,
                                    vocabs=self.vocabs)
     return loader
Exemplo n.º 27
0
 def build_dataloader(self,
                      data,
                      batch_size,
                      sent_a_col=None,
                      sent_b_col=None,
                      similarity_col=None,
                      delimiter='auto',
                      gradient_accumulation=1,
                      sampler_builder=None,
                      shuffle=False,
                      device=None,
                      logger: logging.Logger = None,
                      split=None,
                      **kwargs) -> DataLoader:
     dataset = SemanticTextualSimilarityDataset(data,
                                                sent_a_col,
                                                sent_b_col,
                                                similarity_col,
                                                delimiter=delimiter,
                                                transform=self._tokenizer,
                                                cache=isinstance(data, str))
     if split == 'trn':
         scores = [x['similarity'] for x in dataset]
         self.config.max_score = max(scores)
         self.config.min_score = min(scores)
     if not sampler_builder:
         sampler_builder = SortingSamplerBuilder(batch_size=batch_size)
     lens = [len(x['input_ids']) for x in dataset]
     return PadSequenceDataLoader(dataset,
                                  batch_sampler=sampler_builder.build(
                                      lens, shuffle, gradient_accumulation),
                                  device=device,
                                  pad={
                                      'similarity':
                                      0.0,
                                      'input_ids':
                                      self._tokenizer.tokenizer.pad_token_id
                                  })
Exemplo n.º 28
0
 def build_dataloader(self,
                      data,
                      batch_size,
                      shuffle=False,
                      device=None,
                      logger: logging.Logger = None,
                      sampler_builder=None,
                      gradient_accumulation=1,
                      transformer: ContextualWordEmbedding = None,
                      **kwargs) -> DataLoader:
     transform = [
         generate_lemma_rule, append_bos, self.vocabs,
         transformer.transform(),
         FieldLength('token')
     ]
     if not self.config.punct:
         transform.append(PunctuationMask('token', 'punct_mask'))
     dataset = self.build_dataset(data, transform)
     if self.vocabs.mutable:
         # noinspection PyTypeChecker
         self.build_vocabs(dataset, logger)
     lens = [len(x['token_input_ids']) for x in dataset]
     if sampler_builder:
         sampler = sampler_builder.build(lens, shuffle,
                                         gradient_accumulation)
     else:
         sampler = SortingSamplerBuilder(batch_size).build(
             lens, shuffle, gradient_accumulation)
     return PadSequenceDataLoader(
         dataset,
         batch_size,
         shuffle,
         device=device,
         batch_sampler=sampler,
         pad={'arc': 0},
     )
Exemplo n.º 29
0
Arquivo: amr.py Projeto: lei1993/HanLP
def make_batch_for_squeeze(data, augmented_concept, tokenizer, device, ret):
    token_field = 'token_and_concept'
    attention_mask = []
    token_and_concept = [
        t + [tokenizer.sep_token] + c
        for t, c in zip(data['token'], augmented_concept)
    ]
    encodings = [tokenizer({token_field: x}) for x in token_and_concept]
    ret.update(merge_list_of_dict(encodings))
    max_input_len = len(max(ret[f'{token_field}_input_ids'], key=len))
    concept_mask = []
    token_mask = []
    token_type_ids = []
    snt_len = []
    last_concept_offset = []
    for tokens, concepts, input_ids, spans in zip(
            data['token'], augmented_concept,
            ret['token_and_concept_input_ids'],
            ret['token_and_concept_token_span']):
        raw_sent_len = len(tokens) + 1  # for [SEP]
        raw_concept_len = len(concepts)
        if concepts[-1] == END:
            concept_mask.append([False] * raw_sent_len + [True] *
                                (raw_concept_len - 1) +
                                [False])  # skip END concept
        else:
            concept_mask.append([False] * raw_sent_len +
                                [True] * raw_concept_len)
        token_mask.append([False] + [True] * (raw_sent_len - 2) + [False] *
                          (raw_concept_len + 1))
        assert len(concept_mask) == len(token_mask)
        snt_len.append(raw_sent_len - 2)  # skip [CLS] and [SEP]
        sent_len = input_ids.index(tokenizer.tokenizer.sep_token_id) + 1
        concept_len = len(input_ids) - sent_len
        mask = torch.zeros((max_input_len, max_input_len), dtype=torch.bool)
        mask[:sent_len + concept_len, :sent_len] = True
        bottom_right = ~SelfAttentionMask.get_mask(
            concept_len, device, ret_parameter=False)
        mask[sent_len:sent_len + concept_len,
             sent_len:sent_len + concept_len] = bottom_right
        for group in spans:
            if group[0] >= sent_len:
                for i in range(len(group)):
                    for j in range(i + 1, len(group)):
                        mask[group[i], group[j]] = True
        attention_mask.append(mask)
        _token_type_ids = [0] * sent_len + [1] * concept_len
        token_type_ids.append(_token_type_ids)
        assert len(input_ids) == len(_token_type_ids)
        last_concept_offset.append(raw_concept_len - 1)
    ret['attention_mask'] = torch.stack(attention_mask)
    ret['concept_mask'] = PadSequenceDataLoader.pad_data(
        concept_mask, 0, torch.bool)
    ret['token_mask'] = PadSequenceDataLoader.pad_data(token_mask, 0,
                                                       torch.bool)
    ret['token_type_ids'] = PadSequenceDataLoader.pad_data(
        token_type_ids, 0, torch.long)
    ret['snt_len'] = PadSequenceDataLoader.pad_data(snt_len, 0, torch.long)
    ret['last_concept_offset'] = PadSequenceDataLoader.pad_data(
        last_concept_offset, 0, torch.long)
    return token_field
Exemplo n.º 30
0
    def predict(self,
                data: Union[str, List[str]],
                batch_size: int = None,
                tasks: Optional[Union[str, List[str]]] = None,
                skip_tasks: Optional[Union[str, List[str]]] = None,
                resolved_tasks=None,
                **kwargs) -> Document:
        """Predict on data.

        Args:
            data: A sentence or a list of sentences.
            batch_size: Decoding batch size.
            tasks: The tasks to predict.
            skip_tasks: The tasks to skip.
            resolved_tasks: The resolved tasks to override ``tasks`` and ``skip_tasks``.
            **kwargs: Not used.

        Returns:
            A :class:`~hanlp_common.document.Document`.
        """
        doc = Document()
        if not data:
            return doc

        target_tasks = resolved_tasks or self.resolve_tasks(tasks, skip_tasks)
        flatten_target_tasks = [
            self.tasks[t] for group in target_tasks for t in group
        ]
        cls_is_bos = any([x.cls_is_bos for x in flatten_target_tasks])
        sep_is_eos = any([x.sep_is_eos for x in flatten_target_tasks])
        # Now build the dataloaders and execute tasks
        first_task_name: str = list(target_tasks[0])[0]
        first_task: Task = self.tasks[first_task_name]
        encoder_transform, transform = self.build_transform(first_task)
        # Override the tokenizer config of the 1st task
        encoder_transform.sep_is_eos = sep_is_eos
        encoder_transform.cls_is_bos = cls_is_bos
        average_subwords = self.model.encoder.average_subwords
        flat = first_task.input_is_flat(data)
        if flat:
            data = [data]
        device = self.device
        samples = first_task.build_samples(data,
                                           cls_is_bos=cls_is_bos,
                                           sep_is_eos=sep_is_eos)
        dataloader = first_task.build_dataloader(samples,
                                                 transform=transform,
                                                 device=device)
        results = defaultdict(list)
        order = []
        for batch in dataloader:
            order.extend(batch[IDX])
            # Run the first task, let it make the initial batch for the successors
            output_dict = self.predict_task(first_task,
                                            first_task_name,
                                            batch,
                                            results,
                                            run_transform=True,
                                            cls_is_bos=cls_is_bos,
                                            sep_is_eos=sep_is_eos)
            # Run each task group in order
            for group_id, group in enumerate(target_tasks):
                # We could parallelize this in the future
                for task_name in group:
                    if task_name == first_task_name:
                        continue
                    output_dict = self.predict_task(self.tasks[task_name],
                                                    task_name,
                                                    batch,
                                                    results,
                                                    output_dict,
                                                    run_transform=True,
                                                    cls_is_bos=cls_is_bos,
                                                    sep_is_eos=sep_is_eos)
                if group_id == 0:
                    # We are kind of hard coding here. If the first task is a tokenizer,
                    # we need to convert the hidden and mask to token level
                    if first_task_name.startswith('tok'):
                        spans = []
                        tokens = []
                        for span_per_sent, token_per_sent in zip(
                                output_dict[first_task_name]['prediction'],
                                results[first_task_name][-len(batch[IDX]):]):
                            if cls_is_bos:
                                span_per_sent = [(-1, 0)] + span_per_sent
                                token_per_sent = [BOS] + token_per_sent
                            if sep_is_eos:
                                span_per_sent = span_per_sent + [
                                    (span_per_sent[-1][0] + 1,
                                     span_per_sent[-1][1] + 1)
                                ]
                                token_per_sent = token_per_sent + [EOS]
                            # The offsets start with 0 while [CLS] is zero
                            if average_subwords:
                                span_per_sent = [
                                    list(range(x[0] + 1, x[1] + 1))
                                    for x in span_per_sent
                                ]
                            else:
                                span_per_sent = [
                                    x[0] + 1 for x in span_per_sent
                                ]
                            spans.append(span_per_sent)
                            tokens.append(token_per_sent)
                        spans = PadSequenceDataLoader.pad_data(spans,
                                                               0,
                                                               torch.long,
                                                               device=device)
                        output_dict['hidden'] = pick_tensor_for_each_token(
                            output_dict['hidden'], spans, average_subwords)
                        batch['token_token_span'] = spans
                        batch['token'] = tokens
                        # noinspection PyTypeChecker
                        batch['token_length'] = torch.tensor(
                            [len(x) for x in tokens],
                            dtype=torch.long,
                            device=device)
                        batch.pop('mask', None)
        # Put results into doc in the order of tasks
        for k in self.config.task_names:
            v = results.get(k, None)
            if v is None:
                continue
            doc[k] = reorder(v, order)
        # Allow task to perform finalization on document
        for group in target_tasks:
            for task_name in group:
                task = self.tasks[task_name]
                task.finalize_document(doc, task_name)
        # If no tok in doc, use raw input as tok
        if not any(k.startswith('tok') for k in doc):
            doc['tok'] = data
        if flat:
            for k, v in list(doc.items()):
                doc[k] = v[0]
        # If there is only one field, don't bother to wrap it
        # if len(doc) == 1:
        #     return list(doc.values())[0]
        return doc