def subtoken_to_tensor(token_field, ret): token_input_ids = PadSequenceDataLoader.pad_data( ret[f'{token_field}_input_ids'], 0, torch.long) token_token_span = PadSequenceDataLoader.pad_data( ret[f'{token_field}_token_span'], 0, torch.long) ret.update({ f'{token_field}_token_span': token_token_span, f'{token_field}_input_ids': token_input_ids })
def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader: dataset = BiaffineSecondaryParser.build_dataset(self, data, transform) if isinstance(data, str): dataset.purge_cache() if self.vocabs.mutable: BiaffineSecondaryParser.build_vocabs(self, dataset, logger, transformer=True) max_seq_len = self.config.get('max_seq_len', None) if max_seq_len and isinstance(data, str): dataset.prune(lambda x: len(x['token_input_ids']) > 510, logger) if dataset.cache: timer = CountdownTimer(len(dataset)) BiaffineSecondaryDependencyParsing.cache_dataset( self, dataset, timer, training, logger) return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build( self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset, pad={ 'arc': 0, 'arc_2nd': False })
def build_dataloader(self, data, transform: TransformList = None, training=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader: transform.insert(0, append_bos) dataset = BiaffineDependencyParser.build_dataset(self, data, transform) if isinstance(data, str): dataset.purge_cache() if self.vocabs.mutable: BiaffineDependencyParser.build_vocabs(self, dataset, logger, transformer=True) if dataset.cache: timer = CountdownTimer(len(dataset)) BiaffineDependencyParser.cache_dataset(self, dataset, timer, training, logger) max_seq_len = self.config.get('max_seq_len', None) if max_seq_len and isinstance(data, str): dataset.prune(lambda x: len(x['token_input_ids']) > 510, logger) return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build( self.compute_lens(data, dataset, length_field='FORM'), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset, pad=self.get_pad_dict())
def build_dataloader(self, data: List[List[str]], transform: Callable = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: args = dict((k, self.config[k]) for k in [ 'delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint' ]) dataset = self.build_dataset(data, cache=cache, transform=transform, **args) dataset.append_transform(self.vocabs) if self.vocabs.mutable: self.build_vocabs(dataset, logger) return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build( self.compute_lens(data, dataset, 'token_input_ids', 'token'), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset)
def build_dataloader(self, data, batch_size, sampler_builder: SamplerBuilder = None, gradient_accumulation=1, shuffle=False, device=None, logger: logging.Logger = None, **kwargs) -> DataLoader: if isinstance(data, TransformDataset): dataset = data else: transform = self.config.encoder.transform() if self.config.get('transform', None): transform = TransformList(self.config.transform, transform) dataset = self.build_dataset(data, transform, logger) if self.vocabs.mutable: # noinspection PyTypeChecker self.build_vocabs(dataset, logger) lens = [len(x['token_input_ids']) for x in dataset] if sampler_builder: sampler = sampler_builder.build(lens, shuffle, gradient_accumulation) else: sampler = None return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler)
def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, cache=False, gradient_accumulation=1, **kwargs) -> DataLoader: dataset = CRFConstituencyParsing.build_dataset(self, data, transform) if isinstance(data, str): dataset.purge_cache() if self.vocabs.mutable: CRFConstituencyParsing.build_vocabs(self, dataset, logger) if dataset.cache: timer = CountdownTimer(len(dataset)) # noinspection PyCallByClass BiaffineDependencyParser.cache_dataset(self, dataset, timer, training, logger) return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build( self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset)
def _create_dataloader(self, dataset, batch_size, device, sampler, shuffle): return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler, pad=self._get_pad_dict())
def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader: dataset = BiaffineSecondaryParser.build_dataset(self, data, transform) if isinstance(data, str): dataset.purge_cache() if self.vocabs.mutable: BiaffineSecondaryParser.build_vocabs(self, dataset, logger, transformer=True) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset, pad={'arc': 0, 'arc_2nd': False})
def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger, **kwargs) -> DataLoader: dataset = SentenceBoundaryDetectionDataset(data, **self.config, transform=[self.vocabs]) if isinstance(data, str): dataset.purge_cache() if not self.vocabs: self.build_vocabs(dataset, logger) return PadSequenceDataLoader(dataset, batch_size=batch_size, shuffle=shuffle, device=device, pad={'label_id': .0})
def build_dataloader(self, data, transform: Callable = None, training=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader: dataset = self.build_dataset(data, isinstance(data, list), logger, transform) return PadSequenceDataLoader(batch_sampler=self.sampler_builder.build( self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset)
def build_dataloader(self, data, shuffle, device, embed: Embedding, training=False, logger=None, gradient_accumulation=1, sampler_builder=None, batch_size=None, bos='\0', **kwargs) -> DataLoader: first_transform = TransformList(functools.partial(append_bos, bos=bos)) embed_transform = embed.transform(vocabs=self.vocabs) transformer_transform = self._get_transformer_transform_from_transforms(embed_transform) if embed_transform: if transformer_transform and isinstance(embed_transform, TransformList): embed_transform.remove(transformer_transform) first_transform.append(embed_transform) dataset = self.build_dataset(data, first_transform=first_transform) if self.config.get('transform', None): dataset.append_transform(self.config.transform) if self.vocabs.mutable: self.build_vocabs(dataset, logger, self._transformer_trainable()) if transformer_transform and isinstance(embed_transform, TransformList): embed_transform.append(transformer_transform) dataset.append_transform(FieldLength('token', 'sent_length')) if isinstance(data, str): dataset.purge_cache() if len(dataset) > 1000 and isinstance(data, str): timer = CountdownTimer(len(dataset)) self.cache_dataset(dataset, timer, training, logger) if sampler_builder: lens = [sample['sent_length'] for sample in dataset] sampler = sampler_builder.build(lens, shuffle, gradient_accumulation) else: sampler = None loader = PadSequenceDataLoader(dataset=dataset, batch_sampler=sampler, batch_size=batch_size, pad=self.get_pad_dict(), device=device, vocabs=self.vocabs) return loader
def build_dataloader(self, data, transform: TransformList = None, training=False, device=None, logger: logging.Logger = None, gradient_accumulation=1, **kwargs) -> DataLoader: if isinstance(data, list): data = BiaffineSemanticDependencyParser.build_samples(self, data, self.config.use_pos) dataset = BiaffineSemanticDependencyParser.build_dataset(self, data, transform) if isinstance(data, str): dataset.purge_cache() if self.vocabs.mutable: BiaffineSemanticDependencyParser.build_vocabs(self, dataset, logger, transformer=True) if dataset.cache: timer = CountdownTimer(len(dataset)) BiaffineSemanticDependencyParser.cache_dataset(self, dataset, timer, training, logger) return PadSequenceDataLoader( batch_sampler=self.sampler_builder.build(self.compute_lens(data, dataset), shuffle=training, gradient_accumulation=gradient_accumulation), device=device, dataset=dataset, pad=self.get_pad_dict())
def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger = None, sampler_builder: SamplerBuilder = None, gradient_accumulation=1, **kwargs) -> DataLoader: if isinstance(data, TransformDataset): dataset = data else: args = dict((k, self.config[k]) for k in [ 'delimiter', 'max_seq_len', 'sent_delimiter', 'char_level', 'hard_constraint' ]) dataset = self.build_dataset(data, **args) if self.config.token_key is None: self.config.token_key = next(iter(dataset[0])) logger.info( f'Guess [bold][blue]token_key={self.config.token_key}[/blue][/bold] according to the ' f'training dataset: [blue]{dataset}[/blue]') dataset.append_transform(self.tokenizer_transform) dataset.append_transform(self.last_transform()) if not isinstance(data, list): dataset.purge_cache() if self.vocabs.mutable: self.build_vocabs(dataset, logger) if sampler_builder is not None: sampler = sampler_builder.build( [ len(x[f'{self.config.token_key}_input_ids']) for x in dataset ], shuffle, gradient_accumulation=gradient_accumulation if shuffle else 1) else: sampler = None return PadSequenceDataLoader(dataset, batch_size, shuffle, device=device, batch_sampler=sampler)
def build_dataloader(self, data, shuffle, device, training=False, logger=None, gradient_accumulation=1, sampler_builder=None, batch_size=None, **kwargs) -> DataLoader: dataset = self.build_dataset(data) if self.vocabs.mutable: self.build_vocabs(dataset, logger, self.config.transformer) transformer_tokenizer = self.transformer_tokenizer if transformer_tokenizer: dataset.transform.append(self.build_tokenizer_transform()) dataset.append_transform(FieldLength('token', 'sent_length')) if isinstance(data, str): dataset.purge_cache() if len(dataset) > 1000 and isinstance(data, str): timer = CountdownTimer(len(dataset)) self.cache_dataset(dataset, timer, training, logger) if self.config.transformer: lens = [len(sample['input_ids']) for sample in dataset] else: lens = [sample['sent_length'] for sample in dataset] if sampler_builder: sampler = sampler_builder.build(lens, shuffle, gradient_accumulation) else: sampler = None loader = PadSequenceDataLoader(dataset=dataset, batch_sampler=sampler, batch_size=batch_size, num_workers=0 if isdebugging() else 2, pad=self.get_pad_dict(), device=device, vocabs=self.vocabs) return loader
def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger, generate_idx=False, **kwargs) -> DataLoader: batch_max_tokens = self.config.batch_max_tokens gradient_accumulation = self.config.get('gradient_accumulation', 1) if batch_size: batch_size //= gradient_accumulation if batch_max_tokens: batch_max_tokens //= gradient_accumulation dataset = self.build_dataset(data, generate_idx, logger) sampler = SortingSampler([x['token_length'] for x in dataset], batch_size=batch_size, batch_max_tokens=batch_max_tokens, shuffle=shuffle) return PadSequenceDataLoader(batch_sampler=sampler, device=device, dataset=dataset)
def build_dataloader(self, data, batch_size, shuffle, device, logger=None, **kwargs) -> DataLoader: vocabs = self.vocabs token_embed = self._convert_embed() dataset = data if isinstance(data, TransformDataset) else self.build_dataset(data, transform=[vocabs]) if vocabs.mutable: # Before building vocabs, let embeddings submit their vocabs, some embeddings will possibly opt out as their # transforms are not relevant to vocabs if isinstance(token_embed, Embedding): transform = token_embed.transform(vocabs=vocabs) if transform: dataset.transform.insert(-1, transform) self.build_vocabs(dataset, logger) if isinstance(token_embed, Embedding): # Vocabs built, now add all transforms to the pipeline. Be careful about redundant ones. transform = token_embed.transform(vocabs=vocabs) if transform and transform not in dataset.transform: dataset.transform.insert(-1, transform) sampler = SortingSampler([len(sample[self.config.token_key]) for sample in dataset], batch_size, shuffle=shuffle) return PadSequenceDataLoader(dataset, device=device, batch_sampler=sampler, vocabs=vocabs)
def build_dataloader(self, data, batch_size, shuffle, device, logger: logging.Logger = None, vocabs=None, sampler_builder=None, gradient_accumulation=1, **kwargs) -> DataLoader: if vocabs is None: vocabs = self.vocabs transform = TransformList(unpack_ner, FieldLength('token')) if isinstance(self.config.embed, Embedding): transform.append(self.config.embed.transform(vocabs=vocabs)) transform.append(self.vocabs) dataset = self.build_dataset(data, vocabs, transform) if vocabs.mutable: self.build_vocabs(dataset, logger, vocabs) if 'token' in vocabs: lens = [x['token'] for x in dataset] else: lens = [len(x['token_input_ids']) for x in dataset] if sampler_builder: sampler = sampler_builder.build(lens, shuffle, gradient_accumulation) else: sampler = None return PadSequenceDataLoader(batch_sampler=sampler, device=device, dataset=dataset)
def make_batch_for_squeeze(data, augmented_concept, tokenizer, device, ret): token_field = 'token_and_concept' attention_mask = [] token_and_concept = [ t + [tokenizer.sep_token] + c for t, c in zip(data['token'], augmented_concept) ] encodings = [tokenizer({token_field: x}) for x in token_and_concept] ret.update(merge_list_of_dict(encodings)) max_input_len = len(max(ret[f'{token_field}_input_ids'], key=len)) concept_mask = [] token_mask = [] token_type_ids = [] snt_len = [] last_concept_offset = [] for tokens, concepts, input_ids, spans in zip( data['token'], augmented_concept, ret['token_and_concept_input_ids'], ret['token_and_concept_token_span']): raw_sent_len = len(tokens) + 1 # for [SEP] raw_concept_len = len(concepts) if concepts[-1] == END: concept_mask.append([False] * raw_sent_len + [True] * (raw_concept_len - 1) + [False]) # skip END concept else: concept_mask.append([False] * raw_sent_len + [True] * raw_concept_len) token_mask.append([False] + [True] * (raw_sent_len - 2) + [False] * (raw_concept_len + 1)) assert len(concept_mask) == len(token_mask) snt_len.append(raw_sent_len - 2) # skip [CLS] and [SEP] sent_len = input_ids.index(tokenizer.tokenizer.sep_token_id) + 1 concept_len = len(input_ids) - sent_len mask = torch.zeros((max_input_len, max_input_len), dtype=torch.bool) mask[:sent_len + concept_len, :sent_len] = True bottom_right = ~SelfAttentionMask.get_mask( concept_len, device, ret_parameter=False) mask[sent_len:sent_len + concept_len, sent_len:sent_len + concept_len] = bottom_right for group in spans: if group[0] >= sent_len: for i in range(len(group)): for j in range(i + 1, len(group)): mask[group[i], group[j]] = True attention_mask.append(mask) _token_type_ids = [0] * sent_len + [1] * concept_len token_type_ids.append(_token_type_ids) assert len(input_ids) == len(_token_type_ids) last_concept_offset.append(raw_concept_len - 1) ret['attention_mask'] = torch.stack(attention_mask) ret['concept_mask'] = PadSequenceDataLoader.pad_data( concept_mask, 0, torch.bool) ret['token_mask'] = PadSequenceDataLoader.pad_data(token_mask, 0, torch.bool) ret['token_type_ids'] = PadSequenceDataLoader.pad_data( token_type_ids, 0, torch.long) ret['snt_len'] = PadSequenceDataLoader.pad_data(snt_len, 0, torch.long) ret['last_concept_offset'] = PadSequenceDataLoader.pad_data( last_concept_offset, 0, torch.long) return token_field
def build_dataloader(self, data, batch_size, shuffle, device, text_a_key, text_b_key, label_key, logger: logging.Logger = None, sorting=True, **kwargs) -> DataLoader: if not batch_size: batch_size = self.config.batch_size dataset = self.build_dataset(data) dataset.append_transform(self.vocabs) if self.vocabs.mutable: if not any([text_a_key, text_b_key]): if len(dataset.headers) == 2: self.config.text_a_key = dataset.headers[0] self.config.label_key = dataset.headers[1] elif len(dataset.headers) >= 3: self.config.text_a_key, self.config.text_b_key, self.config.label_key = dataset.headers[0], \ dataset.headers[1], \ dataset.headers[-1] else: raise ValueError('Wrong dataset format') report = {'text_a_key', 'text_b_key', 'label_key'} report = dict((k, self.config[k]) for k in report) report = [f'{k}={v}' for k, v in report.items() if v] report = ', '.join(report) logger.info( f'Guess [bold][blue]{report}[/blue][/bold] according to the headers of training dataset: ' f'[blue]{dataset}[/blue]') self.build_vocabs(dataset, logger) dataset.purge_cache() # if self.config.transform: # dataset.append_transform(self.config.transform) dataset.append_transform( TransformerTextTokenizer( tokenizer=self.transformer_tokenizer, text_a_key=self.config.text_a_key, text_b_key=self.config.text_b_key, max_seq_length=self.config.max_seq_length, truncate_long_sequences=self.config.truncate_long_sequences, output_key='')) batch_sampler = None if sorting and not isdebugging(): if dataset.cache and len(dataset) > 1000: timer = CountdownTimer(len(dataset)) lens = [] for idx, sample in enumerate(dataset): lens.append(len(sample['input_ids'])) timer.log( 'Pre-processing and caching dataset [blink][yellow]...[/yellow][/blink]', ratio_percentage=None) else: lens = [len(sample['input_ids']) for sample in dataset] batch_sampler = SortingSampler( lens, batch_size=batch_size, shuffle=shuffle, batch_max_tokens=self.config.batch_max_tokens) return PadSequenceDataLoader(dataset, batch_size, shuffle, batch_sampler=batch_sampler, device=device, collate_fn=self.collate_fn)
def predict(self, data: Union[str, List[str]], batch_size: int = None, tasks: Optional[Union[str, List[str]]] = None, resolve_dependencies=True, **kwargs) -> Document: doc = Document() if not data: return doc if resolve_dependencies: # Now we decide which tasks to perform and their orders tasks_in_topological_order = self._tasks_in_topological_order task_topological_order = self._task_topological_order computation_graph = self._computation_graph target_tasks = self._resolve_task_name(tasks) if not target_tasks: target_tasks = tasks_in_topological_order else: target_topological_order = defaultdict(set) for task_name in target_tasks: if task_name not in computation_graph: continue for dependency in topological_sort(computation_graph, task_name): target_topological_order[ task_topological_order[dependency]].add(dependency) target_tasks = [ item[1] for item in sorted(target_topological_order.items()) ] else: target_tasks = [set(tasks)] if isinstance(tasks, list) else [{tasks}] if not target_tasks: return Document() # Sort target tasks within the same group in a defined order target_tasks = [ sorted(x, key=lambda _x: self.config.task_names.index(_x)) for x in target_tasks ] flatten_target_tasks = [ self.tasks[t] for group in target_tasks for t in group ] cls_is_bos = any([x.cls_is_bos for x in flatten_target_tasks]) sep_is_eos = any([x.sep_is_eos for x in flatten_target_tasks]) # Now build the dataloaders and execute tasks first_task_name: str = list(target_tasks[0])[0] first_task: Task = self.tasks[first_task_name] encoder_transform, transform = self.build_transform(first_task) # Override the tokenizer config of the 1st task encoder_transform.sep_is_eos = sep_is_eos encoder_transform.cls_is_bos = cls_is_bos average_subwords = self.model.encoder.average_subwords flat = first_task.input_is_flat(data) if flat: data = [data] device = self.device samples = first_task.build_samples(data, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos) dataloader = first_task.build_dataloader(samples, transform=transform, device=device) results = defaultdict(list) order = [] for batch in dataloader: order.extend(batch[IDX]) # Run the first task, let it make the initial batch for the successors output_dict = self.predict_task(first_task, first_task_name, batch, results, run_transform=True, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos) # Run each task group in order for group_id, group in enumerate(target_tasks): # We could parallelize this in the future for task_name in group: if task_name == first_task_name: continue output_dict = self.predict_task(self.tasks[task_name], task_name, batch, results, output_dict, run_transform=True, cls_is_bos=cls_is_bos, sep_is_eos=sep_is_eos) if group_id == 0: # We are kind of hard coding here. If the first task is a tokenizer, # we need to convert the hidden and mask to token level if 'token_token_span' not in batch: spans = [] tokens = [] for span_per_sent, token_per_sent in zip( output_dict[first_task_name]['prediction'], results[first_task_name]): if cls_is_bos: span_per_sent = [(-1, 0)] + span_per_sent token_per_sent = [BOS] + token_per_sent if sep_is_eos: span_per_sent = span_per_sent + [ (span_per_sent[-1][0] + 1, span_per_sent[-1][1] + 1) ] token_per_sent = token_per_sent + [EOS] # The offsets start with 0 while [CLS] is zero if average_subwords: span_per_sent = [ list(range(x[0] + 1, x[1] + 1)) for x in span_per_sent ] else: span_per_sent = [ x[0] + 1 for x in span_per_sent ] spans.append(span_per_sent) tokens.append(token_per_sent) spans = PadSequenceDataLoader.pad_data(spans, 0, torch.long, device=device) output_dict['hidden'] = pick_tensor_for_each_token( output_dict['hidden'], spans, average_subwords) batch['token_token_span'] = spans batch['token'] = tokens # noinspection PyTypeChecker batch['token_length'] = torch.tensor( [len(x) for x in tokens], dtype=torch.long, device=device) batch.pop('mask', None) # Put results into doc in the order of tasks for k in self.config.task_names: v = results.get(k, None) if v is None: continue doc[k] = reorder(v, order) # Allow task to perform finalization on document for group in target_tasks: for task_name in group: task = self.tasks[task_name] task.finalize_document(doc, task_name) # If no tok in doc, use raw input as tok if not any(k.startswith('tok') for k in doc): doc['tok'] = data if flat: for k, v in list(doc.items()): doc[k] = v[0] # If there is only one field, don't bother to wrap it # if len(doc) == 1: # return list(doc.values())[0] return doc