def compute_lens(self, data: Union[List[Dict[str, Any]], str], dataset: TransformableDataset, input_ids='token_input_ids'): """ Args: data: Samples to be measured or path to dataset during training time. dataset: During training time, use this dataset to measure the length of each sample inside. input_ids: Field name corresponds to input ids. Returns: Length list of this samples """ if not dataset.cache: warnings.warn( f'Caching for the dataset is not enabled, ' f'try `dataset.purge_cache()` if possible. The dataset is {dataset}.' ) if isinstance(data, str): timer = CountdownTimer(len(dataset)) for each in dataset: timer.log( 'Preprocessing and caching samples [blink][yellow]...[/yellow][/blink]' ) timer.erase() return [len(x[input_ids]) for x in dataset]
def build_vocabs(self, dataset, logger, **kwargs): self.vocabs.srl_label = Vocab(pad_token=None, unk_token=None) # Use null to indicate no relationship self.vocabs.srl_label.add('<null>') timer = CountdownTimer(len(dataset)) max_seq_len = 0 for each in dataset: max_seq_len = max(max_seq_len, len(each['token_input_ids'])) timer.log(f'Building vocabs (max sequence length {max_seq_len}) [blink][yellow]...[/yellow][/blink]') pass timer.stop() timer.erase() self.vocabs['srl_label'].set_unk_as_safe_unk() self.vocabs.lock() self.vocabs.summary(logger)