def load_data( self, data: Any, columns: List[str] = ["input_ids", "attention_mask", "labels"] ) -> 'datasets.Dataset': file, input, target = data data_files = {} stage = self._running_stage.value data_files[stage] = str(file) # FLASH_TESTING is set in the CI to run faster. if flash._IS_TESTING: try: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) except Exception: dataset_dict = load_dataset(self.filetype, data_files=data_files) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) dataset_dict = dataset_dict.map(partial(self._tokenize_fn, input=input, target=target), batched=True) dataset_dict.set_format(columns=columns) return dataset_dict[stage]
def load_data( self, file: str, use_full: bool = True, columns: List[str] = ["input_ids", "attention_mask", "labels"] ) -> 'datasets.Dataset': data_files = {} stage = self._running_stage.value data_files[stage] = str(file) # FLASH_TESTING is set in the CI to run faster. if use_full and os.getenv("FLASH_TESTING", "0") == "0": dataset_dict = load_dataset(self.filetype, data_files=data_files) else: # used for debugging. Avoid processing the entire dataset # noqa E265 try: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) except AssertionError: dataset_dict = load_dataset(self.filetype, data_files=data_files) dataset_dict = dataset_dict.map(self._tokenize_fn_wrapped, batched=True) dataset_dict.set_format(columns=columns) return dataset_dict[stage]
def load_data( self, data: Tuple[str, Union[str, List[str]], Union[str, List[str]]], dataset: Optional[Any] = None, columns: Union[List[str], Tuple[str]] = ("input_ids", "attention_mask", "labels"), ) -> Union[Sequence[Mapping[str, Any]]]: csv_file, input, target = data data_files = {} stage = self.running_stage.value data_files[stage] = str(csv_file) # FLASH_TESTING is set in the CI to run faster. # FLASH_TESTING is set in the CI to run faster. if flash._IS_TESTING and not torch.cuda.is_available(): try: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) except Exception: dataset_dict = load_dataset(self.filetype, data_files=data_files) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) if self.training: labels = list(sorted(list(set(dataset_dict[stage][target])))) dataset.num_classes = len(labels) self.set_state(LabelsState(labels)) labels = self.get_state(LabelsState) # convert labels to ids # if not self.predicting: if labels is not None: labels = labels.labels label_to_class_mapping = {v: k for k, v in enumerate(labels)} dataset_dict = dataset_dict.map( partial(self._transform_label, label_to_class_mapping, target)) dataset_dict = dataset_dict.map(partial(self._tokenize_fn, input=input), batched=True) # Hugging Face models expect target to be named ``labels``. if not self.predicting and target != "labels": dataset_dict.rename_column_(target, "labels") dataset_dict.set_format("torch", columns=columns) return dataset_dict[stage]
def load_data(self, data: Any, columns: List[str] = None) -> "datasets.Dataset": if columns is None: columns = ["input_ids", "attention_mask", "labels"] if self.filetype == "json": file, input, target, field = data else: file, input, target = data data_files = {} stage = self._running_stage.value data_files[stage] = str(file) # FLASH_TESTING is set in the CI to run faster. if flash._IS_TESTING: try: if self.filetype == "json" and field is not None: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f"{stage}[:20]"], field=field)[0] }) else: dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f"{stage}[:20]"])[0] }) except Exception: if self.filetype == "json" and field is not None: dataset_dict = load_dataset(self.filetype, data_files=data_files, field=field) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) else: if self.filetype == "json" and field is not None: dataset_dict = load_dataset(self.filetype, data_files=data_files, field=field) else: dataset_dict = load_dataset(self.filetype, data_files=data_files) dataset_dict = dataset_dict.map(partial(self._tokenize_fn, input=input, target=target), batched=True) dataset_dict.set_format(columns=columns) return dataset_dict[stage]
def load_data(self, filepath: str, dataset: AutoDataset, columns: Union[List[str], Tuple[str]] = ("input_ids", "attention_mask", "labels"), use_full: bool = True): data_files = {} stage = dataset.running_stage.value data_files[stage] = str(filepath) # FLASH_TESTING is set in the CI to run faster. if use_full and os.getenv("FLASH_TESTING", "0") == "0": dataset_dict = load_dataset(self.filetype, data_files=data_files) else: # used for debugging. Avoid processing the entire dataset # noqa E265 dataset_dict = DatasetDict({ stage: load_dataset(self.filetype, data_files=data_files, split=[f'{stage}[:20]'])[0] }) dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True) # convert labels to ids if not self.predicting: dataset_dict = dataset_dict.map(self._transform_label) dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True) # Hugging Face models expect target to be named ``labels``. if not self.predicting and self.target != "labels": dataset_dict.rename_column_(self.target, "labels") dataset_dict.set_format("torch", columns=columns) if not self.predicting: dataset.num_classes = len(self.label_to_class_mapping) return dataset_dict[stage]
def build_datasets( data_args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, cache_dir=None, skip_train=False, skip_eval=False) -> Tuple[Dataset, Dataset]: if skip_eval and skip_train: logger.warning("Both `skip_train` and `skip_eval` are set to True") json_path = data_args.data_json data_dir = data_args.load_data_from add_line_breaks = data_args.add_line_breaks break_token = data_args.line_break_token train_data, eval_data = None, None dataset = DatasetDict() if add_line_breaks: tokenizer.add_special_tokens(dict(additional_special_tokens=[break_token])) if json_path is not None: logger.info("Preprocessing new dataset from {}".format(json_path)) eval_split = data_args.eval_split save_dir = data_args.save_data_to dataset = load_dataset('json', data_files=[json_path], cache_dir=cache_dir) if eval_split < 1: dataset = dataset["train"].train_test_split(test_size=eval_split, shuffle=False) if save_dir is None: # Spend less time on preprocessing if skip_train: del dataset["train"] if skip_eval and "test" in dataset: del dataset["test"] if not data_args.skip_text_clean: normalize = partial(normalize_text, add_line_breaks=add_line_breaks, brk=break_token) dataset = dataset.map(normalize, input_columns='text') proc_kwargs = dict( batched=True, batch_size=data_args.tokenizer_batch_size, remove_columns=["text", "title"]) if "train" in dataset: proc_train = create_preprocess_fn( tokenizer, data_args.max_source_length, data_args.max_target_length) dataset["train"] = dataset["train"].map(proc_train, **proc_kwargs) if "test" in dataset: proc_eval = create_preprocess_fn( tokenizer, data_args.max_source_length, data_args.val_max_target_length) dataset["test"] = dataset["test"].map(proc_eval, **proc_kwargs) dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]) save_dir = data_args.save_data_to if save_dir is not None: if not os.path.exists(save_dir): os.mkdir(save_dir) logger.info("Saving preprocessed dataset to {}".format(save_dir)) dataset.save_to_disk(save_dir) elif data_dir is not None: logger.info("Loading preprocessed dataset from {}".format(data_dir)) if skip_train: eval_data = load_from_disk(os.path.join(data_dir, "test")) elif skip_eval: train_data = load_from_disk(os.path.join(data_dir, "train")) else: dataset = load_from_disk(data_dir) else: raise AttributeError("You must provide either `--data_json` or `--load_data_from` argument.") if "train" in dataset: train_data = dataset["train"] if "test" in dataset: eval_data = dataset["test"] return train_data, eval_data