예제 #1
0
    def load_data(
        self,
        data: Any,
        columns: List[str] = ["input_ids", "attention_mask", "labels"]
    ) -> 'datasets.Dataset':
        file, input, target = data
        data_files = {}
        stage = self._running_stage.value
        data_files[stage] = str(file)

        # FLASH_TESTING is set in the CI to run faster.
        if flash._IS_TESTING:
            try:
                dataset_dict = DatasetDict({
                    stage:
                    load_dataset(self.filetype,
                                 data_files=data_files,
                                 split=[f'{stage}[:20]'])[0]
                })
            except Exception:
                dataset_dict = load_dataset(self.filetype,
                                            data_files=data_files)
        else:
            dataset_dict = load_dataset(self.filetype, data_files=data_files)

        dataset_dict = dataset_dict.map(partial(self._tokenize_fn,
                                                input=input,
                                                target=target),
                                        batched=True)
        dataset_dict.set_format(columns=columns)
        return dataset_dict[stage]
예제 #2
0
    def load_data(
        self,
        file: str,
        use_full: bool = True,
        columns: List[str] = ["input_ids", "attention_mask", "labels"]
    ) -> 'datasets.Dataset':
        data_files = {}
        stage = self._running_stage.value
        data_files[stage] = str(file)

        # FLASH_TESTING is set in the CI to run faster.
        if use_full and os.getenv("FLASH_TESTING", "0") == "0":
            dataset_dict = load_dataset(self.filetype, data_files=data_files)
        else:
            # used for debugging. Avoid processing the entire dataset   # noqa E265
            try:
                dataset_dict = DatasetDict({
                    stage:
                    load_dataset(self.filetype,
                                 data_files=data_files,
                                 split=[f'{stage}[:20]'])[0]
                })
            except AssertionError:
                dataset_dict = load_dataset(self.filetype,
                                            data_files=data_files)

        dataset_dict = dataset_dict.map(self._tokenize_fn_wrapped,
                                        batched=True)
        dataset_dict.set_format(columns=columns)
        return dataset_dict[stage]
예제 #3
0
    def load_data(
        self,
        data: Tuple[str, Union[str, List[str]], Union[str, List[str]]],
        dataset: Optional[Any] = None,
        columns: Union[List[str],
                       Tuple[str]] = ("input_ids", "attention_mask", "labels"),
    ) -> Union[Sequence[Mapping[str, Any]]]:
        csv_file, input, target = data

        data_files = {}

        stage = self.running_stage.value
        data_files[stage] = str(csv_file)

        # FLASH_TESTING is set in the CI to run faster.
        # FLASH_TESTING is set in the CI to run faster.
        if flash._IS_TESTING and not torch.cuda.is_available():
            try:
                dataset_dict = DatasetDict({
                    stage:
                    load_dataset(self.filetype,
                                 data_files=data_files,
                                 split=[f'{stage}[:20]'])[0]
                })
            except Exception:
                dataset_dict = load_dataset(self.filetype,
                                            data_files=data_files)
        else:
            dataset_dict = load_dataset(self.filetype, data_files=data_files)

        if self.training:
            labels = list(sorted(list(set(dataset_dict[stage][target]))))
            dataset.num_classes = len(labels)
            self.set_state(LabelsState(labels))

        labels = self.get_state(LabelsState)

        # convert labels to ids
        # if not self.predicting:
        if labels is not None:
            labels = labels.labels
            label_to_class_mapping = {v: k for k, v in enumerate(labels)}
            dataset_dict = dataset_dict.map(
                partial(self._transform_label, label_to_class_mapping, target))

        dataset_dict = dataset_dict.map(partial(self._tokenize_fn,
                                                input=input),
                                        batched=True)

        # Hugging Face models expect target to be named ``labels``.
        if not self.predicting and target != "labels":
            dataset_dict.rename_column_(target, "labels")

        dataset_dict.set_format("torch", columns=columns)

        return dataset_dict[stage]
예제 #4
0
    def load_data(self,
                  data: Any,
                  columns: List[str] = None) -> "datasets.Dataset":
        if columns is None:
            columns = ["input_ids", "attention_mask", "labels"]
        if self.filetype == "json":
            file, input, target, field = data
        else:
            file, input, target = data
        data_files = {}
        stage = self._running_stage.value
        data_files[stage] = str(file)

        # FLASH_TESTING is set in the CI to run faster.
        if flash._IS_TESTING:
            try:
                if self.filetype == "json" and field is not None:
                    dataset_dict = DatasetDict({
                        stage:
                        load_dataset(self.filetype,
                                     data_files=data_files,
                                     split=[f"{stage}[:20]"],
                                     field=field)[0]
                    })
                else:
                    dataset_dict = DatasetDict({
                        stage:
                        load_dataset(self.filetype,
                                     data_files=data_files,
                                     split=[f"{stage}[:20]"])[0]
                    })
            except Exception:
                if self.filetype == "json" and field is not None:
                    dataset_dict = load_dataset(self.filetype,
                                                data_files=data_files,
                                                field=field)
                else:
                    dataset_dict = load_dataset(self.filetype,
                                                data_files=data_files)
        else:
            if self.filetype == "json" and field is not None:
                dataset_dict = load_dataset(self.filetype,
                                            data_files=data_files,
                                            field=field)
            else:
                dataset_dict = load_dataset(self.filetype,
                                            data_files=data_files)

        dataset_dict = dataset_dict.map(partial(self._tokenize_fn,
                                                input=input,
                                                target=target),
                                        batched=True)
        dataset_dict.set_format(columns=columns)
        return dataset_dict[stage]
예제 #5
0
    def load_data(self,
                  filepath: str,
                  dataset: AutoDataset,
                  columns: Union[List[str],
                                 Tuple[str]] = ("input_ids", "attention_mask",
                                                "labels"),
                  use_full: bool = True):
        data_files = {}

        stage = dataset.running_stage.value
        data_files[stage] = str(filepath)

        # FLASH_TESTING is set in the CI to run faster.
        if use_full and os.getenv("FLASH_TESTING", "0") == "0":
            dataset_dict = load_dataset(self.filetype, data_files=data_files)
        else:
            # used for debugging. Avoid processing the entire dataset   # noqa E265
            dataset_dict = DatasetDict({
                stage:
                load_dataset(self.filetype,
                             data_files=data_files,
                             split=[f'{stage}[:20]'])[0]
            })

        dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True)

        # convert labels to ids
        if not self.predicting:
            dataset_dict = dataset_dict.map(self._transform_label)

        dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True)

        # Hugging Face models expect target to be named ``labels``.
        if not self.predicting and self.target != "labels":
            dataset_dict.rename_column_(self.target, "labels")

        dataset_dict.set_format("torch", columns=columns)

        if not self.predicting:
            dataset.num_classes = len(self.label_to_class_mapping)

        return dataset_dict[stage]
def build_datasets(
        data_args: DataTrainingArguments,
        tokenizer: PreTrainedTokenizer,
        cache_dir=None,
        skip_train=False,
        skip_eval=False) -> Tuple[Dataset, Dataset]:
    if skip_eval and skip_train:
        logger.warning("Both `skip_train` and `skip_eval` are set to True")

    json_path = data_args.data_json
    data_dir = data_args.load_data_from
    add_line_breaks = data_args.add_line_breaks
    break_token = data_args.line_break_token
    train_data, eval_data = None, None
    dataset = DatasetDict()

    if add_line_breaks:
        tokenizer.add_special_tokens(dict(additional_special_tokens=[break_token]))

    if json_path is not None:
        logger.info("Preprocessing new dataset from {}".format(json_path))
        eval_split = data_args.eval_split
        save_dir = data_args.save_data_to

        dataset = load_dataset('json', data_files=[json_path], cache_dir=cache_dir)
        if eval_split < 1:
            dataset = dataset["train"].train_test_split(test_size=eval_split, shuffle=False)

        if save_dir is None:
            # Spend less time on preprocessing
            if skip_train:
                del dataset["train"]
            if skip_eval and "test" in dataset:
                del dataset["test"]

        if not data_args.skip_text_clean:
            normalize = partial(normalize_text, add_line_breaks=add_line_breaks, brk=break_token)
            dataset = dataset.map(normalize, input_columns='text')

        proc_kwargs = dict(
            batched=True,
            batch_size=data_args.tokenizer_batch_size,
            remove_columns=["text", "title"])

        if "train" in dataset:
            proc_train = create_preprocess_fn(
                tokenizer, data_args.max_source_length, data_args.max_target_length)
            dataset["train"] = dataset["train"].map(proc_train, **proc_kwargs)

        if "test" in dataset:
            proc_eval = create_preprocess_fn(
                tokenizer, data_args.max_source_length, data_args.val_max_target_length)
            dataset["test"] = dataset["test"].map(proc_eval, **proc_kwargs)

        dataset.set_format(type="torch",
                           columns=["input_ids", "attention_mask", "decoder_input_ids",
                                    "decoder_attention_mask", "labels"])

        save_dir = data_args.save_data_to
        if save_dir is not None:
            if not os.path.exists(save_dir):
                os.mkdir(save_dir)
            logger.info("Saving preprocessed dataset to {}".format(save_dir))
            dataset.save_to_disk(save_dir)

    elif data_dir is not None:
        logger.info("Loading preprocessed dataset from {}".format(data_dir))
        if skip_train:
            eval_data = load_from_disk(os.path.join(data_dir, "test"))
        elif skip_eval:
            train_data = load_from_disk(os.path.join(data_dir, "train"))
        else:
            dataset = load_from_disk(data_dir)
    else:
        raise AttributeError("You must provide either `--data_json` or `--load_data_from` argument.")

    if "train" in dataset:
        train_data = dataset["train"]
    if "test" in dataset:
        eval_data = dataset["test"]
    return train_data, eval_data