Exemplo n.º 1
0
def test_autodataset_smoke():
    num_samples = 20
    dt = range(num_samples)
    ds = DataSource()

    dset = AutoDataset(data=dt,
                       data_source=ds,
                       running_stage=RunningStage.TRAINING)
    assert dset is not None
    assert dset.running_stage == RunningStage.TRAINING

    # check on members
    assert dset.data == dt
    assert dset.data_source == ds

    # test set the running stage
    dset.running_stage = RunningStage.PREDICTING
    assert dset.running_stage == RunningStage.PREDICTING

    # check on methods
    assert dset.load_sample is not None
    assert dset.load_sample == ds.load_sample

    # check getters
    assert len(dset) == num_samples
    assert dset[0] == 0
    assert dset[9] == 9
    assert dset[11] == 11
Exemplo n.º 2
0
    def generate_dataset(
        self,
        data: Optional[DATA_TYPE],
        running_stage: RunningStage,
    ) -> Optional[Union[AutoDataset, IterableAutoDataset]]:
        is_none = data is None

        if isinstance(data, Sequence):
            is_none = data[0] is None

        if not is_none:
            from flash.data.data_pipeline import DataPipeline

            mock_dataset = typing.cast(AutoDataset, MockDataset())
            with CurrentRunningStageFuncContext(running_stage, "load_data", self):
                load_data: Callable[[DATA_TYPE, Optional[Any]], Any] = getattr(
                    self, DataPipeline._resolve_function_hierarchy(
                        "load_data",
                        self,
                        running_stage,
                        DataSource,
                    )
                )
                parameters = signature(load_data).parameters
                if len(parameters) > 1 and "dataset" in parameters:  # TODO: This was DATASET_KEY before
                    data = load_data(data, mock_dataset)
                else:
                    data = load_data(data)

            if has_len(data):
                dataset = AutoDataset(data, self, running_stage)
            else:
                dataset = IterableAutoDataset(data, self, running_stage)
            dataset.__dict__.update(mock_dataset.metadata)
            return dataset
    def autogenerate_dataset(
        cls,
        data: Any,
        running_stage: RunningStage,
        whole_data_load_fn: Optional[Callable] = None,
        per_sample_load_fn: Optional[Callable] = None,
        data_pipeline: Optional[DataPipeline] = None,
    ) -> AutoDataset:
        """
        This function is used to generate an ``AutoDataset`` from a ``DataPipeline`` if provided
        or from the provided ``whole_data_load_fn``, ``per_sample_load_fn`` functions directly
        """

        if whole_data_load_fn is None:
            whole_data_load_fn = getattr(
                cls.preprocess_cls,
                DataPipeline._resolve_function_hierarchy(
                    'load_data', cls.preprocess_cls, running_stage,
                    Preprocess))

        if per_sample_load_fn is None:
            per_sample_load_fn = getattr(
                cls.preprocess_cls,
                DataPipeline._resolve_function_hierarchy(
                    'load_sample', cls.preprocess_cls, running_stage,
                    Preprocess))
        return AutoDataset(data,
                           whole_data_load_fn,
                           per_sample_load_fn,
                           data_pipeline,
                           running_stage=running_stage)
Exemplo n.º 4
0
def test_autodataset_with_functions(
    with_dataset: bool,
    with_running_stage: bool,
):

    functions = _AutoDatasetTestPreprocess(with_dataset)

    load_sample_func = functions.load_sample
    load_data_func = functions.load_data

    if with_running_stage:
        running_stage = RunningStage.TRAINING
    else:
        running_stage = None
    dset = AutoDataset(
        range(10),
        load_data=load_data_func,
        load_sample=load_sample_func,
        running_stage=running_stage,
    )

    assert len(dset) == 10

    for idx in range(len(dset)):
        dset[idx]

    if with_dataset:
        assert dset.load_sample_was_called
        assert dset.load_data_was_called
        assert functions.load_sample_with_dataset_count == len(dset)
        assert functions.load_data_with_dataset_count == 1
    else:
        assert functions.load_data_count == 1
        assert functions.load_sample_count == len(dset)
 def _generate_auto_dataset(
         self,
         data: Union[Iterable, Any],
         running_stage: RunningStage = None) -> AutoDataset:
     return AutoDataset(data=data,
                        data_pipeline=self,
                        running_stage=running_stage)
Exemplo n.º 6
0
 def load_data(self, metadata: Any,
               dataset: AutoDataset) -> CustomCOCODataset:
     # Extract folder, coco annotation file and the transform to be applied on the images
     folder, ann_file, transform = metadata
     ds = CustomCOCODataset(folder, ann_file, transform)
     if self.training:
         dataset.num_classes = ds.num_classes
         ds = _coco_remove_images_without_annotations(ds)
     return ds
Exemplo n.º 7
0
 def _generate_auto_dataset(
     self,
     data: Union[Iterable, Any],
     running_stage: RunningStage = None,
     use_iterable_auto_dataset: bool = False
 ) -> Union[AutoDataset, IterableAutoDataset]:
     if use_iterable_auto_dataset:
         return IterableAutoDataset(data, data_pipeline=self, running_stage=running_stage)
     return AutoDataset(data=data, data_pipeline=self, running_stage=running_stage)
def test_autodataset_warning():
    with pytest.warns(
            UserWarning,
            match=
            "``datapipeline`` is specified but load_sample and/or load_data are also specified"
    ):
        AutoDataset(range(10),
                    load_data=lambda x: x,
                    load_sample=lambda x: x,
                    data_pipeline=DataPipeline())
Exemplo n.º 9
0
    def common_load_data(self, df: DataFrame, dataset: AutoDataset):
        # impute_data
        # compute train dataset stats
        dfs = _pre_transform([df], self.num_cols, self.cat_cols, self.codes, self.mean, self.std, self.target_col,
                             self.target_codes)

        df = dfs[0]

        dataset.num_samples = len(df)
        cat_vars = _to_cat_vars_numpy(df, self.cat_cols)
        num_vars = _to_num_vars_numpy(df, self.num_cols)

        cat_vars = np.stack(cat_vars, 1) if len(cat_vars) else np.zeros((len(self), 0))
        num_vars = np.stack(num_vars, 1) if len(num_vars) else np.zeros((len(self), 0))
        return df, cat_vars, num_vars
Exemplo n.º 10
0
    def generate_dataset(
        self,
        data: Optional[DATA_TYPE],
        running_stage: RunningStage,
    ) -> Optional[Union[AutoDataset, IterableAutoDataset]]:
        """Generate a single dataset with the given input to :meth:`~flash.data.data_source.DataSource.load_data` for
        the given ``running_stage``.

        Args:
            data: The input to :meth:`~flash.data.data_source.DataSource.load_data` to use to create the dataset.
            running_stage: The running_stage for this dataset.

        Returns:
            The constructed :class:`~flash.data.auto_dataset.BaseAutoDataset`.
        """
        is_none = data is None

        if isinstance(data, Sequence):
            is_none = data[0] is None

        if not is_none:
            from flash.data.data_pipeline import DataPipeline

            mock_dataset = typing.cast(AutoDataset, MockDataset())
            with CurrentRunningStageFuncContext(running_stage, "load_data",
                                                self):
                load_data: Callable[[DATA_TYPE, Optional[Any]], Any] = getattr(
                    self,
                    DataPipeline._resolve_function_hierarchy(
                        "load_data",
                        self,
                        running_stage,
                        DataSource,
                    ))
                parameters = signature(load_data).parameters
                if len(
                        parameters
                ) > 1 and "dataset" in parameters:  # TODO: This was DATASET_KEY before
                    data = load_data(data, mock_dataset)
                else:
                    data = load_data(data)

            if has_len(data):
                dataset = AutoDataset(data, self, running_stage)
            else:
                dataset = IterableAutoDataset(data, self, running_stage)
            dataset.__dict__.update(mock_dataset.metadata)
            return dataset
Exemplo n.º 11
0
    def load_data(self,
                  filepath: str,
                  dataset: AutoDataset,
                  columns: Union[List[str],
                                 Tuple[str]] = ("input_ids", "attention_mask",
                                                "labels"),
                  use_full: bool = True):
        data_files = {}

        stage = dataset.running_stage.value
        data_files[stage] = str(filepath)

        # FLASH_TESTING is set in the CI to run faster.
        if use_full and os.getenv("FLASH_TESTING", "0") == "0":
            dataset_dict = load_dataset(self.filetype, data_files=data_files)
        else:
            # used for debugging. Avoid processing the entire dataset   # noqa E265
            dataset_dict = DatasetDict({
                stage:
                load_dataset(self.filetype,
                             data_files=data_files,
                             split=[f'{stage}[:20]'])[0]
            })

        dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True)

        # convert labels to ids
        if not self.predicting:
            dataset_dict = dataset_dict.map(self._transform_label)

        dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True)

        # Hugging Face models expect target to be named ``labels``.
        if not self.predicting and self.target != "labels":
            dataset_dict.rename_column_(self.target, "labels")

        dataset_dict.set_format("torch", columns=columns)

        if not self.predicting:
            dataset.num_classes = len(self.label_to_class_mapping)

        return dataset_dict[stage]
Exemplo n.º 12
0
 def load_data(self, data: Tuple[ND, ND], dataset: AutoDataset) -> List[Tuple[ND, float]]:
     if self.training:
         dataset.num_inputs = data[0].shape[1]
     return [(x, y) for x, y in zip(*data)]