예제 #1
0
    def transform(self, dataset: Dataset) -> Dataset:
        """Transforms the dataset by applying the graph of operators to it. Requires the ``fit``
        method to have already been called, or calculated statistics to be loaded from disk

        This method returns a Dataset object, with the transformations lazily loaded. None
        of the actual computation will happen until the produced Dataset is consumed, or
        written out to disk.

        Parameters
        -----------
        dataset: Dataset

        Returns
        -------
        Dataset
        """
        self._clear_worker_cache()

        if not self.output_schema:
            self.fit_schema(dataset.schema)

        ddf = dataset.to_ddf(columns=self._input_columns())
        return Dataset(
            _transform_ddf(ddf, self.output_node, self.output_dtypes),
            client=self.client,
            cpu=dataset.cpu,
            base_dataset=dataset.base_dataset,
            schema=self.output_schema,
        )
예제 #2
0
def test_dataloader_epochs(datasets, engine, batch_size, epochs, on_ddf):
    dataset = Dataset(str(datasets["parquet"]), engine=engine)

    if on_ddf:
        dataset = dataset.to_ddf()

    cont_names = ["x", "y", "id"]
    cat_names = ["name-string", "name-cat"]
    label_name = ["label"]

    data_loader = DataLoader(
        dataset,
        cat_names=cat_names,
        cont_names=cont_names,
        batch_size=batch_size,
        label_names=label_name,
        shuffle=False,
    )

    # Convert to iterators and then to DataFrames
    df1 = _concat(list(data_loader._buff.itr))
    df2 = _concat(list(data_loader.epochs(epochs)._buff.itr))

    # Check that the DataFrame sizes and rows make sense
    assert len(df2) == epochs * len(df1)
    assert_eq(
        _concat([df1 for i in range(epochs)]).reset_index(drop=True),
        df2.reset_index(drop=True),
    )
예제 #3
0
    def transform(self, dataset: Dataset) -> Dataset:
        """Transforms the dataset by applying the graph of operators to it. Requires the 'fit'
        method to have already been called, or calculated statistics to be loaded from disk

        This method returns a Dataset object, with the transformations lazily loaded. None
        of the actual computation will happen until the produced Dataset is consumed, or
        written out to disk.

        Parameters
        -----------
        dataset: Dataset

        Returns
        -------
        Dataset
        """
        self._clear_worker_cache()
        ddf = dataset.to_ddf(columns=self._input_columns())
        return Dataset(_transform_ddf(ddf, self.column_group), client=self.client)
예제 #4
0
def test_dataloader_empty_error(datasets, engine, batch_size):
    dataset = Dataset(str(datasets["parquet"]), engine=engine)

    with pytest.raises(ValueError) as exc_info:
        DataLoader(
            dataset,
            batch_size=batch_size,
            label_names=["label"],
            shuffle=False,
        )
    assert "Neither Categorical or Continuous columns were found by the dataloader. " in str(
        exc_info.value)
예제 #5
0
    def fit(self, dataset: Dataset):
        """Calculates statistics for this workflow on the input dataset

        Parameters
        -----------
        dataset: Dataset
            The input dataset to calculate statistics for. If there is a train/test split this
            data should be the training dataset only.
        """
        self._clear_worker_cache()
        ddf = dataset.to_ddf(columns=self._input_columns())

        # Get a dictionary mapping all StatOperators we need to fit to a set of any dependant
        # StatOperators (having StatOperators that depend on the output of other StatOperators
        # means that will have multiple phases in the fit cycle here)
        stat_ops = {op: _get_stat_ops(op.parents) for op in _get_stat_ops([self.column_group])}

        while stat_ops:
            # get all the StatOperators that we can currently call fit on (no outstanding
            # dependencies)
            current_phase = [op for op, dependencies in stat_ops.items() if not dependencies]
            if not current_phase:
                # this shouldn't happen, but lets not infinite loop just in case
                raise RuntimeError("failed to find dependency-free StatOperator to fit")

            stats, ops = [], []
            for column_group in current_phase:
                # apply transforms necessary for the inputs to the current column group, ignoring
                # the transforms from the statop itself
                transformed_ddf = _transform_ddf(ddf, column_group.parents)

                op = column_group.op
                try:
                    stats.append(op.fit(column_group.input_column_names, transformed_ddf))
                    ops.append(op)
                except Exception:
                    LOG.exception("Failed to fit operator %s", column_group.op)
                    raise

            if self.client:
                results = [r.result() for r in self.client.compute(stats)]
            else:
                results = dask.compute(stats, scheduler="synchronous")[0]

            for computed_stats, op in zip(results, ops):
                op.fit_finalize(computed_stats)

            # Remove all the operators we processed in this phase, and remove
            # from the dependencies of other ops too
            for stat_op in current_phase:
                stat_ops.pop(stat_op)
            for dependencies in stat_ops.values():
                dependencies.difference_update(current_phase)
def test_shuffling():
    num_rows = 10000
    batch_size = 10000

    df = pd.DataFrame({"a": np.asarray(range(num_rows)), "b": np.asarray([0] * num_rows)})

    train_dataset = tf_dataloader.KerasSequenceLoader(
        Dataset(df), cont_names=["a"], label_names=["b"], batch_size=batch_size, shuffle=True
    )

    batch = next(iter(train_dataset))

    first_batch = tf.reshape(tf.cast(batch[0]["a"].cpu(), tf.int32), (batch_size,))
    in_order = tf.range(0, batch_size, dtype=tf.int32)

    assert (first_batch != in_order).numpy().any()
    assert (tf.sort(first_batch) == in_order).numpy().all()
def test_nested_list():
    num_rows = 100
    batch_size = 12

    df = pd.DataFrame(
        {
            "data": [
                np.random.rand(np.random.randint(10) + 1, 3).tolist() for i in range(num_rows)
            ],
            "data2": [np.random.rand(np.random.randint(10) + 1).tolist() for i in range(num_rows)],
            "label": [np.random.rand() for i in range(num_rows)],
        }
    )

    train_dataset = tf_dataloader.KerasSequenceLoader(
        Dataset(df),
        cont_names=["data", "data2"],
        label_names=["label"],
        batch_size=batch_size,
        shuffle=False,
    )

    batch = next(iter(train_dataset))
    # [[1,2,3],[3,1],[...],[]]
    nested_data_col = tf.RaggedTensor.from_row_lengths(
        batch[0]["data"][0][:, 0], tf.cast(batch[0]["data"][1][:, 0], tf.int32)
    ).to_tensor()
    true_data_col = tf.reshape(
        tf.ragged.constant(df.iloc[:batch_size, 0].tolist()).to_tensor(), [batch_size, -1]
    )
    # [1,2,3]
    multihot_data2_col = tf.RaggedTensor.from_row_lengths(
        batch[0]["data2"][0][:, 0], tf.cast(batch[0]["data2"][1][:, 0], tf.int32)
    ).to_tensor()
    true_data2_col = tf.reshape(
        tf.ragged.constant(df.iloc[:batch_size, 1].tolist()).to_tensor(), [batch_size, -1]
    )
    assert nested_data_col.shape == true_data_col.shape
    assert np.allclose(nested_data_col.numpy(), true_data_col.numpy())
    assert multihot_data2_col.shape == true_data2_col.shape
    assert np.allclose(multihot_data2_col.numpy(), true_data2_col.numpy())
def test_shuffling():
    num_rows = 10000
    batch_size = 10000

    df = pd.DataFrame({
        "a": np.asarray(range(num_rows)),
        "b": np.asarray([0] * num_rows)
    })

    train_dataset = torch_dataloader.TorchAsyncItr(Dataset(df),
                                                   conts=["a"],
                                                   labels=["b"],
                                                   batch_size=batch_size,
                                                   shuffle=True)

    batch = next(iter(train_dataset))

    first_batch = batch[0]["a"].cpu()
    in_order = torch.arange(0, batch_size)

    assert (first_batch != in_order).any()
    assert (torch.sort(first_batch).values == in_order).all()
예제 #9
0
def _validate_dataset(paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs):
    # TODO: put this in parent class and allow
    # torch dataset to leverage as well?

    # if a dataset was passed, just return it
    if isinstance(paths_or_dataset, Dataset):
        return paths_or_dataset

    # otherwise initialize a dataset
    # from paths or glob pattern
    if isinstance(paths_or_dataset, str):
        files = tf.io.gfile.glob(paths_or_dataset)
        _is_empty_msg = "Couldn't find file pattern {} in directory {}".format(
            *os.path.split(paths_or_dataset)
        )
    else:
        # TODO: some checking around attribute
        # error here?
        files = list(paths_or_dataset)
        _is_empty_msg = "paths_or_dataset list must contain at least one filename"

    assert isinstance(files, list)
    if len(files) == 0:
        raise ValueError(_is_empty_msg)

    # implement buffer size logic
    # TODO: IMPORTANT
    # should we divide everything by 3 to account
    # for extra copies laying around due to asynchronicity?
    reader_kwargs = reader_kwargs or {}
    if buffer_size >= 1:
        if buffer_size < batch_size:
            reader_kwargs["batch_size"] = int(batch_size * buffer_size)
        else:
            reader_kwargs["batch_size"] = buffer_size
    else:
        reader_kwargs["part_mem_fraction"] = buffer_size
    return Dataset(files, engine=engine, **reader_kwargs)
예제 #10
0
    def fit(self, dataset: Dataset) -> "Workflow":
        """Calculates statistics for this workflow on the input dataset

        Parameters
        -----------
        dataset: Dataset
            The input dataset to calculate statistics for. If there is a train/test split this
            data should be the training dataset only.
        """
        self._clear_worker_cache()

        if not self.output_schema:
            self.fit_schema(dataset.schema)

        ddf = dataset.to_ddf(columns=self._input_columns())

        # Get a dictionary mapping all StatOperators we need to fit to a set of any dependent
        # StatOperators (having StatOperators that depend on the output of other StatOperators
        # means that will have multiple phases in the fit cycle here)
        stat_ops = {
            op: _get_stat_ops(op.parents_with_dependencies)
            for op in _get_stat_ops([self.output_node])
        }

        while stat_ops:
            # get all the StatOperators that we can currently call fit on (no outstanding
            # dependencies)
            current_phase = [op for op, dependencies in stat_ops.items() if not dependencies]
            if not current_phase:
                # this shouldn't happen, but lets not infinite loop just in case
                raise RuntimeError("failed to find dependency-free StatOperator to fit")

            stats, ops = [], []
            for workflow_node in current_phase:
                # Check for additional input columns that aren't generated by parents
                addl_input_cols = set()
                if workflow_node.parents:
                    upstream_output_cols = sum(
                        [
                            upstream.output_columns
                            for upstream in workflow_node.parents_with_dependencies
                        ],
                        nvtabular.ColumnSelector(),
                    )
                    addl_input_cols = set(workflow_node.input_columns.names) - set(
                        upstream_output_cols.names
                    )

                # apply transforms necessary for the inputs to the current column group, ignoring
                # the transforms from the statop itself
                transformed_ddf = _ensure_optimize_dataframe_graph(
                    ddf=_transform_ddf(
                        ddf,
                        workflow_node.parents_with_dependencies,
                        additional_columns=addl_input_cols,
                    )
                )

                op = workflow_node.op
                try:
                    stats.append(op.fit(workflow_node.input_columns, transformed_ddf))
                    ops.append(op)
                except Exception:
                    LOG.exception("Failed to fit operator %s", workflow_node.op)
                    raise

            if self.client:
                results = [r.result() for r in self.client.compute(stats)]
            else:
                results = dask.compute(stats, scheduler="synchronous")[0]

            for computed_stats, op in zip(results, ops):
                op.fit_finalize(computed_stats)

            # Remove all the operators we processed in this phase, and remove
            # from the dependencies of other ops too
            for stat_op in current_phase:
                stat_ops.pop(stat_op)
            for dependencies in stat_ops.values():
                dependencies.difference_update(current_phase)

        # hack: store input/output dtypes here. We should have complete dtype
        # information for each operator (like we do for column names), but as
        # an interim solution this gets us what we need.
        input_dtypes = dataset.to_ddf()[self._input_columns()].dtypes
        self.input_dtypes = dict(zip(input_dtypes.index, input_dtypes))
        output_dtypes = self.transform(dataset).sample_dtypes()
        self.output_dtypes = dict(zip(output_dtypes.index, output_dtypes))

        self._zero_output_schemas()
        self.fit_schema(dataset.schema)
        return self
예제 #11
0
def test_dataloader_seeding(datasets, engine, batch_size):
    cont_names = ["x", "y", "id"]
    cat_names = ["name-string", "name-cat"]
    label_name = ["label"]

    dataset = Dataset(str(datasets["parquet"]), engine=engine)

    # Define a seed function that returns the same seed on all workers
    seed_fragments = []

    def seed_fn():
        # Capturing the next random number generated allows us to check
        # that different workers have different random states when this
        # function is called
        next_rand = _generate_local_seed(0, 1)
        seed_fragments.append(next_rand)

        # But since we don't actually want to run two data loaders in
        # parallel in this test, we'll cheat and return a static seed
        # instead of combining the fragments into a new seed
        return 5678

    # Set up two dataloaders with different global ranks
    data_loader_0 = DataLoader(
        dataset,
        cat_names=cat_names,
        cont_names=cont_names,
        batch_size=batch_size,
        label_names=label_name,
        shuffle=False,
        global_size=2,
        global_rank=0,
        seed_fn=seed_fn,
    )

    data_loader_1 = DataLoader(
        dataset,
        cat_names=cat_names,
        cont_names=cont_names,
        batch_size=batch_size,
        label_names=label_name,
        shuffle=False,
        global_size=2,
        global_rank=1,
        seed_fn=seed_fn,
    )

    # Starting from the same random state, run a shuffle on each worker
    # and capture the results
    np.random.seed(1234)

    data_loader_0._shuffle_indices()

    dl0_rng_state = _get_random_state()
    dl0_next_rand = dl0_rng_state.tomaxint(size=1)
    dl0_indices = data_loader_0.indices

    np.random.seed(1234)

    data_loader_1._shuffle_indices()

    dl1_next_rand = _generate_local_seed(0, 1)
    dl1_indices = data_loader_1.indices

    # Test that the seed function actually gets called in each data loader
    assert len(seed_fragments) == 2

    # Test that each data loader had different random state
    # when seed_fn was called
    assert seed_fragments[0] != seed_fragments[1]

    # Test that the shuffle has the same result on both workers
    # (i.e. the random seeds are the same when the shuffle happens)
    for idx, element in enumerate(dl0_indices):
        assert dl0_indices[idx] == dl1_indices[idx]

    # Test that after the shuffle each worker generates different random numbers
    # (i.e. the random seeds are different on each worker after the shuffle)
    assert dl0_next_rand != dl1_next_rand