def transform(self, dataset: Dataset) -> Dataset: """Transforms the dataset by applying the graph of operators to it. Requires the ``fit`` method to have already been called, or calculated statistics to be loaded from disk This method returns a Dataset object, with the transformations lazily loaded. None of the actual computation will happen until the produced Dataset is consumed, or written out to disk. Parameters ----------- dataset: Dataset Returns ------- Dataset """ self._clear_worker_cache() if not self.output_schema: self.fit_schema(dataset.schema) ddf = dataset.to_ddf(columns=self._input_columns()) return Dataset( _transform_ddf(ddf, self.output_node, self.output_dtypes), client=self.client, cpu=dataset.cpu, base_dataset=dataset.base_dataset, schema=self.output_schema, )
def test_dataloader_epochs(datasets, engine, batch_size, epochs, on_ddf): dataset = Dataset(str(datasets["parquet"]), engine=engine) if on_ddf: dataset = dataset.to_ddf() cont_names = ["x", "y", "id"] cat_names = ["name-string", "name-cat"] label_name = ["label"] data_loader = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, ) # Convert to iterators and then to DataFrames df1 = _concat(list(data_loader._buff.itr)) df2 = _concat(list(data_loader.epochs(epochs)._buff.itr)) # Check that the DataFrame sizes and rows make sense assert len(df2) == epochs * len(df1) assert_eq( _concat([df1 for i in range(epochs)]).reset_index(drop=True), df2.reset_index(drop=True), )
def transform(self, dataset: Dataset) -> Dataset: """Transforms the dataset by applying the graph of operators to it. Requires the 'fit' method to have already been called, or calculated statistics to be loaded from disk This method returns a Dataset object, with the transformations lazily loaded. None of the actual computation will happen until the produced Dataset is consumed, or written out to disk. Parameters ----------- dataset: Dataset Returns ------- Dataset """ self._clear_worker_cache() ddf = dataset.to_ddf(columns=self._input_columns()) return Dataset(_transform_ddf(ddf, self.column_group), client=self.client)
def test_dataloader_empty_error(datasets, engine, batch_size): dataset = Dataset(str(datasets["parquet"]), engine=engine) with pytest.raises(ValueError) as exc_info: DataLoader( dataset, batch_size=batch_size, label_names=["label"], shuffle=False, ) assert "Neither Categorical or Continuous columns were found by the dataloader. " in str( exc_info.value)
def fit(self, dataset: Dataset): """Calculates statistics for this workflow on the input dataset Parameters ----------- dataset: Dataset The input dataset to calculate statistics for. If there is a train/test split this data should be the training dataset only. """ self._clear_worker_cache() ddf = dataset.to_ddf(columns=self._input_columns()) # Get a dictionary mapping all StatOperators we need to fit to a set of any dependant # StatOperators (having StatOperators that depend on the output of other StatOperators # means that will have multiple phases in the fit cycle here) stat_ops = {op: _get_stat_ops(op.parents) for op in _get_stat_ops([self.column_group])} while stat_ops: # get all the StatOperators that we can currently call fit on (no outstanding # dependencies) current_phase = [op for op, dependencies in stat_ops.items() if not dependencies] if not current_phase: # this shouldn't happen, but lets not infinite loop just in case raise RuntimeError("failed to find dependency-free StatOperator to fit") stats, ops = [], [] for column_group in current_phase: # apply transforms necessary for the inputs to the current column group, ignoring # the transforms from the statop itself transformed_ddf = _transform_ddf(ddf, column_group.parents) op = column_group.op try: stats.append(op.fit(column_group.input_column_names, transformed_ddf)) ops.append(op) except Exception: LOG.exception("Failed to fit operator %s", column_group.op) raise if self.client: results = [r.result() for r in self.client.compute(stats)] else: results = dask.compute(stats, scheduler="synchronous")[0] for computed_stats, op in zip(results, ops): op.fit_finalize(computed_stats) # Remove all the operators we processed in this phase, and remove # from the dependencies of other ops too for stat_op in current_phase: stat_ops.pop(stat_op) for dependencies in stat_ops.values(): dependencies.difference_update(current_phase)
def test_shuffling(): num_rows = 10000 batch_size = 10000 df = pd.DataFrame({"a": np.asarray(range(num_rows)), "b": np.asarray([0] * num_rows)}) train_dataset = tf_dataloader.KerasSequenceLoader( Dataset(df), cont_names=["a"], label_names=["b"], batch_size=batch_size, shuffle=True ) batch = next(iter(train_dataset)) first_batch = tf.reshape(tf.cast(batch[0]["a"].cpu(), tf.int32), (batch_size,)) in_order = tf.range(0, batch_size, dtype=tf.int32) assert (first_batch != in_order).numpy().any() assert (tf.sort(first_batch) == in_order).numpy().all()
def test_nested_list(): num_rows = 100 batch_size = 12 df = pd.DataFrame( { "data": [ np.random.rand(np.random.randint(10) + 1, 3).tolist() for i in range(num_rows) ], "data2": [np.random.rand(np.random.randint(10) + 1).tolist() for i in range(num_rows)], "label": [np.random.rand() for i in range(num_rows)], } ) train_dataset = tf_dataloader.KerasSequenceLoader( Dataset(df), cont_names=["data", "data2"], label_names=["label"], batch_size=batch_size, shuffle=False, ) batch = next(iter(train_dataset)) # [[1,2,3],[3,1],[...],[]] nested_data_col = tf.RaggedTensor.from_row_lengths( batch[0]["data"][0][:, 0], tf.cast(batch[0]["data"][1][:, 0], tf.int32) ).to_tensor() true_data_col = tf.reshape( tf.ragged.constant(df.iloc[:batch_size, 0].tolist()).to_tensor(), [batch_size, -1] ) # [1,2,3] multihot_data2_col = tf.RaggedTensor.from_row_lengths( batch[0]["data2"][0][:, 0], tf.cast(batch[0]["data2"][1][:, 0], tf.int32) ).to_tensor() true_data2_col = tf.reshape( tf.ragged.constant(df.iloc[:batch_size, 1].tolist()).to_tensor(), [batch_size, -1] ) assert nested_data_col.shape == true_data_col.shape assert np.allclose(nested_data_col.numpy(), true_data_col.numpy()) assert multihot_data2_col.shape == true_data2_col.shape assert np.allclose(multihot_data2_col.numpy(), true_data2_col.numpy())
def test_shuffling(): num_rows = 10000 batch_size = 10000 df = pd.DataFrame({ "a": np.asarray(range(num_rows)), "b": np.asarray([0] * num_rows) }) train_dataset = torch_dataloader.TorchAsyncItr(Dataset(df), conts=["a"], labels=["b"], batch_size=batch_size, shuffle=True) batch = next(iter(train_dataset)) first_batch = batch[0]["a"].cpu() in_order = torch.arange(0, batch_size) assert (first_batch != in_order).any() assert (torch.sort(first_batch).values == in_order).all()
def _validate_dataset(paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs): # TODO: put this in parent class and allow # torch dataset to leverage as well? # if a dataset was passed, just return it if isinstance(paths_or_dataset, Dataset): return paths_or_dataset # otherwise initialize a dataset # from paths or glob pattern if isinstance(paths_or_dataset, str): files = tf.io.gfile.glob(paths_or_dataset) _is_empty_msg = "Couldn't find file pattern {} in directory {}".format( *os.path.split(paths_or_dataset) ) else: # TODO: some checking around attribute # error here? files = list(paths_or_dataset) _is_empty_msg = "paths_or_dataset list must contain at least one filename" assert isinstance(files, list) if len(files) == 0: raise ValueError(_is_empty_msg) # implement buffer size logic # TODO: IMPORTANT # should we divide everything by 3 to account # for extra copies laying around due to asynchronicity? reader_kwargs = reader_kwargs or {} if buffer_size >= 1: if buffer_size < batch_size: reader_kwargs["batch_size"] = int(batch_size * buffer_size) else: reader_kwargs["batch_size"] = buffer_size else: reader_kwargs["part_mem_fraction"] = buffer_size return Dataset(files, engine=engine, **reader_kwargs)
def fit(self, dataset: Dataset) -> "Workflow": """Calculates statistics for this workflow on the input dataset Parameters ----------- dataset: Dataset The input dataset to calculate statistics for. If there is a train/test split this data should be the training dataset only. """ self._clear_worker_cache() if not self.output_schema: self.fit_schema(dataset.schema) ddf = dataset.to_ddf(columns=self._input_columns()) # Get a dictionary mapping all StatOperators we need to fit to a set of any dependent # StatOperators (having StatOperators that depend on the output of other StatOperators # means that will have multiple phases in the fit cycle here) stat_ops = { op: _get_stat_ops(op.parents_with_dependencies) for op in _get_stat_ops([self.output_node]) } while stat_ops: # get all the StatOperators that we can currently call fit on (no outstanding # dependencies) current_phase = [op for op, dependencies in stat_ops.items() if not dependencies] if not current_phase: # this shouldn't happen, but lets not infinite loop just in case raise RuntimeError("failed to find dependency-free StatOperator to fit") stats, ops = [], [] for workflow_node in current_phase: # Check for additional input columns that aren't generated by parents addl_input_cols = set() if workflow_node.parents: upstream_output_cols = sum( [ upstream.output_columns for upstream in workflow_node.parents_with_dependencies ], nvtabular.ColumnSelector(), ) addl_input_cols = set(workflow_node.input_columns.names) - set( upstream_output_cols.names ) # apply transforms necessary for the inputs to the current column group, ignoring # the transforms from the statop itself transformed_ddf = _ensure_optimize_dataframe_graph( ddf=_transform_ddf( ddf, workflow_node.parents_with_dependencies, additional_columns=addl_input_cols, ) ) op = workflow_node.op try: stats.append(op.fit(workflow_node.input_columns, transformed_ddf)) ops.append(op) except Exception: LOG.exception("Failed to fit operator %s", workflow_node.op) raise if self.client: results = [r.result() for r in self.client.compute(stats)] else: results = dask.compute(stats, scheduler="synchronous")[0] for computed_stats, op in zip(results, ops): op.fit_finalize(computed_stats) # Remove all the operators we processed in this phase, and remove # from the dependencies of other ops too for stat_op in current_phase: stat_ops.pop(stat_op) for dependencies in stat_ops.values(): dependencies.difference_update(current_phase) # hack: store input/output dtypes here. We should have complete dtype # information for each operator (like we do for column names), but as # an interim solution this gets us what we need. input_dtypes = dataset.to_ddf()[self._input_columns()].dtypes self.input_dtypes = dict(zip(input_dtypes.index, input_dtypes)) output_dtypes = self.transform(dataset).sample_dtypes() self.output_dtypes = dict(zip(output_dtypes.index, output_dtypes)) self._zero_output_schemas() self.fit_schema(dataset.schema) return self
def test_dataloader_seeding(datasets, engine, batch_size): cont_names = ["x", "y", "id"] cat_names = ["name-string", "name-cat"] label_name = ["label"] dataset = Dataset(str(datasets["parquet"]), engine=engine) # Define a seed function that returns the same seed on all workers seed_fragments = [] def seed_fn(): # Capturing the next random number generated allows us to check # that different workers have different random states when this # function is called next_rand = _generate_local_seed(0, 1) seed_fragments.append(next_rand) # But since we don't actually want to run two data loaders in # parallel in this test, we'll cheat and return a static seed # instead of combining the fragments into a new seed return 5678 # Set up two dataloaders with different global ranks data_loader_0 = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, global_size=2, global_rank=0, seed_fn=seed_fn, ) data_loader_1 = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, global_size=2, global_rank=1, seed_fn=seed_fn, ) # Starting from the same random state, run a shuffle on each worker # and capture the results np.random.seed(1234) data_loader_0._shuffle_indices() dl0_rng_state = _get_random_state() dl0_next_rand = dl0_rng_state.tomaxint(size=1) dl0_indices = data_loader_0.indices np.random.seed(1234) data_loader_1._shuffle_indices() dl1_next_rand = _generate_local_seed(0, 1) dl1_indices = data_loader_1.indices # Test that the seed function actually gets called in each data loader assert len(seed_fragments) == 2 # Test that each data loader had different random state # when seed_fn was called assert seed_fragments[0] != seed_fragments[1] # Test that the shuffle has the same result on both workers # (i.e. the random seeds are the same when the shuffle happens) for idx, element in enumerate(dl0_indices): assert dl0_indices[idx] == dl1_indices[idx] # Test that after the shuffle each worker generates different random numbers # (i.e. the random seeds are different on each worker after the shuffle) assert dl0_next_rand != dl1_next_rand