def join_fit_transform(self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> \ Tuple['Any', DataContainer]: """ Concatenate the pipeline fit transform output of each batch of self.batch_size together. :param step: pipeline to fit transform on :type step: Pipeline :param data_container: data container to fit transform on :type data_container: DataContainer :param context: execution context :return: fitted self, transformed data inputs :rtype: Tuple[Any, DataContainer] """ context = context.push(step) data_container_batches = data_container.minibatches( batch_size=self.batch_size, include_incomplete_batch=self.include_incomplete_batch, default_value_data_inputs=self.default_value_data_inputs, default_value_expected_outputs=self.default_value_expected_outputs) output_data_container = ListDataContainer.empty() for data_container_batch in data_container_batches: step, data_container_batch = step._fit_transform_data_container( data_container_batch, context) output_data_container.concat(data_container_batch) return step, output_data_container
def test_data_container_batching(batch_size, include_incomplete_pass, default_value, expected_data_containers): data_container = DataContainer(current_ids=[str(i) for i in range(10)], data_inputs=np.array(list(range(10))), expected_outputs=np.array( list(range(10, 20)))) # When data_containers = [] for dc in data_container.minibatches( batch_size=batch_size, include_incomplete_batch=include_incomplete_pass, default_value_data_inputs=default_value): data_containers.append(dc) # Then assert len(expected_data_containers) == len(data_containers) for expected_data_container, actual_data_container in zip( expected_data_containers, data_containers): np.array_equal(expected_data_container.current_ids, actual_data_container.current_ids) np.array_equal(expected_data_container.data_inputs, actual_data_container.data_inputs) np.array_equal(expected_data_container.expected_outputs, actual_data_container.expected_outputs)
def join_transform(self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> DataContainer: """ Concatenate the pipeline transform output of each batch of self.batch_size together. :param step: pipeline to transform on :type step: Pipeline :param data_container: data container to transform :type data_container: DataContainer :param context: execution context :return: transformed data container :rtype: DataContainer """ context = context.push(step) data_container_batches = data_container.minibatches( batch_size=self.batch_size, include_incomplete_batch=self.include_incomplete_batch, default_value_data_inputs=self.default_value_data_inputs, default_value_expected_outputs=self.default_value_expected_outputs) output_data_container = ListDataContainer.empty() for data_container_batch in data_container_batches: output_data_container.concat( step._transform_data_container(data_container_batch, context)) return output_data_container
def transform_data_container(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer: """ Transform data container :param data_container: data container to transform. :type data_container: DataContainer :param context: execution context :type context: ExecutionContext :return: data container """ data_container_batches = data_container.minibatches( batch_size=self.batch_size, include_incomplete_batch=self.include_incomplete_batch, default_value_data_inputs=self.default_value_data_inputs, default_value_expected_outputs=self.default_value_expected_outputs) n_batches = self.get_n_batches(data_container) self[-1].set_n_batches(n_batches) for name, step in self[:-1]: step.start(context) batch_index = 0 for data_container_batch in data_container_batches: self.send_batch_to_queued_pipeline( batch_index=batch_index, data_container=data_container_batch) batch_index += 1 data_container = self[-1].join(original_data_container=data_container) return data_container
def join_transform(self, step: TruncableSteps, data_container: DataContainer, context: ExecutionContext) -> ZipDataContainer: context = context.push(step) data_container_batches = data_container.minibatches( batch_size=self.batch_size, keep_incomplete_batch=self.keep_incomplete_batch, default_value_data_inputs=self.default_value_data_inputs, default_value_expected_outputs=self.default_value_expected_outputs ) output_data_container = [] for data_container_batch in data_container_batches: output_data_container.append(step._transform_data_container(data_container_batch, context)) return ZipDataContainer.create_from(*output_data_container)
def test_data_container_should_iterate_through_batches_using_convolved(): data_container = DataContainer(current_ids=[str(i) for i in range(100)], data_inputs=np.array(list(range(100))), expected_outputs=np.array( list(range(100, 200)))) batches = [] for b in data_container.minibatches(batch_size=10): batches.append(b) for i, batch in enumerate(batches): assert np.array_equal(np.array(batch.data_inputs), np.array(list(range(i * 10, (i * 10) + 10)))) assert np.array_equal( np.array(batch.expected_outputs), np.array(list(range((i * 10) + 100, (i * 10) + 100 + 10))))
def join_fit_transform(self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> \ Tuple['Any', DataContainer]: context = context.push(step) data_container_batches = data_container.minibatches( batch_size=self.batch_size, keep_incomplete_batch=self.keep_incomplete_batch, default_value_data_inputs=self.default_value_data_inputs, default_value_expected_outputs=self.default_value_expected_outputs ) output_data_container = [] for data_container_batch in data_container_batches: step, data_container_batch = step._fit_transform_data_container(data_container_batch, context) output_data_container.append(data_container_batch) return step, ZipDataContainer.create_from(*output_data_container)
def test_data_container_minibatch_should_be_lazy_and_use_getitem_when_data_is_lazy_loadable( ): items = [LoadableItem() for _ in range(10)] data_inputs = SomeLazyLoadableCollection(items) expected_outputs = SomeLazyLoadableCollection( [LoadableItem() for _ in range(10)]) data_container = DataContainer(data_inputs=data_inputs, expected_outputs=expected_outputs) i = 0 batch_size = 2 for batch in data_container.minibatches(batch_size=batch_size): assert len(batch) == batch_size assert all(item.is_loaded() for item in data_inputs.inner_list[:(i * batch_size)]) for y in range((i + 1) * batch_size, len(data_inputs)): assert not items[y].is_loaded() i += 1