def test_load_full_dump_from_path(tmpdir): # Given tape_fit_callback_function = TapeCallbackFunction() tape_transform_callback_function = TapeCallbackFunction() pipeline = Pipeline( [('step_a', Identity()), ('step_b', OutputTransformerWrapper( FitTransformCallbackStep(tape_fit_callback_function, tape_transform_callback_function)))], cache_folder=tmpdir).set_name(PIPELINE_NAME) # When pipeline, outputs = pipeline.fit_transform(DATA_INPUTS, EXPECTED_OUTPUTS) pipeline.save(ExecutionContext(tmpdir), full_dump=True) # Then loaded_pipeline = ExecutionContext(tmpdir).load( os.path.join(PIPELINE_NAME, 'step_b')) assert isinstance(loaded_pipeline, OutputTransformerWrapper) loaded_step_b_wrapped_step = loaded_pipeline.wrapped assert np.array_equal( loaded_step_b_wrapped_step.transform_callback_function.data[0], EXPECTED_OUTPUTS) assert np.array_equal( loaded_step_b_wrapped_step.fit_callback_function.data[0][0], EXPECTED_OUTPUTS) assert np.array_equal( loaded_step_b_wrapped_step.fit_callback_function.data[0][1], [None] * len(EXPECTED_OUTPUTS))
def _fit_transform_data_container(self, data_container: DataContainer, context: ExecutionContext) -> ( 'BaseStep', DataContainer): """ Fit Transform given data inputs without splitting. :param context: :param data_container: DataContainer :type data_container: DataContainer :type context: ExecutionContext :return: outputs """ train_data_container, validation_data_container = self.split_data_container(data_container) # add sub data container for the validation metrics calculated in MetricsWrapper train_data_container.add_sub_data_container( name=VALIDATION_SUB_DATA_CONTAINER_NAME, data_container=validation_data_container ) self.wrapped, results_data_container = self.wrapped.handle_fit_transform(train_data_container, context.push(self.wrapped)) self._update_scores_train(results_data_container.data_inputs, results_data_container.expected_outputs) results_data_container = self.wrapped.handle_predict(validation_data_container, context.push(self.wrapped)) self._update_scores_validation(results_data_container.data_inputs, results_data_container.expected_outputs) self.wrapped.apply('disable_metrics') data_container = self.wrapped.handle_predict(data_container, context.push(self.wrapped)) self.wrapped.apply('enable_metrics') return self, data_container
def _fit_transform_data_container( self, data_container: DataContainer, context: ExecutionContext) -> ('BaseStep', DataContainer): """ Fit Transform given data inputs without splitting. :param context: :param data_container: DataContainer :type data_container: DataContainer :type context: ExecutionContext :return: outputs """ train_data_container, validation_data_container = self.split_data_container( data_container) self.wrapped, results_data_container = self.wrapped.handle_fit_transform( train_data_container, context.push(self.wrapped)) self._update_scores_train(results_data_container.data_inputs, results_data_container.expected_outputs) results_data_container = self.wrapped.handle_predict( validation_data_container, context.push(self.wrapped)) self._update_scores_validation(results_data_container.data_inputs, results_data_container.expected_outputs) self.wrapped.apply('disable_metrics') data_container = self.wrapped.handle_predict( data_container, context.push(self.wrapped)) self.wrapped.apply('enable_metrics') return self, data_container
def _fit_transform_data_container( self, data_container: DataContainer, context: ExecutionContext) -> ('BaseStep', DataContainer): """ According to the idiom of `(1, 2, reversed(1))`, we do this, in order: - `1`. Fit Transform preprocessing step - `2`. Fit Transform postprocessing step - `reversed(1)`. Inverse transform preprocessing step :param data_container: data container to transform :param context: execution context :return: (self, data_container) """ self["preprocessing_step"], data_container = self[ "preprocessing_step"].handle_fit_transform( data_container, context.push(self["preprocessing_step"])) self["postprocessing_step"], data_container = self[ "postprocessing_step"].handle_fit_transform( data_container, context.push(self["postprocessing_step"])) data_container = self["preprocessing_step"].handle_inverse_transform( data_container, context.push(self["preprocessing_step"])) current_ids = self.hash(data_container) data_container.set_current_ids(current_ids) return self, data_container
def handle_fit(self, data_container: DataContainer, context: ExecutionContext): """ Fit the parallel steps on the data. It will make use of some parallel processing. :param data_container: The input data to fit onto :param context: execution context :return: self """ # Actually fit: if self.n_jobs != 1: fitted_steps_data_containers = Parallel( backend=self.backend, n_jobs=self.n_jobs)(delayed(step.handle_fit)( data_container.copy(), context.push(step)) for _, step in self.steps_as_tuple) else: fitted_steps_data_containers = [ step.handle_fit(data_container.copy(), context.push(step)) for _, step in self.steps_as_tuple ] # Save fitted steps for i, (fitted_step, _) in enumerate(fitted_steps_data_containers): self.steps_as_tuple[i] = (self.steps_as_tuple[i][0], fitted_step) self._refresh_steps() return self, data_container
def save_checkpoint(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer: if self.is_for_execution_mode(context.get_execution_mode()): # TODO: save the context by execution mode AND data container ids / summary context.copy().save() return data_container
def handle_transform(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer: """ According to the idiom of `(1, 2, reversed(1))`, we do this, in order: - `1`. Transform preprocessing step - `2`. Transform postprocessing step - `reversed(1)`. Inverse transform preprocessing step :param data_container: data container to transform :type data_container: DataContainer :param context: execution context :type context: ExecutionContext :return: data_container :rtype: DataContainer """ data_container = self["preprocessing_step"].handle_transform(data_container, context.push(self["preprocessing_step"])) data_container = self["postprocessing_step"].handle_transform(data_container, context.push(self["postprocessing_step"])) data_container = self["preprocessing_step"].handle_inverse_transform(data_container, context.push(self["preprocessing_step"])) current_ids = self.hash(data_container) data_container.set_current_ids(current_ids) return data_container
def save_checkpoint(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer: """ Save data container data inputs with :py:attr:`~data_input_checkpointer`. Save data container expected outputs with :py:attr:`~expected_output_checkpointer`. :param data_container: data container to checkpoint :type data_container: neuraxle.data_container.DataContainer :param context: execution context to checkpoint from :type context: ExecutionContext :return: """ if not self.is_for_execution_mode(context.get_execution_mode()): return data_container context.mkdir() self.summary_checkpointer.save_summary( checkpoint_path=context.get_path(), data_container=data_container) for current_id, data_input, expected_output in data_container: self.data_input_checkpointer.save_checkpoint( checkpoint_path=self._get_data_input_checkpoint_path(context), current_id=current_id, data=data_input) self.expected_output_checkpointer.save_checkpoint( checkpoint_path=self._get_expected_output_checkpoint_path( context), current_id=current_id, data=expected_output) return data_container
def should_resume(self, data_container: DataContainer, context: ExecutionContext) -> bool: """ Returns if the whole data container has been checkpointed. :param data_container: data container to read checkpoint for :type data_container: neuraxle.data_container.DataContainer :param context: execution context to read checkpoint from :type context: ExecutionContext :return: data container checkpoint :rtype: neuraxle.data_container.DataContainer """ if not self.summary_checkpointer.checkpoint_exists( context.get_path(), data_container): return False current_ids = self.summary_checkpointer.read_summary( checkpoint_path=context.get_path(), data_container=data_container) for current_id in current_ids: if not self.data_input_checkpointer.checkpoint_exists( checkpoint_path=self._get_data_input_checkpoint_path( context), current_id=current_id): return False if not self.expected_output_checkpointer.checkpoint_exists( checkpoint_path=self._get_expected_output_checkpoint_path( context), current_id=current_id): return False return True
def handle_transform(self, data_container: DataContainer, context: ExecutionContext): """ Transform the data with the unions. It will make use of some parallel processing. :param data_container: data container :param context: execution context :return: the transformed data_inputs. """ if self.n_jobs != 1: data_containers = Parallel( backend=self.backend, n_jobs=self.n_jobs)(delayed(step.handle_transform)( data_container.copy(), context.push(step)) for _, step in self.steps_as_tuple) else: data_containers = [ step.handle_transform(data_container.copy(), context.push(step)) for _, step in self.steps_as_tuple ] new_current_ids = self.hash(data_container) data_container = self.joiner.handle_transform(data_containers, new_current_ids) return data_container
def fit_data_container(self, data_container): data_container = self.hash_data_container(data_container) context = ExecutionContext(self.cache_folder, ExecutionMode.FIT) context = context.push(self) new_self = self._fit_data_container(data_container, context) return new_self
def test_queued_pipeline_saving(tmpdir): # Given p = ParallelQueuedFeatureUnion([ ('1', FitTransformCallbackStep()), ('2', FitTransformCallbackStep()), ('3', FitTransformCallbackStep()), ('4', FitTransformCallbackStep()), ], n_workers_per_step=1, max_queue_size=10, batch_size=10) # When p, outputs = p.fit_transform(list(range(100)), list(range(100))) p.save(ExecutionContext(tmpdir)) p.apply('clear_callbacks') # Then assert len(p[0].wrapped.transform_callback_function.data) == 0 assert len(p[0].wrapped.fit_callback_function.data) == 0 assert len(p[1].wrapped.transform_callback_function.data) == 0 assert len(p[1].wrapped.fit_callback_function.data) == 0 assert len(p[2].wrapped.transform_callback_function.data) == 0 assert len(p[2].wrapped.fit_callback_function.data) == 0 assert len(p[3].wrapped.transform_callback_function.data) == 0 assert len(p[3].wrapped.fit_callback_function.data) == 0 p = p.load(ExecutionContext(tmpdir)) assert len(p[0].wrapped.transform_callback_function.data) == 10 assert len(p[0].wrapped.fit_callback_function.data) == 10 assert len(p[1].wrapped.transform_callback_function.data) == 10 assert len(p[1].wrapped.fit_callback_function.data) == 10 assert len(p[2].wrapped.transform_callback_function.data) == 10 assert len(p[2].wrapped.fit_callback_function.data) == 10 assert len(p[3].wrapped.transform_callback_function.data) == 10 assert len(p[3].wrapped.fit_callback_function.data) == 10
def fit_transform_data_container(self, data_container): data_container = self.hash_data_container(data_container) context = ExecutionContext(root=self.cache_folder, execution_mode=ExecutionMode.FIT_TRANSFORM) context = context.push(self) new_self, data_container = self._fit_transform_data_container( data_container, context) return new_self, data_container.data_inputs
def transform_data_container(self, data_container: DataContainer): data_container = self.hash_data_container(data_container) context = ExecutionContext(root=self.cache_folder, execution_mode=ExecutionMode.TRANSFORM) context = context.push(self) data_container = self._transform_data_container( data_container, context) return data_container.data_inputs
def test_localassert_should_assert_dependencies_properly_at_exec(tmpdir): data_inputs = np.array([0, 1, 2, 3]) context = ExecutionContext(root=tmpdir) p = Pipeline([ RegisterServiceDynamically(), SomeStep().assert_has_services_at_execution(SomeBaseService) ]).with_context(context=context) p.transform(data_inputs=data_inputs) service = context.get_service(SomeBaseService) assert np.array_equal(service.data, data_inputs)
def test_with_context_should_inject_dependencies_properly(tmpdir): data_inputs = np.array([0, 1, 2, 3]) context = ExecutionContext(root=tmpdir) service = SomeService() context.set_service_locator({BaseService: service}) p = Pipeline([SomeStep().assert_has_services(BaseService) ]).with_context(context=context) p.transform(data_inputs=data_inputs) assert np.array_equal(service.data, data_inputs)
def test_tensorflowv2_saver(tmpdir): dataset = toy_dataset() model = Pipeline([create_model_step(tmpdir)]) loss_first_fit = evaluate_model_on_dataset(model, dataset) model.save(ExecutionContext(root=tmpdir)) loaded = Pipeline([create_model_step(tmpdir) ]).load(ExecutionContext(root=tmpdir)) loss_second_fit = evaluate_model_on_dataset(loaded, dataset) assert loss_second_fit < (loss_first_fit / 2)
def test_step_with_context_should_only_save_wrapped_step(tmpdir): context = ExecutionContext(root=tmpdir) service = SomeService() context.set_service_locator({BaseService: service}) p = Pipeline([SomeStep().assert_has_services(BaseService) ]).with_context(context=context) p.save(context, full_dump=True) p: Pipeline = ExecutionContext(root=tmpdir).load( os.path.join('StepWithContext', 'Pipeline')) assert isinstance(p, Pipeline)
def transform(self, data_inputs: Any): """ After loading the last checkpoint, transform each pipeline steps :param data_inputs: the data input to transform :return: transformed data inputs """ data_container = DataContainer(current_ids=None, data_inputs=data_inputs) data_container = self.hash_data_container(data_container) context = ExecutionContext(root=self.cache_folder, execution_mode=ExecutionMode.TRANSFORM) context = context.push(self) data_container = self._transform_data_container(data_container, context) return data_container.data_inputs
def test_auto_ml_should_assert_dependecies_properly_at_exec(tmpdir): data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 2 p = Pipeline([ RegisterServiceDynamically(), SomeStep().assert_has_services_at_execution(SomeBaseService), ]) context = ExecutionContext(root=tmpdir) auto_ml: AutoML = _make_autoML_loop(tmpdir, p) auto_ml: StepWithContext = auto_ml.with_context(context=context) assert isinstance(auto_ml, StepWithContext) auto_ml.fit(data_inputs, expected_outputs) service = context.get_service(SomeBaseService) assert np.array_equal(service.data, data_inputs)
def join_fit_transform(self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> \ Tuple['Any', DataContainer]: """ Concatenate the pipeline fit transform output of each batch of self.batch_size together. :param step: pipeline to fit transform on :type step: Pipeline :param data_container: data container to fit transform on :type data_container: DataContainer :param context: execution context :return: fitted self, transformed data inputs :rtype: Tuple[Any, DataContainer] """ context = context.push(step) data_container_batches = data_container.minibatches( batch_size=self.batch_size, include_incomplete_batch=self.include_incomplete_batch, default_value_data_inputs=self.default_value_data_inputs, default_value_expected_outputs=self.default_value_expected_outputs) output_data_container = ListDataContainer.empty() for data_container_batch in data_container_batches: step, data_container_batch = step._fit_transform_data_container( data_container_batch, context) output_data_container.concat(data_container_batch) return step, output_data_container
def join_transform(self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> DataContainer: """ Concatenate the pipeline transform output of each batch of self.batch_size together. :param step: pipeline to transform on :type step: Pipeline :param data_container: data container to transform :type data_container: DataContainer :param context: execution context :return: transformed data container :rtype: DataContainer """ context = context.push(step) data_container_batches = data_container.minibatches( batch_size=self.batch_size, include_incomplete_batch=self.include_incomplete_batch, default_value_data_inputs=self.default_value_data_inputs, default_value_expected_outputs=self.default_value_expected_outputs) output_data_container = ListDataContainer.empty() for data_container_batch in data_container_batches: output_data_container.concat( step._transform_data_container(data_container_batch, context)) return output_data_container
def _load_checkpoint( self, data_container: DataContainer, context: ExecutionContext) -> Tuple[NamedTupleList, DataContainer]: """ Try loading a pipeline cache with the passed data container. If pipeline cache loading succeeds, find steps left to do, and load the latest data container. :param data_container: the data container to resume :param context: the execution context to resume :return: tuple(steps left to do, last checkpoint data container) """ new_starting_step_index, starting_step_data_container = \ self._get_starting_step_info(data_container, context) loading_context = context.copy() loading_context.pop() loaded_pipeline = self.load(loading_context) if not self.are_steps_before_index_the_same(loaded_pipeline, new_starting_step_index): return self.steps_as_tuple, data_container self._assign_loaded_pipeline_into_self(loaded_pipeline) step = self[new_starting_step_index] if isinstance(step, Checkpoint) or (isinstance( step, MetaStep) and isinstance(step.wrapped, Checkpoint)): starting_step_data_container = step.resume( starting_step_data_container, context) return self[new_starting_step_index:], starting_step_data_container
def read_checkpoint(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer: """ Read data container data inputs checkpoint with :py:attr:`~data_input_checkpointer`. Read data container expected outputs checkpoint with :py:attr:`~expected_output_checkpointer`. :param data_container: data container to read checkpoint for :type data_container: neuraxle.data_container.DataContainer :param context: execution context to read checkpoint from :type context: ExecutionContext :return: data container checkpoint :rtype: neuraxle.data_container.DataContainer """ data_container_checkpoint = ListDataContainer.empty( original_data_container=data_container) current_ids = self.summary_checkpointer.read_summary( checkpoint_path=context.get_path(), data_container=data_container) for current_id in current_ids: data_input = self.data_input_checkpointer.read_checkpoint( checkpoint_path=self._get_data_input_checkpoint_path(context), current_id=current_id) expected_output = self.expected_output_checkpointer.read_checkpoint( checkpoint_path=self._get_expected_output_checkpoint_path( context), current_id=current_id) data_container_checkpoint.append(current_id, data_input, expected_output) return data_container_checkpoint
def test_logger(): file_path = "test.log" if os.path.exists(file_path): os.remove(file_path) # Given logger = logging.getLogger('test') file_handler = logging.FileHandler(file_path) file_handler.setLevel('DEBUG') logger.addHandler(file_handler) logger.setLevel('DEBUG') context = ExecutionContext(logger=logger) pipeline = Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), LoggingStep() ]) # When data_container = DataContainer( data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pipeline.handle_fit(data_container, context) # Then assert os.path.exists(file_path) with open(file_path) as f: l = f.read() # Teardown file_handler.close() os.remove(file_path)
def start(self, context: ExecutionContext): """ Start multiple processes or threads with the worker function as a target. :param context: execution context :type context: ExecutionContext :return: """ thread_safe_context = context thread_safe_self = self parallel_call = Thread if self.use_processes: # New process requires trimming the references to other processes # when we create many processes: https://stackoverflow.com/a/65749012 thread_safe_context = context.thread_safe() parallel_call = Process if self.use_savers: _ = thread_safe_self.save(thread_safe_context, full_dump=True) # Cannot delete queue worker self. del thread_safe_self.wrapped # del thread_safe_self.queue self.workers = [] for _, worker_arguments in zip(range(self.n_workers), self.additional_worker_arguments): p = parallel_call( target=worker_function, args=(thread_safe_self, thread_safe_context, self.use_savers, worker_arguments) ) p.daemon = True p.start() self.workers.append(p)
def refit(self, p: BaseStep, data_container: DataContainer, context: ExecutionContext) -> BaseStep: """ Refit the pipeline on the whole dataset (without any validation technique). :param p: trial to refit :param data_container: data container :param context: execution context :return: fitted pipeline """ context.set_execution_phase(ExecutionPhase.TRAIN) for i in range(self.epochs): p = p.handle_fit(data_container, context) return p
def handle_inverse_transform(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer: """ Handle inverse transform by passing expected outputs to the wrapped step inverse transform method. Update the expected outputs with the outputs. :param context: execution context :param data_container: :return: data container :rtype: DataContainer """ new_expected_outputs_data_container = self.wrapped.handle_inverse_transform( DataContainer( current_ids=data_container.current_ids, data_inputs=data_container.expected_outputs, expected_outputs=None ), context.push(self.wrapped) ) data_container.set_expected_outputs(new_expected_outputs_data_container.data_inputs) current_ids = self.hash(data_container) data_container.set_current_ids(current_ids) return data_container
def worker_function(queue_worker: QueueWorker, context: ExecutionContext, use_savers: bool, additional_worker_arguments): """ Worker function that transforms the items inside the queue of items to process. :param queue_worker: step to transform :param context: execution context :param use_savers: use savers :param additional_worker_arguments: any additional arguments that need to be passed to the workers :return: """ step = queue_worker.get_step() if use_savers: saved_queue_worker: QueueWorker = context.load(queue_worker.get_name()) step = saved_queue_worker.get_step() additional_worker_arguments = tuple( additional_worker_arguments[i: i + 2] for i in range(0, len(additional_worker_arguments), 2) ) for argument_name, argument_value in additional_worker_arguments: step.__dict__.update({argument_name: argument_value}) while True: task: QueuedPipelineTask = queue_worker.get() summary_id = task.data_container.summary_id data_container = step.handle_transform(task.data_container, context) data_container = data_container.set_summary_id(summary_id) queue_worker.notify(QueuedPipelineTask(step_name=queue_worker.name, data_container=data_container))
def join_fit_transform( self, step: Pipeline, data_container: DataContainer, context: ExecutionContext) -> Tuple['Any', DataContainer]: """ Concatenate the pipeline fit transform output of each batch of self.batch_size together. :param step: pipeline to fit transform on :type step: Pipeline :param data_container: data container to fit transform on :type data_container: DataContainer :param context: execution context :return: fitted self, transformed data inputs :rtype: Tuple[Any, DataContainer] """ context = context.push(step) data_container_batches = data_container.convolved_1d( stride=self.batch_size, kernel_size=self.batch_size) output_data_container = ListDataContainer.empty() for data_container_batch in data_container_batches: step, data_container_batch = step._fit_transform_data_container( data_container_batch, context) output_data_container.concat(data_container_batch) return step, output_data_container