def create_pipeline(tmpdir, pickle_checkpoint_step, tape, hyperparameters=None, different=False, save_pipeline=True): if different: pipeline = ResumablePipeline(steps=[ ('a', DifferentCallbackStep(tape.callback, ["1"], hyperparams=hyperparameters)), ('pickle_checkpoint', pickle_checkpoint_step), ('c', TransformCallbackStep(tape.callback, ["2"])), ('d', TransformCallbackStep(tape.callback, ["3"])) ], cache_folder=tmpdir) else: pipeline = ResumablePipeline(steps=[ ('a', TransformCallbackStep(tape.callback, ["1"], hyperparams=hyperparameters)), ('pickle_checkpoint', pickle_checkpoint_step), ('c', TransformCallbackStep(tape.callback, ["2"])), ('d', TransformCallbackStep(tape.callback, ["3"])) ], cache_folder=tmpdir) return pipeline
def test_resumable_pipeline_fit_transform_should_save_all_fitted_pipeline_steps( tmpdir: LocalPath): p = ResumablePipeline( [(SOME_STEP_1, MultiplyByN(multiply_by=2)), (PIPELINE_2, ResumablePipeline([(SOME_STEP_2, MultiplyByN(multiply_by=4)), (CHECKPOINT, DefaultCheckpoint()), (SOME_STEP_3, MultiplyByN(multiply_by=6))]))], cache_folder=tmpdir) p.name = ROOT p, outputs = p.fit_transform(np.array(range(10)), np.array(range(10))) not_saved_paths = [create_some_step3_path(tmpdir)] saved_paths = [ create_root_path(tmpdir), create_pipeline2_path(tmpdir), create_some_step1_path(tmpdir), create_some_step2_path(tmpdir), create_some_checkpoint_path(tmpdir) ] assert np.array_equal(outputs, EXPECTED_OUTPUTS) for p in saved_paths: assert os.path.exists(p) for p in not_saved_paths: assert not os.path.exists(p)
def test_should_transform_each_steps(test_case: ResumablePipelineTestCase, tmpdir): pipeline = ResumablePipeline(steps=test_case.steps, cache_folder=tmpdir) actual_data_inputs = pipeline.transform(test_case.data_inputs) actual_tape = test_case.tape.get_name_tape() assert actual_tape == test_case.expected_tape assert np.array_equal(actual_data_inputs, test_case.data_inputs)
def test_should_fit_each_steps(test_case: ResumablePipelineTestCase, tmpdir): pipeline = ResumablePipeline(steps=test_case.steps, cache_folder=tmpdir) actual_pipeline = pipeline.fit(test_case.data_inputs, test_case.expected_outputs) actual_tape = test_case.tape.get_name_tape() assert isinstance(actual_pipeline, Pipeline) assert actual_tape == test_case.expected_tape[:-1]
def test_pickle_checkpoint_step_should_load_data_container(tmpdir: LocalPath): initial_data_inputs = [1, 2] initial_expected_outputs = [2, 3] create_pipeline_output_transformer = lambda: ResumablePipeline([ ('output_transformer_1', MultiplyBy2OutputTransformer()), ('pickle_checkpoint', DefaultCheckpoint()), ('output_transformer_2', MultiplyBy2OutputTransformer()), ], cache_folder =tmpdir) create_pipeline_output_transformer().fit_transform( data_inputs=initial_data_inputs, expected_outputs=initial_expected_outputs) transformer = create_pipeline_output_transformer() actual_data_container = transformer.handle_transform( DataContainer(current_ids=[0, 1], data_inputs=initial_data_inputs, expected_outputs=initial_expected_outputs), ExecutionContext.create_from_root(transformer, ExecutionMode.TRANSFORM, tmpdir)) assert np.array_equal(actual_data_container.data_inputs, [4, 8]) assert np.array_equal(actual_data_container.expected_outputs, [8, 12])
def create_pipeline_output_transformer(tmpdir): return ResumablePipeline([ ('output_transformer_1', MultiplyBy2OutputTransformer()), ('joblib_checkpoint', DefaultCheckpoint()), ('output_transformer_2', MultiplyBy2OutputTransformer()), ], cache_folder=tmpdir)
def create_checkpoint_test_case(tmpdir): tape_transform_1 = TapeCallbackFunction() tape_fit_1 = TapeCallbackFunction() tape_transform_2 = TapeCallbackFunction() tape_fit_2 = TapeCallbackFunction() pipeline = ResumablePipeline( [('step1', FitTransformCallbackStep(tape_transform_1, tape_fit_1)), ('checkpoint', DefaultCheckpoint()), ('step2', FitTransformCallbackStep(tape_transform_2, tape_fit_2))], cache_folder=tmpdir) return CheckpointTest(tape_transform_1, tape_fit_1, tape_transform_2, tape_fit_2, pipeline)
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), 'multiplication_3__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') classic_pipeline_folder = os.path.join(str(tmpdir), 'classic') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEachDataInput(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('multiplication_3', MultiplyByN()), ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=classic_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ], ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable') pipeline = ResumablePipeline([ ('multiplication_1', MultiplyByN()), ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('checkpoint2', ExpandDim(DefaultCheckpoint())), ('multiplication_3', MultiplyByN()) ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=resumable_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ] ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float)
def main(tmpdir, sleep_time: float = 0, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), 'multiplication_3__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEachDataInput(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('multiplication_3', MultiplyByN()), ]).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() best_model = RandomSearch(pipeline, n_iter=n_iter, higher_score_is_better=True).fit( DATA_INPUTS, EXPECTED_OUTPUTS) outputs = best_model.transform(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') pipeline = ResumablePipeline( [('multiplication_1', MultiplyByN()), ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('checkpoint2', ExpandDim(DefaultCheckpoint())), ('multiplication_3', MultiplyByN())], cache_folder=tmpdir).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() best_model = RandomSearch(pipeline, n_iter=n_iter, higher_score_is_better=True).fit( DATA_INPUTS, EXPECTED_OUTPUTS) outputs = best_model.transform(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float)
def create_test_cases(): data_inputs = np.ones((1, 1)) expected_outputs = np.ones((1, 1)) dc = DataContainer(data_inputs=data_inputs, current_ids=range(len(data_inputs)), expected_outputs=expected_outputs) tape = TapeCallbackFunction() tape_fit = TapeCallbackFunction() tape_without_checkpoint_test_arguments = ResumablePipelineTestCase( tape, data_inputs, expected_outputs, [("a", FitTransformCallbackStep(tape.callback, tape_fit.callback, ["1"])), ("b", FitTransformCallbackStep(tape.callback, tape_fit.callback, ["2"])), ("c", FitTransformCallbackStep(tape.callback, tape_fit.callback, ["3"]))], ["1", "2", "3"]) tape2 = TapeCallbackFunction() tape2_fit = TapeCallbackFunction() tape_checkpoint_not_saved_test_arguments = ResumablePipelineTestCase( tape2, data_inputs, expected_outputs, [("a", FitTransformCallbackStep(tape2.callback, tape2_fit.callback, ["1"])), ("b", SomeCheckpointStep(data_container=None)), ("c", FitTransformCallbackStep(tape2.callback, tape2_fit.callback, ["2"])), ("d", FitTransformCallbackStep(tape2.callback, tape2_fit.callback, ["3"])) ], ["1", "2", "3"]) tape3 = TapeCallbackFunction() tape3_fit = TapeCallbackFunction() tape_checkpoint_saved_after_first_step_test_arguments = ResumablePipelineTestCase( tape3, data_inputs, expected_outputs, [("a", FitTransformCallbackStep(tape3.callback, tape3_fit.callback, ["1"])), ("b", SomeCheckpointStep(data_container=dc)), ("c", FitTransformCallbackStep(tape3.callback, tape3_fit.callback, ["2"])), ("d", FitTransformCallbackStep(tape3.callback, tape3_fit.callback, ["3"])) ], ["2", "3"]) tape4 = TapeCallbackFunction() tape4_fit = TapeCallbackFunction() tape_checkpoint_saved_after_second_step_test_arguments = ResumablePipelineTestCase( tape4, data_inputs, expected_outputs, [("a", FitTransformCallbackStep(tape4.callback, tape4_fit.callback, ["1"])), ("b", FitTransformCallbackStep(tape4.callback, tape4_fit.callback, ["2"])), ("c", SomeCheckpointStep(data_container=dc)), ("d", FitTransformCallbackStep(tape4.callback, tape4_fit.callback, ["3"])) ], ["3"]) tape5 = TapeCallbackFunction() tape5_fit = TapeCallbackFunction() tape_checkpoint_saved_after_last_step_test_arguments = ResumablePipelineTestCase( tape5, data_inputs, expected_outputs, [ ("a", FitTransformCallbackStep(tape5.callback, tape5_fit.callback, ["1"])), ("b", FitTransformCallbackStep(tape5.callback, tape5_fit.callback, ["2"])), ("c", FitTransformCallbackStep(tape5.callback, tape5_fit.callback, ["3"])), ("d", SomeCheckpointStep(data_container=dc)), ], []) tape6 = TapeCallbackFunction() tape6_fit = TapeCallbackFunction() tape_checkpoint_saved_inside_subpipeline_last_step = ResumablePipelineTestCase( tape6, data_inputs, expected_outputs, [ ("a", FitTransformCallbackStep(tape6.callback, tape6_fit.callback, ["1"])), ResumablePipeline([ ("b", FitTransformCallbackStep(tape6.callback, tape6_fit.callback, ["2"])), ("d", SomeCheckpointStep(data_container=dc)), ]), ("e", FitTransformCallbackStep(tape6.callback, tape6_fit.callback, ["3"])), ("f", FitTransformCallbackStep(tape6.callback, tape6_fit.callback, ["4"])), ], ["3", "4"]) tape7 = TapeCallbackFunction() tape7_fit = TapeCallbackFunction() tape_checkpoint_saved_inside_subpipeline_first_step = ResumablePipelineTestCase( tape7, data_inputs, expected_outputs, [ ("a", FitTransformCallbackStep(tape7.callback, tape7_fit.callback, ["1"])), ResumablePipeline([ ("d", SomeCheckpointStep(data_container=dc)), ("b", FitTransformCallbackStep(tape7.callback, tape7_fit.callback, ["2"])), ]), ("e", FitTransformCallbackStep(tape7.callback, tape7_fit.callback, ["3"])), ("f", FitTransformCallbackStep(tape7.callback, tape7_fit.callback, ["4"])), ], ["2", "3", "4"]) tape8 = TapeCallbackFunction() tape8_fit = TapeCallbackFunction() tape_checkpoint_saved_inside_subpipeline_step_in_the_middle = ResumablePipelineTestCase( tape8, data_inputs, expected_outputs, [ ("a", FitTransformCallbackStep(tape8.callback, tape8_fit.callback, ["1"])), ResumablePipeline([ ("b", FitTransformCallbackStep(tape8.callback, tape8_fit.callback, ["2"])), ("d", SomeCheckpointStep(data_container=dc)), ("e", FitTransformCallbackStep(tape8.callback, tape8_fit.callback, ["3"])), ]), ("f", FitTransformCallbackStep(tape8.callback, tape8_fit.callback, ["4"])), ], ["3", "4"]) tape9 = TapeCallbackFunction() tape9_fit = TapeCallbackFunction() tape_checkpoint_saved_inside_subpipeline_of_subpipeline = ResumablePipelineTestCase( tape9, data_inputs, expected_outputs, [ ("a", FitTransformCallbackStep(tape9.callback, tape9_fit.callback, ["1"])), ResumablePipeline([ ("b", FitTransformCallbackStep(tape9.callback, tape9_fit.callback, ["2"])), ResumablePipeline([ ("e", FitTransformCallbackStep(tape9.callback, tape9_fit.callback, ["3"])), ("d", SomeCheckpointStep(data_container=dc)), ("f", FitTransformCallbackStep(tape9.callback, tape9_fit.callback, ["4"])), ]), ("g", FitTransformCallbackStep(tape9.callback, tape9_fit.callback, ["5"])), ]), ("h", FitTransformCallbackStep(tape9.callback, tape9_fit.callback, ["6"])), ], ["4", "5", "6"]) tape10 = TapeCallbackFunction() tape10_fit = TapeCallbackFunction() tape_saved_checkpoint_after_another_saved_checkpoint = ResumablePipelineTestCase( tape10, data_inputs, expected_outputs, [("a", FitTransformCallbackStep(tape10.callback, tape10_fit.callback, ["1"])), ("b", SomeCheckpointStep(data_container=dc)), ("c", FitTransformCallbackStep(tape10.callback, tape10_fit.callback, ["2"])), ("b", SomeCheckpointStep(data_container=dc)), ("d", FitTransformCallbackStep(tape10.callback, tape10_fit.callback, ["3"]))], ["3"]) tape11 = TapeCallbackFunction() tape11_fit = TapeCallbackFunction() tape_multiple_checkpoint_in_a_row = ResumablePipelineTestCase( tape11, data_inputs, expected_outputs, [("a", FitTransformCallbackStep(tape11.callback, tape11_fit.callback, ["1"])), ("joblib_1", SomeCheckpointStep(data_container=dc)), ("joblib_2", SomeCheckpointStep(data_container=dc)), ("c", FitTransformCallbackStep(tape11.callback, tape11_fit.callback, ["2"])), ("d", FitTransformCallbackStep(tape11.callback, tape11_fit.callback, ["3"]))], ["2", "3"]) return [ tape_without_checkpoint_test_arguments, tape_checkpoint_not_saved_test_arguments, tape_checkpoint_saved_after_first_step_test_arguments, tape_checkpoint_saved_after_second_step_test_arguments, tape_checkpoint_saved_after_last_step_test_arguments, tape_checkpoint_saved_inside_subpipeline_first_step, tape_checkpoint_saved_inside_subpipeline_last_step, tape_checkpoint_saved_inside_subpipeline_step_in_the_middle, tape_checkpoint_saved_inside_subpipeline_of_subpipeline, tape_saved_checkpoint_after_another_saved_checkpoint, tape_multiple_checkpoint_in_a_row ]
def given_saved_pipeline(tmpdir: LocalPath): step_savers = [(SOME_STEP_1, []), (PIPELINE_2, [TruncableJoblibStepSaver()])] path = create_root_path(tmpdir, True) root = ResumablePipeline([], cache_folder=tmpdir) root.sub_steps_savers = step_savers root.name = ROOT dump(root, path) pipeline_2 = ResumablePipeline([], cache_folder=tmpdir) pipeline_2.name = 'pipeline2' pipeline_2.sub_steps_savers = [ (SOME_STEP_2, []), (CHECKPOINT, []), (SOME_STEP_3, []), ] dump(pipeline_2, create_pipeline2_path(tmpdir, True)) given_saved_some_step(multiply_by=2, name=SOME_STEP_1, path=create_some_step1_path(tmpdir, True)) given_saved_some_step(multiply_by=4, name=SOME_STEP_2, path=create_some_step2_path(tmpdir, True)) given_saved_some_step(multiply_by=6, name=SOME_STEP_3, path=create_some_step3_path(tmpdir, True)) checkpoint = DefaultCheckpoint() checkpoint.name = CHECKPOINT dump(checkpoint, create_some_checkpoint_path(tmpdir, True)) p = ResumablePipeline( [(SOME_STEP_1, MultiplyByN(multiply_by=1)), (PIPELINE_2, ResumablePipeline([(SOME_STEP_2, MultiplyByN(multiply_by=1)), (CHECKPOINT, DefaultCheckpoint()), (SOME_STEP_3, MultiplyByN(multiply_by=1))]))], cache_folder=tmpdir) p.name = ROOT return p