Пример #1
0
def main():
    """
    Process tasks of batch size 10 with 8 queued workers that have a max queue size of 10.
    Each task doest the following: For each data input, sleep 0.02 seconds, and multiply by 2.
    """
    sleep_time = 0.02
    p = SequentialQueuedPipeline([
        Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]),
    ], n_workers_per_step=8, max_queue_size=10, batch_size=10)

    a = time.time()
    outputs_streaming = p.transform(list(range(100)))
    b = time.time()
    time_queued_pipeline = b - a
    print('SequentialQueuedPipeline')
    print('execution time: {} seconds'.format(time_queued_pipeline))

    """
    Process data inputs sequentially. 
    For each data input, sleep 0.02 seconds, and then multiply by 2.
    """
    p = Pipeline([
        Pipeline([ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]),
    ])

    a = time.time()
    outputs_vanilla = p.transform(list(range(100)))
    b = time.time()
    time_vanilla_pipeline = b - a

    print('VanillaPipeline')
    print('execution time: {} seconds'.format(time_vanilla_pipeline))

    assert time_queued_pipeline < time_vanilla_pipeline
    assert np.array_equal(outputs_streaming, outputs_vanilla)
Пример #2
0
def main():
    """
    The task is to sleep 0.02 seconds for each data input and then multiply by 2.
    """
    sleep_time = 0.02
    preprocessing_and_model_steps = [ForEach(Sleep(sleep_time=sleep_time)), MultiplyByN(2)]

    # Classical pipeline - all at once with one big batch:
    p = Pipeline(preprocessing_and_model_steps)
    time_vanilla_pipeline, output_classical = eval_run_time(p)
    print(f"Classical 'Pipeline' execution time: {time_vanilla_pipeline} seconds.")

    # Classical minibatch pipeline - minibatch size 10:
    p = MiniBatchSequentialPipeline(preprocessing_and_model_steps,
                                    batch_size=10)
    time_minibatch_pipeline, output_minibatch = eval_run_time(p)
    print(f"Minibatched 'MiniBatchSequentialPipeline' execution time: {time_minibatch_pipeline} seconds.")

    # Parallel pipeline - minibatch size 10 with 16 parallel workers per step that
    # have a max queue size of 10 batches between preprocessing and the model:
    p = SequentialQueuedPipeline(preprocessing_and_model_steps,
                                 n_workers_per_step=16, max_queue_size=10, batch_size=10)
    time_parallel_pipeline, output_parallel = eval_run_time(p)
    print(f"Parallel 'SequentialQueuedPipeline' execution time: {time_parallel_pipeline} seconds.")

    assert time_parallel_pipeline < time_minibatch_pipeline, str((time_parallel_pipeline, time_vanilla_pipeline))
    assert np.array_equal(output_classical, output_minibatch)
    assert np.array_equal(output_classical, output_parallel)
Пример #3
0
    def __init__(self, columns_selection, n_dimension=3):
        assert n_dimension >= 2
        col_selector: ColumnSelector2D = ColumnSelector2D(columns_selection=columns_selection)
        for _ in range(min(0, n_dimension - 2)):
            col_selector = ForEach(col_selector)

        MetaStep.__init__(self, col_selector)
        self.n_dimension = n_dimension
Пример #4
0
def main():
    value_caching_folder = 'value_caching'
    if not os.path.exists(value_caching_folder):
        os.makedirs(value_caching_folder)

    data_inputs = list(range(100))

    sleep_time = 0.001
    a = time.time()
    for i in range(5):
        p = Pipeline([
            PickleValueCachingWrapper(ForEach(
                Pipeline([Sleep(sleep_time=sleep_time),
                          MultiplyByN(2)])),
                                      cache_folder=value_caching_folder)
        ])
        outputs_value_caching = p.transform(data_inputs)
    b = time.time()
    time_value_caching_pipeline = b - a
    print('Pipeline with ValueCachingWrapper')
    print('execution time: {} seconds'.format(time_value_caching_pipeline))

    a = time.time()
    for i in range(5):
        p = Pipeline([
            ForEach(Pipeline([Sleep(sleep_time=sleep_time),
                              MultiplyByN(2)])),
        ])

        outputs_vanilla = p.transform(data_inputs)
    b = time.time()
    time_vanilla_pipeline = b - a
    print('Pipeline without value caching')
    print('execution time: {} seconds'.format(time_vanilla_pipeline))

    shutil.rmtree(value_caching_folder)

    assert np.array_equal(outputs_value_caching, outputs_vanilla)
    assert time_value_caching_pipeline < time_vanilla_pipeline
Пример #5
0
def test_transform_should_transform_all_steps_for_each_data_inputs_expected_outputs(
):
    tape = TapeCallbackFunction()
    p = Pipeline([
        ForEach(
            Pipeline([
                TransformCallbackStep(tape.callback, ["1"]),
                TransformCallbackStep(tape.callback, ["2"]),
            ]))
    ])
    data_inputs = [[0, 1], [1, 2]]

    outputs = p.transform(data_inputs)

    assert tape.get_name_tape() == ["1", "2", "1", "2"]
Пример #6
0
def test_parallel_queued_parallelize_correctly(tmpdir, use_processes,
                                               use_savers):
    sleep_time = 0.01
    p = SequentialQueuedPipeline(
        [('1', 2, 10,
          Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                    MultiplyByN(2)])),
         ('2', 2, 10,
          Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                    MultiplyByN(2)])),
         ('3', 2, 10,
          Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                    MultiplyByN(2)])),
         ('4', 2, 10,
          Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                    MultiplyByN(2)]))],
        batch_size=10,
        use_processes=use_processes,
        use_savers=use_savers).with_context(ExecutionContext(tmpdir))

    a = time.time()
    outputs_streaming = p.transform(list(range(100)))
    b = time.time()
    time_queued_pipeline = b - a

    p = Pipeline([
        Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                  MultiplyByN(2)]),
        Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                  MultiplyByN(2)]),
        Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                  MultiplyByN(2)]),
        Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                  MultiplyByN(2)])
    ])

    a = time.time()
    outputs_vanilla = p.transform(list(range(100)))
    b = time.time()
    time_vanilla_pipeline = b - a

    assert time_queued_pipeline < time_vanilla_pipeline
    assert np.array_equal(outputs_streaming, outputs_vanilla)
Пример #7
0
def test_fit_for_each_should_fit_all_steps_for_each_data_inputs_expected_outputs(
):
    tape = TapeCallbackFunction()
    p = Pipeline([
        ForEach(
            Pipeline([
                FitCallbackStep(tape.callback, ["1"]),
                FitCallbackStep(tape.callback, ["2"]),
            ]))
    ])
    data_inputs = [[0, 1], [1, 2]]
    expected_outputs = [[2, 3], [4, 5]]

    p = p.fit(data_inputs, expected_outputs)

    assert isinstance(p, Pipeline)
    assert tape.get_name_tape() == ["1", "2", "1", "2"]
    assert tape.data == [([0, 1], [2, 3]), ([0, 1], [2, 3]), ([1, 2], [4, 5]),
                         ([1, 2], [4, 5])]
Пример #8
0
def test_parallel_queued_parallelize_correctly():
    sleep_time = 0.001
    p = SequentialQueuedPipeline(
        [('1', 4, 10,
          Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                    MultiplyByN(2)])),
         ('2', 4, 10,
          Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                    MultiplyByN(2)])),
         ('3', 4, 10,
          Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                    MultiplyByN(2)])),
         ('4', 4, 10,
          Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                    MultiplyByN(2)]))],
        batch_size=10)

    a = time.time()
    outputs_streaming = p.transform(list(range(100)))
    b = time.time()
    time_queued_pipeline = b - a

    p = Pipeline([
        Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                  MultiplyByN(2)]),
        Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                  MultiplyByN(2)]),
        Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                  MultiplyByN(2)]),
        Pipeline([ForEach(Sleep(sleep_time=sleep_time)),
                  MultiplyByN(2)])
    ])

    a = time.time()
    outputs_vanilla = p.transform(list(range(100)))
    b = time.time()
    time_vanilla_pipeline = b - a

    assert time_queued_pipeline < time_vanilla_pipeline
    assert np.array_equal(outputs_streaming, outputs_vanilla)
Пример #9
0
def test_fit_transform_should_fit_transform_all_steps_for_each_data_inputs_expected_outputs(
):
    tape = TapeCallbackFunction()
    tape_fit = TapeCallbackFunction()
    p = Pipeline([
        ForEach(
            Pipeline([
                FitTransformCallbackStep(tape.callback, tape_fit, ["1"]),
                FitTransformCallbackStep(tape.callback, tape_fit, ["2"]),
            ]))
    ])
    data_inputs = [[0, 1], [1, 2]]
    expected_outputs = [[2, 3], [4, 5]]

    p, outputs = p.fit_transform(data_inputs, expected_outputs)

    assert tape.get_name_tape() == ["1", "2", "1", "2"]
    assert tape_fit.get_name_tape() == ["1", "2", "1", "2"]
    assert tape_fit.data == [([0, 1], [2, 3]), ([0, 1], [2, 3]),
                             ([1, 2], [4, 5]), ([1, 2], [4, 5])]
Пример #10
0
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10):
    DATA_INPUTS = np.array(range(100))
    EXPECTED_OUTPUTS = np.array(range(100, 200))

    HYPERPARAMETER_SPACE = HyperparameterSpace({
        'multiplication_1__multiply_by': RandInt(1, 2),
        'multiplication_2__multiply_by': RandInt(1, 2),
    })

    print('Classic Pipeline:')
    classic_pipeline_folder = os.path.join(str(tmpdir), 'classic')

    pipeline = Pipeline([
        ('multiplication_1', MultiplyByN()),
        ('sleep_1', ForEach(Sleep(sleep_time))),
        ('multiplication_2', MultiplyByN()),
    ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    auto_ml = AutoML(
        pipeline,
        refit_trial=True,
        n_trials=n_iter,
        cache_folder_when_no_handle=classic_pipeline_folder,
        validation_splitter=ValidationSplitter(0.2),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)
        ],
    )
    auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = auto_ml.get_best_model().predict(DATA_INPUTS)
    time_b = time.time()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)

    print('Resumable Pipeline:')
    resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable')

    pipeline = ResumablePipeline([
        ('multiplication_1', MultiplyByN()),
        ('sleep_1', ForEach(Sleep(sleep_time))),
        ('checkpoint1', ExpandDim(DefaultCheckpoint())),
        ('multiplication_2', MultiplyByN()),
    ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    auto_ml = AutoML(
        pipeline,
        refit_trial=True,
        n_trials=n_iter,
        cache_folder_when_no_handle=resumable_pipeline_folder,
        validation_splitter=ValidationSplitter(0.2),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)
        ]
    )
    auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs2 = auto_ml.get_best_model().predict(DATA_INPUTS)
    time_b = time.time()
    pipeline.flush_all_cache()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs2)
    print('{0} seconds'.format(time_b - time_a))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)
    assert (outputs == outputs2).all()