예제 #1
0
def test_sklearn_incremental_predictor_partial_fit_calls(
        batch_size, num_epochs):
    df = vaex.example()
    df_train, df_test = df.ml.train_test_split(test_size=0.1, verbose=False)

    features = df_train.column_names[:6]
    target = 'FeH'

    N_total = len(df_train)
    num_batches = (N_total + batch_size - 1) // batch_size

    # Create a mock model for counting the number of samples seen and partial_fit calls
    class MockModel():
        def __init__(self):
            self.n_samples_ = 0
            self.n_partial_fit_calls_ = 0

        def partial_fit(self, X, y):
            self.n_samples_ += X.shape[0]
            self.n_partial_fit_calls_ += 1

    incremental = IncrementalPredictor(model=MockModel(),
                                       features=features,
                                       batch_size=batch_size,
                                       num_epochs=num_epochs,
                                       shuffle=False,
                                       prediction_name='pred')

    incremental.fit(df=df_train, target=target)
    assert incremental.model.n_samples_ == N_total * num_epochs
    assert incremental.model.n_partial_fit_calls_ == num_batches * num_epochs
예제 #2
0
def test_sklearn_incremental_predictor_classification():
    df = vaex.ml.datasets.load_iris_1e5()
    df_train, df_test = df.ml.train_test_split(test_size=0.1, verbose=False)

    features = df_train.column_names[:4]
    target = 'class_'

    incremental = IncrementalPredictor(
        model=SGDClassifier(learning_rate='constant', eta0=0.01),
        features=features,
        batch_size=10_000,
        num_epochs=3,
        shuffle=False,
        prediction_name='pred',
        partial_fit_kwargs={'classes': [0, 1, 2]})

    incremental.fit(df=df_train, target=target)
    df_train = incremental.transform(df_train)

    # State transfer
    state = df_train.state_get()
    df_test.state_set(state)

    assert df_test.column_count() == 6
    assert df_test.pred.values.shape == (10050, )

    pred_in_memory = incremental.predict(df_test)
    np.testing.assert_array_equal(pred_in_memory, df_test.pred.values)
예제 #3
0
def test_sklearn_incremental_predictor_serialize(tmpdir):
    df = vaex.example()
    df_train, df_test = df.ml.train_test_split(test_size=0.1, verbose=False)

    features = df_train.column_names[:6]
    target = 'FeH'

    incremental = IncrementalPredictor(model=SGDRegressor(),
                                       features=features,
                                       batch_size=10 * 1000,
                                       num_epochs=5,
                                       shuffle=True,
                                       prediction_name='pred')
    incremental.fit(df=df_train, target=target)
    df_train = incremental.transform(df_train)

    # State transfer - serialization
    df_train.state_write(str(tmpdir.join('test.json')))
    df_test.state_load(str(tmpdir.join('test.json')))

    assert df_train.column_count() == df_test.column_count()
    assert df_test.pred.values.shape == (33000, )

    pred_in_memory = incremental.predict(df_test)
    np.testing.assert_array_almost_equal(pred_in_memory,
                                         df_test.pred.values,
                                         decimal=1)
예제 #4
0
def test_sklearn_incremental_predictor_regression(df_example):
    df = df_example
    df_train, df_test = df.ml.train_test_split(test_size=0.1, verbose=False)

    features = df_train.column_names[:6]
    target = 'FeH'

    incremental = IncrementalPredictor(model=SGDRegressor(),
                                       features=features,
                                       target=target,
                                       batch_size=10 * 1000,
                                       num_epochs=5,
                                       shuffle=True,
                                       prediction_name='pred')
    incremental.fit(df=df_train)
    df_train = incremental.transform(df_train)

    # State transfer
    state = df_train.state_get()
    df_test.state_set(state)

    assert df_train.column_count() == df_test.column_count()
    assert df_test.pred.values.shape == (33000, )

    pred_in_memory = incremental.predict(df_test)
    np.testing.assert_array_almost_equal(pred_in_memory,
                                         df_test.pred.values,
                                         decimal=1)