def test_sklearn_incremental_predictor_classification(): df = vaex.ml.datasets.load_iris_1e5() df_train, df_test = df.ml.train_test_split(test_size=0.1, verbose=False) features = df_train.column_names[:4] target = 'class_' incremental = IncrementalPredictor( model=SGDClassifier(learning_rate='constant', eta0=0.01), features=features, batch_size=10_000, num_epochs=3, shuffle=False, prediction_name='pred', partial_fit_kwargs={'classes': [0, 1, 2]}) incremental.fit(df=df_train, target=target) df_train = incremental.transform(df_train) # State transfer state = df_train.state_get() df_test.state_set(state) assert df_test.column_count() == 6 assert df_test.pred.values.shape == (10050, ) pred_in_memory = incremental.predict(df_test) np.testing.assert_array_equal(pred_in_memory, df_test.pred.values)
def test_sklearn_incremental_predictor_serialize(tmpdir): df = vaex.example() df_train, df_test = df.ml.train_test_split(test_size=0.1, verbose=False) features = df_train.column_names[:6] target = 'FeH' incremental = IncrementalPredictor(model=SGDRegressor(), features=features, batch_size=10 * 1000, num_epochs=5, shuffle=True, prediction_name='pred') incremental.fit(df=df_train, target=target) df_train = incremental.transform(df_train) # State transfer - serialization df_train.state_write(str(tmpdir.join('test.json'))) df_test.state_load(str(tmpdir.join('test.json'))) assert df_train.column_count() == df_test.column_count() assert df_test.pred.values.shape == (33000, ) pred_in_memory = incremental.predict(df_test) np.testing.assert_array_almost_equal(pred_in_memory, df_test.pred.values, decimal=1)
def test_sklearn_incremental_predictor_regression(df_example): df = df_example df_train, df_test = df.ml.train_test_split(test_size=0.1, verbose=False) features = df_train.column_names[:6] target = 'FeH' incremental = IncrementalPredictor(model=SGDRegressor(), features=features, target=target, batch_size=10 * 1000, num_epochs=5, shuffle=True, prediction_name='pred') incremental.fit(df=df_train) df_train = incremental.transform(df_train) # State transfer state = df_train.state_get() df_test.state_set(state) assert df_train.column_count() == df_test.column_count() assert df_test.pred.values.shape == (33000, ) pred_in_memory = incremental.predict(df_test) np.testing.assert_array_almost_equal(pred_in_memory, df_test.pred.values, decimal=1)