def test_sklearn_estimator_pipeline(): ds = vaex.ml.datasets.load_iris() train, test = ds.ml.train_test_split(verbose=False) # Add virtual columns train['sepal_virtual'] = np.sqrt(train.sepal_length**2 + train.sepal_width**2) train['petal_scaled'] = train.petal_length * 0.2 # Do a pca features = ['sepal_virtual', 'petal_scaled'] pca = train.ml.pca(n_components=2, features=features) train = pca.transform(train) # Do state transfer st = train.ml.state_transfer() # now apply the model features = ['sepal_virtual', 'petal_scaled'] model = Predictor(model=LinearRegression(), features=features, prediction_name='pred') model.fit(train, train.petal_width) # Create a pipeline pipeline = vaex.ml.Pipeline([st, model]) # Use the pipeline pred = pipeline.predict(test) df_trans = pipeline.transform(test) # WARNING: on windows/appveyor this gives slightly different results # do we fully understand why? I also have the same results on my osx laptop # sklearn 0.21.1 (scikit-learn-0.21.2 is installed on windows) so it might be a # version related thing np.testing.assert_array_almost_equal(pred, df_trans.pred.values)
def test_sklearn_estimator_regression_validation(): ds = vaex.ml.datasets.load_iris() train, test = ds.ml.train_test_split(verbose=False) features = ['sepal_length', 'sepal_width', 'petal_length'] # Dense features Xtrain = train[features].values Xtest = test[features].values ytrain = train.petal_width.values for model in models_regression: # vaex vaex_model = Predictor(model=model, features=features, prediction_name='pred') vaex_model.fit(train, train.petal_width) test = vaex_model.transform(test) # sklearn model.fit(Xtrain, ytrain) skl_pred = model.predict(Xtest) np.testing.assert_array_almost_equal(test.pred.values, skl_pred, decimal=5)
def test_sklearn_estimator_classification_validation(): ds = vaex.ml.datasets.load_titanic() train, test = ds.ml.train_test_split(verbose=False) features = ['pclass', 'parch', 'sibsp'] # Dense features Xtrain = train[features].values Xtest = test[features].values ytrain = train.survived.values for model in models_classification: # vaex vaex_model = Predictor(model=model, features=features, prediction_name='pred') vaex_model.fit(train, train.survived) test = vaex_model.transform(test) # scikit-learn model.fit(Xtrain, ytrain) skl_pred = model.predict(Xtest) assert np.all(skl_pred == test.pred.values)
def test_sklearn_estimator_classification_validation(prediction_type, df_titanic): df = df_titanic df['survived'] = df.survived.astype('int32') train, test = df.ml.train_test_split(verbose=False) features = ['pclass', 'parch', 'sibsp'] # Dense features Xtrain = train[features].values Xtest = test[features].values ytrain = train.survived.values for model in models_classification: # vaex vaex_model = Predictor(model=model, features=features, target='survived', prediction_name='pred', prediction_type=prediction_type) vaex_model.fit(train) test = vaex_model.transform(test) # scikit-learn model.fit(Xtrain, ytrain) if prediction_type == 'predict': skl_pred = model.predict(Xtest) else: skl_pred = model.predict_proba(Xtest) assert np.all(skl_pred == test.pred.values)
def test_sklearn_estimator_virtual_columns(): ds = vaex.ml.datasets.load_iris() ds['x'] = ds.sepal_length * 1 ds['y'] = ds.sepal_width * 1 ds['w'] = ds.petal_length * 1 ds['z'] = ds.petal_width * 1 train, test = ds.ml.train_test_split(test_size=0.2, verbose=False) features = ['x', 'y', 'z'] model = Predictor(model=LinearRegression(), features=features, prediction_name='pred') model.fit(ds, ds.w) ds = model.transform(ds) assert ds.pred.values.shape == (150,)
def train_gtfsr(df): print("*** gtfsr model training ***") feats = (df.get_column_names(regex="pca[\d]") + df.get_column_names(regex=".*_[xy]") + df.get_column_names(regex="standard_scaled_*") + df.get_column_names(regex="label_encode_*") + df.get_column_names(regex="minmax_scaled_*") + ["stop_sequence", "is_delayed", "direction"]) assert [ df[feat].dtype for feat in feats if not df[feat].dtype in ["float64", "int64"] ] == [], f"All training feature must be number types, not {df[feats].dtypes}" target = "arrival" prediction_name = "p_arrival" lgbm_params = { "boosting_type": "gbdt", "learning_rate": 0.3, "n_estimators": 300, "max_depth": 50, "num_leaves": 50, "num_iterations": 300, } models = [ # lightGBM Regressor Predictor( features=feats, target=target, prediction_name=prediction_name + "_lgbm", model=lightgbm.LGBMRegressor(**lgbm_params, n_jobs=-1), ), # # XGBoost Regressor # Predictor( # features=feats, # target=target, # prediction_name=prediction_name + "_xgb", # model=xgboost.XGBRegressor(max_depth=50, min_child_weight=1, n_estimators=200, n_jobs=-1, learning_rate=0.3), # ), ] # here we fit and train the model for i, model in enumerate(models): model.fit(df) print(f"\n\nmodel {i} trained, time taken: {duration()}s") df = model.transform(df) # df[prediction_name + "_final"] = ( # df["p_arrival_lgbm"].astype("float64") * 0.5 + df["p_arrival_xgb"].astype("float64") * 0.5 # ) df.state_write(gtfsr_model_out_path) print("exported model") return
def create_scats_ml_model(): print("starting scats ml modeling") # load existing csv into vaex dataframe if not os.path.exists(finalScatsPath + ".hdf5"): vaex.from_csv(finalScatsPath, convert=True, copy_index=False, chunk_size=1_000_000) df = vaex.open(finalScatsPath + ".hdf5", shuffle=True) # transform the features into more machine learning friendly vars pca_coord = vaex.ml.PCA(features=["lat", "lon"], n_components=2, prefix="pca") df = pca_coord.fit_transform(df) cycl_transform_hour = vaex.ml.CycleTransformer(features=["hour"], n=24) df = cycl_transform_hour.fit_transform(df) cycl_transform_dow = vaex.ml.CycleTransformer(features=["dow"], n=7) df = cycl_transform_dow.fit_transform(df) print("dataWrangling done, ready to create model, time: {}s".format(duration())) # create a randomForestRegression model vaex_model = Predictor( features=df.get_column_names(regex="pca[\d]") + df.get_column_names(regex=".*_[xy]"), target="avg_vol", model=RandomForestRegressor(random_state=42, n_estimators=7 * 24), prediction_name="p_avg_vol", ) # here we fit and train the model with parallel_backend("threading", n_jobs=8): vaex_model.fit(df) print("\n\nmodel created, time: {}s".format(duration())) dump(value=vaex_model, filename=model_out, compress=3) print("model written to output, time: {}s".format(duration())) return
def test_sklearn_estimator(): ds = vaex.ml.datasets.load_iris() features = ['sepal_length', 'sepal_width', 'petal_length'] train, test = ds.ml.train_test_split(verbose=False) model = Predictor(model=LinearRegression(), features=features, prediction_name='pred') model.fit(train, train.petal_width) prediction = model.predict(test) test = model.transform(test) np.testing.assert_array_almost_equal(test.pred.values, prediction, decimal=5) # Transfer the state of train to ds train = model.transform(train) state = train.state_get() ds.state_set(state) assert ds.pred.values.shape == (150,)
def test_sklearn_estimator_serialize(tmpdir): ds = vaex.ml.datasets.load_iris() features = ['sepal_length', 'sepal_width', 'petal_length'] model = Predictor(model=LinearRegression(), features=features, prediction_name='pred') model.fit(ds, ds.petal_width) pipeline = vaex.ml.Pipeline([model]) pipeline.save(str(tmpdir.join('test.json'))) pipeline.load(str(tmpdir.join('test.json'))) model = Predictor(model=LinearRegression(), features=features, prediction_name='pred') model.fit(ds, ds.petal_width) model.state_set(model.state_get()) pipeline = vaex.ml.Pipeline([model]) pipeline.save(str(tmpdir.join('test.json'))) pipeline.load(str(tmpdir.join('test.json')))