def test_regressor_quantile(output, client, listen_port, alpha): X, y, w, dX, dy, dw = _create_data( objective='regression', output=output ) params = { "objective": "quantile", "alpha": alpha, "random_state": 42, "n_estimators": 10, "num_leaves": 10 } dask_regressor = lgb.DaskLGBMRegressor( local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_regressor = dask_regressor.fit(dX, dy, client=client, sample_weight=dw) p1 = dask_regressor.predict(dX).compute() q1 = np.count_nonzero(y < p1) / y.shape[0] local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) p2 = local_regressor.predict(X) q2 = np.count_nonzero(y < p2) / y.shape[0] # Quantiles should be right np.testing.assert_allclose(q1, alpha, atol=0.2) np.testing.assert_allclose(q2, alpha, atol=0.2) client.close()
def test_regressor_pred_contrib(output, client, listen_port): X, y, w, dX, dy, dw = _create_data( objective='regression', output=output ) params = { "n_estimators": 10, "num_leaves": 10 } dask_regressor = lgb.DaskLGBMRegressor( time_out=5, local_listen_port=listen_port, tree_learner='data', **params ) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw, client=client) preds_with_contrib = dask_regressor.predict(dX, pred_contrib=True).compute() local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True) if output == "scipy_csr_matrix": preds_with_contrib = np.array(preds_with_contrib.todense()) # contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position num_features = dX.shape[1] assert preds_with_contrib.shape[1] == num_features + 1 assert preds_with_contrib.shape == local_preds_with_contrib.shape
def test_regressor(output, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='regression', output=output) params = {"random_state": 42, "num_leaves": 10} dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, local_listen_port=listen_port, tree='data', **params) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) p1 = dask_regressor.predict(dX) if output != 'dataframe': s1 = _r2_score(dy, p1) p1 = p1.compute() p1_local = dask_regressor.to_local().predict(X) s1_local = dask_regressor.to_local().score(X, y) local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) s2 = local_regressor.score(X, y) p2 = local_regressor.predict(X) # Scores should be the same if output != 'dataframe': assert_eq(s1, s2, atol=.01) assert_eq(s1, s1_local, atol=.003) # Predictions should be roughly the same assert_eq(y, p1, rtol=1., atol=100.) assert_eq(y, p2, rtol=1., atol=50.) assert_eq(p1, p1_local) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_regressor(output, client): X, y, w, _, dX, dy, dw, _ = _create_data(objective='regression', output=output) params = { "random_state": 42, "num_leaves": 31, "n_estimators": 20, } dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree='data', **params) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) p1 = dask_regressor.predict(dX) p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True) s1 = _r2_score(dy, p1) p1 = p1.compute() p1_local = dask_regressor.to_local().predict(X) s1_local = dask_regressor.to_local().score(X, y) local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) s2 = local_regressor.score(X, y) p2 = local_regressor.predict(X) # Scores should be the same assert_eq(s1, s2, atol=0.01) assert_eq(s1, s1_local) # Predictions should be roughly the same. assert_eq(p1, p1_local) # pref_leaf values should have the right shape # and values that look like valid tree nodes pred_leaf_vals = p1_pred_leaf.compute() assert pred_leaf_vals.shape == (X.shape[0], dask_regressor.booster_.num_trees()) assert np.max(pred_leaf_vals) <= params['num_leaves'] assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] assert_eq(p1, y, rtol=0.5, atol=50.) assert_eq(p2, y, rtol=0.5, atol=50.) # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_regressor.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_warns_and_continues_on_unrecognized_tree_learner(client): X = da.random.random((1e3, 10)) y = da.random.random((1e3, 1)) dask_regressor = lgb.DaskLGBMRegressor( time_out=5, local_listen_port=1234, tree_learner='some-nonsense-value', n_estimators=1, num_leaves=2 ) with pytest.warns(UserWarning, match='Parameter tree_learner set to some-nonsense-value'): dask_regressor = dask_regressor.fit(X, y, client=client) assert dask_regressor.fitted_
def test_warns_and_continues_on_unrecognized_tree_learner(client): X = da.random.random((1e3, 10)) y = da.random.random((1e3, 1)) dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree_learner='some-nonsense-value', n_estimators=1, num_leaves=2) with pytest.warns( UserWarning, match='Parameter tree_learner set to some-nonsense-value'): dask_regressor = dask_regressor.fit(X, y) assert dask_regressor.fitted_ client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_warns_but_makes_no_changes_for_feature_or_voting_tree_learner(client): X = da.random.random((1e3, 10)) y = da.random.random((1e3, 1)) for tree_learner in ['feature_parallel', 'voting']: dask_regressor = lgb.DaskLGBMRegressor( time_out=5, local_listen_port=1234, tree_learner=tree_learner, n_estimators=1, num_leaves=2 ) with pytest.warns(UserWarning, match='Support for tree_learner %s in lightgbm' % tree_learner): dask_regressor = dask_regressor.fit(X, y, client=client) assert dask_regressor.fitted_ assert dask_regressor.get_params()['tree_learner'] == tree_learner
def test_regressor_pred_contrib(output, client, listen_port): X, y, w, dX, dy, dw = _create_data( objective='regression', output=output ) params = { "n_estimators": 10, "num_leaves": 10 } dask_regressor = lgb.DaskLGBMRegressor( client=client, time_out=5, local_listen_port=listen_port, tree_learner='data', **params ) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) preds_with_contrib = dask_regressor.predict(dX, pred_contrib=True).compute() local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True) if output == "scipy_csr_matrix": preds_with_contrib = np.array(preds_with_contrib.todense()) # contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position num_features = dX.shape[1] assert preds_with_contrib.shape[1] == num_features + 1 assert preds_with_contrib.shape == local_preds_with_contrib.shape # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_regressor.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def refit_and_save(self, model_path): """ https://stackoverflow.com/questions/55208734/save-lgbmregressor-model-from-python-lightgbm-package-to-disc/55209076 """ try: self.best_model = lgb.DaskLGBMRegressor( client=self.client, random_state=42, silent=False, tree_learner="data", force_row_wise=True, **self.best_params_, ) self.best_model.fit( self.full_dataset[[ col for col in self.full_dataset if col.startswith(("pc", "cat")) ]].to_dask_array(lengths=True), self.full_dataset["sid_shop_item_qty_sold_day"].to_dask_array( lengths=True, ), sample_weight=self.get_sample_weights(self.full_dataset), feature_name=[ col for col in self.full_dataset if col.startswith(("pc", "cat")) ], categorical_feature=[ col for col in self.full_dataset if col.startswith("cat") ], ) output_txt = str(model_path).split("/")[-1] booster = self.best_model.booster_.save_model(output_txt) # output_txt = str(model_path).split('/')[-1] # global s3_client s3_client = boto3.client("s3") response = s3_client.upload_file(output_txt, "sales-demand-data", output_txt) logging.info( f"Name of saved model uploaded to S3 is: {output_txt}") except (Exception, ClientError): logging.exception( "Exception occurred while fitting model on the full dataset and saving the booster to file on S3." ) # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1)
def test_regressor_quantile(output, client, listen_port, alpha): X, y, w, dX, dy, dw = _create_data( objective='regression', output=output ) params = { "objective": "quantile", "alpha": alpha, "random_state": 42, "n_estimators": 10, "num_leaves": 10 } dask_regressor = lgb.DaskLGBMRegressor( client=client, local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) p1 = dask_regressor.predict(dX).compute() q1 = np.count_nonzero(y < p1) / y.shape[0] local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) p2 = local_regressor.predict(X) q2 = np.count_nonzero(y < p2) / y.shape[0] # Quantiles should be right np.testing.assert_allclose(q1, alpha, atol=0.2) np.testing.assert_allclose(q2, alpha, atol=0.2) # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_regressor.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
print("initializing a Dask cluster") cluster = LocalCluster(n_workers=2) client = Client(cluster) print("created a Dask LocalCluster") print("distributing training data on the Dask cluster") dX = da.from_array(X, chunks=(100, 50)) dy = da.from_array(y, chunks=(100, )) print("beginning training") dask_model = lgb.DaskLGBMRegressor(n_estimators=10) dask_model.fit(dX, dy) assert dask_model.fitted_ print("done training") print("predicting on the training data") preds = dask_model.predict(dX) # the code below uses sklearn.metrics, but this requires pulling all of the # predictions and target values back from workers to the client # # for larger datasets, consider the metrics from dask-ml instead # https://ml.dask.org/modules/api.html#dask-ml-metrics-metrics print("computing MSE")
def test_regressor(output, client, listen_port): X, y, w, dX, dy, dw = _create_data( objective='regression', output=output ) params = { "random_state": 42, "num_leaves": 10 } dask_regressor = lgb.DaskLGBMRegressor( client=client, time_out=5, local_listen_port=listen_port, tree='data', **params ) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) p1 = dask_regressor.predict(dX) p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True) if not output.startswith('dataframe'): s1 = _r2_score(dy, p1) p1 = p1.compute() p1_local = dask_regressor.to_local().predict(X) s1_local = dask_regressor.to_local().score(X, y) local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) s2 = local_regressor.score(X, y) p2 = local_regressor.predict(X) # Scores should be the same if not output.startswith('dataframe'): assert_eq(s1, s2, atol=.01) assert_eq(s1, s1_local, atol=.003) # Predictions should be roughly the same. assert_eq(p1, p1_local) # pref_leaf values should have the right shape # and values that look like valid tree nodes pred_leaf_vals = p1_pred_leaf.compute() assert pred_leaf_vals.shape == ( X.shape[0], dask_regressor.booster_.num_trees() ) assert np.max(pred_leaf_vals) <= params['num_leaves'] assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] # The checks below are skipped # for the categorical data case because it's difficult to get # a good fit from just categoricals for a regression problem # with small data if output != 'dataframe-with-categorical': assert_eq(y, p1, rtol=1., atol=100.) assert_eq(y, p2, rtol=1., atol=50.) # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_regressor.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def gridsearch_wfv(self, params): # self.hyperparameters = hyperparameters # self.rmse_results = defaultdict(list) # replace this variable by creating a key-value in # the self.hyper_dict dictionary with value containing list of RMSE values self.all_params_combs = list() # determine if there is more than one combination of hyperparameters # if only one combination, set get_stats_ flag to True self.get_stats_ = (len(params[max(params, key=lambda x: len(params[x]))]) == 1) for params_comb_dict in (dict( zip(params.keys(), v)) for v in list(product(*list(params.values())))): # for self.hyper_dict in hyperparameters: # self.params_combs_list.append(params_comb_dict) self.params_comb_dict = params_comb_dict.copy() self.params_comb_dict["rmse_list_"] = list() self.params_comb_dict["monthly_rmse_list_"] = list() self.params_comb_dict["fit_times_list_"] = list() try: self.model = lgb.DaskLGBMRegressor( client=self.client, random_state=42, silent=False, tree_learner="data", force_row_wise=True, **params_comb_dict, ) except Exception: logging.exception( "Exception occurred while initializing Dask model.") # kill all active work, delete all data on the network, and restart the worker processes. self.client.restart() sys.exit(1) # call method that loops over train-validation sets with performance_report( filename=f"dask_report_{self.curr_dt_time}.html"): for train, test, get_stats in self.train_test_time_split(): self.fit(train).predict(test).rmse_all_folds( test, get_stats) self.params_comb_dict["avg_rmse_"] = mean( self.params_comb_dict["rmse_list_"]) self.params_comb_dict["monthly_avg_rmse_"] = mean( self.params_comb_dict["monthly_rmse_list_"]) self.all_params_combs.append(self.params_comb_dict) best_params = min(self.all_params_combs, key=lambda x: x["monthly_avg_rmse_"]) self.best_score_ = best_params["monthly_avg_rmse_"] # remove non-parameter key-values from self.best_params (i.e., rmse_list_ and avg_rmse_, etc.) self.best_params_ = { k: v for k, v in best_params.items() if k in params } # save list of parameter-result dictionaries to dataframe and then to CSV if self.all_params_combs: all_params_combs_df = pd.DataFrame(self.all_params_combs) output_csv = "all_params_combs.csv" all_params_combs_df.to_csv(output_csv, index=False) try: key = f"lightgbm_all_params_combs_{self.curr_dt_time}.csv" # global s3_client s3_client = boto3.client("s3") response = s3_client.upload_file(output_csv, "sales-demand-data", key) logging.info( "Name of CSV uploaded to S3 and containing all parameter combinations " f"and results is: {key}") except ClientError as e: logging.exception( "CSV file with LightGBM parameter combinations and results was not copied to S3." ) else: logging.debug( "List of parameter-result dictionaries is empty and was not converted to CSV!" )