def test_ranker(output, client, listen_port, group): X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group ) # rebalance small dask.array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist() dw = dw.persist() dg = dg.persist() _ = wait([dX, dy, dw, dg]) client.rebalance() # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. params = { "random_state": 42, "n_estimators": 50, "num_leaves": 20, "min_child_samples": 1 } dask_ranker = lgb.DaskLGBMRanker( client=client, time_out=5, local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = rnkvec_dask.compute() rnkvec_dask_local = dask_ranker.to_local().predict(X) local_ranker = lgb.LGBMRanker(**params) local_ranker.fit(X, y, sample_weight=w, group=g) rnkvec_local = local_ranker.predict(X) # distributed ranker should be able to rank decently well and should # have high rank correlation with scores from serial ranker. dcor = spearmanr(rnkvec_dask, y).correlation assert dcor > 0.6 assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8 assert_eq(rnkvec_dask, rnkvec_dask_local) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
client = Client(cluster) print("created a Dask LocalCluster") print("distributing training data on the Dask cluster") # split training data into two partitions rows_in_part1 = int(np.sum(group[:100])) rows_in_part2 = X.shape[0] - rows_in_part1 num_features = X.shape[1] # make this array dense because we're splitting across # a sparse boundary to partition the data X = X.todense() dX = da.from_array(x=X, chunks=[(rows_in_part1, rows_in_part2), (num_features, )]) dy = da.from_array(x=y, chunks=[ (rows_in_part1, rows_in_part2), ]) dg = da.from_array(x=group, chunks=[(100, group.size - 100)]) print("beginning training") dask_model = lgb.DaskLGBMRanker(n_estimators=10) dask_model.fit(dX, dy, group=dg) assert dask_model.fitted_ print("done training")
def test_ranker(output, client, listen_port, group): if output == 'dataframe-with-categorical': X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group, n_features=1, n_informative=1 ) else: X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group, ) # rebalance small dask.Array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist() dw = dw.persist() dg = dg.persist() _ = wait([dX, dy, dw, dg]) client.rebalance() # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. params = { "random_state": 42, "n_estimators": 50, "num_leaves": 20, "min_child_samples": 1 } dask_ranker = lgb.DaskLGBMRanker( client=client, time_out=5, local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = rnkvec_dask.compute() p1_pred_leaf = dask_ranker.predict(dX, pred_leaf=True) rnkvec_dask_local = dask_ranker.to_local().predict(X) local_ranker = lgb.LGBMRanker(**params) local_ranker.fit(X, y, sample_weight=w, group=g) rnkvec_local = local_ranker.predict(X) # distributed ranker should be able to rank decently well and should # have high rank correlation with scores from serial ranker. dcor = spearmanr(rnkvec_dask, y).correlation assert dcor > 0.6 assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8 assert_eq(rnkvec_dask, rnkvec_dask_local) # pref_leaf values should have the right shape # and values that look like valid tree nodes pred_leaf_vals = p1_pred_leaf.compute() assert pred_leaf_vals.shape == ( X.shape[0], dask_ranker.booster_.num_trees() ) assert np.max(pred_leaf_vals) <= params['num_leaves'] assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_ranker.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)