def test_network_params_not_required_but_respected_if_given(client, task, listen_port): client.wait_for_workers(2) _, _, _, _, dX, dy, _, dg = _create_data( objective=task, output='array', chunk_size=10, group=None ) dask_model_factory = task_to_dask_factory[task] # rebalance data to be sure that each worker has a piece of the data client.rebalance() # model 1 - no network parameters given dask_model1 = dask_model_factory( n_estimators=5, num_leaves=5, ) dask_model1.fit(dX, dy, group=dg) assert dask_model1.fitted_ params = dask_model1.get_params() assert 'local_listen_port' not in params assert 'machines' not in params # model 2 - machines given n_workers = len(client.scheduler_info()['workers']) open_ports = [lgb.dask._find_random_open_port() for _ in range(n_workers)] dask_model2 = dask_model_factory( n_estimators=5, num_leaves=5, machines=",".join([ "127.0.0.1:" + str(port) for port in open_ports ]), ) dask_model2.fit(dX, dy, group=dg) assert dask_model2.fitted_ params = dask_model2.get_params() assert 'local_listen_port' not in params assert 'machines' in params # model 3 - local_listen_port given # training should fail because LightGBM will try to use the same # port for multiple worker processes on the same machine dask_model3 = dask_model_factory( n_estimators=5, num_leaves=5, local_listen_port=listen_port ) error_msg = "has multiple Dask worker processes running on it" with pytest.raises(lgb.basic.LightGBMError, match=error_msg): dask_model3.fit(dX, dy, group=dg) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_machines_should_be_used_if_provided(task, output): if task == 'ranking' and output == 'scipy_csr_matrix': pytest.skip('LGBMRanker is not currently tested on sparse matrices') with LocalCluster(n_workers=2) as cluster, Client(cluster) as client: if task == 'ranking': _, _, _, _, dX, dy, _, dg = _create_ranking_data( output=output, group=None, chunk_size=10, ) else: _, _, _, dX, dy, _ = _create_data( objective=task, output=output, chunk_size=10, ) dg = None dask_model_factory = task_to_dask_factory[task] # rebalance data to be sure that each worker has a piece of the data if output == 'array': client.rebalance() n_workers = len(client.scheduler_info()['workers']) assert n_workers > 1 open_ports = [ lgb.dask._find_random_open_port() for _ in range(n_workers) ] dask_model = dask_model_factory( n_estimators=5, num_leaves=5, machines=",".join( ["127.0.0.1:" + str(port) for port in open_ports]), ) # test that "machines" is actually respected by creating a socket that uses # one of the ports mentioned in "machines" error_msg = "Binding port %s failed" % open_ports[0] with pytest.raises(lgb.basic.LightGBMError, match=error_msg): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(('127.0.0.1', open_ports[0])) dask_model.fit(dX, dy, group=dg) # an informative error should be raised if "machines" has duplicates one_open_port = lgb.dask._find_random_open_port() dask_model.set_params(machines=",".join( ["127.0.0.1:" + str(one_open_port) for _ in range(n_workers)])) with pytest.raises(ValueError, match="Found duplicates in 'machines'"): dask_model.fit(dX, dy, group=dg)
def test_ranker(output, client, listen_port, group): X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group ) # rebalance small dask.array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist() dw = dw.persist() dg = dg.persist() _ = wait([dX, dy, dw, dg]) client.rebalance() # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. params = { "random_state": 42, "n_estimators": 50, "num_leaves": 20, "min_child_samples": 1 } dask_ranker = lgb.DaskLGBMRanker( client=client, time_out=5, local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = rnkvec_dask.compute() rnkvec_dask_local = dask_ranker.to_local().predict(X) local_ranker = lgb.LGBMRanker(**params) local_ranker.fit(X, y, sample_weight=w, group=g) rnkvec_local = local_ranker.predict(X) # distributed ranker should be able to rank decently well and should # have high rank correlation with scores from serial ranker. dcor = spearmanr(rnkvec_dask, y).correlation assert dcor > 0.6 assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8 assert_eq(rnkvec_dask, rnkvec_dask_local) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_ranker(output, client, listen_port, group): if output == 'dataframe-with-categorical': X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group, n_features=1, n_informative=1 ) else: X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group, ) # rebalance small dask.Array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist() dw = dw.persist() dg = dg.persist() _ = wait([dX, dy, dw, dg]) client.rebalance() # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. params = { "random_state": 42, "n_estimators": 50, "num_leaves": 20, "min_child_samples": 1 } dask_ranker = lgb.DaskLGBMRanker( client=client, time_out=5, local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = rnkvec_dask.compute() p1_pred_leaf = dask_ranker.predict(dX, pred_leaf=True) rnkvec_dask_local = dask_ranker.to_local().predict(X) local_ranker = lgb.LGBMRanker(**params) local_ranker.fit(X, y, sample_weight=w, group=g) rnkvec_local = local_ranker.predict(X) # distributed ranker should be able to rank decently well and should # have high rank correlation with scores from serial ranker. dcor = spearmanr(rnkvec_dask, y).correlation assert dcor > 0.6 assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8 assert_eq(rnkvec_dask, rnkvec_dask_local) # pref_leaf values should have the right shape # and values that look like valid tree nodes pred_leaf_vals = p1_pred_leaf.compute() assert pred_leaf_vals.shape == ( X.shape[0], dask_ranker.booster_.num_trees() ) assert np.max(pred_leaf_vals) <= params['num_leaves'] assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_ranker.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_network_params_not_required_but_respected_if_given( client, task, output, listen_port): if task == 'ranking' and output == 'scipy_csr_matrix': pytest.skip('LGBMRanker is not currently tested on sparse matrices') if task == 'ranking': _, _, _, _, dX, dy, _, dg = _create_ranking_data( output=output, group=None, chunk_size=10, ) dask_model_factory = lgb.DaskLGBMRanker else: _, _, _, dX, dy, _ = _create_data( objective=task, output=output, chunk_size=10, ) dg = None if task == 'classification': dask_model_factory = lgb.DaskLGBMClassifier elif task == 'regression': dask_model_factory = lgb.DaskLGBMRegressor # rebalance data to be sure that each worker has a piece of the data if output == 'array': client.rebalance() # model 1 - no network parameters given dask_model1 = dask_model_factory( n_estimators=5, num_leaves=5, ) if task == 'ranking': dask_model1.fit(dX, dy, group=dg) else: dask_model1.fit(dX, dy) assert dask_model1.fitted_ params = dask_model1.get_params() assert 'local_listen_port' not in params assert 'machines' not in params # model 2 - machines given n_workers = len(client.scheduler_info()['workers']) open_ports = [lgb.dask._find_random_open_port() for _ in range(n_workers)] dask_model2 = dask_model_factory( n_estimators=5, num_leaves=5, machines=",".join(["127.0.0.1:" + str(port) for port in open_ports]), ) if task == 'ranking': dask_model2.fit(dX, dy, group=dg) else: dask_model2.fit(dX, dy) assert dask_model2.fitted_ params = dask_model2.get_params() assert 'local_listen_port' not in params assert 'machines' in params # model 3 - local_listen_port given # training should fail because LightGBM will try to use the same # port for multiple worker processes on the same machine dask_model3 = dask_model_factory(n_estimators=5, num_leaves=5, local_listen_port=listen_port) error_msg = "has multiple Dask worker processes running on it" with pytest.raises(lgb.basic.LightGBMError, match=error_msg): if task == 'ranking': dask_model3.fit(dX, dy, group=dg) else: dask_model3.fit(dX, dy) client.close(timeout=CLIENT_CLOSE_TIMEOUT)