def test_regressor(output, client): X, y, w, _, dX, dy, dw, _ = _create_data(objective='regression', output=output) params = { "random_state": 42, "num_leaves": 31, "n_estimators": 20, } dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree='data', **params) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) p1 = dask_regressor.predict(dX) p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True) s1 = _r2_score(dy, p1) p1 = p1.compute() p1_local = dask_regressor.to_local().predict(X) s1_local = dask_regressor.to_local().score(X, y) local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) s2 = local_regressor.score(X, y) p2 = local_regressor.predict(X) # Scores should be the same assert_eq(s1, s2, atol=0.01) assert_eq(s1, s1_local) # Predictions should be roughly the same. assert_eq(p1, p1_local) # pref_leaf values should have the right shape # and values that look like valid tree nodes pred_leaf_vals = p1_pred_leaf.compute() assert pred_leaf_vals.shape == (X.shape[0], dask_regressor.booster_.num_trees()) assert np.max(pred_leaf_vals) <= params['num_leaves'] assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] assert_eq(p1, y, rtol=0.5, atol=50.) assert_eq(p2, y, rtol=0.5, atol=50.) # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_regressor.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_regressor_quantile(output, client, listen_port, alpha): X, y, w, dX, dy, dw = _create_data(objective='regression', output=output) params = { "objective": "quantile", "alpha": alpha, "random_state": 42, "n_estimators": 10, "num_leaves": 10 } dask_regressor = dlgbm.DaskLGBMRegressor(local_listen_port=listen_port, tree_learner_type='data_parallel', **params) dask_regressor = dask_regressor.fit(dX, dy, client=client, sample_weight=dw) p1 = dask_regressor.predict(dX).compute() q1 = np.count_nonzero(y < p1) / y.shape[0] local_regressor = lightgbm.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) p2 = local_regressor.predict(X) q2 = np.count_nonzero(y < p2) / y.shape[0] # Quantiles should be right np.testing.assert_allclose(q1, alpha, atol=0.2) np.testing.assert_allclose(q2, alpha, atol=0.2) client.close()
def test_regressor(output, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='regression', output=output) dask_regressor = dlgbm.DaskLGBMRegressor(time_out=5, local_listen_port=listen_port, seed=42, num_leaves=10, tree='data') dask_regressor = dask_regressor.fit(dX, dy, client=client, sample_weight=dw) p1 = dask_regressor.predict(dX) if output != 'dataframe': s1 = r2_score(dy, p1) p1 = p1.compute() local_regressor = lightgbm.LGBMRegressor(seed=42, num_leaves=10) local_regressor.fit(X, y, sample_weight=w) s2 = local_regressor.score(X, y) p2 = local_regressor.predict(X) # Scores should be the same if output != 'dataframe': assert_eq(s1, s2, atol=.01) # Predictions should be roughly the same assert_eq(y, p1, rtol=1., atol=100.) assert_eq(y, p2, rtol=1., atol=50.) client.close()
def test_regressor_quantile(output, client, listen_port, alpha): X, y, w, dX, dy, dw = _create_data(objective='regression', output=output) dask_regressor = dlgbm.DaskLGBMRegressor(local_listen_port=listen_port, seed=42, objective='quantile', alpha=alpha, n_estimators=10, num_leaves=10, tree_learner_type='data_parallel') dask_regressor = dask_regressor.fit(dX, dy, client=client, sample_weight=dw) p1 = dask_regressor.predict(dX).compute() q1 = np.count_nonzero(y < p1) / y.shape[0] local_regressor = lightgbm.LGBMRegressor(seed=42, objective='quantile', alpha=alpha, n_estimatores=10, num_leaves=10) local_regressor.fit(X, y, sample_weight=w) p2 = local_regressor.predict(X) q2 = np.count_nonzero(y < p2) / y.shape[0] # Quantiles should be right np.testing.assert_allclose(q1, alpha, atol=0.2) np.testing.assert_allclose(q2, alpha, atol=0.2) client.close()
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array( task, client, ): _, _, _, _, dX, dy, dw, dg = _create_data( objective=task, output='dataframe', group=None ) model_factory = task_to_dask_factory[task] dy = dy.to_dask_array(lengths=True) dy_col_array = dy.reshape(-1, 1) assert len(dy_col_array.shape) == 2 and dy_col_array.shape[1] == 1 params = { 'n_estimators': 1, 'num_leaves': 3, 'random_state': 0, 'time_out': 5 } model = model_factory(**params) model.fit(dX, dy_col_array, sample_weight=dw, group=dg) assert model.fitted_ client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_predict_with_raw_score(task, output, client): if task == 'ranking' and output == 'scipy_csr_matrix': pytest.skip('LGBMRanker is not currently tested on sparse matrices') _, _, _, _, dX, dy, _, dg = _create_data(objective=task, output=output, group=None) model_factory = task_to_dask_factory[task] params = { 'client': client, 'n_estimators': 1, 'num_leaves': 2, 'time_out': 5, 'min_sum_hessian': 0 } model = model_factory(**params) model.fit(dX, dy, group=dg) raw_predictions = model.predict(dX, raw_score=True).compute() trees_df = model.booster_.trees_to_dataframe() leaves_df = trees_df[trees_df.node_depth == 2] if task == 'multiclass-classification': for i in range(model.n_classes_): class_df = leaves_df[leaves_df.tree_index == i] assert set(raw_predictions[:, i]) == set(class_df['value']) else: assert set(raw_predictions) == set(leaves_df['value']) if task.endswith('classification'): pred_proba_raw = model.predict_proba(dX, raw_score=True).compute() assert_eq(raw_predictions, pred_proba_raw) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_classifier(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='classification', output=output, centers=centers) dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5, local_listen_port=listen_port, n_estimators=10, num_leaves=10) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw, client=client) p1 = dask_classifier.predict(dX) p1_proba = dask_classifier.predict_proba(dX).compute() s1 = accuracy_score(dy, p1) p1 = p1.compute() local_classifier = lightgbm.LGBMClassifier(n_estimators=10, num_leaves=10) local_classifier.fit(X, y, sample_weight=w) p2 = local_classifier.predict(X) p2_proba = local_classifier.predict_proba(X) s2 = local_classifier.score(X, y) assert_eq(s1, s2) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2) assert_eq(p1_proba, p2_proba, atol=0.3) client.close()
def test_init_score(task, output, client): if task == 'ranking' and output == 'scipy_csr_matrix': pytest.skip('LGBMRanker is not currently tested on sparse matrices') _, _, _, _, dX, dy, dw, dg = _create_data( objective=task, output=output, group=None ) model_factory = task_to_dask_factory[task] params = { 'n_estimators': 1, 'num_leaves': 2, 'time_out': 5 } init_score = random.random() # init_scores must be a 1D array, even for multiclass classification # where you need to provide 1 score per class for each row in X # https://github.com/microsoft/LightGBM/issues/4046 size_factor = 1 if task == 'multiclass-classification': size_factor = 3 # number of classes if output.startswith('dataframe'): init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size * size_factor)) else: init_scores = dy.map_blocks(lambda x: np.repeat(init_score, x.size * size_factor)) model = model_factory(client=client, **params) model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) # value of the root node is 0 when init_score is set assert model.booster_.trees_to_dataframe()['value'][0] == 0 client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_training_works_if_client_not_provided_or_set_after_construction( task, client): _, _, _, _, dX, dy, _, dg = _create_data(objective=task, output='array', group=None) model_factory = task_to_dask_factory[task] params = {"time_out": 5, "n_estimators": 1, "num_leaves": 2} # should be able to use the class without specifying a client dask_model = model_factory(**params) assert dask_model.client is None with pytest.raises( lgb.compat.LGBMNotFittedError, match='Cannot access property client_ before calling fit'): dask_model.client_ dask_model.fit(dX, dy, group=dg) assert dask_model.fitted_ assert dask_model.client is None assert dask_model.client_ == client preds = dask_model.predict(dX) assert isinstance(preds, da.Array) assert dask_model.fitted_ assert dask_model.client is None assert dask_model.client_ == client local_model = dask_model.to_local() with pytest.raises(AttributeError): local_model.client local_model.client_ # should be able to set client after construction dask_model = model_factory(**params) dask_model.set_params(client=client) assert dask_model.client == client with pytest.raises( lgb.compat.LGBMNotFittedError, match='Cannot access property client_ before calling fit'): dask_model.client_ dask_model.fit(dX, dy, group=dg) assert dask_model.fitted_ assert dask_model.client == client assert dask_model.client_ == client preds = dask_model.predict(dX) assert isinstance(preds, da.Array) assert dask_model.fitted_ assert dask_model.client == client assert dask_model.client_ == client local_model = dask_model.to_local() with pytest.raises(AttributeError): local_model.client local_model.client_ client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_classifier(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='classification', output=output, centers=centers) params = {"n_estimators": 10, "num_leaves": 10} if output == 'dataframe-with-categorical': params["categorical_feature"] = [ i for i, col in enumerate(dX.columns) if col.startswith('cat_') ] dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, local_listen_port=listen_port, **params) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) p1 = dask_classifier.predict(dX) p1_proba = dask_classifier.predict_proba(dX).compute() p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True) p1_local = dask_classifier.to_local().predict(X) s1 = _accuracy_score(dy, p1) p1 = p1.compute() local_classifier = lgb.LGBMClassifier(**params) local_classifier.fit(X, y, sample_weight=w) p2 = local_classifier.predict(X) p2_proba = local_classifier.predict_proba(X) s2 = local_classifier.score(X, y) assert_eq(s1, s2) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2) assert_eq(p1_proba, p2_proba, atol=0.3) assert_eq(p1_local, p2) assert_eq(y, p1_local) # pref_leaf values should have the right shape # and values that look like valid tree nodes pred_leaf_vals = p1_pred_leaf.compute() assert pred_leaf_vals.shape == (X.shape[0], dask_classifier.booster_.num_trees()) assert np.max(pred_leaf_vals) <= params['num_leaves'] assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_classifier.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array( task, client, ): if task == 'ranking': _, _, _, _, dX, dy, dw, dg = _create_ranking_data(output='dataframe', group=None) model_factory = lgb.DaskLGBMRanker else: _, _, _, dX, dy, dw = _create_data( objective=task, output='dataframe', ) dg = None if task == 'classification': model_factory = lgb.DaskLGBMClassifier elif task == 'regression': model_factory = lgb.DaskLGBMRegressor dy = dy.to_dask_array(lengths=True) dy_col_array = dy.reshape(-1, 1) assert len(dy_col_array.shape) == 2 and dy_col_array.shape[1] == 1 params = { 'n_estimators': 1, 'num_leaves': 3, 'random_state': 0, 'time_out': 5 } model = model_factory(**params) model.fit(dX, dy_col_array, sample_weight=dw, group=dg) assert model.fitted_ client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_regressor_pred_contrib(output, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='regression', output=output) params = {"n_estimators": 10, "num_leaves": 10} dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, local_listen_port=listen_port, tree_learner='data', **params) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) preds_with_contrib = dask_regressor.predict(dX, pred_contrib=True).compute() local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True) if output == "scipy_csr_matrix": preds_with_contrib = np.array(preds_with_contrib.todense()) # contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position num_features = dX.shape[1] assert preds_with_contrib.shape[1] == num_features + 1 assert preds_with_contrib.shape == local_preds_with_contrib.shape client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_regressor(output, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='regression', output=output) params = {"random_state": 42, "num_leaves": 10} dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, local_listen_port=listen_port, tree='data', **params) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) p1 = dask_regressor.predict(dX) if output != 'dataframe': s1 = _r2_score(dy, p1) p1 = p1.compute() p1_local = dask_regressor.to_local().predict(X) s1_local = dask_regressor.to_local().score(X, y) local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) s2 = local_regressor.score(X, y) p2 = local_regressor.predict(X) # Scores should be the same if output != 'dataframe': assert_eq(s1, s2, atol=.01) assert_eq(s1, s1_local, atol=.003) # Predictions should be roughly the same assert_eq(y, p1, rtol=1., atol=100.) assert_eq(y, p2, rtol=1., atol=50.) assert_eq(p1, p1_local) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_classifier(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='classification', output=output, centers=centers) params = {"n_estimators": 10, "num_leaves": 10} dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, local_listen_port=listen_port, **params) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) p1 = dask_classifier.predict(dX) p1_proba = dask_classifier.predict_proba(dX).compute() p1_local = dask_classifier.to_local().predict(X) s1 = _accuracy_score(dy, p1) p1 = p1.compute() local_classifier = lgb.LGBMClassifier(**params) local_classifier.fit(X, y, sample_weight=w) p2 = local_classifier.predict(X) p2_proba = local_classifier.predict_proba(X) s2 = local_classifier.score(X, y) assert_eq(s1, s2) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2) assert_eq(p1_proba, p2_proba, atol=0.3) assert_eq(p1_local, p2) assert_eq(y, p1_local) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_classifier_pred_contrib(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='classification', output=output, centers=centers) params = {"n_estimators": 10, "num_leaves": 10} dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, local_listen_port=listen_port, tree_learner='data', **params) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True).compute() local_classifier = lgb.LGBMClassifier(**params) local_classifier.fit(X, y, sample_weight=w) local_preds_with_contrib = local_classifier.predict(X, pred_contrib=True) if output == 'scipy_csr_matrix': preds_with_contrib = np.array(preds_with_contrib.todense()) # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_classifier.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' # shape depends on whether it is binary or multiclass classification num_features = dask_classifier.n_features_ num_classes = dask_classifier.n_classes_ if num_classes == 2: expected_num_cols = num_features + 1 else: expected_num_cols = (num_features + 1) * num_classes # * shape depends on whether it is binary or multiclass classification # * matrix for binary classification is of the form [feature_contrib, base_value], # for multi-class it's [feat_contrib_class1, base_value_class1, feat_contrib_class2, base_value_class2, etc.] # * contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position assert preds_with_contrib.shape[1] == expected_num_cols assert preds_with_contrib.shape == local_preds_with_contrib.shape if num_classes == 2: assert len(np.unique(preds_with_contrib[:, num_features]) == 1) else: for i in range(num_classes): base_value_col = num_features * (i + 1) + i assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_network_params_not_required_but_respected_if_given(client, task, listen_port): client.wait_for_workers(2) _, _, _, _, dX, dy, _, dg = _create_data( objective=task, output='array', chunk_size=10, group=None ) dask_model_factory = task_to_dask_factory[task] # rebalance data to be sure that each worker has a piece of the data client.rebalance() # model 1 - no network parameters given dask_model1 = dask_model_factory( n_estimators=5, num_leaves=5, ) dask_model1.fit(dX, dy, group=dg) assert dask_model1.fitted_ params = dask_model1.get_params() assert 'local_listen_port' not in params assert 'machines' not in params # model 2 - machines given n_workers = len(client.scheduler_info()['workers']) open_ports = [lgb.dask._find_random_open_port() for _ in range(n_workers)] dask_model2 = dask_model_factory( n_estimators=5, num_leaves=5, machines=",".join([ "127.0.0.1:" + str(port) for port in open_ports ]), ) dask_model2.fit(dX, dy, group=dg) assert dask_model2.fitted_ params = dask_model2.get_params() assert 'local_listen_port' not in params assert 'machines' in params # model 3 - local_listen_port given # training should fail because LightGBM will try to use the same # port for multiple worker processes on the same machine dask_model3 = dask_model_factory( n_estimators=5, num_leaves=5, local_listen_port=listen_port ) error_msg = "has multiple Dask worker processes running on it" with pytest.raises(lgb.basic.LightGBMError, match=error_msg): dask_model3.fit(dX, dy, group=dg) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_training_succeeds_even_if_some_workers_do_not_have_any_data(client, task, output): if task == 'ranking' and output == 'scipy_csr_matrix': pytest.skip('LGBMRanker is not currently tested on sparse matrices') def collection_to_single_partition(collection): """Merge the parts of a Dask collection into a single partition.""" if collection is None: return if isinstance(collection, da.Array): return collection.rechunk(*collection.shape) return collection.repartition(npartitions=1) if task == 'ranking': X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=None ) else: X, y, w, dX, dy, dw = _create_data( objective=task, output=output ) g = None dg = None dask_model_factory = task_to_dask_factory[task] local_model_factory = task_to_local_factory[task] dX = collection_to_single_partition(dX) dy = collection_to_single_partition(dy) dw = collection_to_single_partition(dw) dg = collection_to_single_partition(dg) n_workers = len(client.scheduler_info()['workers']) assert n_workers > 1 assert dX.npartitions == 1 params = { 'time_out': 5, 'random_state': 42, 'num_leaves': 10 } dask_model = dask_model_factory(tree='data', client=client, **params) dask_model.fit(dX, dy, group=dg, sample_weight=dw) dask_preds = dask_model.predict(dX).compute() local_model = local_model_factory(**params) if task == 'ranking': local_model.fit(X, y, group=g, sample_weight=w) else: local_model.fit(X, y, sample_weight=w) local_preds = local_model.predict(X) assert assert_eq(dask_preds, local_preds) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_find_random_open_port(client): for _ in range(5): worker_address_to_port = client.run(lgb.dask._find_random_open_port) found_ports = worker_address_to_port.values() # check that found ports are different for same address (LocalCluster) assert len(set(found_ports)) == len(found_ports) # check that the ports are indeed open for port in found_ports: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(('', port)) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_classifier_pred_contrib(output, centers, client, listen_port): X, y, w, dX, dy, dw = _create_data( objective='classification', output=output, centers=centers ) params = { "n_estimators": 10, "num_leaves": 10 } dask_classifier = lgb.DaskLGBMClassifier( client=client, time_out=5, local_listen_port=listen_port, tree_learner='data', **params ) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True).compute() local_classifier = lgb.LGBMClassifier(**params) local_classifier.fit(X, y, sample_weight=w) local_preds_with_contrib = local_classifier.predict(X, pred_contrib=True) if output == 'scipy_csr_matrix': preds_with_contrib = np.array(preds_with_contrib.todense()) # shape depends on whether it is binary or multiclass classification num_features = dask_classifier.n_features_ num_classes = dask_classifier.n_classes_ if num_classes == 2: expected_num_cols = num_features + 1 else: expected_num_cols = (num_features + 1) * num_classes # * shape depends on whether it is binary or multiclass classification # * matrix for binary classification is of the form [feature_contrib, base_value], # for multi-class it's [feat_contrib_class1, base_value_class1, feat_contrib_class2, base_value_class2, etc.] # * contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position assert preds_with_contrib.shape[1] == expected_num_cols assert preds_with_contrib.shape == local_preds_with_contrib.shape if num_classes == 2: assert len(np.unique(preds_with_contrib[:, num_features]) == 1) else: for i in range(num_classes): base_value_col = num_features * (i + 1) + i assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_training_does_not_fail_on_port_conflicts(client): _, _, _, dX, dy, dw = _create_data('classification', output='array') with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(('127.0.0.1', 12400)) dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5, local_listen_port=12400, n_estimators=5, num_leaves=5) for _ in range(5): dask_classifier.fit(X=dX, y=dy, sample_weight=dw, client=client) assert dask_classifier.booster_ client.close()
def test_ranker(output, client, listen_port, group): X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group ) # rebalance small dask.array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist() dw = dw.persist() dg = dg.persist() _ = wait([dX, dy, dw, dg]) client.rebalance() # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. params = { "random_state": 42, "n_estimators": 50, "num_leaves": 20, "min_child_samples": 1 } dask_ranker = lgb.DaskLGBMRanker( client=client, time_out=5, local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = rnkvec_dask.compute() rnkvec_dask_local = dask_ranker.to_local().predict(X) local_ranker = lgb.LGBMRanker(**params) local_ranker.fit(X, y, sample_weight=w, group=g) rnkvec_local = local_ranker.predict(X) # distributed ranker should be able to rank decently well and should # have high rank correlation with scores from serial ranker. dcor = spearmanr(rnkvec_dask, y).correlation assert dcor > 0.6 assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8 assert_eq(rnkvec_dask, rnkvec_dask_local) client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_warns_and_continues_on_unrecognized_tree_learner(client): X = da.random.random((1e3, 10)) y = da.random.random((1e3, 1)) dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree_learner='some-nonsense-value', n_estimators=1, num_leaves=2) with pytest.warns( UserWarning, match='Parameter tree_learner set to some-nonsense-value'): dask_regressor = dask_regressor.fit(X, y) assert dask_regressor.fitted_ client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_regressor_pred_contrib(output, client, listen_port): X, y, w, dX, dy, dw = _create_data( objective='regression', output=output ) params = { "n_estimators": 10, "num_leaves": 10 } dask_regressor = lgb.DaskLGBMRegressor( client=client, time_out=5, local_listen_port=listen_port, tree_learner='data', **params ) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) preds_with_contrib = dask_regressor.predict(dX, pred_contrib=True).compute() local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True) if output == "scipy_csr_matrix": preds_with_contrib = np.array(preds_with_contrib.todense()) # contrib outputs for distributed training are different than from local training, so we can just test # that the output has the right shape and base values are in the right position num_features = dX.shape[1] assert preds_with_contrib.shape[1] == num_features + 1 assert preds_with_contrib.shape == local_preds_with_contrib.shape # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_regressor.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_regressor_quantile(output, client, listen_port, alpha): X, y, w, dX, dy, dw = _create_data( objective='regression', output=output ) params = { "objective": "quantile", "alpha": alpha, "random_state": 42, "n_estimators": 10, "num_leaves": 10 } dask_regressor = lgb.DaskLGBMRegressor( client=client, local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw) p1 = dask_regressor.predict(dX).compute() q1 = np.count_nonzero(y < p1) / y.shape[0] local_regressor = lgb.LGBMRegressor(**params) local_regressor.fit(X, y, sample_weight=w) p2 = local_regressor.predict(X) q2 = np.count_nonzero(y < p2) / y.shape[0] # Quantiles should be right np.testing.assert_allclose(q1, alpha, atol=0.2) np.testing.assert_allclose(q2, alpha, atol=0.2) # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_regressor.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_warns_but_makes_no_changes_for_feature_or_voting_tree_learner(client): X = da.random.random((1e3, 10)) y = da.random.random((1e3, 1)) for tree_learner in ['feature_parallel', 'voting']: dask_regressor = lgb.DaskLGBMRegressor(client=client, time_out=5, tree_learner=tree_learner, n_estimators=1, num_leaves=2) with pytest.warns(UserWarning, match='Support for tree_learner %s in lightgbm' % tree_learner): dask_regressor = dask_regressor.fit(X, y) assert dask_regressor.fitted_ assert dask_regressor.get_params()['tree_learner'] == tree_learner client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_training_does_not_fail_on_port_conflicts(client): _, _, _, dX, dy, dw = _create_data('classification', output='array') lightgbm_default_port = 12400 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(('127.0.0.1', lightgbm_default_port)) dask_classifier = lgb.DaskLGBMClassifier(client=client, time_out=5, n_estimators=5, num_leaves=5) for _ in range(5): dask_classifier.fit( X=dX, y=dy, sample_weight=dw, ) assert dask_classifier.booster_ client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_ranker_local_predict(output, client, listen_port, group): X, y, w, g, dX, dy, dw, dg = _create_ranking_data(output=output, group=group) dask_ranker = dlgbm.DaskLGBMRanker(time_out=5, local_listen_port=listen_port, tree_learner='data', n_estimators=10, num_leaves=10, seed=42, min_child_samples=1) dask_ranker = dask_ranker.fit(dX, dy, group=dg, client=client) rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = rnkvec_dask.compute() rnkvec_local = dask_ranker.to_local().predict(X) # distributed and to-local scores should be the same. assert_eq(rnkvec_dask, rnkvec_local) client.close()
def test_init_score( task, output, client): if task == 'ranking' and output == 'scipy_csr_matrix': pytest.skip('LGBMRanker is not currently tested on sparse matrices') if task == 'ranking': _, _, _, _, dX, dy, dw, dg = _create_ranking_data( output=output, group=None ) model_factory = lgb.DaskLGBMRanker else: _, _, _, dX, dy, dw = _create_data( objective=task, output=output, ) dg = None if task == 'classification': model_factory = lgb.DaskLGBMClassifier elif task == 'regression': model_factory = lgb.DaskLGBMRegressor params = { 'n_estimators': 1, 'num_leaves': 2, 'time_out': 5 } init_score = random.random() if output.startswith('dataframe'): init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size)) else: init_scores = da.full_like(dy, fill_value=init_score, dtype=np.float64) model = model_factory(client=client, **params) model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg) # value of the root node is 0 when init_score is set assert model.booster_.trees_to_dataframe()['value'][0] == 0 client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def test_classifier_local_predict(client, listen_port): X, y, w, dX, dy, dw = _create_data(objective='classification', output='array') dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5, local_port=listen_port, n_estimators=10, num_leaves=10) dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw, client=client) p1 = dask_classifier.to_local().predict(dX) local_classifier = lightgbm.LGBMClassifier(n_estimators=10, num_leaves=10) local_classifier.fit(X, y, sample_weight=w) p2 = local_classifier.predict(X) assert_eq(p1, p2) assert_eq(y, p1) assert_eq(y, p2) client.close()
def test_regressor_local_predict(client, listen_port): X, y, _, dX, dy, dw = _create_data('regression', output='array') dask_regressor = dlgbm.DaskLGBMRegressor(local_listen_port=listen_port, seed=42, n_estimators=10, num_leaves=10, tree_type='data') dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw, client=client) p1 = dask_regressor.predict(dX) p2 = dask_regressor.to_local().predict(X) s1 = r2_score(dy, p1) p1 = p1.compute() s2 = dask_regressor.to_local().score(X, y) # Predictions and scores should be the same assert_eq(p1, p2) assert_eq(s1, s2) client.close()