示例#1
0
def test_regressor(output, client):
    X, y, w, _, dX, dy, dw, _ = _create_data(objective='regression',
                                             output=output)

    params = {
        "random_state": 42,
        "num_leaves": 31,
        "n_estimators": 20,
    }

    dask_regressor = lgb.DaskLGBMRegressor(client=client,
                                           time_out=5,
                                           tree='data',
                                           **params)
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
    p1 = dask_regressor.predict(dX)
    p1_pred_leaf = dask_regressor.predict(dX, pred_leaf=True)

    s1 = _r2_score(dy, p1)
    p1 = p1.compute()
    p1_local = dask_regressor.to_local().predict(X)
    s1_local = dask_regressor.to_local().score(X, y)

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    s2 = local_regressor.score(X, y)
    p2 = local_regressor.predict(X)

    # Scores should be the same
    assert_eq(s1, s2, atol=0.01)
    assert_eq(s1, s1_local)

    # Predictions should be roughly the same.
    assert_eq(p1, p1_local)

    # pref_leaf values should have the right shape
    # and values that look like valid tree nodes
    pred_leaf_vals = p1_pred_leaf.compute()
    assert pred_leaf_vals.shape == (X.shape[0],
                                    dask_regressor.booster_.num_trees())
    assert np.max(pred_leaf_vals) <= params['num_leaves']
    assert np.min(pred_leaf_vals) >= 0
    assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']

    assert_eq(p1, y, rtol=0.5, atol=50.)
    assert_eq(p2, y, rtol=0.5, atol=50.)

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_regressor.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col,
                           "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#2
0
def test_regressor_quantile(output, client, listen_port, alpha):
    X, y, w, dX, dy, dw = _create_data(objective='regression', output=output)

    params = {
        "objective": "quantile",
        "alpha": alpha,
        "random_state": 42,
        "n_estimators": 10,
        "num_leaves": 10
    }
    dask_regressor = dlgbm.DaskLGBMRegressor(local_listen_port=listen_port,
                                             tree_learner_type='data_parallel',
                                             **params)
    dask_regressor = dask_regressor.fit(dX,
                                        dy,
                                        client=client,
                                        sample_weight=dw)
    p1 = dask_regressor.predict(dX).compute()
    q1 = np.count_nonzero(y < p1) / y.shape[0]

    local_regressor = lightgbm.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    p2 = local_regressor.predict(X)
    q2 = np.count_nonzero(y < p2) / y.shape[0]

    # Quantiles should be right
    np.testing.assert_allclose(q1, alpha, atol=0.2)
    np.testing.assert_allclose(q2, alpha, atol=0.2)

    client.close()
示例#3
0
def test_regressor(output, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='regression', output=output)

    dask_regressor = dlgbm.DaskLGBMRegressor(time_out=5,
                                             local_listen_port=listen_port,
                                             seed=42,
                                             num_leaves=10,
                                             tree='data')
    dask_regressor = dask_regressor.fit(dX,
                                        dy,
                                        client=client,
                                        sample_weight=dw)
    p1 = dask_regressor.predict(dX)
    if output != 'dataframe':
        s1 = r2_score(dy, p1)
    p1 = p1.compute()

    local_regressor = lightgbm.LGBMRegressor(seed=42, num_leaves=10)
    local_regressor.fit(X, y, sample_weight=w)
    s2 = local_regressor.score(X, y)
    p2 = local_regressor.predict(X)

    # Scores should be the same
    if output != 'dataframe':
        assert_eq(s1, s2, atol=.01)

    # Predictions should be roughly the same
    assert_eq(y, p1, rtol=1., atol=100.)
    assert_eq(y, p2, rtol=1., atol=50.)

    client.close()
示例#4
0
def test_regressor_quantile(output, client, listen_port, alpha):
    X, y, w, dX, dy, dw = _create_data(objective='regression', output=output)

    dask_regressor = dlgbm.DaskLGBMRegressor(local_listen_port=listen_port,
                                             seed=42,
                                             objective='quantile',
                                             alpha=alpha,
                                             n_estimators=10,
                                             num_leaves=10,
                                             tree_learner_type='data_parallel')
    dask_regressor = dask_regressor.fit(dX,
                                        dy,
                                        client=client,
                                        sample_weight=dw)
    p1 = dask_regressor.predict(dX).compute()
    q1 = np.count_nonzero(y < p1) / y.shape[0]

    local_regressor = lightgbm.LGBMRegressor(seed=42,
                                             objective='quantile',
                                             alpha=alpha,
                                             n_estimatores=10,
                                             num_leaves=10)
    local_regressor.fit(X, y, sample_weight=w)
    p2 = local_regressor.predict(X)
    q2 = np.count_nonzero(y < p2) / y.shape[0]

    # Quantiles should be right
    np.testing.assert_allclose(q1, alpha, atol=0.2)
    np.testing.assert_allclose(q2, alpha, atol=0.2)

    client.close()
示例#5
0
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(
    task,
    client,
):
    _, _, _, _, dX, dy, dw, dg = _create_data(
        objective=task,
        output='dataframe',
        group=None
    )

    model_factory = task_to_dask_factory[task]

    dy = dy.to_dask_array(lengths=True)
    dy_col_array = dy.reshape(-1, 1)
    assert len(dy_col_array.shape) == 2 and dy_col_array.shape[1] == 1

    params = {
        'n_estimators': 1,
        'num_leaves': 3,
        'random_state': 0,
        'time_out': 5
    }
    model = model_factory(**params)
    model.fit(dX, dy_col_array, sample_weight=dw, group=dg)
    assert model.fitted_

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#6
0
def test_predict_with_raw_score(task, output, client):
    if task == 'ranking' and output == 'scipy_csr_matrix':
        pytest.skip('LGBMRanker is not currently tested on sparse matrices')

    _, _, _, _, dX, dy, _, dg = _create_data(objective=task,
                                             output=output,
                                             group=None)

    model_factory = task_to_dask_factory[task]
    params = {
        'client': client,
        'n_estimators': 1,
        'num_leaves': 2,
        'time_out': 5,
        'min_sum_hessian': 0
    }
    model = model_factory(**params)
    model.fit(dX, dy, group=dg)
    raw_predictions = model.predict(dX, raw_score=True).compute()

    trees_df = model.booster_.trees_to_dataframe()
    leaves_df = trees_df[trees_df.node_depth == 2]
    if task == 'multiclass-classification':
        for i in range(model.n_classes_):
            class_df = leaves_df[leaves_df.tree_index == i]
            assert set(raw_predictions[:, i]) == set(class_df['value'])
    else:
        assert set(raw_predictions) == set(leaves_df['value'])

    if task.endswith('classification'):
        pred_proba_raw = model.predict_proba(dX, raw_score=True).compute()
        assert_eq(raw_predictions, pred_proba_raw)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#7
0
def test_classifier(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='classification',
                                       output=output,
                                       centers=centers)

    dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5,
                                               local_listen_port=listen_port,
                                               n_estimators=10,
                                               num_leaves=10)
    dask_classifier = dask_classifier.fit(dX,
                                          dy,
                                          sample_weight=dw,
                                          client=client)
    p1 = dask_classifier.predict(dX)
    p1_proba = dask_classifier.predict_proba(dX).compute()
    s1 = accuracy_score(dy, p1)
    p1 = p1.compute()

    local_classifier = lightgbm.LGBMClassifier(n_estimators=10, num_leaves=10)
    local_classifier.fit(X, y, sample_weight=w)
    p2 = local_classifier.predict(X)
    p2_proba = local_classifier.predict_proba(X)
    s2 = local_classifier.score(X, y)

    assert_eq(s1, s2)
    assert_eq(p1, p2)
    assert_eq(y, p1)
    assert_eq(y, p2)
    assert_eq(p1_proba, p2_proba, atol=0.3)

    client.close()
示例#8
0
def test_init_score(task, output, client):
    if task == 'ranking' and output == 'scipy_csr_matrix':
        pytest.skip('LGBMRanker is not currently tested on sparse matrices')

    _, _, _, _, dX, dy, dw, dg = _create_data(
        objective=task,
        output=output,
        group=None
    )

    model_factory = task_to_dask_factory[task]

    params = {
        'n_estimators': 1,
        'num_leaves': 2,
        'time_out': 5
    }
    init_score = random.random()
    # init_scores must be a 1D array, even for multiclass classification
    # where you need to provide 1 score per class for each row in X
    # https://github.com/microsoft/LightGBM/issues/4046
    size_factor = 1
    if task == 'multiclass-classification':
        size_factor = 3  # number of classes

    if output.startswith('dataframe'):
        init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size * size_factor))
    else:
        init_scores = dy.map_blocks(lambda x: np.repeat(init_score, x.size * size_factor))
    model = model_factory(client=client, **params)
    model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
    # value of the root node is 0 when init_score is set
    assert model.booster_.trees_to_dataframe()['value'][0] == 0

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#9
0
def test_training_works_if_client_not_provided_or_set_after_construction(
        task, client):
    _, _, _, _, dX, dy, _, dg = _create_data(objective=task,
                                             output='array',
                                             group=None)
    model_factory = task_to_dask_factory[task]

    params = {"time_out": 5, "n_estimators": 1, "num_leaves": 2}

    # should be able to use the class without specifying a client
    dask_model = model_factory(**params)
    assert dask_model.client is None
    with pytest.raises(
            lgb.compat.LGBMNotFittedError,
            match='Cannot access property client_ before calling fit'):
        dask_model.client_

    dask_model.fit(dX, dy, group=dg)
    assert dask_model.fitted_
    assert dask_model.client is None
    assert dask_model.client_ == client

    preds = dask_model.predict(dX)
    assert isinstance(preds, da.Array)
    assert dask_model.fitted_
    assert dask_model.client is None
    assert dask_model.client_ == client

    local_model = dask_model.to_local()
    with pytest.raises(AttributeError):
        local_model.client
        local_model.client_

    # should be able to set client after construction
    dask_model = model_factory(**params)
    dask_model.set_params(client=client)
    assert dask_model.client == client

    with pytest.raises(
            lgb.compat.LGBMNotFittedError,
            match='Cannot access property client_ before calling fit'):
        dask_model.client_

    dask_model.fit(dX, dy, group=dg)
    assert dask_model.fitted_
    assert dask_model.client == client
    assert dask_model.client_ == client

    preds = dask_model.predict(dX)
    assert isinstance(preds, da.Array)
    assert dask_model.fitted_
    assert dask_model.client == client
    assert dask_model.client_ == client

    local_model = dask_model.to_local()
    with pytest.raises(AttributeError):
        local_model.client
        local_model.client_

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#10
0
def test_classifier(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='classification',
                                       output=output,
                                       centers=centers)

    params = {"n_estimators": 10, "num_leaves": 10}

    if output == 'dataframe-with-categorical':
        params["categorical_feature"] = [
            i for i, col in enumerate(dX.columns) if col.startswith('cat_')
        ]

    dask_classifier = lgb.DaskLGBMClassifier(client=client,
                                             time_out=5,
                                             local_listen_port=listen_port,
                                             **params)
    dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
    p1 = dask_classifier.predict(dX)
    p1_proba = dask_classifier.predict_proba(dX).compute()
    p1_pred_leaf = dask_classifier.predict(dX, pred_leaf=True)
    p1_local = dask_classifier.to_local().predict(X)
    s1 = _accuracy_score(dy, p1)
    p1 = p1.compute()

    local_classifier = lgb.LGBMClassifier(**params)
    local_classifier.fit(X, y, sample_weight=w)
    p2 = local_classifier.predict(X)
    p2_proba = local_classifier.predict_proba(X)
    s2 = local_classifier.score(X, y)

    assert_eq(s1, s2)
    assert_eq(p1, p2)
    assert_eq(y, p1)
    assert_eq(y, p2)
    assert_eq(p1_proba, p2_proba, atol=0.3)
    assert_eq(p1_local, p2)
    assert_eq(y, p1_local)

    # pref_leaf values should have the right shape
    # and values that look like valid tree nodes
    pred_leaf_vals = p1_pred_leaf.compute()
    assert pred_leaf_vals.shape == (X.shape[0],
                                    dask_classifier.booster_.num_trees())
    assert np.max(pred_leaf_vals) <= params['num_leaves']
    assert np.min(pred_leaf_vals) >= 0
    assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_classifier.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col,
                           "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#11
0
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(
    task,
    client,
):
    if task == 'ranking':
        _, _, _, _, dX, dy, dw, dg = _create_ranking_data(output='dataframe',
                                                          group=None)
        model_factory = lgb.DaskLGBMRanker
    else:
        _, _, _, dX, dy, dw = _create_data(
            objective=task,
            output='dataframe',
        )
        dg = None
        if task == 'classification':
            model_factory = lgb.DaskLGBMClassifier
        elif task == 'regression':
            model_factory = lgb.DaskLGBMRegressor
    dy = dy.to_dask_array(lengths=True)
    dy_col_array = dy.reshape(-1, 1)
    assert len(dy_col_array.shape) == 2 and dy_col_array.shape[1] == 1

    params = {
        'n_estimators': 1,
        'num_leaves': 3,
        'random_state': 0,
        'time_out': 5
    }
    model = model_factory(**params)
    model.fit(dX, dy_col_array, sample_weight=dw, group=dg)
    assert model.fitted_

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#12
0
def test_regressor_pred_contrib(output, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='regression', output=output)

    params = {"n_estimators": 10, "num_leaves": 10}
    dask_regressor = lgb.DaskLGBMRegressor(client=client,
                                           time_out=5,
                                           local_listen_port=listen_port,
                                           tree_learner='data',
                                           **params)
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
    preds_with_contrib = dask_regressor.predict(dX,
                                                pred_contrib=True).compute()

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True)

    if output == "scipy_csr_matrix":
        preds_with_contrib = np.array(preds_with_contrib.todense())

    # contrib outputs for distributed training are different than from local training, so we can just test
    # that the output has the right shape and base values are in the right position
    num_features = dX.shape[1]
    assert preds_with_contrib.shape[1] == num_features + 1
    assert preds_with_contrib.shape == local_preds_with_contrib.shape

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#13
0
def test_regressor(output, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='regression', output=output)

    params = {"random_state": 42, "num_leaves": 10}
    dask_regressor = lgb.DaskLGBMRegressor(client=client,
                                           time_out=5,
                                           local_listen_port=listen_port,
                                           tree='data',
                                           **params)
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
    p1 = dask_regressor.predict(dX)
    if output != 'dataframe':
        s1 = _r2_score(dy, p1)
    p1 = p1.compute()
    p1_local = dask_regressor.to_local().predict(X)
    s1_local = dask_regressor.to_local().score(X, y)

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    s2 = local_regressor.score(X, y)
    p2 = local_regressor.predict(X)

    # Scores should be the same
    if output != 'dataframe':
        assert_eq(s1, s2, atol=.01)
        assert_eq(s1, s1_local, atol=.003)

    # Predictions should be roughly the same
    assert_eq(y, p1, rtol=1., atol=100.)
    assert_eq(y, p2, rtol=1., atol=50.)
    assert_eq(p1, p1_local)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#14
0
def test_classifier(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='classification',
                                       output=output,
                                       centers=centers)

    params = {"n_estimators": 10, "num_leaves": 10}
    dask_classifier = lgb.DaskLGBMClassifier(client=client,
                                             time_out=5,
                                             local_listen_port=listen_port,
                                             **params)
    dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
    p1 = dask_classifier.predict(dX)
    p1_proba = dask_classifier.predict_proba(dX).compute()
    p1_local = dask_classifier.to_local().predict(X)
    s1 = _accuracy_score(dy, p1)
    p1 = p1.compute()

    local_classifier = lgb.LGBMClassifier(**params)
    local_classifier.fit(X, y, sample_weight=w)
    p2 = local_classifier.predict(X)
    p2_proba = local_classifier.predict_proba(X)
    s2 = local_classifier.score(X, y)

    assert_eq(s1, s2)
    assert_eq(p1, p2)
    assert_eq(y, p1)
    assert_eq(y, p2)
    assert_eq(p1_proba, p2_proba, atol=0.3)
    assert_eq(p1_local, p2)
    assert_eq(y, p1_local)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#15
0
def test_classifier_pred_contrib(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='classification',
                                       output=output,
                                       centers=centers)

    params = {"n_estimators": 10, "num_leaves": 10}

    dask_classifier = lgb.DaskLGBMClassifier(client=client,
                                             time_out=5,
                                             local_listen_port=listen_port,
                                             tree_learner='data',
                                             **params)
    dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
    preds_with_contrib = dask_classifier.predict(dX,
                                                 pred_contrib=True).compute()

    local_classifier = lgb.LGBMClassifier(**params)
    local_classifier.fit(X, y, sample_weight=w)
    local_preds_with_contrib = local_classifier.predict(X, pred_contrib=True)

    if output == 'scipy_csr_matrix':
        preds_with_contrib = np.array(preds_with_contrib.todense())

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_classifier.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col,
                           "decision_type"].unique()[0] == '=='

    # shape depends on whether it is binary or multiclass classification
    num_features = dask_classifier.n_features_
    num_classes = dask_classifier.n_classes_
    if num_classes == 2:
        expected_num_cols = num_features + 1
    else:
        expected_num_cols = (num_features + 1) * num_classes

    # * shape depends on whether it is binary or multiclass classification
    # * matrix for binary classification is of the form [feature_contrib, base_value],
    #   for multi-class it's [feat_contrib_class1, base_value_class1, feat_contrib_class2, base_value_class2, etc.]
    # * contrib outputs for distributed training are different than from local training, so we can just test
    #   that the output has the right shape and base values are in the right position
    assert preds_with_contrib.shape[1] == expected_num_cols
    assert preds_with_contrib.shape == local_preds_with_contrib.shape

    if num_classes == 2:
        assert len(np.unique(preds_with_contrib[:, num_features]) == 1)
    else:
        for i in range(num_classes):
            base_value_col = num_features * (i + 1) + i
            assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#16
0
def test_network_params_not_required_but_respected_if_given(client, task, listen_port):
    client.wait_for_workers(2)

    _, _, _, _, dX, dy, _, dg = _create_data(
        objective=task,
        output='array',
        chunk_size=10,
        group=None
    )

    dask_model_factory = task_to_dask_factory[task]

    # rebalance data to be sure that each worker has a piece of the data
    client.rebalance()

    # model 1 - no network parameters given
    dask_model1 = dask_model_factory(
        n_estimators=5,
        num_leaves=5,
    )
    dask_model1.fit(dX, dy, group=dg)
    assert dask_model1.fitted_
    params = dask_model1.get_params()
    assert 'local_listen_port' not in params
    assert 'machines' not in params

    # model 2 - machines given
    n_workers = len(client.scheduler_info()['workers'])
    open_ports = [lgb.dask._find_random_open_port() for _ in range(n_workers)]
    dask_model2 = dask_model_factory(
        n_estimators=5,
        num_leaves=5,
        machines=",".join([
            "127.0.0.1:" + str(port)
            for port in open_ports
        ]),
    )

    dask_model2.fit(dX, dy, group=dg)
    assert dask_model2.fitted_
    params = dask_model2.get_params()
    assert 'local_listen_port' not in params
    assert 'machines' in params

    # model 3 - local_listen_port given
    # training should fail because LightGBM will try to use the same
    # port for multiple worker processes on the same machine
    dask_model3 = dask_model_factory(
        n_estimators=5,
        num_leaves=5,
        local_listen_port=listen_port
    )
    error_msg = "has multiple Dask worker processes running on it"
    with pytest.raises(lgb.basic.LightGBMError, match=error_msg):
        dask_model3.fit(dX, dy, group=dg)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#17
0
def test_training_succeeds_even_if_some_workers_do_not_have_any_data(client, task, output):
    if task == 'ranking' and output == 'scipy_csr_matrix':
        pytest.skip('LGBMRanker is not currently tested on sparse matrices')

    def collection_to_single_partition(collection):
        """Merge the parts of a Dask collection into a single partition."""
        if collection is None:
            return
        if isinstance(collection, da.Array):
            return collection.rechunk(*collection.shape)
        return collection.repartition(npartitions=1)

    if task == 'ranking':
        X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
            output=output,
            group=None
        )
    else:
        X, y, w, dX, dy, dw = _create_data(
            objective=task,
            output=output
        )
        g = None
        dg = None

    dask_model_factory = task_to_dask_factory[task]
    local_model_factory = task_to_local_factory[task]

    dX = collection_to_single_partition(dX)
    dy = collection_to_single_partition(dy)
    dw = collection_to_single_partition(dw)
    dg = collection_to_single_partition(dg)

    n_workers = len(client.scheduler_info()['workers'])
    assert n_workers > 1
    assert dX.npartitions == 1

    params = {
        'time_out': 5,
        'random_state': 42,
        'num_leaves': 10
    }

    dask_model = dask_model_factory(tree='data', client=client, **params)
    dask_model.fit(dX, dy, group=dg, sample_weight=dw)
    dask_preds = dask_model.predict(dX).compute()

    local_model = local_model_factory(**params)
    if task == 'ranking':
        local_model.fit(X, y, group=g, sample_weight=w)
    else:
        local_model.fit(X, y, sample_weight=w)
    local_preds = local_model.predict(X)

    assert assert_eq(dask_preds, local_preds)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#18
0
def test_find_random_open_port(client):
    for _ in range(5):
        worker_address_to_port = client.run(lgb.dask._find_random_open_port)
        found_ports = worker_address_to_port.values()
        # check that found ports are different for same address (LocalCluster)
        assert len(set(found_ports)) == len(found_ports)
        # check that the ports are indeed open
        for port in found_ports:
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                s.bind(('', port))
    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#19
0
def test_classifier_pred_contrib(output, centers, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(
        objective='classification',
        output=output,
        centers=centers
    )

    params = {
        "n_estimators": 10,
        "num_leaves": 10
    }
    dask_classifier = lgb.DaskLGBMClassifier(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree_learner='data',
        **params
    )
    dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
    preds_with_contrib = dask_classifier.predict(dX, pred_contrib=True).compute()

    local_classifier = lgb.LGBMClassifier(**params)
    local_classifier.fit(X, y, sample_weight=w)
    local_preds_with_contrib = local_classifier.predict(X, pred_contrib=True)

    if output == 'scipy_csr_matrix':
        preds_with_contrib = np.array(preds_with_contrib.todense())

    # shape depends on whether it is binary or multiclass classification
    num_features = dask_classifier.n_features_
    num_classes = dask_classifier.n_classes_
    if num_classes == 2:
        expected_num_cols = num_features + 1
    else:
        expected_num_cols = (num_features + 1) * num_classes

    # * shape depends on whether it is binary or multiclass classification
    # * matrix for binary classification is of the form [feature_contrib, base_value],
    #   for multi-class it's [feat_contrib_class1, base_value_class1, feat_contrib_class2, base_value_class2, etc.]
    # * contrib outputs for distributed training are different than from local training, so we can just test
    #   that the output has the right shape and base values are in the right position
    assert preds_with_contrib.shape[1] == expected_num_cols
    assert preds_with_contrib.shape == local_preds_with_contrib.shape

    if num_classes == 2:
        assert len(np.unique(preds_with_contrib[:, num_features]) == 1)
    else:
        for i in range(num_classes):
            base_value_col = num_features * (i + 1) + i
            assert len(np.unique(preds_with_contrib[:, base_value_col]) == 1)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#20
0
def test_training_does_not_fail_on_port_conflicts(client):
    _, _, _, dX, dy, dw = _create_data('classification', output='array')

    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(('127.0.0.1', 12400))

        dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5,
                                                   local_listen_port=12400,
                                                   n_estimators=5,
                                                   num_leaves=5)
        for _ in range(5):
            dask_classifier.fit(X=dX, y=dy, sample_weight=dw, client=client)
            assert dask_classifier.booster_

    client.close()
示例#21
0
def test_ranker(output, client, listen_port, group):

    X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
        output=output,
        group=group
    )

    # rebalance small dask.array dataset for better performance.
    if output == 'array':
        dX = dX.persist()
        dy = dy.persist()
        dw = dw.persist()
        dg = dg.persist()
        _ = wait([dX, dy, dw, dg])
        client.rebalance()

    # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of
    # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
    params = {
        "random_state": 42,
        "n_estimators": 50,
        "num_leaves": 20,
        "min_child_samples": 1
    }
    dask_ranker = lgb.DaskLGBMRanker(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree_learner_type='data_parallel',
        **params
    )
    dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
    rnkvec_dask = dask_ranker.predict(dX)
    rnkvec_dask = rnkvec_dask.compute()
    rnkvec_dask_local = dask_ranker.to_local().predict(X)

    local_ranker = lgb.LGBMRanker(**params)
    local_ranker.fit(X, y, sample_weight=w, group=g)
    rnkvec_local = local_ranker.predict(X)

    # distributed ranker should be able to rank decently well and should
    # have high rank correlation with scores from serial ranker.
    dcor = spearmanr(rnkvec_dask, y).correlation
    assert dcor > 0.6
    assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8
    assert_eq(rnkvec_dask, rnkvec_dask_local)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#22
0
def test_warns_and_continues_on_unrecognized_tree_learner(client):
    X = da.random.random((1e3, 10))
    y = da.random.random((1e3, 1))
    dask_regressor = lgb.DaskLGBMRegressor(client=client,
                                           time_out=5,
                                           tree_learner='some-nonsense-value',
                                           n_estimators=1,
                                           num_leaves=2)
    with pytest.warns(
            UserWarning,
            match='Parameter tree_learner set to some-nonsense-value'):
        dask_regressor = dask_regressor.fit(X, y)

    assert dask_regressor.fitted_

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#23
0
def test_regressor_pred_contrib(output, client, listen_port):
    X, y, w, dX, dy, dw = _create_data(
        objective='regression',
        output=output
    )

    params = {
        "n_estimators": 10,
        "num_leaves": 10
    }

    dask_regressor = lgb.DaskLGBMRegressor(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree_learner='data',
        **params
    )
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
    preds_with_contrib = dask_regressor.predict(dX, pred_contrib=True).compute()

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    local_preds_with_contrib = local_regressor.predict(X, pred_contrib=True)

    if output == "scipy_csr_matrix":
        preds_with_contrib = np.array(preds_with_contrib.todense())

    # contrib outputs for distributed training are different than from local training, so we can just test
    # that the output has the right shape and base values are in the right position
    num_features = dX.shape[1]
    assert preds_with_contrib.shape[1] == num_features + 1
    assert preds_with_contrib.shape == local_preds_with_contrib.shape

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns
            if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_regressor.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#24
0
def test_regressor_quantile(output, client, listen_port, alpha):
    X, y, w, dX, dy, dw = _create_data(
        objective='regression',
        output=output
    )

    params = {
        "objective": "quantile",
        "alpha": alpha,
        "random_state": 42,
        "n_estimators": 10,
        "num_leaves": 10
    }

    dask_regressor = lgb.DaskLGBMRegressor(
        client=client,
        local_listen_port=listen_port,
        tree_learner_type='data_parallel',
        **params
    )
    dask_regressor = dask_regressor.fit(dX, dy, sample_weight=dw)
    p1 = dask_regressor.predict(dX).compute()
    q1 = np.count_nonzero(y < p1) / y.shape[0]

    local_regressor = lgb.LGBMRegressor(**params)
    local_regressor.fit(X, y, sample_weight=w)
    p2 = local_regressor.predict(X)
    q2 = np.count_nonzero(y < p2) / y.shape[0]

    # Quantiles should be right
    np.testing.assert_allclose(q1, alpha, atol=0.2)
    np.testing.assert_allclose(q2, alpha, atol=0.2)

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns
            if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_regressor.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#25
0
def test_warns_but_makes_no_changes_for_feature_or_voting_tree_learner(client):
    X = da.random.random((1e3, 10))
    y = da.random.random((1e3, 1))
    for tree_learner in ['feature_parallel', 'voting']:
        dask_regressor = lgb.DaskLGBMRegressor(client=client,
                                               time_out=5,
                                               tree_learner=tree_learner,
                                               n_estimators=1,
                                               num_leaves=2)
        with pytest.warns(UserWarning,
                          match='Support for tree_learner %s in lightgbm' %
                          tree_learner):
            dask_regressor = dask_regressor.fit(X, y)

        assert dask_regressor.fitted_
        assert dask_regressor.get_params()['tree_learner'] == tree_learner

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#26
0
def test_training_does_not_fail_on_port_conflicts(client):
    _, _, _, dX, dy, dw = _create_data('classification', output='array')

    lightgbm_default_port = 12400
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(('127.0.0.1', lightgbm_default_port))
        dask_classifier = lgb.DaskLGBMClassifier(client=client,
                                                 time_out=5,
                                                 n_estimators=5,
                                                 num_leaves=5)
        for _ in range(5):
            dask_classifier.fit(
                X=dX,
                y=dy,
                sample_weight=dw,
            )
            assert dask_classifier.booster_

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#27
0
def test_ranker_local_predict(output, client, listen_port, group):

    X, y, w, g, dX, dy, dw, dg = _create_ranking_data(output=output,
                                                      group=group)

    dask_ranker = dlgbm.DaskLGBMRanker(time_out=5,
                                       local_listen_port=listen_port,
                                       tree_learner='data',
                                       n_estimators=10,
                                       num_leaves=10,
                                       seed=42,
                                       min_child_samples=1)
    dask_ranker = dask_ranker.fit(dX, dy, group=dg, client=client)
    rnkvec_dask = dask_ranker.predict(dX)
    rnkvec_dask = rnkvec_dask.compute()
    rnkvec_local = dask_ranker.to_local().predict(X)

    # distributed and to-local scores should be the same.
    assert_eq(rnkvec_dask, rnkvec_local)

    client.close()
示例#28
0
def test_init_score(
        task,
        output,
        client):
    if task == 'ranking' and output == 'scipy_csr_matrix':
        pytest.skip('LGBMRanker is not currently tested on sparse matrices')

    if task == 'ranking':
        _, _, _, _, dX, dy, dw, dg = _create_ranking_data(
            output=output,
            group=None
        )
        model_factory = lgb.DaskLGBMRanker
    else:
        _, _, _, dX, dy, dw = _create_data(
            objective=task,
            output=output,
        )
        dg = None
        if task == 'classification':
            model_factory = lgb.DaskLGBMClassifier
        elif task == 'regression':
            model_factory = lgb.DaskLGBMRegressor

    params = {
        'n_estimators': 1,
        'num_leaves': 2,
        'time_out': 5
    }
    init_score = random.random()
    if output.startswith('dataframe'):
        init_scores = dy.map_partitions(lambda x: pd.Series([init_score] * x.size))
    else:
        init_scores = da.full_like(dy, fill_value=init_score, dtype=np.float64)
    model = model_factory(client=client, **params)
    model.fit(dX, dy, sample_weight=dw, init_score=init_scores, group=dg)
    # value of the root node is 0 when init_score is set
    assert model.booster_.trees_to_dataframe()['value'][0] == 0

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
示例#29
0
def test_classifier_local_predict(client, listen_port):
    X, y, w, dX, dy, dw = _create_data(objective='classification',
                                       output='array')

    dask_classifier = dlgbm.DaskLGBMClassifier(time_out=5,
                                               local_port=listen_port,
                                               n_estimators=10,
                                               num_leaves=10)
    dask_classifier = dask_classifier.fit(dX,
                                          dy,
                                          sample_weight=dw,
                                          client=client)
    p1 = dask_classifier.to_local().predict(dX)

    local_classifier = lightgbm.LGBMClassifier(n_estimators=10, num_leaves=10)
    local_classifier.fit(X, y, sample_weight=w)
    p2 = local_classifier.predict(X)

    assert_eq(p1, p2)
    assert_eq(y, p1)
    assert_eq(y, p2)

    client.close()
示例#30
0
def test_regressor_local_predict(client, listen_port):
    X, y, _, dX, dy, dw = _create_data('regression', output='array')

    dask_regressor = dlgbm.DaskLGBMRegressor(local_listen_port=listen_port,
                                             seed=42,
                                             n_estimators=10,
                                             num_leaves=10,
                                             tree_type='data')
    dask_regressor = dask_regressor.fit(dX,
                                        dy,
                                        sample_weight=dw,
                                        client=client)
    p1 = dask_regressor.predict(dX)
    p2 = dask_regressor.to_local().predict(X)
    s1 = r2_score(dy, p1)
    p1 = p1.compute()
    s2 = dask_regressor.to_local().score(X, y)

    # Predictions and scores should be the same
    assert_eq(p1, p2)
    assert_eq(s1, s2)

    client.close()