Exemplo n.º 1
0
def test_network_params_not_required_but_respected_if_given(client, task, listen_port):
    client.wait_for_workers(2)

    _, _, _, _, dX, dy, _, dg = _create_data(
        objective=task,
        output='array',
        chunk_size=10,
        group=None
    )

    dask_model_factory = task_to_dask_factory[task]

    # rebalance data to be sure that each worker has a piece of the data
    client.rebalance()

    # model 1 - no network parameters given
    dask_model1 = dask_model_factory(
        n_estimators=5,
        num_leaves=5,
    )
    dask_model1.fit(dX, dy, group=dg)
    assert dask_model1.fitted_
    params = dask_model1.get_params()
    assert 'local_listen_port' not in params
    assert 'machines' not in params

    # model 2 - machines given
    n_workers = len(client.scheduler_info()['workers'])
    open_ports = [lgb.dask._find_random_open_port() for _ in range(n_workers)]
    dask_model2 = dask_model_factory(
        n_estimators=5,
        num_leaves=5,
        machines=",".join([
            "127.0.0.1:" + str(port)
            for port in open_ports
        ]),
    )

    dask_model2.fit(dX, dy, group=dg)
    assert dask_model2.fitted_
    params = dask_model2.get_params()
    assert 'local_listen_port' not in params
    assert 'machines' in params

    # model 3 - local_listen_port given
    # training should fail because LightGBM will try to use the same
    # port for multiple worker processes on the same machine
    dask_model3 = dask_model_factory(
        n_estimators=5,
        num_leaves=5,
        local_listen_port=listen_port
    )
    error_msg = "has multiple Dask worker processes running on it"
    with pytest.raises(lgb.basic.LightGBMError, match=error_msg):
        dask_model3.fit(dX, dy, group=dg)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
Exemplo n.º 2
0
def test_machines_should_be_used_if_provided(task, output):
    if task == 'ranking' and output == 'scipy_csr_matrix':
        pytest.skip('LGBMRanker is not currently tested on sparse matrices')

    with LocalCluster(n_workers=2) as cluster, Client(cluster) as client:
        if task == 'ranking':
            _, _, _, _, dX, dy, _, dg = _create_ranking_data(
                output=output,
                group=None,
                chunk_size=10,
            )
        else:
            _, _, _, dX, dy, _ = _create_data(
                objective=task,
                output=output,
                chunk_size=10,
            )
            dg = None

        dask_model_factory = task_to_dask_factory[task]

        # rebalance data to be sure that each worker has a piece of the data
        if output == 'array':
            client.rebalance()

        n_workers = len(client.scheduler_info()['workers'])
        assert n_workers > 1
        open_ports = [
            lgb.dask._find_random_open_port() for _ in range(n_workers)
        ]
        dask_model = dask_model_factory(
            n_estimators=5,
            num_leaves=5,
            machines=",".join(
                ["127.0.0.1:" + str(port) for port in open_ports]),
        )

        # test that "machines" is actually respected by creating a socket that uses
        # one of the ports mentioned in "machines"
        error_msg = "Binding port %s failed" % open_ports[0]
        with pytest.raises(lgb.basic.LightGBMError, match=error_msg):
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                s.bind(('127.0.0.1', open_ports[0]))
                dask_model.fit(dX, dy, group=dg)

        # an informative error should be raised if "machines" has duplicates
        one_open_port = lgb.dask._find_random_open_port()
        dask_model.set_params(machines=",".join(
            ["127.0.0.1:" + str(one_open_port) for _ in range(n_workers)]))
        with pytest.raises(ValueError, match="Found duplicates in 'machines'"):
            dask_model.fit(dX, dy, group=dg)
Exemplo n.º 3
0
def test_ranker(output, client, listen_port, group):

    X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
        output=output,
        group=group
    )

    # rebalance small dask.array dataset for better performance.
    if output == 'array':
        dX = dX.persist()
        dy = dy.persist()
        dw = dw.persist()
        dg = dg.persist()
        _ = wait([dX, dy, dw, dg])
        client.rebalance()

    # use many trees + leaves to overfit, help ensure that dask data-parallel strategy matches that of
    # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
    params = {
        "random_state": 42,
        "n_estimators": 50,
        "num_leaves": 20,
        "min_child_samples": 1
    }
    dask_ranker = lgb.DaskLGBMRanker(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree_learner_type='data_parallel',
        **params
    )
    dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
    rnkvec_dask = dask_ranker.predict(dX)
    rnkvec_dask = rnkvec_dask.compute()
    rnkvec_dask_local = dask_ranker.to_local().predict(X)

    local_ranker = lgb.LGBMRanker(**params)
    local_ranker.fit(X, y, sample_weight=w, group=g)
    rnkvec_local = local_ranker.predict(X)

    # distributed ranker should be able to rank decently well and should
    # have high rank correlation with scores from serial ranker.
    dcor = spearmanr(rnkvec_dask, y).correlation
    assert dcor > 0.6
    assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8
    assert_eq(rnkvec_dask, rnkvec_dask_local)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
Exemplo n.º 4
0
def test_ranker(output, client, listen_port, group):

    if output == 'dataframe-with-categorical':
        X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
            output=output,
            group=group,
            n_features=1,
            n_informative=1
        )
    else:
        X, y, w, g, dX, dy, dw, dg = _create_ranking_data(
            output=output,
            group=group,
        )

    # rebalance small dask.Array dataset for better performance.
    if output == 'array':
        dX = dX.persist()
        dy = dy.persist()
        dw = dw.persist()
        dg = dg.persist()
        _ = wait([dX, dy, dw, dg])
        client.rebalance()

    # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of
    # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210.
    params = {
        "random_state": 42,
        "n_estimators": 50,
        "num_leaves": 20,
        "min_child_samples": 1
    }

    dask_ranker = lgb.DaskLGBMRanker(
        client=client,
        time_out=5,
        local_listen_port=listen_port,
        tree_learner_type='data_parallel',
        **params
    )
    dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg)
    rnkvec_dask = dask_ranker.predict(dX)
    rnkvec_dask = rnkvec_dask.compute()
    p1_pred_leaf = dask_ranker.predict(dX, pred_leaf=True)
    rnkvec_dask_local = dask_ranker.to_local().predict(X)

    local_ranker = lgb.LGBMRanker(**params)
    local_ranker.fit(X, y, sample_weight=w, group=g)
    rnkvec_local = local_ranker.predict(X)

    # distributed ranker should be able to rank decently well and should
    # have high rank correlation with scores from serial ranker.
    dcor = spearmanr(rnkvec_dask, y).correlation
    assert dcor > 0.6
    assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8
    assert_eq(rnkvec_dask, rnkvec_dask_local)

    # pref_leaf values should have the right shape
    # and values that look like valid tree nodes
    pred_leaf_vals = p1_pred_leaf.compute()
    assert pred_leaf_vals.shape == (
        X.shape[0],
        dask_ranker.booster_.num_trees()
    )
    assert np.max(pred_leaf_vals) <= params['num_leaves']
    assert np.min(pred_leaf_vals) >= 0
    assert len(np.unique(pred_leaf_vals)) <= params['num_leaves']

    # be sure LightGBM actually used at least one categorical column,
    # and that it was correctly treated as a categorical feature
    if output == 'dataframe-with-categorical':
        cat_cols = [
            col for col in dX.columns
            if dX.dtypes[col].name == 'category'
        ]
        tree_df = dask_ranker.booster_.trees_to_dataframe()
        node_uses_cat_col = tree_df['split_feature'].isin(cat_cols)
        assert node_uses_cat_col.sum() > 0
        assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '=='

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)
Exemplo n.º 5
0
def test_network_params_not_required_but_respected_if_given(
        client, task, output, listen_port):
    if task == 'ranking' and output == 'scipy_csr_matrix':
        pytest.skip('LGBMRanker is not currently tested on sparse matrices')

    if task == 'ranking':
        _, _, _, _, dX, dy, _, dg = _create_ranking_data(
            output=output,
            group=None,
            chunk_size=10,
        )
        dask_model_factory = lgb.DaskLGBMRanker
    else:
        _, _, _, dX, dy, _ = _create_data(
            objective=task,
            output=output,
            chunk_size=10,
        )
        dg = None
        if task == 'classification':
            dask_model_factory = lgb.DaskLGBMClassifier
        elif task == 'regression':
            dask_model_factory = lgb.DaskLGBMRegressor

    # rebalance data to be sure that each worker has a piece of the data
    if output == 'array':
        client.rebalance()

    # model 1 - no network parameters given
    dask_model1 = dask_model_factory(
        n_estimators=5,
        num_leaves=5,
    )
    if task == 'ranking':
        dask_model1.fit(dX, dy, group=dg)
    else:
        dask_model1.fit(dX, dy)
    assert dask_model1.fitted_
    params = dask_model1.get_params()
    assert 'local_listen_port' not in params
    assert 'machines' not in params

    # model 2 - machines given
    n_workers = len(client.scheduler_info()['workers'])
    open_ports = [lgb.dask._find_random_open_port() for _ in range(n_workers)]
    dask_model2 = dask_model_factory(
        n_estimators=5,
        num_leaves=5,
        machines=",".join(["127.0.0.1:" + str(port) for port in open_ports]),
    )

    if task == 'ranking':
        dask_model2.fit(dX, dy, group=dg)
    else:
        dask_model2.fit(dX, dy)
    assert dask_model2.fitted_
    params = dask_model2.get_params()
    assert 'local_listen_port' not in params
    assert 'machines' in params

    # model 3 - local_listen_port given
    # training should fail because LightGBM will try to use the same
    # port for multiple worker processes on the same machine
    dask_model3 = dask_model_factory(n_estimators=5,
                                     num_leaves=5,
                                     local_listen_port=listen_port)
    error_msg = "has multiple Dask worker processes running on it"
    with pytest.raises(lgb.basic.LightGBMError, match=error_msg):
        if task == 'ranking':
            dask_model3.fit(dX, dy, group=dg)
        else:
            dask_model3.fit(dX, dy)

    client.close(timeout=CLIENT_CLOSE_TIMEOUT)