示例#1
0
def test_predict_and_score(dataset, datatype, n_neighbors, n_parts, batch_size,
                           client):
    X_train, X_test, y_train, y_test = dataset
    np_y_test = y_test

    l_model = lKNNReg(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_distances, l_indices = l_model.kneighbors(X_test)
    l_outputs = l_model.predict(X_test)
    local_out = (l_outputs, l_indices, l_distances)
    handmade_local_score = r2_score(y_test, l_outputs)
    handmade_local_score = round(float(handmade_local_score), 3)

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)
    y_test = generate_dask_array(y_test, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)
        y_test = to_dask_cudf(y_test, client)

    d_model = dKNNReg(client=client,
                      n_neighbors=n_neighbors,
                      batch_size=batch_size)
    d_model.fit(X_train, y_train)
    d_outputs, d_indices, d_distances = \
        d_model.predict(X_test, convert_dtype=True)
    distributed_out = da.compute(d_outputs, d_indices, d_distances)
    if datatype == 'dask_array':
        distributed_score = d_model.score(X_test, y_test)
        distributed_score = round(float(distributed_score), 3)

    if datatype == 'dask_cudf':
        distributed_out = list(
            map(
                lambda o: o.as_matrix()
                if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis],
                distributed_out))

    exact_match(local_out, distributed_out)

    if datatype == 'dask_array':
        assert distributed_score == pytest.approx(handmade_local_score,
                                                  abs=1e-2)
    else:
        y_pred = distributed_out[0]
        handmade_distributed_score = float(r2_score(np_y_test, y_pred))
        handmade_distributed_score = round(handmade_distributed_score, 3)
        assert handmade_distributed_score == pytest.approx(
            handmade_local_score, abs=1e-2)
def test_predict_1D_labels(input_type, client):
    # Testing that nothing crashes with 1D labels

    X, y = make_regression(n_samples=10000)
    if input_type == 'array':
        dX = da.from_array(X)
        dy = da.from_array(y)
    elif input_type == 'dataframe':
        X = cudf.DataFrame(X)
        y = cudf.Series(y)
        dX = dd.from_pandas(X, npartitions=1)
        dy = dd.from_pandas(y, npartitions=1)

    clf = dKNNReg()
    clf.fit(dX, dy)
    clf.predict(dX)
示例#3
0
def test_score(dataset, datatype, n_neighbors, n_parts, client):
    X_train, X_test, y_train, y_test = dataset

    if not n_parts:
        n_parts = len(client.has_what().keys())

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)
    y_test = generate_dask_array(y_test, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)
        y_test = to_dask_cudf(y_test, client)

    d_model = dKNNReg(client=client, n_neighbors=n_neighbors)
    d_model.fit(X_train, y_train)
    d_outputs, d_indices, d_distances = \
        d_model.predict(X_test, convert_dtype=True)
    distributed_out = da.compute(d_outputs, d_indices, d_distances)

    if datatype == 'dask_cudf':
        distributed_out = list(
            map(
                lambda o: o.as_matrix()
                if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis],
                distributed_out))
    cuml_score = d_model.score(X_test, y_test)

    if datatype == 'dask_cudf':
        y_test = y_test.compute().as_matrix()
    else:
        y_test = y_test.compute()
    manual_score = accuracy_score(y_test, distributed_out[0])

    assert cuml_score == manual_score
示例#4
0
def test_predict(dataset, datatype, n_neighbors, n_parts, batch_size, client):
    X_train, X_test, y_train, y_test = dataset

    l_model = lKNNReg(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_distances, l_indices = l_model.kneighbors(X_test)
    l_outputs = l_model.predict(X_test)
    local_out = (l_outputs, l_indices, l_distances)

    if not n_parts:
        n_parts = len(client.has_what().keys())

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)

    d_model = dKNNReg(client=client,
                      n_neighbors=n_neighbors,
                      batch_size=batch_size)
    d_model.fit(X_train, y_train)
    d_outputs, d_indices, d_distances = \
        d_model.predict(X_test, convert_dtype=True)
    distributed_out = da.compute(d_outputs, d_indices, d_distances)

    if datatype == 'dask_cudf':
        distributed_out = list(
            map(
                lambda o: o.as_matrix()
                if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis],
                distributed_out))

    match_test(local_out, distributed_out)
    accuracy_score(local_out[0], distributed_out[0]) > 0.12
def test_predict_and_score(dataset, datatype, parameters, client):
    n_neighbors, n_parts, batch_size = parameters
    X_train, X_test, y_train, y_test = dataset

    l_model = lKNNReg(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_outputs = l_model.predict(X_test)
    handmade_local_score = r2_score(y_test, l_outputs)
    handmade_local_score = round(float(handmade_local_score), 3)

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)
    y_test = generate_dask_array(y_test, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)
        y_test = to_dask_cudf(y_test, client)

    d_model = dKNNReg(client=client,
                      n_neighbors=n_neighbors,
                      batch_size=batch_size)
    d_model.fit(X_train, y_train)
    d_outputs = d_model.predict(X_test, convert_dtype=True)
    d_outputs = d_outputs.compute()

    d_outputs = d_outputs.to_numpy() \
        if isinstance(d_outputs, DataFrame) \
        else d_outputs

    exact_match(l_outputs, d_outputs)

    distributed_score = d_model.score(X_test, y_test)
    distributed_score = round(float(distributed_score), 3)
    assert distributed_score == pytest.approx(handmade_local_score, abs=1e-2)