def test_predict_proba(dataset, datatype, n_neighbors, n_parts, batch_size, client): X_train, X_test, y_train, y_test = dataset l_model = lKNNClf(n_neighbors=n_neighbors) l_model.fit(X_train, y_train) l_probas = l_model.predict_proba(X_test) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) d_model = dKNNClf(client=client, n_neighbors=n_neighbors) d_model.fit(X_train, y_train) d_probas = d_model.predict_proba(X_test, convert_dtype=True) d_probas = da.compute(d_probas)[0] if datatype == 'dask_cudf': d_probas = list(map(lambda o: o.as_matrix() if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis], d_probas)) check_probabilities(l_probas, d_probas)
def test_predict_and_score(dataset, datatype, parameters, client): n_neighbors, n_parts, batch_size = parameters X_train, X_test, y_train, y_test = dataset np_y_test = y_test l_model = lKNNClf(n_neighbors=n_neighbors) l_model.fit(X_train, y_train) l_distances, l_indices = l_model.kneighbors(X_test) l_labels = l_model.predict(X_test) local_out = (l_labels, l_indices, l_distances) handmade_local_score = np.mean(y_test == l_labels) handmade_local_score = round(handmade_local_score, 3) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) y_test = generate_dask_array(y_test, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) y_test = to_dask_cudf(y_test, client) d_model = dKNNClf(client=client, n_neighbors=n_neighbors, batch_size=batch_size) d_model.fit(X_train, y_train) d_labels, d_indices, d_distances = \ d_model.predict(X_test, convert_dtype=True) distributed_out = da.compute(d_labels, d_indices, d_distances) if datatype == 'dask_array': distributed_score = d_model.score(X_test, y_test) distributed_score = round(distributed_score, 3) if datatype == 'dask_cudf': distributed_out = list( map( lambda o: o.as_matrix() if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis], distributed_out)) exact_match(local_out, distributed_out) if datatype == 'dask_array': assert distributed_score == pytest.approx(handmade_local_score, abs=1e-2) else: y_pred = distributed_out[0] handmade_distributed_score = np.mean(np_y_test == y_pred) handmade_distributed_score = round(handmade_distributed_score, 3) assert handmade_distributed_score == pytest.approx( handmade_local_score, abs=1e-2)
def test_predict_1D_labels(input_type, client): # Testing that nothing crashes with 1D labels X, y = make_classification(n_samples=10000) if input_type == 'array': dX = da.from_array(X) dy = da.from_array(y) elif input_type == 'dataframe': X = cudf.DataFrame(X) y = cudf.Series(y) dX = dd.from_pandas(X, npartitions=1) dy = dd.from_pandas(y, npartitions=1) clf = dKNNClf() clf.fit(dX, dy) clf.predict(dX)
def test_score(dataset, datatype, n_neighbors, n_parts, client): X_train, X_test, y_train, y_test = dataset if not n_parts: n_parts = len(client.has_what().keys()) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) y_test = generate_dask_array(y_test, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) y_test = to_dask_cudf(y_test, client) d_model = dKNNClf(client=client, n_neighbors=n_neighbors) d_model.fit(X_train, y_train) d_labels, d_indices, d_distances = \ d_model.predict(X_test, convert_dtype=True) distributed_out = da.compute(d_labels, d_indices, d_distances) if datatype == 'dask_cudf': distributed_out = list( map( lambda o: o.as_matrix() if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis], distributed_out)) cuml_score = d_model.score(X_test, y_test) if datatype == 'dask_cudf': y_test = y_test.compute().as_matrix() else: y_test = y_test.compute() manual_score = np.mean(y_test == distributed_out[0]) assert cuml_score == manual_score
def test_predict(dataset, datatype, n_neighbors, n_parts, batch_size, client): X_train, X_test, y_train, y_test = dataset l_model = lKNNClf(n_neighbors=n_neighbors) l_model.fit(X_train, y_train) l_distances, l_indices = l_model.kneighbors(X_test) l_labels = l_model.predict(X_test) local_out = (l_labels, l_indices, l_distances) if not n_parts: n_parts = len(client.has_what().keys()) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) d_model = dKNNClf(client=client, n_neighbors=n_neighbors, batch_size=batch_size) d_model.fit(X_train, y_train) d_labels, d_indices, d_distances = \ d_model.predict(X_test, convert_dtype=True) distributed_out = da.compute(d_labels, d_indices, d_distances) if datatype == 'dask_cudf': distributed_out = list( map( lambda o: o.as_matrix() if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis], distributed_out)) match_test(local_out, distributed_out) assert accuracy_score(y_test, distributed_out[0]) > 0.12
def test_predict_and_score(dataset, datatype, parameters, client): n_neighbors, n_parts, batch_size = parameters X_train, X_test, y_train, y_test = dataset l_model = lKNNClf(n_neighbors=n_neighbors) l_model.fit(X_train, y_train) l_outputs = l_model.predict(X_test) handmade_local_score = np.mean(y_test == l_outputs) handmade_local_score = round(handmade_local_score, 3) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) y_test = generate_dask_array(y_test, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) y_test = to_dask_cudf(y_test, client) d_model = dKNNClf(client=client, n_neighbors=n_neighbors, batch_size=batch_size) d_model.fit(X_train, y_train) d_outputs = d_model.predict(X_test, convert_dtype=True) d_outputs = d_outputs.compute() d_outputs = d_outputs.to_numpy() \ if isinstance(d_outputs, DataFrame) \ else d_outputs exact_match(l_outputs, d_outputs) distributed_score = d_model.score(X_test, y_test) distributed_score = round(distributed_score, 3) assert distributed_score == pytest.approx(handmade_local_score, abs=1e-2)