Пример #1
0
def test_basic_fit_predict(client):

    X, y = load_text_corpus(client)

    model = MultinomialNB()

    model.fit(X, y)

    y_hat = model.predict(X)

    y_hat = y_hat.compute()
    y = y.compute()

    assert (accuracy_score(y_hat.get(), y) > .97)
Пример #2
0
def post_etl_processing(client, train_data, test_data):
    import cudf
    from cuml.dask.naive_bayes import MultinomialNB as DistMNB
    from cuml.dask.common import to_dask_cudf
    from cuml.dask.common.input_utils import DistributedDataHandler

    # Feature engineering
    X_train = build_features(train_data)
    X_test = build_features(test_data)

    y_train = build_labels(train_data)
    y_test = build_labels(test_data)

    # Perform ML
    model = DistMNB(client=client, alpha=0.001)
    model.fit(X_train, y_train)

    ### this regression seems to be coming from here
    test_pred_st = time.time()
    y_hat = model.predict(X_test).persist()

    # Compute distributed performance metrics
    acc = accuracy_score(client, y_test, y_hat)

    print("Accuracy: " + str(acc))
    prec = precision_score(client, y_test, y_hat, average="macro")

    print("Precision: " + str(prec))
    cmat = confusion_matrix(client, y_test, y_hat)

    print("Confusion Matrix: " + str(cmat))
    metric_et = time.time()

    # Place results back in original Dataframe

    ddh = DistributedDataHandler.create(y_hat)
    test_preds = to_dask_cudf(
        [client.submit(cudf.Series, part) for w, part in ddh.gpu_futures])

    test_preds = test_preds.map_partitions(categoricalize)

    test_data["prediction"] = test_preds

    final_data = test_data[["pr_review_sk", "pr_review_rating",
                            "prediction"]].persist()

    final_data = final_data.sort_values("pr_review_sk").reset_index(drop=True)
    wait(final_data)
    return final_data, acc, prec, cmat
Пример #3
0
def test_model_multiple_chunks(client, dtype):
    # tests naive_bayes with n_chunks being greater than one, related to issue
    # https://github.com/rapidsai/cuml/issues/3150
    X = cp.array([[0, 0, 0, 1], [1, 0, 0, 1], [1, 0, 0, 0]])

    X = dask.array.from_array(X, chunks=((1, 1, 1), -1)).astype(dtype)
    y = dask.array.from_array([1, 0, 0], asarray=False,
                              fancy=False, chunks=(1)).astype(cp.int32)

    model = MultinomialNB()
    model.fit(X, y)

    # this test is a code coverage test, it is too small to be a numeric test,
    # but we call score here to exercise the whole model.
    assert(0 <= model.score(X, y) <= 1)
Пример #4
0
def test_single_distributed_exact_results(client):

    X, y = load_text_corpus(client)

    sgX, sgy = (X.compute(), y.compute())

    model = MultinomialNB()
    model.fit(X, y)

    sg_model = SGNB()
    sg_model.fit(sgX, sgy)

    y_hat = model.predict(X)
    sg_y_hat = sg_model.predict(sgX).get()

    y_hat = y_hat.compute().get()

    assert (accuracy_score(y_hat, sg_y_hat) == 1.0)
Пример #5
0
def test_score(client):

    X, y = load_text_corpus(client)

    model = MultinomialNB()
    model.fit(X, y)

    y_hat = model.predict(X)

    score = model.score(X, y)

    y_hat_local = y_hat.compute()
    y_local = y.compute()

    assert (accuracy_score(y_hat_local.get(), y_local) == score)