Exemplo n.º 1
0
def test_spark_ndcg(spark_data, target_metrics):
    df_true, df_pred = spark_data

    evaluator0 = SparkRankingEvaluation(df_true, df_true, k=10, col_prediction="rating")
    assert evaluator0.ndcg_at_k() == 1.0

    evaluator = SparkRankingEvaluation(df_true, df_pred, k=10)
    assert evaluator.ndcg_at_k() == target_metrics["ndcg"]

    evaluator1 = SparkRankingEvaluation(
        df_true, df_pred, relevancy_method="by_threshold", threshold=3.5
    )
    assert evaluator1.ndcg_at_k() == target_metrics["ndcg"]
Exemplo n.º 2
0
def ranking_metrics_pyspark(test, predictions, k=DEFAULT_K):
    rank_eval = SparkRankingEvaluation(
        test, predictions, k=k, relevancy_method="top_k", **COL_DICT
    )
    return {
        "MAP": rank_eval.map_at_k(),
        "nDCG@k": rank_eval.ndcg_at_k(),
        "Precision@k": rank_eval.precision_at_k(),
        "Recall@k": rank_eval.recall_at_k(),
    }
Exemplo n.º 3
0
def ranking_metrics_pyspark(test, predictions, k=DEFAULT_K):
    rank_eval = SparkRankingEvaluation(test, 
                                       predictions, 
                                       k=k, 
                                       relevancy_method="top_k",
                                       **COL_DICT)
    return {
        "MAP": rank_eval.map_at_k(),
        "nDCG@k": rank_eval.ndcg_at_k(),
        "Precision@k": rank_eval.precision_at_k(),
        "Recall@k": rank_eval.recall_at_k()
    }
Exemplo n.º 4
0
def test_spark_python_match(python_data, spark):
    # Test on the original data with k = 10.

    df_true, df_pred = python_data

    dfs_true = spark.createDataFrame(df_true)
    dfs_pred = spark.createDataFrame(df_pred)

    eval_spark1 = SparkRankingEvaluation(dfs_true, dfs_pred, k=10)

    match1 = [
        recall_at_k(df_true, df_pred,
                    k=10) == pytest.approx(eval_spark1.recall_at_k(), TOL),
        precision_at_k(df_true, df_pred,
                       k=10) == pytest.approx(eval_spark1.precision_at_k(),
                                              TOL),
        ndcg_at_k(df_true, df_pred,
                  k=10) == pytest.approx(eval_spark1.ndcg_at_k(), TOL),
        map_at_k(df_true, df_pred,
                 k=10) == pytest.approx(eval_spark1.map_at_k(), TOL),
    ]

    assert all(match1)

    # Test on the original data with k = 3.

    dfs_true = spark.createDataFrame(df_true)
    dfs_pred = spark.createDataFrame(df_pred)

    eval_spark2 = SparkRankingEvaluation(dfs_true, dfs_pred, k=3)

    match2 = [
        recall_at_k(df_true, df_pred,
                    k=3) == pytest.approx(eval_spark2.recall_at_k(), TOL),
        precision_at_k(df_true, df_pred,
                       k=3) == pytest.approx(eval_spark2.precision_at_k(),
                                             TOL),
        ndcg_at_k(df_true, df_pred,
                  k=3) == pytest.approx(eval_spark2.ndcg_at_k(), TOL),
        map_at_k(df_true, df_pred,
                 k=3) == pytest.approx(eval_spark2.map_at_k(), TOL),
    ]

    assert all(match2)

    # Remove the first row from the original data.

    df_pred = df_pred[1:-1]

    dfs_true = spark.createDataFrame(df_true)
    dfs_pred = spark.createDataFrame(df_pred)

    eval_spark3 = SparkRankingEvaluation(dfs_true, dfs_pred, k=10)

    match3 = [
        recall_at_k(df_true, df_pred,
                    k=10) == pytest.approx(eval_spark3.recall_at_k(), TOL),
        precision_at_k(df_true, df_pred,
                       k=10) == pytest.approx(eval_spark3.precision_at_k(),
                                              TOL),
        ndcg_at_k(df_true, df_pred,
                  k=10) == pytest.approx(eval_spark3.ndcg_at_k(), TOL),
        map_at_k(df_true, df_pred,
                 k=10) == pytest.approx(eval_spark3.map_at_k(), TOL),
    ]

    assert all(match3)

    # Test with one user

    df_pred = df_pred[df_pred["userID"] == 3]
    df_true = df_true[df_true["userID"] == 3]

    dfs_true = spark.createDataFrame(df_true)
    dfs_pred = spark.createDataFrame(df_pred)

    eval_spark4 = SparkRankingEvaluation(dfs_true, dfs_pred, k=10)

    match4 = [
        recall_at_k(df_true, df_pred,
                    k=10) == pytest.approx(eval_spark4.recall_at_k(), TOL),
        precision_at_k(df_true, df_pred,
                       k=10) == pytest.approx(eval_spark4.precision_at_k(),
                                              TOL),
        ndcg_at_k(df_true, df_pred,
                  k=10) == pytest.approx(eval_spark4.ndcg_at_k(), TOL),
        map_at_k(df_true, df_pred,
                 k=10) == pytest.approx(eval_spark4.map_at_k(), TOL),
    ]

    assert all(match4)
def test_spark_python_match(python_data, spark):
    # Test on the original data with k = 10.
    df_true, df_pred = python_data

    dfs_true = spark.createDataFrame(df_true)
    dfs_pred = spark.createDataFrame(df_pred)

    eval_spark1 = SparkRankingEvaluation(dfs_true, dfs_pred, k=10)

    match1 = [
        recall_at_k(df_true, df_pred, k=10)
        == pytest.approx(eval_spark1.recall_at_k(), TOL),
        precision_at_k(df_true, df_pred, k=10)
        == pytest.approx(eval_spark1.precision_at_k(), TOL),
        ndcg_at_k(df_true, df_pred, k=10)
        == pytest.approx(eval_spark1.ndcg_at_k(), TOL),
        map_at_k(df_true, df_pred, k=10) == pytest.approx(eval_spark1.map_at_k(), TOL),
    ]
    assert all(match1)

    # Test on the original data with k = 3.
    dfs_true = spark.createDataFrame(df_true)
    dfs_pred = spark.createDataFrame(df_pred)

    eval_spark2 = SparkRankingEvaluation(dfs_true, dfs_pred, k=3)

    match2 = [
        recall_at_k(df_true, df_pred, k=3)
        == pytest.approx(eval_spark2.recall_at_k(), TOL),
        precision_at_k(df_true, df_pred, k=3)
        == pytest.approx(eval_spark2.precision_at_k(), TOL),
        ndcg_at_k(df_true, df_pred, k=3) == pytest.approx(eval_spark2.ndcg_at_k(), TOL),
        map_at_k(df_true, df_pred, k=3) == pytest.approx(eval_spark2.map_at_k(), TOL),
    ]
    assert all(match2)

    # Remove the first row from the original data.
    df_pred = df_pred[1:-1]

    dfs_true = spark.createDataFrame(df_true)
    dfs_pred = spark.createDataFrame(df_pred)

    eval_spark3 = SparkRankingEvaluation(dfs_true, dfs_pred, k=10)

    match3 = [
        recall_at_k(df_true, df_pred, k=10)
        == pytest.approx(eval_spark3.recall_at_k(), TOL),
        precision_at_k(df_true, df_pred, k=10)
        == pytest.approx(eval_spark3.precision_at_k(), TOL),
        ndcg_at_k(df_true, df_pred, k=10)
        == pytest.approx(eval_spark3.ndcg_at_k(), TOL),
        map_at_k(df_true, df_pred, k=10) == pytest.approx(eval_spark3.map_at_k(), TOL),
    ]
    assert all(match3)

    # Test with one user
    df_pred = df_pred.loc[df_pred["userID"] == 3]
    df_true = df_true.loc[df_true["userID"] == 3]

    dfs_true = spark.createDataFrame(df_true)
    dfs_pred = spark.createDataFrame(df_pred)

    eval_spark4 = SparkRankingEvaluation(dfs_true, dfs_pred, k=10)

    match4 = [
        recall_at_k(df_true, df_pred, k=10)
        == pytest.approx(eval_spark4.recall_at_k(), TOL),
        precision_at_k(df_true, df_pred, k=10)
        == pytest.approx(eval_spark4.precision_at_k(), TOL),
        ndcg_at_k(df_true, df_pred, k=10)
        == pytest.approx(eval_spark4.ndcg_at_k(), TOL),
        map_at_k(df_true, df_pred, k=10) == pytest.approx(eval_spark4.map_at_k(), TOL),
    ]
    assert all(match4)