Пример #1
0
def test_recommend_k_items(threshold, similarity_type, file, header,
                           sar_settings, demo_usage_data):
    time_now = demo_usage_data[header["col_timestamp"]].max()
    model = SARSingleNode(similarity_type=similarity_type,
                          timedecay_formula=True,
                          time_decay_coefficient=30,
                          time_now=time_now,
                          threshold=threshold,
                          **header)
    model.fit(demo_usage_data)

    true_items, true_scores = load_userpred(sar_settings["FILE_DIR"] +
                                            "userpred_" + file +
                                            str(threshold) +
                                            "_userid_only.csv")
    test_results = model.recommend_k_items(
        demo_usage_data[demo_usage_data[header["col_user"]] ==
                        sar_settings["TEST_USER_ID"]],
        top_k=10,
        sort_top_k=True,
        remove_seen=True,
    )
    test_items = list(test_results[header["col_item"]])
    test_scores = np.array(test_results["prediction"])
    assert true_items == test_items
    assert np.allclose(true_scores, test_scores, atol=sar_settings["ATOL"])
Пример #2
0
def test_fit(similarity_type, timedecay_formula, train_test_dummy_timestamp,
             header):
    model = SARSingleNode(similarity_type=similarity_type,
                          timedecay_formula=timedecay_formula,
                          **header)
    trainset, testset = train_test_dummy_timestamp
    model.fit(trainset)
Пример #3
0
def test_get_popularity_based_topk(header):

    train_df = pd.DataFrame({
        header["col_user"]: [1, 1, 1, 2, 2, 2, 3, 3, 3],
        header["col_item"]: [1, 2, 3, 1, 3, 4, 5, 6, 1],
        header["col_rating"]: [1, 2, 3, 1, 2, 3, 1, 2, 3],
    })

    sar = SARSingleNode(**header)
    sar.fit(train_df)

    expected = pd.DataFrame(dict(MovieId=[1, 3, 4], prediction=[3, 2, 1]))
    actual = sar.get_popularity_based_topk(top_k=3, sort_top_k=True)
    assert_frame_equal(expected, actual)
Пример #4
0
def test_predict(similarity_type, timedecay_formula,
                 train_test_dummy_timestamp, header):
    model = SARSingleNode(similarity_type=similarity_type,
                          timedecay_formula=timedecay_formula,
                          **header)
    trainset, testset = train_test_dummy_timestamp
    model.fit(trainset)
    preds = model.predict(testset)

    assert len(preds) == 2
    assert isinstance(preds, pd.DataFrame)
    assert preds[header["col_user"]].dtype == trainset[
        header["col_user"]].dtype
    assert preds[header["col_item"]].dtype == trainset[
        header["col_item"]].dtype
    assert preds[DEFAULT_PREDICTION_COL].dtype == trainset[
        header["col_rating"]].dtype
Пример #5
0
def test_predict_all_items(train_test_dummy_timestamp, header):
    model = SARSingleNode(**header)
    trainset, _ = train_test_dummy_timestamp
    model.fit(trainset)

    user_items = itertools.product(trainset[header["col_user"]].unique(),
                                   trainset[header["col_item"]].unique())
    testset = pd.DataFrame(user_items,
                           columns=[header["col_user"], header["col_item"]])
    preds = model.predict(testset)

    assert len(preds) == len(testset)
    assert isinstance(preds, pd.DataFrame)
    assert preds[header["col_user"]].dtype == trainset[
        header["col_user"]].dtype
    assert preds[header["col_item"]].dtype == trainset[
        header["col_item"]].dtype
    assert preds[DEFAULT_PREDICTION_COL].dtype == trainset[
        header["col_rating"]].dtype
Пример #6
0
def test_match_similarity_type_from_json_file(header):
    # store parameters in json
    params_str = json.dumps({'similarity_type': 'lift'})
    # load parameters in json
    params = json.loads(params_str)

    params.update(header)

    model = SARSingleNode(**params)

    train = pd.DataFrame({
        header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2],
        header["col_item"]: [1, 2, 3, 4, 1, 5, 6, 7],
        header["col_rating"]: [3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0, 5.0],
        header["col_timestamp"]: [1, 20, 30, 400, 50, 60, 70, 800],
    })

    # make sure fit still works when similarity type is loaded from a json file
    model.fit(train)
Пример #7
0
def test_sar_item_similarity(threshold, similarity_type, file, demo_usage_data,
                             sar_settings, header):

    model = SARSingleNode(similarity_type=similarity_type,
                          timedecay_formula=False,
                          time_decay_coefficient=30,
                          threshold=threshold,
                          **header)

    # Remove duplicates
    demo_usage_data = demo_usage_data.sort_values(header["col_timestamp"],
                                                  ascending=False)
    demo_usage_data = demo_usage_data.drop_duplicates(
        [header["col_user"], header["col_item"]], keep="first")

    model.fit(demo_usage_data)

    true_item_similarity, row_ids, col_ids = read_matrix(
        sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv")

    if similarity_type == "cooccurrence":
        test_item_similarity = _rearrange_to_test(
            model.item_similarity.todense(),
            row_ids,
            col_ids,
            model.item2index,
            model.item2index,
        )
        assert np.array_equal(
            true_item_similarity.astype(test_item_similarity.dtype),
            test_item_similarity,
        )
    else:
        test_item_similarity = _rearrange_to_test(model.item_similarity,
                                                  row_ids, col_ids,
                                                  model.item2index,
                                                  model.item2index)
        assert np.allclose(
            true_item_similarity.astype(test_item_similarity.dtype),
            test_item_similarity,
            atol=sar_settings["ATOL"],
        )
Пример #8
0
def test_init(header):
    model = SARSingleNode(similarity_type="jaccard", **header)

    assert model.col_user == "UserId"
    assert model.col_item == "MovieId"
    assert model.col_rating == "Rating"
    assert model.col_timestamp == "Timestamp"
    assert model.col_prediction == "prediction"
    assert model.similarity_type == "jaccard"
    assert model.time_decay_half_life == 2592000
    assert not model.time_decay_flag
    assert model.time_now is None
    assert model.threshold == 1
Пример #9
0
def test_user_affinity(demo_usage_data, sar_settings, header):
    time_now = demo_usage_data[header["col_timestamp"]].max()
    model = SARSingleNode(similarity_type="cooccurrence",
                          timedecay_formula=True,
                          time_decay_coefficient=30,
                          time_now=time_now,
                          **header)
    model.fit(demo_usage_data)

    true_user_affinity, items = load_affinity(sar_settings["FILE_DIR"] +
                                              "user_aff.csv")
    user_index = model.user2index[sar_settings["TEST_USER_ID"]]
    sar_user_affinity = np.reshape(
        np.array(
            _rearrange_to_test(model.user_affinity, None, items, None,
                               model.item2index)[user_index, ].todense()),
        -1,
    )
    assert np.allclose(
        true_user_affinity.astype(sar_user_affinity.dtype),
        sar_user_affinity,
        atol=sar_settings["ATOL"],
    )
Пример #10
0
def test_sar_item_similarity(threshold, similarity_type, file, demo_usage_data,
                             sar_settings, header):

    model = SARSingleNode(similarity_type=similarity_type,
                          timedecay_formula=False,
                          time_decay_coefficient=30,
                          threshold=threshold,
                          **header)

    model.fit(demo_usage_data)

    true_item_similarity, row_ids, col_ids = read_matrix(
        sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv")

    if similarity_type == "cooccurrence":
        test_item_similarity = _rearrange_to_test(
            model.item_similarity.todense(),
            row_ids,
            col_ids,
            model.item2index,
            model.item2index,
        )
        assert np.array_equal(
            true_item_similarity.astype(test_item_similarity.dtype),
            test_item_similarity,
        )
    else:
        test_item_similarity = _rearrange_to_test(model.item_similarity,
                                                  row_ids, col_ids,
                                                  model.item2index,
                                                  model.item2index)
        assert np.allclose(
            true_item_similarity.astype(test_item_similarity.dtype),
            test_item_similarity,
            atol=sar_settings["ATOL"],
        )
Пример #11
0
def test_get_normalized_scores(header):
    train = pd.DataFrame({
        header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2],
        header["col_item"]: [1, 2, 3, 4, 1, 5, 6, 7],
        header["col_rating"]: [3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0, 5.0],
        header["col_timestamp"]: [1, 20, 30, 400, 50, 60, 70, 800],
    })
    test = pd.DataFrame({
        header["col_user"]: [1, 1, 1, 2, 2, 2],
        header["col_item"]: [5, 6, 7, 2, 3, 4],
        header["col_rating"]: [2.0, 1.0, 5.0, 3.0, 4.0, 5.0],
    })

    model = SARSingleNode(**header, timedecay_formula=True, normalize=True)
    model.fit(train)
    actual = model.score(test, remove_seen=True)
    expected = np.array([
        [
            -np.inf, -np.inf, -np.inf, -np.inf, 1.23512374, 1.23512374,
            1.23512374
        ],
        [
            -np.inf, 1.23512374, 1.23512374, 1.23512374, -np.inf, -np.inf,
            -np.inf
        ],
    ])
    assert actual.shape == (2, 7)
    assert isinstance(actual, np.ndarray)
    assert np.isclose(expected, np.asarray(actual)).all()

    actual = model.score(test)
    expected = np.array([
        [
            3.11754872,
            4.29408577,
            4.29408577,
            4.29408577,
            1.23512374,
            1.23512374,
            1.23512374,
        ],
        [
            2.5293308,
            1.23511758,
            1.23511758,
            1.23511758,
            3.11767458,
            3.11767458,
            3.11767458,
        ],
    ])

    assert actual.shape == (2, 7)
    assert isinstance(actual, np.ndarray)
    assert np.isclose(expected, np.asarray(actual)).all()
Пример #12
0
def test_get_item_based_topk(header, pandas_dummy):

    sar = SARSingleNode(**header)
    sar.fit(pandas_dummy)

    # test with just items provided
    expected = pd.DataFrame(
        dict(UserId=[0, 0, 0], MovieId=[8, 7, 6], prediction=[2.0, 2.0, 2.0]))
    items = pd.DataFrame({header["col_item"]: [1, 5, 10]})
    actual = sar.get_item_based_topk(items, top_k=3)
    assert_frame_equal(expected, actual, check_dtype=False)

    # test with items and users
    expected = pd.DataFrame(
        dict(
            UserId=[100, 100, 100, 1, 1, 1],
            MovieId=[8, 7, 6, 4, 3, 10],
            prediction=[2.0, 2.0, 2.0, 2.0, 2.0, 1.0],
        ))
    items = pd.DataFrame({
        header["col_user"]: [100, 100, 1, 100, 1, 1],
        header["col_item"]: [1, 5, 1, 10, 2, 6],
    })
    actual = sar.get_item_based_topk(items, top_k=3, sort_top_k=True)
    assert_frame_equal(expected, actual, check_dtype=False)

    # test with items, users, and ratings
    expected = pd.DataFrame(
        dict(
            UserId=[100, 100, 100, 1, 1, 1],
            MovieId=[2, 4, 3, 4, 3, 10],
            prediction=[5.0, 5.0, 5.0, 8.0, 8.0, 4.0],
        )).set_index(["UserId", "MovieId"])
    items = pd.DataFrame({
        header["col_user"]: [100, 100, 1, 100, 1, 1],
        header["col_item"]: [1, 5, 1, 10, 2, 6],
        header["col_rating"]: [5, 1, 3, 1, 5, 4],
    })
    actual = sar.get_item_based_topk(items,
                                     top_k=3).set_index(["UserId", "MovieId"])
    assert_frame_equal(expected, actual, check_like=True)
Пример #13
0
def train_sar(params, data):
    model = SARSingleNode(**params)
    model.set_index(data)
    with Timer() as t:
        model.fit(data)
    return model, t