def test_user_affinity(spark, demo_usage_data, sar_settings, header): time_now = demo_usage_data[header["col_timestamp"]].max() model = SARPlus(spark, **header, timedecay_formula=True, time_decay_coefficient=30, time_now=time_now, similarity_type="cooccurrence") df = spark.createDataFrame(demo_usage_data) model.fit(df) user_affinity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "user_aff.csv") user_affinity_ref = pd.melt(user_affinity_ref, user_affinity_ref.columns[0], user_affinity_ref.columns[1:], 'ItemId', 'Rating') user_affinity_ref = user_affinity_ref[user_affinity_ref.Rating > 0]\ .reset_index(drop=True) # construct dataframe with test user id we'd like to get the affinity for df_test = spark.createDataFrame( pd.DataFrame({header['col_user']: [sar_settings["TEST_USER_ID"]]})) user_affinity = model.get_user_affinity(df_test).toPandas().reset_index( drop=True) # verify the that item ids are the same assert ( user_affinity[header['col_item']] == user_affinity_ref.ItemId).all() assert np.allclose(user_affinity_ref[header['col_rating']].values, user_affinity['Rating'].values, atol=sar_settings["ATOL"])
def test_e2e(spark, pandas_dummy_dataset, header): sar = SARPlus(spark, **header) df = spark.createDataFrame(pandas_dummy_dataset) sar.fit(df) test_df = spark.createDataFrame( pd.DataFrame({header["col_user"]: [3], header["col_item"]: [2]}) ) r1 = ( sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False) .toPandas() .sort_values([header["col_user"], header["col_item"]]) .reset_index(drop=True) ) r2 = ( sar.recommend_k_items( test_df, "tests/test_e2e_cache", top_k=3, n_user_prediction_partitions=2, remove_seen=False, ) .toPandas() .sort_values([header["col_user"], header["col_item"]]) .reset_index(drop=True) ) assert (r1.iloc[:, :2] == r2.iloc[:, :2]).all().all() assert np.allclose(r1.score.values, r2.score.values, 1e-3)
def test_e2e(spark, pandas_dummy_dataset, header): sar = SARPlus(spark, **header) df = spark.createDataFrame(pandas_dummy_dataset) sar.fit(df) # assert 4*4 + 32 == sar.item_similarity.count() # print(sar.item_similarity # .toPandas() # .pivot_table(index='i1', columns='i2', values='value')) test_df = spark.createDataFrame( pd.DataFrame({ header['col_user']: [3], header['col_item']: [2] })) r1 = sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False)\ .toPandas()\ .sort_values([header['col_user'], header['col_item']])\ .reset_index(drop=True) r2 = sar.recommend_k_items(test_df, "tests/test_e2e_cache", top_k=3, n_user_prediction_partitions=2, remove_seen=False)\ .toPandas()\ .sort_values([header['col_user'], header['col_item']])\ .reset_index(drop=True) assert (r1.iloc[:, :2] == r2.iloc[:, :2]).all().all() assert np.allclose(r1.score.values, r2.score.values, 1e-3)
def test_userpred( spark, threshold, similarity_type, file, header, sar_settings, demo_usage_data ): time_now = demo_usage_data[header["col_timestamp"]].max() test_id = "{0}_{1}_{2}".format(threshold, similarity_type, file) model = SARPlus( spark, **header, table_prefix=test_id, timedecay_formula=True, time_decay_coefficient=30, time_now=time_now, threshold=threshold, similarity_type=similarity_type ) df = spark.createDataFrame(demo_usage_data) model.fit(df) url = ( sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" ) pred_ref = pd.read_csv(url) pred_ref = ( pd.wide_to_long(pred_ref, ["rec", "score"], "user", "idx") .sort_values("score", ascending=False) .reset_index(drop=True) ) # Note: it's important to have a separate cache_path for each run as they're interferring with each other pred = model.recommend_k_items( spark.createDataFrame( demo_usage_data[ demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"] ] ), cache_path="test_userpred-" + test_id, top_k=10, n_user_prediction_partitions=1, ) pred = pred.toPandas().sort_values("score", ascending=False).reset_index(drop=True) assert (pred.MovieId.values == pred_ref.rec.values).all() assert np.allclose( pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"] )
def test_sar_item_similarity( spark, threshold, similarity_type, file, demo_usage_data, sar_settings, header, ): model = SARPlus( spark, **header, timedecay_formula=False, time_decay_coefficient=30, time_now=None, threshold=threshold, similarity_type=similarity_type, ) df = spark.createDataFrame(demo_usage_data) model.fit(df) # reference item_similarity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv") item_similarity_ref = pd.melt( item_similarity_ref, item_similarity_ref.columns[0], item_similarity_ref.columns[1:], "i2", "value", ) item_similarity_ref.columns = ["i1", "i2", "value"] item_similarity_ref = ( item_similarity_ref[item_similarity_ref.value > 0].sort_values( ["i1", "i2"]).reset_index(drop=True)) # actual item_similarity = (model.item_similarity.toPandas().sort_values( ["i1", "i2"]).reset_index(drop=True)) if similarity_type == "cooccurrence": assert (item_similarity_ref == item_similarity).all().all() else: assert ((item_similarity.iloc[:, :1] == item_similarity_ref.iloc[:, :1] ).all().all()) assert np.allclose( item_similarity.value.values, item_similarity_ref.value.values, atol=sar_settings["ATOL"], )
def test_fit(spark, similarity_type, timedecay_formula, train_test_dummy_timestamp, header): model = SARPlus(spark, **header) trainset, testset = train_test_dummy_timestamp df = spark.createDataFrame(trainset) df.write.mode("overwrite").saveAsTable("trainset") df = spark.table("trainset") model.fit(df, timedecay_formula=timedecay_formula, similarity_type=similarity_type)
def test_sar_item_similarity(spark, threshold, similarity_type, file, demo_usage_data, sar_settings, header): model = SARPlus(spark, **header, timedecay_formula=False, time_decay_coefficient=30, time_now=None, threshold=threshold, similarity_type=similarity_type) df = spark.createDataFrame(demo_usage_data) model.fit(df) # reference item_similarity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv") item_similarity_ref = pd.melt(item_similarity_ref, item_similarity_ref.columns[0], item_similarity_ref.columns[1:], 'i2', 'value') item_similarity_ref.columns = ['i1', 'i2', 'value'] item_similarity_ref = item_similarity_ref[item_similarity_ref.value > 0]\ .sort_values(['i1', 'i2'])\ .reset_index(drop=True)\ # actual item_similarity = model.item_similarity\ .toPandas()\ .sort_values(['i1', 'i2'])\ .reset_index(drop=True) if similarity_type is "cooccurrence": assert ((item_similarity_ref == item_similarity).all().all()) else: assert ((item_similarity.iloc[:, :1] == item_similarity_ref.iloc[:, :1] ).all().all()) assert np.allclose(item_similarity.value.values, item_similarity_ref.value.values)
from pysarplus import SARPlus, SARModel # spark dataframe with user/item/rating/optional timestamp tuples train_df = spark.createDataFrame([ (1, 1, 1), (1, 2, 1), (2, 1, 1), (3, 1, 1), (3, 3, 1)], ['user_id', 'item_id', 'rating']) # spark dataframe with user/item tuples test_df = spark.createDataFrame([ (1, 1, 1), (3, 3, 1)], ['user_id', 'item_id', 'rating']) model = SARPlus( spark, col_user='******', col_item='item_id', col_rating='rating', similarity_type='jaccard', ) model.fit(train_df) model.recommend_k_items(test_df, 'sarplus_cache', top_k=3).show()