def test_fit(similarity_type, timedecay_formula, train_test_dummy_timestamp, header): model = SARSingleNode(similarity_type=similarity_type, timedecay_formula=timedecay_formula, **header) trainset, testset = train_test_dummy_timestamp model.fit(trainset)
def test_user_affinity(demo_usage_data, sar_settings, header): time_now = demo_usage_data[header["col_timestamp"]].max() model = SARSingleNode( similarity_type="cooccurrence", timedecay_formula=True, time_decay_coefficient=30, time_now=time_now, **header ) model.fit(demo_usage_data) true_user_affinity, items = load_affinity(sar_settings["FILE_DIR"] + "user_aff.csv") user_index = model.user2index[sar_settings["TEST_USER_ID"]] sar_user_affinity = np.reshape( np.array( _rearrange_to_test( model.user_affinity, None, items, None, model.item2index )[ user_index, ].todense() ), -1, ) assert np.allclose( true_user_affinity.astype(sar_user_affinity.dtype), sar_user_affinity, atol=sar_settings["ATOL"], )
def test_recommend_k_items( threshold, similarity_type, file, header, sar_settings, demo_usage_data ): time_now = demo_usage_data[header["col_timestamp"]].max() model = SARSingleNode( similarity_type=similarity_type, timedecay_formula=True, time_decay_coefficient=30, time_now=time_now, threshold=threshold, **header ) model.fit(demo_usage_data) true_items, true_scores = load_userpred( sar_settings["FILE_DIR"] + "userpred_" + file + str(threshold) + "_userid_only.csv" ) test_results = model.recommend_k_items( demo_usage_data[ demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"] ], top_k=10, sort_top_k=True, remove_seen=True, ) test_items = list(test_results[header["col_item"]]) test_scores = np.array(test_results["prediction"]) assert true_items == test_items assert np.allclose(true_scores, test_scores, atol=sar_settings["ATOL"])
def test_get_normalized_scores(header): train = pd.DataFrame({ header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2], header["col_item"]: [1, 2, 3, 4, 1, 5, 6, 7], header["col_rating"]: [3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0, 5.0], header["col_timestamp"]: [1, 20, 30, 400, 50, 60, 70, 800], }) test = pd.DataFrame({ header["col_user"]: [1, 1, 1, 2, 2, 2], header["col_item"]: [5, 6, 7, 2, 3, 4], header["col_rating"]: [2.0, 1.0, 5.0, 3.0, 4.0, 5.0], }) model = SARSingleNode(**header, timedecay_formula=True, normalize=True) model.fit(train) actual = model.score(test, remove_seen=True, normalize=True) expected = np.array([ [-np.inf, -np.inf, -np.inf, -np.inf, 3.0, 3.0, 3.0], [-np.inf, 3.0, 3.0, 3.0, -np.inf, -np.inf, -np.inf], ]) assert actual.shape == (2, 7) assert isinstance(actual, np.ndarray) assert np.isclose(expected, actual).all() actual = model.score(test, normalize=True) expected = np.array([ [3.80000633, 4.14285448, 4.14285448, 4.14285448, 3.0, 3.0, 3.0], [2.8000859, 3.0, 3.0, 3.0, 2.71441353, 2.71441353, 2.71441353], ]) assert actual.shape == (2, 7) assert isinstance(actual, np.ndarray) assert np.isclose(expected, actual).all()
def test_get_normalized_scores(header): train = pd.DataFrame( { header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2], header["col_item"]: [1, 2, 3, 4, 1, 5, 6, 7], header["col_rating"]: [3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0, 5.0], header["col_timestamp"]: [1, 20, 30, 400, 50, 60, 70, 800], } ) test = pd.DataFrame( { header["col_user"]: [1, 1, 1, 2, 2, 2], header["col_item"]: [5, 6, 7, 2, 3, 4], header["col_rating"]: [2.0, 1.0, 5.0, 3.0, 4.0, 5.0], } ) model = SARSingleNode(**header, timedecay_formula=True, normalize=True) model.fit(train) actual = model.score(test, remove_seen=True) expected = np.array( [ [-np.inf, -np.inf, -np.inf, -np.inf, 1.23512374, 1.23512374, 1.23512374], [-np.inf, 1.23512374, 1.23512374, 1.23512374, -np.inf, -np.inf, -np.inf], ] ) assert actual.shape == (2, 7) assert isinstance(actual, np.ndarray) assert np.isclose(expected, np.asarray(actual)).all() actual = model.score(test) expected = np.array( [ [ 3.11754872, 4.29408577, 4.29408577, 4.29408577, 1.23512374, 1.23512374, 1.23512374, ], [ 2.5293308, 1.23511758, 1.23511758, 1.23511758, 3.11767458, 3.11767458, 3.11767458, ], ] ) assert actual.shape == (2, 7) assert isinstance(actual, np.ndarray) assert np.isclose(expected, np.asarray(actual)).all()
def test_fit(similarity_type, timedecay_formula, train_test_dummy_timestamp, header): model = SARSingleNode(remove_seen=True, similarity_type=similarity_type, timedecay_formula=timedecay_formula, **header) trainset, testset = train_test_dummy_timestamp _apply_sar_hash_index(model, trainset, testset, header) model.fit(trainset)
def test_get_popularity_based_topk(header): train_df = pd.DataFrame({ header["col_user"]: [1, 1, 1, 2, 2, 2, 3, 3, 3], header["col_item"]: [1, 2, 3, 1, 3, 4, 5, 6, 1], header["col_rating"]: [1, 2, 3, 1, 2, 3, 1, 2, 3] }) sar = SARSingleNode(**header) sar.fit(train_df) expected = pd.DataFrame(dict(MovieId=[1, 3, 4], prediction=[3, 2, 1])) actual = sar.get_popularity_based_topk(top_k=3, sort_top_k=True) assert_frame_equal(expected, actual)
def test_predict( similarity_type, timedecay_formula, train_test_dummy_timestamp, header ): model = SARSingleNode( similarity_type=similarity_type, timedecay_formula=timedecay_formula, **header ) trainset, testset = train_test_dummy_timestamp model.fit(trainset) preds = model.predict(testset) assert len(preds) == 2 assert isinstance(preds, pd.DataFrame) assert preds[header["col_user"]].dtype == trainset[header["col_user"]].dtype assert preds[header["col_item"]].dtype == trainset[header["col_item"]].dtype assert preds[DEFAULT_PREDICTION_COL].dtype == trainset[header["col_rating"]].dtype
def test_predict_all_items(train_test_dummy_timestamp, header): model = SARSingleNode(**header) trainset, _ = train_test_dummy_timestamp model.fit(trainset) user_items = itertools.product( trainset[header["col_user"]].unique(), trainset[header["col_item"]].unique() ) testset = pd.DataFrame(user_items, columns=[header["col_user"], header["col_item"]]) preds = model.predict(testset) assert len(preds) == len(testset) assert isinstance(preds, pd.DataFrame) assert preds[header["col_user"]].dtype == trainset[header["col_user"]].dtype assert preds[header["col_item"]].dtype == trainset[header["col_item"]].dtype assert preds[DEFAULT_PREDICTION_COL].dtype == trainset[header["col_rating"]].dtype
def test_get_item_based_topk(header, pandas_dummy): sar = SARSingleNode(**header) sar.fit(pandas_dummy) # test with just items provided expected = pd.DataFrame( dict(UserId=[0, 0, 0], MovieId=[8, 7, 6], prediction=[2.0, 2.0, 2.0]) ) items = pd.DataFrame({header["col_item"]: [1, 5, 10]}) actual = sar.get_item_based_topk(items, top_k=3) assert_frame_equal(expected, actual, check_dtype=False) # test with items and users expected = pd.DataFrame( dict( UserId=[100, 100, 100, 1, 1, 1], MovieId=[8, 7, 6, 4, 3, 10], prediction=[2.0, 2.0, 2.0, 2.0, 2.0, 1.0], ) ) items = pd.DataFrame( { header["col_user"]: [100, 100, 1, 100, 1, 1], header["col_item"]: [1, 5, 1, 10, 2, 6], } ) actual = sar.get_item_based_topk(items, top_k=3, sort_top_k=True) assert_frame_equal(expected, actual, check_dtype=False) # test with items, users, and ratings expected = pd.DataFrame( dict( UserId=[100, 100, 100, 1, 1, 1], MovieId=[2, 4, 3, 4, 3, 10], prediction=[5.0, 5.0, 5.0, 8.0, 8.0, 4.0], ) ).set_index(["UserId", "MovieId"]) items = pd.DataFrame( { header["col_user"]: [100, 100, 1, 100, 1, 1], header["col_item"]: [1, 5, 1, 10, 2, 6], header["col_rating"]: [5, 1, 3, 1, 5, 4], } ) actual = sar.get_item_based_topk(items, top_k=3).set_index(["UserId", "MovieId"]) assert_frame_equal(expected, actual, check_like=True)
def test_predict(similarity_type, timedecay_formula, train_test_dummy_timestamp, header): model = SARSingleNode(remove_seen=True, similarity_type=similarity_type, timedecay_formula=timedecay_formula, **header) trainset, testset = train_test_dummy_timestamp _apply_sar_hash_index(model, trainset, testset, header) model.fit(trainset) preds = model.predict(testset) assert len(preds) == 2 assert isinstance(preds, pd.DataFrame) assert preds[header["col_user"]].dtype == object assert preds[header["col_item"]].dtype == object assert preds[PREDICTION_COL].dtype == float
def test_sar_item_similarity(threshold, similarity_type, file, demo_usage_data, sar_settings, header): model = SARSingleNode(remove_seen=True, similarity_type=similarity_type, timedecay_formula=False, time_decay_coefficient=30, time_now=TIME_NOW, threshold=threshold, **header) _apply_sar_hash_index(model, demo_usage_data, None, header) model.fit(demo_usage_data) true_item_similarity, row_ids, col_ids = read_matrix( sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv") if similarity_type is "cooccurrence": test_item_similarity = _rearrange_to_test( model.item_similarity.todense(), row_ids, col_ids, model.item_map_dict, model.item_map_dict, ) assert np.array_equal( true_item_similarity.astype(test_item_similarity.dtype), test_item_similarity, ) else: test_item_similarity = _rearrange_to_test( np.array(model.item_similarity), row_ids, col_ids, model.item_map_dict, model.item_map_dict, ) assert np.allclose( true_item_similarity.astype(test_item_similarity.dtype), test_item_similarity, atol=sar_settings["ATOL"], )
def SARtrain(): data = pd.read_csv("SnacksData100.csv") data.loc[:, 'Ratings'] = data['Ratings'].astype(np.float32) header = { "col_user": "******", "col_item": "Product_Id", "col_rating": "Ratings", "col_timestamp": "timestamp", } train, test = python_stratified_split(data, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42) joblib.dump(test, 'testdata') logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s') model = SARSingleNode(similarity_type="jaccard", time_decay_coefficient=30, time_now=None, timedecay_formula=True, **header) model.fit(train) joblib.dump(model, 'SARDump')
def train_sar(params, data): model = SARSingleNode(**params) model.set_index(data) with Timer() as t: model.fit(data) return model, t
header = { "col_user": "******", "col_item": "MovieId", "col_rating": "Rating", "col_timestamp": "Timestamp", } model = SARSingleNode(remove_seen=True, similarity_type="jaccard", time_decay_coefficient=30, time_now=None, timedecay_formula=True, **header) start_time = time.time() model.fit(train) train_time = time.time() - start_time start_time = time.time() topk = model.recommend_k_items(test) test_time = time.time() - start_time # TODO: remove this call when the model returns same type as input topk['UserId'] = pd.to_numeric(topk['UserId']) topk['MovieId'] = pd.to_numeric(topk['MovieId']) mlask(begin="\n", end="\n") mlcat( "Fit the SAR Model", """\
} logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s') model = SARSingleNode(remove_seen=True, similarity_type="jaccard", time_decay_coefficient=30, time_now=None, timedecay_formula=True, **header) # train the SAR model start_time = time.time() model.fit(train) train_time = time.time() - start_time run.log(name="Training time", value=train_time) start_time = time.time() top_k = model.recommend_k_items(test) test_time = time.time() - start_time run.log(name="Prediction time", value=test_time) # TODO: remove this call when the model returns same type as input top_k['UserId'] = pd.to_numeric(top_k['UserId']) top_k['MovieId'] = pd.to_numeric(top_k['MovieId'])