def test_compute_ranking_predictions(rating_true): n_users = len(rating_true["userID"].unique()) n_items = len(rating_true["itemID"].unique()) svd = surprise.SVD() train_set = surprise.Dataset.load_from_df( rating_true, reader=surprise.Reader()).build_full_trainset() svd.fit(train_set) preds = compute_ranking_predictions(svd, rating_true, remove_seen=True) assert set(preds.columns) == {"userID", "itemID", "prediction"} assert preds["userID"].dtypes == rating_true["userID"].dtypes assert preds["itemID"].dtypes == rating_true["itemID"].dtypes user = preds.iloc[0]["userID"] item = preds.iloc[0]["itemID"] assert preds[(preds["userID"] == user) & ( preds["itemID"] == item)]["prediction"].values == pytest.approx( svd.predict(user, item).est, rel=TOL) # Test default remove_seen=True assert pd.merge(rating_true, preds, on=["userID", "itemID"]).shape[0] == 0 assert preds.shape[0] == (n_users * n_items - rating_true.shape[0]) preds = compute_ranking_predictions( svd, rating_true.rename(columns={ "userID": "uid", "itemID": "iid", "rating": "r" }), usercol="uid", itemcol="iid", predcol="pred", remove_seen=False, ) assert set(preds.columns) == {"uid", "iid", "pred"} assert preds["uid"].dtypes == rating_true["userID"].dtypes assert preds["iid"].dtypes == rating_true["itemID"].dtypes user = preds.iloc[1]["uid"] item = preds.iloc[1]["iid"] assert preds[(preds["uid"] == user) & (preds["iid"] == item)]["pred"].values == pytest.approx( svd.predict(user, item).est, rel=TOL) # Test remove_seen=False assert (pd.merge(rating_true, preds, left_on=["userID", "itemID"], right_on=["uid", "iid"]).shape[0] == rating_true.shape[0]) assert preds.shape[0] == n_users * n_items
def svd_training(params): """ Train Surprise SVD using the given hyper-parameters """ logger.debug("Start training...") train_data = pd.read_pickle(path=os.path.join(params['datastore'], params['train_datapath'])) validation_data = pd.read_pickle(path=os.path.join(params['datastore'], params['validation_datapath'])) svd_params = {p: params[p] for p in ['random_state', 'n_epochs', 'verbose', 'biased', 'n_factors', 'init_mean', 'init_std_dev', 'lr_all', 'reg_all', 'lr_bu', 'lr_bi', 'lr_pu', 'lr_qi', 'reg_bu', 'reg_bi', 'reg_pu', 'reg_qi']} svd = surprise.SVD(**svd_params) train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(params['surprise_reader'])) \ .build_full_trainset() svd.fit(train_set) logger.debug("Evaluating...") metrics_dict = {} rating_metrics = params['rating_metrics'] if len(rating_metrics) > 0: predictions = compute_rating_predictions(svd, validation_data, usercol=params['usercol'], itemcol=params['itemcol']) for metric in rating_metrics: result = getattr(evaluation, metric)(validation_data, predictions) logger.debug("%s = %g", metric, result) if metric == params['primary_metric']: metrics_dict['default'] = result else: metrics_dict[metric] = result ranking_metrics = params['ranking_metrics'] if len(ranking_metrics) > 0: all_predictions = compute_ranking_predictions(svd, train_data, usercol=params['usercol'], itemcol=params['itemcol'], recommend_seen=params['recommend_seen']) k = params['k'] for metric in ranking_metrics: result = getattr(evaluation, metric)(validation_data, all_predictions, col_prediction='prediction', k=k) logger.debug("%s@%d = %g", metric, k, result) if metric == params['primary_metric']: metrics_dict['default'] = result else: metrics_dict[metric] = result if len(ranking_metrics) == 0 and len(rating_metrics) == 0: raise ValueError("No metrics were specified.") # Report the metrics nni.report_final_result(metrics_dict) # Save the metrics in a JSON file output_dir = os.environ.get('NNI_OUTPUT_DIR') with open(os.path.join(output_dir, 'metrics.json'), 'w') as fp: temp_dict = metrics_dict.copy() temp_dict[params['primary_metric']] = temp_dict.pop('default') json.dump(temp_dict, fp) return svd
def test_compute_ranking_predictions(rating_true): n_users = len(rating_true["userID"].unique()) n_items = len(rating_true["itemID"].unique()) svd = surprise.SVD() train_set = surprise.Dataset.load_from_df( rating_true, reader=surprise.Reader() ).build_full_trainset() svd.fit(train_set) preds = compute_ranking_predictions(svd, rating_true) assert set(preds.columns) == {"userID", "itemID", "prediction"} assert preds["userID"].dtypes == rating_true["userID"].dtypes assert preds["itemID"].dtypes == rating_true["itemID"].dtypes user = preds.iloc[0]["userID"] item = preds.iloc[0]["itemID"] assert preds[(preds["userID"] == user) & (preds["itemID"] == item)][ "prediction" ].values == pytest.approx(svd.predict(user, item).est, rel=TOL) # Test default recommend_seen=False assert pd.merge(rating_true, preds, on=["userID", "itemID"]).shape[0] == 0 assert preds.shape[0] == (n_users * n_items - rating_true.shape[0]) preds = compute_ranking_predictions( svd, rating_true.rename(columns={"userID": "uid", "itemID": "iid", "rating": "r"}), usercol="uid", itemcol="iid", predcol="pred", recommend_seen=True, ) assert set(preds.columns) == {"uid", "iid", "pred"} assert preds["uid"].dtypes == rating_true["userID"].dtypes assert preds["iid"].dtypes == rating_true["itemID"].dtypes user = preds.iloc[1]["uid"] item = preds.iloc[1]["iid"] assert preds[(preds["uid"] == user) & (preds["iid"] == item)][ "pred" ].values == pytest.approx(svd.predict(user, item).est, rel=TOL) # Test recommend_seen=True assert ( pd.merge( rating_true, preds, left_on=["userID", "itemID"], right_on=["uid", "iid"] ).shape[0] == rating_true.shape[0] ) assert preds.shape[0] == n_users * n_items
def recommend_k_svd(model, test, train): with Timer() as t: topk_scores = compute_ranking_predictions(model, train, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, predcol=DEFAULT_PREDICTION_COL, recommend_seen=False) return topk_scores, t
def test_compute_ranking_predictions(python_data): rating_true, _, _ = python_data(binary_rating=False) n_users = len(rating_true['userID'].unique()) n_items = len(rating_true['itemID'].unique()) svd = surprise.SVD() train_set = surprise.Dataset.load_from_df( rating_true, reader=surprise.Reader()).build_full_trainset() svd.fit(train_set) preds = compute_ranking_predictions(svd, rating_true) assert set(preds.columns) == {'userID', 'itemID', 'prediction'} assert preds['userID'].dtypes == rating_true['userID'].dtypes assert preds['itemID'].dtypes == rating_true['itemID'].dtypes user = preds.iloc[0]['userID'] item = preds.iloc[0]['itemID'] assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \ pytest.approx(svd.predict(user, item).est, rel=TOL) # Test default recommend_seen=False assert pd.merge(rating_true, preds, on=['userID', 'itemID']).shape[0] == 0 assert preds.shape[0] == (n_users * n_items - rating_true.shape[0]) preds = compute_ranking_predictions( svd, rating_true.rename(columns={ 'userID': 'uid', 'itemID': 'iid', 'rating': 'r' }), usercol='uid', itemcol='iid', predcol='pred', recommend_seen=True) assert set(preds.columns) == {'uid', 'iid', 'pred'} assert preds['uid'].dtypes == rating_true['userID'].dtypes assert preds['iid'].dtypes == rating_true['itemID'].dtypes user = preds.iloc[1]['uid'] item = preds.iloc[1]['iid'] assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \ pytest.approx(svd.predict(user, item).est, rel=TOL) # Test recommend_seen=True assert pd.merge(rating_true, preds, left_on=['userID', 'itemID'], right_on=['uid', 'iid']).shape[0] == \ rating_true.shape[0] assert preds.shape[0] == n_users * n_items
def recommend_k_svd(model, test, train): with Timer() as t: topk_scores = compute_ranking_predictions( model, train, usercol=DEFAULT_USER_COL, itemcol=DEFAULT_ITEM_COL, predcol=DEFAULT_PREDICTION_COL, recommend_seen=False) return topk_scores, t
def compute_test_results(svd): test_results = {} predictions = predict(svd, test, usercol="userID", itemcol="itemID") for metric in RATING_METRICS: test_results[metric] = eval(metric)(test, predictions) all_predictions = compute_ranking_predictions(svd, train, usercol="userID", itemcol="itemID", remove_seen=REMOVE_SEEN) for metric in RANKING_METRICS: test_results[metric] = eval(metric)(test, all_predictions, col_prediction='prediction', k=K) return test_results
def svd_training(args): """ Train Surprise SVD using the given hyper-parameters """ print("Start training...") train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath)) validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath)) svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased, n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev, lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu, lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu, reg_qi=args.reg_qi) train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \ .build_full_trainset() svd.fit(train_set) print("Evaluating...") rating_metrics = args.rating_metrics if len(rating_metrics) > 0: predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol) for metric in rating_metrics: result = eval(metric)(validation_data, predictions) print(metric, result) if HAS_AML: run.log(metric, result) ranking_metrics = args.ranking_metrics if len(ranking_metrics) > 0: all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol, remove_seen=args.remove_seen) k = args.k for metric in ranking_metrics: result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k) print("{}@{}".format(metric, k), result) if HAS_AML: run.log(metric, result) if len(ranking_metrics) == 0 and len(rating_metrics) == 0: raise ValueError("No metrics were specified.") return svd
def svd_training(args): """ Train Surprise SVD using the given hyper-parameters """ print("Start training...") train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath)) validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath)) svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased, n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev, lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu, lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu, reg_qi=args.reg_qi) train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \ .build_full_trainset() svd.fit(train_set) print("Evaluating...") rating_metrics = args.rating_metrics if len(rating_metrics) > 0: predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol) for metric in rating_metrics: result = eval(metric)(validation_data, predictions) print(metric, result) if HAS_AML: run.log(metric, result) ranking_metrics = args.ranking_metrics if len(ranking_metrics) > 0: all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol, recommend_seen=args.recommend_seen) k = args.k for metric in ranking_metrics: result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k) print("{}@{}".format(metric, k), result) if HAS_AML: run.log(metric, result) if len(ranking_metrics) == 0 and len(rating_metrics) == 0: raise ValueError("No metrics were specified.") return svd
def svd_training(params): """ Train Surprise SVD using the given hyper-parameters """ logger.debug("Start training...") train_data = pd.read_pickle( path=os.path.join(params["datastore"], params["train_datapath"])) validation_data = pd.read_pickle( path=os.path.join(params["datastore"], params["validation_datapath"])) svd_params = { p: params[p] for p in [ "random_state", "n_epochs", "verbose", "biased", "n_factors", "init_mean", "init_std_dev", "lr_all", "reg_all", "lr_bu", "lr_bi", "lr_pu", "lr_qi", "reg_bu", "reg_bi", "reg_pu", "reg_qi", ] } svd = surprise.SVD(**svd_params) train_set = surprise.Dataset.load_from_df( train_data, reader=surprise.Reader( params["surprise_reader"])).build_full_trainset() svd.fit(train_set) logger.debug("Evaluating...") metrics_dict = {} rating_metrics = params["rating_metrics"] if len(rating_metrics) > 0: predictions = predict(svd, validation_data, usercol=params["usercol"], itemcol=params["itemcol"]) for metric in rating_metrics: result = getattr(evaluation, metric)(validation_data, predictions) logger.debug("%s = %g", metric, result) if metric == params["primary_metric"]: metrics_dict["default"] = result else: metrics_dict[metric] = result ranking_metrics = params["ranking_metrics"] if len(ranking_metrics) > 0: all_predictions = compute_ranking_predictions( svd, train_data, usercol=params["usercol"], itemcol=params["itemcol"], remove_seen=params["remove_seen"], ) k = params["k"] for metric in ranking_metrics: result = getattr(evaluation, metric)(validation_data, all_predictions, col_prediction="prediction", k=k) logger.debug("%s@%d = %g", metric, k, result) if metric == params["primary_metric"]: metrics_dict["default"] = result else: metrics_dict[metric] = result if len(ranking_metrics) == 0 and len(rating_metrics) == 0: raise ValueError("No metrics were specified.") # Report the metrics nni.report_final_result(metrics_dict) # Save the metrics in a JSON file output_dir = os.environ.get("NNI_OUTPUT_DIR") with open(os.path.join(output_dir, "metrics.json"), "w") as fp: temp_dict = metrics_dict.copy() temp_dict[params["primary_metric"]] = temp_dict.pop("default") json.dump(temp_dict, fp) return svd
def svd_training(params): """ Train Surprise SVD using the given hyper-parameters """ logger.debug("Start training...") train_data = pd.read_pickle( path=os.path.join(params['datastore'], params['train_datapath'])) validation_data = pd.read_pickle( path=os.path.join(params['datastore'], params['validation_datapath'])) svd_params = { p: params[p] for p in [ 'random_state', 'n_epochs', 'verbose', 'biased', 'n_factors', 'init_mean', 'init_std_dev', 'lr_all', 'reg_all', 'lr_bu', 'lr_bi', 'lr_pu', 'lr_qi', 'reg_bu', 'reg_bi', 'reg_pu', 'reg_qi' ] } svd = surprise.SVD(**svd_params) train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(params['surprise_reader'])) \ .build_full_trainset() svd.fit(train_set) logger.debug("Evaluating...") metrics_dict = {} rating_metrics = params['rating_metrics'] if len(rating_metrics) > 0: predictions = compute_rating_predictions(svd, validation_data, usercol=params['usercol'], itemcol=params['itemcol']) for metric in rating_metrics: result = getattr(evaluation, metric)(validation_data, predictions) logger.debug("%s = %g", metric, result) if metric == params['primary_metric']: metrics_dict['default'] = result else: metrics_dict[metric] = result ranking_metrics = params['ranking_metrics'] if len(ranking_metrics) > 0: all_predictions = compute_ranking_predictions( svd, train_data, usercol=params['usercol'], itemcol=params['itemcol'], recommend_seen=params['recommend_seen']) k = params['k'] for metric in ranking_metrics: result = getattr(evaluation, metric)(validation_data, all_predictions, col_prediction='prediction', k=k) logger.debug("%s@%d = %g", metric, k, result) if metric == params['primary_metric']: metrics_dict['default'] = result else: metrics_dict[metric] = result if len(ranking_metrics) == 0 and len(rating_metrics) == 0: raise ValueError("No metrics were specified.") # Report the metrics nni.report_final_result(metrics_dict) # Save the metrics in a JSON file output_dir = os.environ.get('NNI_OUTPUT_DIR') with open(os.path.join(output_dir, 'metrics.json'), 'w') as fp: temp_dict = metrics_dict.copy() temp_dict[params['primary_metric']] = temp_dict.pop('default') json.dump(temp_dict, fp) return svd
"uid": 'User-Id', "iid": 'Snack Id', "est": 'Review' }) predictions = predictions.drop(["details", "r_ui"], axis="columns") # In[100]: predictions # In[101]: with Timer() as test_time: all_predictions = compute_ranking_predictions(svd, train, usercol='User-Id', itemcol='Snack Id', remove_seen=True) print("Took {} seconds for prediction.".format(test_time.interval)) # In[102]: suffixes = ["_true", "_pred"] rating_true_pred = pd.merge(test, predictions, on=["User-Id", 'Snack Id'], suffixes=suffixes) # In[103]: