raw_data['max_item'], name='Test') cml_model = CML(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=50, opt='Adam', sess_config=None, l2_reg=0.1) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=cml_model, sampler=sampler, eval_save_prefix="cml-citeulike") auc_evaluator = AUC() recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) precision_evaluator = Precision( precision_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) cml_model.load("cml-citeulike") model_trainer._eval_manager = ImplicitEvalManager(evaluators=[ auc_evaluator, recall_evaluator, ndcg_evaluator, precision_evaluator ]) model_trainer._num_negatives = 200 model_trainer._exclude_positives([test_dataset])
raw_data['max_user'], raw_data['max_item'], name='Test') model = ConcatVisualBPR(batch_size=batch_size, max_user=raw_data['max_user'], max_item=raw_data['max_item'], item_serving_size=item_serving_size, dim_embed=20, dim_ve=10, item_f_source=raw_data['item_features'], l2_reg=None, sess_config=sess_config) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, item_serving_size=item_serving_size, train_dataset=train_dataset, model=model, sampler=sampler) auc_evaluator = AUC() model_trainer.train(num_itr=int(1e5), display_itr=display_itr, eval_datasets=[val_dataset, test_dataset], evaluators=[auc_evaluator], num_negatives=1000)
os.system("wget https://s3.amazonaws.com/cornell-tech-sdl-rec-bias/dataset/citeulike/rsrf_user_data_test.npy") raw_data = dict() raw_data['train_data'] = np.load("rsrf_user_data_train.npy") raw_data['val_data'] = np.load("rsrf_user_data_val.npy") raw_data['test_data'] = np.load("rsrf_user_data_test.npy") raw_data['max_user'] = 5551 raw_data['max_item'] = 16980 batch_size = 8000 test_batch_size = 1000 display_itr = 5000 train_dataset = ImplicitDataset(raw_data['train_data'], raw_data['max_user'], raw_data['max_item'], name='Train') val_dataset = ImplicitDataset(raw_data['val_data'], raw_data['max_user'], raw_data['max_item'], name='Val') test_dataset = ImplicitDataset(raw_data['test_data'], raw_data['max_user'], raw_data['max_item'], name='Test') bpr_model = BPR(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=50, opt='Adam', sess_config=None, l2_reg=0.01) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=1) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=bpr_model, sampler=sampler, eval_save_prefix="bpr-citeulike") auc_evaluator = AUC() recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) precision_evaluator = Precision(precision_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) model_trainer.train(num_itr=100001, display_itr=display_itr, eval_datasets=[val_dataset], evaluators=[auc_evaluator, recall_evaluator, precision_evaluator, ndcg_evaluator], num_negatives=200)
wcml_model = WCML(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=50, neg_num=5, l2_reg=1e-3, opt='Adam', sess_config=None) sampler = NPairwiseSampler(batch_size=batch_size, dataset=train_dataset, negativenum=5, num_process=5) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=wcml_model, sampler=sampler, eval_save_prefix="wcml-tradesy", item_serving_size=666) auc_evaluator = AUC() recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) precision_evaluator = Precision( precision_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) model_trainer.train(num_itr=200001, display_itr=display_itr, eval_datasets=[val_dataset], evaluators=[ auc_evaluator, recall_evaluator, precision_evaluator, ndcg_evaluator
name='Train') val_dataset = ImplicitDataset(raw_data['val_data'], raw_data['max_user'], raw_data['max_item'], name='Val') test_dataset = ImplicitDataset(raw_data['test_data'], raw_data['max_user'], raw_data['max_item'], name='Test') bpr_model = BPR(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=20, opt='Adam', sess_config=sess_config) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=bpr_model, sampler=sampler) auc_evaluator = AUC() model_trainer.train(num_itr=int(1e5), display_itr=display_itr, eval_datasets=[val_dataset, test_dataset], evaluators=[auc_evaluator])
wcml_model = WCML(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=50, neg_num=5, l2_reg=0.001, opt='Adam', sess_config=None) sampler = NPairwiseSampler(batch_size=batch_size, dataset=train_dataset, negativenum=5, num_process=4) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=wcml_model, sampler=sampler, eval_save_prefix="wcml-yahoo", item_serving_size=500) auc_evaluator = AUC() wcml_model.load("wcml-yahoo") model_trainer._eval_manager = ImplicitEvalManager(evaluators=[auc_evaluator]) model_trainer._num_negatives = 300 model_trainer._exclude_positives([train_dataset, val_dataset]) model_trainer._sample_negatives(seed=10) model_trainer._eval_save_prefix = "wcml-yahoo-val" model_trainer._evaluate_partial(val_dataset)
name='Train') val_dataset = ImplicitDataset(raw_data['val_data'], raw_data['max_user'], raw_data['max_item'], name='Val') bpr_model = BPR(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=50, l2_reg=0.001, opt='Adam', sess_config=None) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=4) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=bpr_model, sampler=sampler, eval_save_prefix="bpr-yahoo", item_serving_size=666) auc_evaluator = AUC() model_trainer.train(num_itr=10001, display_itr=display_itr, eval_datasets=[val_dataset], evaluators=[auc_evaluator], num_negatives=200)
name='Val') model = PMF(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=50, l2_reg=0.001, opt='Adam', sess_config=None) sampler = PointwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=4) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=model, sampler=sampler, eval_save_prefix="pmf-yahoo", item_serving_size=666) auc_evaluator = AUC() model.load("pmf-yahoo") model_trainer._eval_manager = ImplicitEvalManager(evaluators=[auc_evaluator]) model_trainer._num_negatives = 200 model_trainer._exclude_positives( [train_dataset, test_dataset_pos, test_dataset_neg]) model_trainer._sample_negatives(seed=10) model_trainer._eval_save_prefix = "pmf-yahoo-test-pos" model_trainer._evaluate_partial(test_dataset_pos)
def run_test_exp(model_name=None, evaluator=None, raw_data=None, user_per=1.0, keep_days=1, l2_reg=0.001, test_date=None, outdir=None, num_itr=1e4 + 1): # parse dataset into incremental training and testing set data = raw_data max_user = len(np.unique(data["user_id"])) max_item = len(np.unique(data["item_id"])) print("max_user:{}, max_item:{}".format(max_user, max_item)) test_date = datetime.datetime.strptime(test_date, "%Y-%m-%d").date() print("test date:%s" % test_date) train_data = data[data["timestamp"] < test_date] test_data = data[(data["timestamp"] >= test_date) & ( data["timestamp"] < (test_date + datetime.timedelta(days=7)))] np.random.seed(10) test_data = np.asarray([ np.random.choice(test_data[test_data["user_id"] == uid], 1)[0] for uid in np.unique(test_data["user_id"]) ]) # filter training data, for selected users keep only the latest n days of data print("filter user percentage:%f" % user_per) print("ratings before filter:%d" % len(train_data)) user_list = np.unique(train_data["user_id"]) filter_user = np.random.choice(user_list, int(len(user_list) * user_per), replace=False) filter_mask = (np.isin(train_data["user_id"], filter_user)) & ( train_data["timestamp"] < (test_date - datetime.timedelta(days=keep_days))) # output filtered data and test data if LOGGING: np.save(outdir + "filtered_data.npy", train_data[filter_mask]) np.save(outdir + "train_data.npy", train_data[~filter_mask]) np.save(outdir + "test_data.npy", test_data) train_data = train_data[~filter_mask] print("ratings after filter:%d" % len(train_data)) train_dataset = ImplicitDataset(train_data, max_user, max_item, name='Train') test_dataset = ImplicitDataset(test_data, max_user, max_item, name='Test') num_process = 8 dim_embed = 50 if model_name == 'PMF': model = PMF(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=0.5, num_process=num_process) elif model_name == 'CML': model = CML(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=num_process) elif model_name == 'BPR': model = BPR(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=num_process) else: print("Wrong model assigned") return if evaluator == 'Recall': test_evaluator = Recall( recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) elif evaluator == 'NDCG': test_evaluator = NDCG( ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) else: print("Wrong evaluator assisgned") return model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=model, sampler=sampler, item_serving_size=1, eval_save_prefix=outdir) model_trainer.train(num_itr=num_itr + 1, display_itr=num_itr, eval_datasets=[test_dataset], evaluators=[test_evaluator], num_negatives=200)
def run_exp(model_name=None, raw_data=None, user_per=1.0, keep_days=1, l2_reg=0.001, test_date=None, outdir=None): # parse dataset into incremental training and testing set data = raw_data max_user = len(np.unique(data["user_id"])) max_item = len(np.unique(data["item_id"])) print("max_user:{}, max_item:{}".format(max_user, max_item)) test_date = datetime.datetime.strptime(test_date, "%Y-%m-%d").date() print("test date:%s" % test_date) train_data = data[data["timestamp"] < test_date] np.random.seed(10) # filter training data, for selected users keep only the most recent n days of data print("filter user percentage:%f" % user_per) print("ratings before filter:%d" % len(train_data)) user_list = np.unique(train_data["user_id"]) filter_user = np.random.choice(user_list, int(len(user_list) * user_per), replace=False) mask = (np.isin(train_data["user_id"], filter_user)) & ( train_data["timestamp"] < (test_date - datetime.timedelta(days=keep_days))) train_data = train_data[~mask] print("ratings after filter:%d" % len(train_data)) # random select one item for each user for validation user_list = np.unique(train_data["user_id"]) val_index = [ np.where(train_data["user_id"] == uid)[0][0] for uid in user_list ] # leave out the most recent rating for validation val_data = train_data[val_index] train_data = np.delete(train_data, val_index) print("trian data: %d, validation data %d" % (len(train_data), len(val_data))) train_dataset = ImplicitDataset(train_data, max_user, max_item, name='Train') val_dataset = ImplicitDataset(val_data, max_user, max_item, name='Val') num_process = 8 dim_embed = 50 if model_name == 'PMF': model = PMF(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=0.5, num_process=num_process) elif model_name == 'CML': model = CML(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=num_process) elif model_name == 'BPR': model = BPR(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=num_process) else: print("Wrong model assigned") return recall_evaluator = Recall( recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=model, sampler=sampler, item_serving_size=1) model_trainer.train(num_itr=num_itr, display_itr=display_itr, eval_datasets=[val_dataset], evaluators=[recall_evaluator, ndcg_evaluator], num_negatives=200)