def run_test_exp(model_name=None, evaluator=None, raw_data=None, user_per=1.0, keep_days=1, l2_reg=0.001, test_date=None, outdir=None, num_itr=1e4 + 1): # parse dataset into incremental training and testing set data = raw_data max_user = len(np.unique(data["user_id"])) max_item = len(np.unique(data["item_id"])) print("max_user:{}, max_item:{}".format(max_user, max_item)) test_date = datetime.datetime.strptime(test_date, "%Y-%m-%d").date() print("test date:%s" % test_date) train_data = data[data["timestamp"] < test_date] test_data = data[(data["timestamp"] >= test_date) & ( data["timestamp"] < (test_date + datetime.timedelta(days=7)))] np.random.seed(10) test_data = np.asarray([ np.random.choice(test_data[test_data["user_id"] == uid], 1)[0] for uid in np.unique(test_data["user_id"]) ]) # filter training data, for selected users keep only the latest n days of data print("filter user percentage:%f" % user_per) print("ratings before filter:%d" % len(train_data)) user_list = np.unique(train_data["user_id"]) filter_user = np.random.choice(user_list, int(len(user_list) * user_per), replace=False) filter_mask = (np.isin(train_data["user_id"], filter_user)) & ( train_data["timestamp"] < (test_date - datetime.timedelta(days=keep_days))) # output filtered data and test data if LOGGING: np.save(outdir + "filtered_data.npy", train_data[filter_mask]) np.save(outdir + "train_data.npy", train_data[~filter_mask]) np.save(outdir + "test_data.npy", test_data) train_data = train_data[~filter_mask] print("ratings after filter:%d" % len(train_data)) train_dataset = ImplicitDataset(train_data, max_user, max_item, name='Train') test_dataset = ImplicitDataset(test_data, max_user, max_item, name='Test') num_process = 8 dim_embed = 50 if model_name == 'PMF': model = PMF(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=0.5, num_process=num_process) elif model_name == 'CML': model = CML(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=num_process) elif model_name == 'BPR': model = BPR(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=num_process) else: print("Wrong model assigned") return if evaluator == 'Recall': test_evaluator = Recall( recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) elif evaluator == 'NDCG': test_evaluator = NDCG( ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) else: print("Wrong evaluator assisgned") return model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=model, sampler=sampler, item_serving_size=1, eval_save_prefix=outdir) model_trainer.train(num_itr=num_itr + 1, display_itr=num_itr, eval_datasets=[test_dataset], evaluators=[test_evaluator], num_negatives=200)
max_item=train_dataset.max_item(), dim_embed=50, opt='Adam', sess_config=None, l2_reg=0.1) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=cml_model, sampler=sampler, eval_save_prefix="cml-citeulike") auc_evaluator = AUC() recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) precision_evaluator = Precision( precision_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) cml_model.load("cml-citeulike") model_trainer._eval_manager = ImplicitEvalManager(evaluators=[ auc_evaluator, recall_evaluator, ndcg_evaluator, precision_evaluator ]) model_trainer._num_negatives = 200 model_trainer._exclude_positives([test_dataset]) model_trainer._sample_negatives(seed=10) model_trainer._eval_save_prefix = "cml-citeulike-test" model_trainer._evaluate_partial(test_dataset)
def run_exp(model_name=None, raw_data=None, user_per=1.0, keep_days=1, l2_reg=0.001, test_date=None, outdir=None): # parse dataset into incremental training and testing set data = raw_data max_user = len(np.unique(data["user_id"])) max_item = len(np.unique(data["item_id"])) print("max_user:{}, max_item:{}".format(max_user, max_item)) test_date = datetime.datetime.strptime(test_date, "%Y-%m-%d").date() print("test date:%s" % test_date) train_data = data[data["timestamp"] < test_date] np.random.seed(10) # filter training data, for selected users keep only the most recent n days of data print("filter user percentage:%f" % user_per) print("ratings before filter:%d" % len(train_data)) user_list = np.unique(train_data["user_id"]) filter_user = np.random.choice(user_list, int(len(user_list) * user_per), replace=False) mask = (np.isin(train_data["user_id"], filter_user)) & ( train_data["timestamp"] < (test_date - datetime.timedelta(days=keep_days))) train_data = train_data[~mask] print("ratings after filter:%d" % len(train_data)) # random select one item for each user for validation user_list = np.unique(train_data["user_id"]) val_index = [ np.where(train_data["user_id"] == uid)[0][0] for uid in user_list ] # leave out the most recent rating for validation val_data = train_data[val_index] train_data = np.delete(train_data, val_index) print("trian data: %d, validation data %d" % (len(train_data), len(val_data))) train_dataset = ImplicitDataset(train_data, max_user, max_item, name='Train') val_dataset = ImplicitDataset(val_data, max_user, max_item, name='Val') num_process = 8 dim_embed = 50 if model_name == 'PMF': model = PMF(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=0.5, num_process=num_process) elif model_name == 'CML': model = CML(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=num_process) elif model_name == 'BPR': model = BPR(batch_size=batch_size, max_user=train_dataset.max_user(), max_item=train_dataset.max_item(), dim_embed=dim_embed, opt='Adam', l2_reg=l2_reg) sampler = PairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=num_process) else: print("Wrong model assigned") return recall_evaluator = Recall( recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) model_trainer = ImplicitModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, train_dataset=train_dataset, model=model, sampler=sampler, item_serving_size=1) model_trainer.train(num_itr=num_itr, display_itr=display_itr, eval_datasets=[val_dataset], evaluators=[recall_evaluator, ndcg_evaluator], num_negatives=200)