df = pd.concat([train_set, test_set], ignore_index=True) user_num = df['user'].nunique() item_num = df['item'].nunique() # get ground truth test_ur = get_ur(test_set) total_train_ur = get_ur(train_set) # initial candidate item pool item_pool = set(range(item_num)) candidates_num = args.cand_num print('='*50, '\n') # retrain model by the whole train set # start negative sampling train_sampled = negative_sampling(user_num, item_num, train_set, args.num_ng, sample_method=args.sample_method) # format training data train_dataset = PointMFData(train_sampled) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4) # build recommender model model = PointFMV2(user_num, item_num, args.factors, args.lamda, args.epochs, args.lr, args.gpu, args.loss_type) model.fit(train_loader) print('Start Calculating Metrics......') # build candidates set test_ucands = defaultdict(list) for k, v in test_ur.items(): sample_num = candidates_num - len(v) if len(v) < candidates_num else 0
def opt_func(params, mi=args.sc_met, topk=args.topk): num_ng, factor_num, num_layers = int(params['num_ng']), int(params['factor_num']), int(params['num_layers']) dropout, lr, batch_size, lamda = params['dropout'], params['lr'], params['batch_size'], params['lamda'] print(f'Parameter Settings: num_ng:{num_ng},factors:{factor_num},layers:{num_layers},dropout:{dropout},lr:{lr},batch_size:{batch_size},lamda:{lamda}') # store metrics result for final validation set fnl_metric = [] for fold in range(fn): print(f'Start Validation [{fold + 1}]......') train = train_set_list[fold] validation = val_set_list[fold] # get ground truth train_ur = get_ur(train) val_ur = get_ur(validation) # start negative sampling train_sampled = negative_sampling(user_num, item_num, train, num_ng) # format training data train_dataset = PointMFData(train_sampled) train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4) # whether load pre-train model model_name = args.model_name assert model_name in ['MLP', 'GMF', 'NeuMF-end', 'NeuMF-pre'] GMF_model_path = f'./tmp/{args.dataset}/CL/GMF.pt' MLP_model_path = f'./tmp/{args.dataset}/CL/MLP.pt' NeuMF_model_path = f'./tmp/{args.dataset}/CL/NeuMF.pt' if model_name == 'NeuMF-pre': assert os.path.exists(GMF_model_path), 'lack of GMF model' assert os.path.exists(MLP_model_path), 'lack of MLP model' GMF_model = torch.load(GMF_model_path) MLP_model = torch.load(MLP_model_path) else: GMF_model = None MLP_model = None # build recommender model model = PointNeuMF(user_num, item_num, factor_num, num_layers, dropout, lr, args.epochs, lamda, args.model_name, GMF_model, MLP_model, args.gpu, args.loss_type) model.fit(train_loader) # build candidates set val_ucands = defaultdict(list) for k, v in val_ur.items(): sample_num = candidates_num - len(v) if len(v) < candidates_num else 0 sub_item_pool = item_pool - v - train_ur[k] # remove GT & interacted sample_num = min(len(sub_item_pool), sample_num) samples = random.sample(sub_item_pool, sample_num) val_ucands[k] = list(v | set(samples)) # get predict result print('') print('Generate recommend list...') print('') preds = {} for u in tqdm(val_ucands.keys()): # build a validation MF dataset for certain user u tmp = pd.DataFrame({'user': [u for _ in val_ucands[u]], 'item': val_ucands[u], 'rating': [0. for _ in val_ucands[u]], # fake label, make nonsense }) tmp_dataset = PointMFData(tmp) tmp_loader = data.DataLoader(tmp_dataset, batch_size=candidates_num, shuffle=False, num_workers=0) # get top-N list with torch method for user_u, item_i, _ in tmp_loader: if torch.cuda.is_available(): user_u = user_u.cuda() item_i = item_i.cuda() else: user_u = user_u.cpu() item_i = item_i.cpu() prediction = model.predict(user_u, item_i) _, indices = torch.topk(prediction, topk) top_n = torch.take(torch.tensor(val_ucands[u]), indices).cpu().numpy() preds[u] = top_n # convert rank list to binary-interaction for u in preds.keys(): preds[u] = [1 if i in val_ur[u] else 0 for i in preds[u]] # calculate metrics for validation set pre_k = np.mean([precision_at_k(r, topk) for r in preds.values()]) rec_k = recall_at_k(preds, val_ur, topk) hr_k = hr_at_k(preds, val_ur) map_k = map_at_k(preds.values()) mrr_k = mrr_at_k(preds, topk) ndcg_k = np.mean([ndcg_at_k(r, topk) for r in preds.values()]) tmp_metric = np.array([pre_k, rec_k, hr_k, map_k, mrr_k, ndcg_k]) fnl_metric.append(tmp_metric) # get final validation metrics result by average operation fnl_metric = np.array(fnl_metric).mean(axis=0) print('='*20, 'Metrics for All Validation', '='*20) print(f'Precision@{topk}: {fnl_metric[0]:.4f}') print(f'Recall@{topk}: {fnl_metric[1]:.4f}') print(f'HR@{topk}: {fnl_metric[2]:.4f}') print(f'MAP@{topk}: {fnl_metric[3]:.4f}') print(f'MRR@{topk}: {fnl_metric[4]:.4f}') print(f'NDCG@{topk}: {fnl_metric[5]:.4f}') score = fnl_metric[metric_idx[mi]] # record all tuning result and settings fnl_metric = [f'{mt:.4f}' for mt in fnl_metric] line = ','.join(fnl_metric) + f',{num_ng},{factor_num},{num_layers},{dropout},{lr},{batch_size},{lamda}' + '\n' f.write(line) f.flush() return -score