from openrec.utils.evaluators import AUC from openrec.utils.samplers import RandomPairwiseSampler from openrec.utils.samplers import EvaluationSampler import dataloader raw_data = dataloader.load_citeulike() dim_embed = 100 total_iter = 10000 batch_size = 1000 eval_iter = 10000 save_iter = eval_iter train_dataset = Dataset(raw_data['train_data'], raw_data['total_users'], raw_data['total_items'], name='Train') val_dataset = Dataset(raw_data['val_data'], raw_data['total_users'], raw_data['total_items'], name='Val', num_negatives=500) test_dataset = Dataset(raw_data['test_data'], raw_data['total_users'], raw_data['total_items'], name='Test', num_negatives=500) train_sampler = RandomPairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5) val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset) bpr_model = BPR(batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), l2_reg=0.01, dim_user_embed=dim_embed, dim_item_embed=dim_embed, save_model_dir='bpr_recommender/', train=True, serve=True) model_trainer = ModelTrainer(model=bpr_model) auc_evaluator = AUC() model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, eval_samplers=[val_sampler], evaluators=[auc_evaluator])
def sample_data_and_train(self): self.logger.warning( 'sample_data_and_train called, pid = %d Please kill process on unsuccessful training', os.getpid()) self.logger.info('-------- sample_data_and_train starts --------') total_users = 0 interactions_count = 0 with open( os.path.dirname(os.path.abspath(__file__)) + self.path_to_dataset, 'r') as fin: for line in fin: interactions_count += int(line.split()[0]) total_users += 1 self.logger.info('############ collecting data.. ############') # radomly hold out an item per user for validation and testing respectively. val_structured_arr = np.zeros(total_users, dtype=[('user_id', np.int32), ('item_id', np.int32)]) test_structured_arr = np.zeros(total_users, dtype=[('user_id', np.int32), ('item_id', np.int32)]) train_structured_arr = np.zeros(interactions_count - total_users * 2, dtype=[('user_id', np.int32), ('item_id', np.int32)]) interaction_ind = 0 next_user_id = 0 next_item_id = 0 map_to_item_id = dict() # Map item id from 0 to len(items)-1 with open( os.path.dirname(os.path.abspath(__file__)) + self.path_to_dataset, 'r') as fin: for line in fin: item_list = line.split()[1:] random.shuffle(item_list) for ind, item in enumerate(item_list): if item not in map_to_item_id: map_to_item_id[item] = next_item_id next_item_id += 1 if ind == 0: val_structured_arr[next_user_id] = ( next_user_id, map_to_item_id[item]) elif ind == 1: test_structured_arr[next_user_id] = ( next_user_id, map_to_item_id[item]) else: train_structured_arr[interaction_ind] = ( next_user_id, map_to_item_id[item]) interaction_ind += 1 next_user_id += 1 self.logger.info('############ instantiating dataset.. ############') from openrec.utils import Dataset train_dataset = Dataset(raw_data=train_structured_arr, total_users=total_users, total_items=len(map_to_item_id), name='Train') val_dataset = Dataset(raw_data=val_structured_arr, total_users=total_users, total_items=len(map_to_item_id), num_negatives=500, name='Val') test_dataset = Dataset(raw_data=test_structured_arr, total_users=total_users, total_items=len(map_to_item_id), num_negatives=500, name='Test') self.logger.info("############ instantiating Samplers.. ############") from openrec.utils.samplers import RandomPairwiseSampler from openrec.utils.samplers import EvaluationSampler train_sampler = RandomPairwiseSampler(batch_size=1000, dataset=train_dataset, num_process=5) val_sampler = EvaluationSampler(batch_size=1000, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=1000, dataset=test_dataset) self.logger.info( "############ instantiating Recommender.. ############") from openrec.recommenders import BPR bpr_model = BPR(batch_size=1000, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), dim_user_embed=50, dim_item_embed=50, save_model_dir='bpr_recommender/', train=True, serve=True) self.logger.info("############ instantiating Evaluator.. ############") from openrec.utils.evaluators import AUC auc_evaluator = AUC() self.logger.info( "############ instantiating Model trainer.. ############") from openrec import ModelTrainer model_trainer = ModelTrainer(model=bpr_model) print("############ starting training.. ############") model_trainer.train( total_iter=10000, # Total number of training iterations eval_iter=1000, # Evaluate the model every "eval_iter" iterations save_iter=10000, # Save the model every "save_iter" iterations train_sampler=train_sampler, eval_samplers=[val_sampler, test_sampler], evaluators=[auc_evaluator]) # self.logger.info("THIS IS WHEN MODEL WILL START TRAINING... returning") self.logger.info("-------- sample_data_and_train ends --------")
infilename = "./others-gmf-citeulike-test_evaluate_partial.pickle" trainset_path = "/Users/xuan/Documents/Specialization Project/openrec/dataset/citeulike/user_data_train.npy" ### # trainset = np.load(trainset_path) trainset = trainset['user_id'] frequency = dict() for i in trainset: if i in frequency: frequency[i] += 1 else: frequency[i] = 1 # auc_evaluator = AUC() recall_evaluator = Recall(recall_at=[10]) precision_evaluator = Precision(precision_at=[10]) ndcg_evaluator = NDCG(ndcg_at=[10]) f = open(infilename, 'rb') p = pickle.load(f) f.close() score_per_user = dict() count_per_user = dict() for user in p['users']: neg_scores = p['results'][user][:p['num_negatives']] for i in range(len(p['user_items'][user][p['num_negatives'] : ])): pos_score = p['results'][user][p['num_negatives'] + i]
def exp(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, save_log, eval_rank): if neg_ratio is not None: if pos_ratio + neg_ratio > 1.0 or pos_ratio + neg_ratio <= 0.0: print ("Invalid sampling ratios...") return if dataset == 'spotify': data = loadSpotify() elif dataset == 'bytedance': data = loadByteDance() else: print ("Unsupported dataset...") return # save logging and model log_dir = "validation_logs/{}_{}_{}_{}_{}_{}/".format(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, eval_rank) os.popen("mkdir -p %s" % log_dir).read() if save_log: log = open(log_dir + "validation.log", "w") sys.stdout = log # prepare train, val, test sets and samplers train_dataset = Dataset(data['train'], data['total_users'], data['total_items'], name='Train') if neg_ratio is None: train_sampler = StratifiedPointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=pos_ratio, num_process=5) else: train_sampler = NegativePointwiseSampler(batch_size=batch_size, dataset=train_dataset, pos_ratio=pos_ratio, neg_ratio=neg_ratio, num_process=5) if neg_ratio > 0.0: print ("Re-weighting implicit negative feedback") else: print ("Corrected negative feedback labels but not re-weighting") eval_num_neg = None if eval_explicit else 500 # num of negative samples for evaluation if eval_rank: # show evaluation metrics for click-complete and click-skip items separately pos_dataset = Dataset(data['pos_test'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Pos_Test', num_negatives=eval_num_neg) neg_dataset = Dataset(data['neg_test'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Neg_Test', num_negatives=eval_num_neg) pos_sampler = EvaluationSampler(batch_size=batch_size, dataset=pos_dataset) neg_sampler = EvaluationSampler(batch_size=batch_size, dataset=neg_dataset) eval_samplers = [pos_sampler, neg_sampler] else: val_dataset = Dataset(data['val'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Val', num_negatives=eval_num_neg) test_dataset = Dataset(data['test'], data['total_users'], data['total_items'], implicit_negative=not eval_explicit, name='Test', num_negatives=eval_num_neg) val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset) test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset) eval_samplers = [val_sampler, test_sampler] # set evaluators auc_evaluator = AUC() evaluators = [auc_evaluator] # set model parameters model = PMF(l2_reg=l2_reg, batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), dim_user_embed=dim_user_embed, dim_item_embed=dim_item_embed, save_model_dir=log_dir, train=True, serve=True) # set model trainer model_trainer = ModelTrainer(model=model) model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, eval_samplers=eval_samplers, evaluators=evaluators)
names=True, encoding='utf8') # csv = np.genfromtxt('movies_medium.csv', delimiter=",", dtype='int,int,float,bool,float,float', names=True, encoding='ansi') # csv = np.genfromtxt('Movies_ratings_small_merged_reduced.csv', delimiter=",", dtype='int,int,float,float,int,int,str,str,float,int,int,str,bool', names=True, encoding='ansi') # Permute all the data, then subsection it off - using temp AND THEN numpy test1Temp = [] model_trainer = None # add evaluators recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) precision_evaluator = Precision( precision_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) evaluators = [AUC(), recall_evaluator, precision_evaluator, ndcg_evaluator] combined_recommender = CombinedRecommender(batch_size=tc.batch_size, max_user=tc.max_user, max_item=tc.max_item) #================================GENETIC ALGORITHM======================================# # EVALUATION FUNCTION - change to min global_min = [1, 1, 1] def evalOneMin(individual): # calling _evaluate_full manually w/o sampler should be single process so ok to set then evaluate combined_recommender.set_ensemble(individual) eval_metrics = model_trainer._evaluate_full(test_dataset)