Пример #1
0
from openrec.utils.evaluators import AUC
from openrec.utils.samplers import RandomPairwiseSampler
from openrec.utils.samplers import EvaluationSampler
import dataloader

raw_data = dataloader.load_citeulike()
dim_embed = 100
total_iter = 10000
batch_size = 1000
eval_iter = 10000
save_iter = eval_iter

train_dataset = Dataset(raw_data['train_data'], raw_data['total_users'], raw_data['total_items'], name='Train')
val_dataset = Dataset(raw_data['val_data'], raw_data['total_users'], raw_data['total_items'], name='Val', num_negatives=500)
test_dataset = Dataset(raw_data['test_data'], raw_data['total_users'], raw_data['total_items'], name='Test', num_negatives=500)

train_sampler = RandomPairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5)
val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset)
test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset)

bpr_model = BPR(batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), 
                l2_reg=0.01,
                dim_user_embed=dim_embed, dim_item_embed=dim_embed, save_model_dir='bpr_recommender/', train=True, serve=True)

model_trainer = ModelTrainer(model=bpr_model)

auc_evaluator = AUC()
model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, 
                    eval_samplers=[val_sampler], evaluators=[auc_evaluator])

Пример #2
0
    def sample_data_and_train(self):
        self.logger.warning(
            'sample_data_and_train called, pid = %d Please kill process on unsuccessful training',
            os.getpid())
        self.logger.info('-------- sample_data_and_train starts --------')

        total_users = 0
        interactions_count = 0
        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                self.path_to_dataset, 'r') as fin:
            for line in fin:
                interactions_count += int(line.split()[0])
                total_users += 1
        self.logger.info('############ collecting data.. ############')

        # radomly hold out an item per user for validation and testing respectively.
        val_structured_arr = np.zeros(total_users,
                                      dtype=[('user_id', np.int32),
                                             ('item_id', np.int32)])
        test_structured_arr = np.zeros(total_users,
                                       dtype=[('user_id', np.int32),
                                              ('item_id', np.int32)])
        train_structured_arr = np.zeros(interactions_count - total_users * 2,
                                        dtype=[('user_id', np.int32),
                                               ('item_id', np.int32)])

        interaction_ind = 0
        next_user_id = 0
        next_item_id = 0
        map_to_item_id = dict()  # Map item id from 0 to len(items)-1

        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                self.path_to_dataset, 'r') as fin:
            for line in fin:
                item_list = line.split()[1:]
                random.shuffle(item_list)
                for ind, item in enumerate(item_list):
                    if item not in map_to_item_id:
                        map_to_item_id[item] = next_item_id
                        next_item_id += 1
                    if ind == 0:
                        val_structured_arr[next_user_id] = (
                            next_user_id, map_to_item_id[item])
                    elif ind == 1:
                        test_structured_arr[next_user_id] = (
                            next_user_id, map_to_item_id[item])
                    else:
                        train_structured_arr[interaction_ind] = (
                            next_user_id, map_to_item_id[item])
                        interaction_ind += 1
                next_user_id += 1

        self.logger.info('############ instantiating dataset.. ############')

        from openrec.utils import Dataset

        train_dataset = Dataset(raw_data=train_structured_arr,
                                total_users=total_users,
                                total_items=len(map_to_item_id),
                                name='Train')
        val_dataset = Dataset(raw_data=val_structured_arr,
                              total_users=total_users,
                              total_items=len(map_to_item_id),
                              num_negatives=500,
                              name='Val')
        test_dataset = Dataset(raw_data=test_structured_arr,
                               total_users=total_users,
                               total_items=len(map_to_item_id),
                               num_negatives=500,
                               name='Test')

        self.logger.info("############ instantiating Samplers.. ############")

        from openrec.utils.samplers import RandomPairwiseSampler
        from openrec.utils.samplers import EvaluationSampler

        train_sampler = RandomPairwiseSampler(batch_size=1000,
                                              dataset=train_dataset,
                                              num_process=5)
        val_sampler = EvaluationSampler(batch_size=1000, dataset=val_dataset)
        test_sampler = EvaluationSampler(batch_size=1000, dataset=test_dataset)

        self.logger.info(
            "############ instantiating Recommender.. ############")

        from openrec.recommenders import BPR

        bpr_model = BPR(batch_size=1000,
                        total_users=train_dataset.total_users(),
                        total_items=train_dataset.total_items(),
                        dim_user_embed=50,
                        dim_item_embed=50,
                        save_model_dir='bpr_recommender/',
                        train=True,
                        serve=True)

        self.logger.info("############ instantiating Evaluator.. ############")

        from openrec.utils.evaluators import AUC

        auc_evaluator = AUC()

        self.logger.info(
            "############ instantiating Model trainer.. ############")

        from openrec import ModelTrainer

        model_trainer = ModelTrainer(model=bpr_model)

        print("############ starting training.. ############")

        model_trainer.train(
            total_iter=10000,  # Total number of training iterations
            eval_iter=1000,  # Evaluate the model every "eval_iter" iterations
            save_iter=10000,  # Save the model every "save_iter" iterations
            train_sampler=train_sampler,
            eval_samplers=[val_sampler, test_sampler],
            evaluators=[auc_evaluator])
        # self.logger.info("THIS IS WHEN MODEL WILL START TRAINING... returning")
        self.logger.info("-------- sample_data_and_train ends --------")
Пример #3
0
infilename = "./others-gmf-citeulike-test_evaluate_partial.pickle"
trainset_path = "/Users/xuan/Documents/Specialization Project/openrec/dataset/citeulike/user_data_train.npy"
###

#
trainset = np.load(trainset_path)
trainset = trainset['user_id']
frequency = dict()
for i in trainset:
    if i in frequency:
        frequency[i] += 1
    else:
        frequency[i] = 1
#

auc_evaluator = AUC()
recall_evaluator = Recall(recall_at=[10])
precision_evaluator = Precision(precision_at=[10])
ndcg_evaluator = NDCG(ndcg_at=[10])

f = open(infilename, 'rb')
p = pickle.load(f)
f.close()

score_per_user = dict()
count_per_user = dict()

for user in p['users']:
    neg_scores = p['results'][user][:p['num_negatives']]
    for i in range(len(p['user_items'][user][p['num_negatives'] : ])):
        pos_score = p['results'][user][p['num_negatives'] + i]
Пример #4
0
def exp(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, save_log, eval_rank):
    
    if neg_ratio is not None:
        if pos_ratio + neg_ratio > 1.0 or pos_ratio + neg_ratio <= 0.0:
            print ("Invalid sampling ratios...")
            return
    
    if dataset == 'spotify':
        data = loadSpotify()
        
    elif dataset == 'bytedance':
        data = loadByteDance()
        
    else:
        print ("Unsupported dataset...")
        return 
    
    # save logging and model
    log_dir = "validation_logs/{}_{}_{}_{}_{}_{}/".format(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, eval_rank)
    os.popen("mkdir -p %s" % log_dir).read()
    if save_log:
        log = open(log_dir + "validation.log", "w")
        sys.stdout = log
    
    
    # prepare train, val, test sets and samplers
    train_dataset = Dataset(data['train'], data['total_users'], data['total_items'], name='Train')    
    if neg_ratio is None:
        train_sampler = StratifiedPointwiseSampler(batch_size=batch_size, 
                                                   dataset=train_dataset, 
                                                   pos_ratio=pos_ratio, 
                                                   num_process=5)
    else:
        train_sampler = NegativePointwiseSampler(batch_size=batch_size, 
                                                 dataset=train_dataset, 
                                                 pos_ratio=pos_ratio, 
                                                 neg_ratio=neg_ratio, 
                                                 num_process=5)
        if neg_ratio > 0.0:
            print ("Re-weighting implicit negative feedback")
        else:
            print ("Corrected negative feedback labels but not re-weighting")
    
    
    eval_num_neg = None if eval_explicit else 500 # num of negative samples for evaluation
    if eval_rank:
        # show evaluation metrics for click-complete and click-skip items separately
        pos_dataset = Dataset(data['pos_test'],  data['total_users'], data['total_items'], 
                              implicit_negative=not eval_explicit, name='Pos_Test', num_negatives=eval_num_neg)
        neg_dataset = Dataset(data['neg_test'],  data['total_users'], data['total_items'], 
                              implicit_negative=not eval_explicit, name='Neg_Test', num_negatives=eval_num_neg)
        pos_sampler = EvaluationSampler(batch_size=batch_size, dataset=pos_dataset)
        neg_sampler = EvaluationSampler(batch_size=batch_size, dataset=neg_dataset)
        eval_samplers = [pos_sampler, neg_sampler]
    else:
        val_dataset = Dataset(data['val'],  data['total_users'], data['total_items'], 
                              implicit_negative=not eval_explicit, name='Val', num_negatives=eval_num_neg)
        test_dataset = Dataset(data['test'],  data['total_users'], data['total_items'], 
                               implicit_negative=not eval_explicit, name='Test', num_negatives=eval_num_neg)
        val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset)
        test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset)
        eval_samplers = [val_sampler, test_sampler]
    
    # set evaluators
    auc_evaluator = AUC()
    evaluators = [auc_evaluator]
  
    
    # set model parameters
    model = PMF(l2_reg=l2_reg, 
                batch_size=batch_size, 
                total_users=train_dataset.total_users(), 
                total_items=train_dataset.total_items(), 
                dim_user_embed=dim_user_embed, 
                dim_item_embed=dim_item_embed, 
                save_model_dir=log_dir, 
                train=True, 
                serve=True)
    
    
    # set model trainer
    model_trainer = ModelTrainer(model=model)  
    model_trainer.train(total_iter=total_iter, 
                        eval_iter=eval_iter, 
                        save_iter=save_iter, 
                        train_sampler=train_sampler, 
                        eval_samplers=eval_samplers, 
                        evaluators=evaluators)
Пример #5
0
    names=True,
    encoding='utf8')
# csv = np.genfromtxt('movies_medium.csv', delimiter=",", dtype='int,int,float,bool,float,float', names=True, encoding='ansi')
# csv = np.genfromtxt('Movies_ratings_small_merged_reduced.csv', delimiter=",", dtype='int,int,float,float,int,int,str,str,float,int,int,str,bool', names=True, encoding='ansi')

# Permute all the data, then subsection it off - using temp AND THEN numpy

test1Temp = []
model_trainer = None

# add evaluators
recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
precision_evaluator = Precision(
    precision_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
evaluators = [AUC(), recall_evaluator, precision_evaluator, ndcg_evaluator]

combined_recommender = CombinedRecommender(batch_size=tc.batch_size,
                                           max_user=tc.max_user,
                                           max_item=tc.max_item)
#================================GENETIC ALGORITHM======================================#

# EVALUATION FUNCTION - change to min

global_min = [1, 1, 1]


def evalOneMin(individual):
    # calling _evaluate_full manually w/o sampler should be single process so ok to set then evaluate
    combined_recommender.set_ensemble(individual)
    eval_metrics = model_trainer._evaluate_full(test_dataset)