Exemplo n.º 1
0
from openrec.utils.evaluators import AUC
from openrec.utils.samplers import RandomPairwiseSampler
from openrec.utils.samplers import EvaluationSampler
import dataloader

raw_data = dataloader.load_citeulike()
dim_embed = 100
total_iter = 10000
batch_size = 1000
eval_iter = 10000
save_iter = eval_iter

train_dataset = Dataset(raw_data['train_data'], raw_data['total_users'], raw_data['total_items'], name='Train')
val_dataset = Dataset(raw_data['val_data'], raw_data['total_users'], raw_data['total_items'], name='Val', num_negatives=500)
test_dataset = Dataset(raw_data['test_data'], raw_data['total_users'], raw_data['total_items'], name='Test', num_negatives=500)

train_sampler = RandomPairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5)
val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset)
test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset)

bpr_model = BPR(batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), 
                l2_reg=0.01,
                dim_user_embed=dim_embed, dim_item_embed=dim_embed, save_model_dir='bpr_recommender/', train=True, serve=True)

model_trainer = ModelTrainer(model=bpr_model)

auc_evaluator = AUC()
model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, 
                    eval_samplers=[val_sampler], evaluators=[auc_evaluator])

Exemplo n.º 2
0
                      num_negatives=500)
test_dataset = Dataset(raw_data['test_data'],
                       raw_data['total_users'],
                       raw_data['total_items'],
                       name='Test',
                       num_negatives=500)

train_sampler = RandomPairwiseSampler(batch_size=batch_size,
                                      dataset=train_dataset,
                                      num_process=5)
val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset)
test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset)

bpr_model = BPR(batch_size=batch_size,
                total_users=train_dataset.total_users(),
                total_items=train_dataset.total_items(),
                l2_reg=CHANGE_L2_REG_HERE,
                dim_user_embed=dim_embed,
                dim_item_embed=dim_embed,
                save_model_dir='bpr_recommender/',
                train=True,
                serve=True)

model_trainer = ModelTrainer(model=bpr_model)

auc_evaluator = AUC()
model_trainer.train(total_iter=total_iter,
                    eval_iter=eval_iter,
                    save_iter=save_iter,
                    train_sampler=train_sampler,
                    eval_samplers=[val_sampler],
Exemplo n.º 3
0
    def sample_data_and_train(self):
        self.logger.warning(
            'sample_data_and_train called, pid = %d Please kill process on unsuccessful training',
            os.getpid())
        self.logger.info('-------- sample_data_and_train starts --------')

        total_users = 0
        interactions_count = 0
        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                self.path_to_dataset, 'r') as fin:
            for line in fin:
                interactions_count += int(line.split()[0])
                total_users += 1
        self.logger.info('############ collecting data.. ############')

        # radomly hold out an item per user for validation and testing respectively.
        val_structured_arr = np.zeros(total_users,
                                      dtype=[('user_id', np.int32),
                                             ('item_id', np.int32)])
        test_structured_arr = np.zeros(total_users,
                                       dtype=[('user_id', np.int32),
                                              ('item_id', np.int32)])
        train_structured_arr = np.zeros(interactions_count - total_users * 2,
                                        dtype=[('user_id', np.int32),
                                               ('item_id', np.int32)])

        interaction_ind = 0
        next_user_id = 0
        next_item_id = 0
        map_to_item_id = dict()  # Map item id from 0 to len(items)-1

        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                self.path_to_dataset, 'r') as fin:
            for line in fin:
                item_list = line.split()[1:]
                random.shuffle(item_list)
                for ind, item in enumerate(item_list):
                    if item not in map_to_item_id:
                        map_to_item_id[item] = next_item_id
                        next_item_id += 1
                    if ind == 0:
                        val_structured_arr[next_user_id] = (
                            next_user_id, map_to_item_id[item])
                    elif ind == 1:
                        test_structured_arr[next_user_id] = (
                            next_user_id, map_to_item_id[item])
                    else:
                        train_structured_arr[interaction_ind] = (
                            next_user_id, map_to_item_id[item])
                        interaction_ind += 1
                next_user_id += 1

        self.logger.info('############ instantiating dataset.. ############')

        from openrec.utils import Dataset

        train_dataset = Dataset(raw_data=train_structured_arr,
                                total_users=total_users,
                                total_items=len(map_to_item_id),
                                name='Train')
        val_dataset = Dataset(raw_data=val_structured_arr,
                              total_users=total_users,
                              total_items=len(map_to_item_id),
                              num_negatives=500,
                              name='Val')
        test_dataset = Dataset(raw_data=test_structured_arr,
                               total_users=total_users,
                               total_items=len(map_to_item_id),
                               num_negatives=500,
                               name='Test')

        self.logger.info("############ instantiating Samplers.. ############")

        from openrec.utils.samplers import RandomPairwiseSampler
        from openrec.utils.samplers import EvaluationSampler

        train_sampler = RandomPairwiseSampler(batch_size=1000,
                                              dataset=train_dataset,
                                              num_process=5)
        val_sampler = EvaluationSampler(batch_size=1000, dataset=val_dataset)
        test_sampler = EvaluationSampler(batch_size=1000, dataset=test_dataset)

        self.logger.info(
            "############ instantiating Recommender.. ############")

        from openrec.recommenders import BPR

        bpr_model = BPR(batch_size=1000,
                        total_users=train_dataset.total_users(),
                        total_items=train_dataset.total_items(),
                        dim_user_embed=50,
                        dim_item_embed=50,
                        save_model_dir='bpr_recommender/',
                        train=True,
                        serve=True)

        self.logger.info("############ instantiating Evaluator.. ############")

        from openrec.utils.evaluators import AUC

        auc_evaluator = AUC()

        self.logger.info(
            "############ instantiating Model trainer.. ############")

        from openrec import ModelTrainer

        model_trainer = ModelTrainer(model=bpr_model)

        print("############ starting training.. ############")

        model_trainer.train(
            total_iter=10000,  # Total number of training iterations
            eval_iter=1000,  # Evaluate the model every "eval_iter" iterations
            save_iter=10000,  # Save the model every "save_iter" iterations
            train_sampler=train_sampler,
            eval_samplers=[val_sampler, test_sampler],
            evaluators=[auc_evaluator])
        # self.logger.info("THIS IS WHEN MODEL WILL START TRAINING... returning")
        self.logger.info("-------- sample_data_and_train ends --------")
Exemplo n.º 4
0
from openrec.utils.samplers import EvaluationSampler
import dataloader

#raw_data = dataloader.load_citeulike()
raw_data = dataloader.load_dataset()
dim_embed = CHANGE_DIM_HERE
total_iter = raw_data["max_iteration"]
batch_size = 1000
eval_iter = total_iter
save_iter = eval_iter

train_dataset = Dataset(raw_data['train_data'], raw_data['total_users'], raw_data['total_items'], name='Train')
val_dataset = Dataset(raw_data['val_data'], raw_data['total_users'], raw_data['total_items'], name='Val', num_negatives=500)
test_dataset = Dataset(raw_data['test_data'], raw_data['total_users'], raw_data['total_items'], name='Test', num_negatives=500)

train_sampler = StratifiedPointwiseSampler(pos_ratio=0.2, batch_size=batch_size, dataset=train_dataset, num_process=5)
val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset)
test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset)

model = PMLP(batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), 
             l2_reg=CHANGE_L2_REG_HERE,
             mlp_dims=CHANGE_MLP_DIMS_HERE, dim_user_embed=dim_embed, dim_item_embed=dim_embed, save_model_dir='pmlp_recommender/', train=True, serve=True)

auc_evaluator = AUC()
recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])    
model_trainer = ModelTrainer(model=model)

model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, 
                    eval_samplers=[val_sampler], evaluators=[auc_evaluator])

Exemplo n.º 5
0
def exp(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, save_log, eval_rank):
    
    if neg_ratio is not None:
        if pos_ratio + neg_ratio > 1.0 or pos_ratio + neg_ratio <= 0.0:
            print ("Invalid sampling ratios...")
            return
    
    if dataset == 'spotify':
        data = loadSpotify()
        
    elif dataset == 'bytedance':
        data = loadByteDance()
        
    else:
        print ("Unsupported dataset...")
        return 
    
    # save logging and model
    log_dir = "validation_logs/{}_{}_{}_{}_{}_{}/".format(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, eval_rank)
    os.popen("mkdir -p %s" % log_dir).read()
    if save_log:
        log = open(log_dir + "validation.log", "w")
        sys.stdout = log
    
    
    # prepare train, val, test sets and samplers
    train_dataset = Dataset(data['train'], data['total_users'], data['total_items'], name='Train')    
    if neg_ratio is None:
        train_sampler = StratifiedPointwiseSampler(batch_size=batch_size, 
                                                   dataset=train_dataset, 
                                                   pos_ratio=pos_ratio, 
                                                   num_process=5)
    else:
        train_sampler = NegativePointwiseSampler(batch_size=batch_size, 
                                                 dataset=train_dataset, 
                                                 pos_ratio=pos_ratio, 
                                                 neg_ratio=neg_ratio, 
                                                 num_process=5)
        if neg_ratio > 0.0:
            print ("Re-weighting implicit negative feedback")
        else:
            print ("Corrected negative feedback labels but not re-weighting")
    
    
    eval_num_neg = None if eval_explicit else 500 # num of negative samples for evaluation
    if eval_rank:
        # show evaluation metrics for click-complete and click-skip items separately
        pos_dataset = Dataset(data['pos_test'],  data['total_users'], data['total_items'], 
                              implicit_negative=not eval_explicit, name='Pos_Test', num_negatives=eval_num_neg)
        neg_dataset = Dataset(data['neg_test'],  data['total_users'], data['total_items'], 
                              implicit_negative=not eval_explicit, name='Neg_Test', num_negatives=eval_num_neg)
        pos_sampler = EvaluationSampler(batch_size=batch_size, dataset=pos_dataset)
        neg_sampler = EvaluationSampler(batch_size=batch_size, dataset=neg_dataset)
        eval_samplers = [pos_sampler, neg_sampler]
    else:
        val_dataset = Dataset(data['val'],  data['total_users'], data['total_items'], 
                              implicit_negative=not eval_explicit, name='Val', num_negatives=eval_num_neg)
        test_dataset = Dataset(data['test'],  data['total_users'], data['total_items'], 
                               implicit_negative=not eval_explicit, name='Test', num_negatives=eval_num_neg)
        val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset)
        test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset)
        eval_samplers = [val_sampler, test_sampler]
    
    # set evaluators
    auc_evaluator = AUC()
    evaluators = [auc_evaluator]
  
    
    # set model parameters
    model = PMF(l2_reg=l2_reg, 
                batch_size=batch_size, 
                total_users=train_dataset.total_users(), 
                total_items=train_dataset.total_items(), 
                dim_user_embed=dim_user_embed, 
                dim_item_embed=dim_item_embed, 
                save_model_dir=log_dir, 
                train=True, 
                serve=True)
    
    
    # set model trainer
    model_trainer = ModelTrainer(model=model)  
    model_trainer.train(total_iter=total_iter, 
                        eval_iter=eval_iter, 
                        save_iter=save_iter, 
                        train_sampler=train_sampler, 
                        eval_samplers=eval_samplers, 
                        evaluators=evaluators)