Exemplo n.º 1
0
from BPR import BPR
from openrec.utils.evaluators import AUC
from openrec.utils.samplers import RandomPairwiseSampler
from openrec.utils.samplers import EvaluationSampler
import dataloader

#raw_data = dataloader.load_citeulike()
raw_data = dataloader.load_dataset()
dim_embed = CHANGE_DIM_HERE
total_iter = raw_data["max_iteration"]
batch_size = 1000
eval_iter = total_iter
save_iter = eval_iter

train_dataset = Dataset(raw_data['train_data'],
                        raw_data['total_users'],
                        raw_data['total_items'],
                        name='Train')
val_dataset = Dataset(raw_data['val_data'],
                      raw_data['total_users'],
                      raw_data['total_items'],
                      name='Val',
                      num_negatives=500)
test_dataset = Dataset(raw_data['test_data'],
                       raw_data['total_users'],
                       raw_data['total_items'],
                       name='Test',
                       num_negatives=500)

train_sampler = RandomPairwiseSampler(batch_size=batch_size,
                                      dataset=train_dataset,
                                      num_process=5)
Exemplo n.º 2
0
from openrec import ModelTrainer
from openrec.utils import Dataset
from BPR import BPR
from openrec.utils.evaluators import AUC
from openrec.utils.samplers import RandomPairwiseSampler
from openrec.utils.samplers import EvaluationSampler
import dataloader

raw_data = dataloader.load_citeulike()
dim_embed = 100
total_iter = 10000
batch_size = 1000
eval_iter = 10000
save_iter = eval_iter

train_dataset = Dataset(raw_data['train_data'], raw_data['total_users'], raw_data['total_items'], name='Train')
val_dataset = Dataset(raw_data['val_data'], raw_data['total_users'], raw_data['total_items'], name='Val', num_negatives=500)
test_dataset = Dataset(raw_data['test_data'], raw_data['total_users'], raw_data['total_items'], name='Test', num_negatives=500)

train_sampler = RandomPairwiseSampler(batch_size=batch_size, dataset=train_dataset, num_process=5)
val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset)
test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset)

bpr_model = BPR(batch_size=batch_size, total_users=train_dataset.total_users(), total_items=train_dataset.total_items(), 
                l2_reg=0.01,
                dim_user_embed=dim_embed, dim_item_embed=dim_embed, save_model_dir='bpr_recommender/', train=True, serve=True)

model_trainer = ModelTrainer(model=bpr_model)

auc_evaluator = AUC()
model_trainer.train(total_iter=total_iter, eval_iter=eval_iter, save_iter=save_iter, train_sampler=train_sampler, 
Exemplo n.º 3
0
from openrec.utils.evaluators import MSE
from openrec.utils.samplers import ExplicitSampler

batch_size = 32
test_batch_size = 32
display_itr = 4096
update_itr = 4096

max_user = 480189
max_item = 17770

pretrained_user_embeddings = np.load('dataset/netflix/pretrained_user_embeddings.npy')
pretrained_item_embeddings = np.load('dataset/netflix/pretrained_item_embeddings.npy')
netflix_ratings = np.load('dataset/netflix/netflix_ratings_formatted.npy')

train_dataset = Dataset(netflix_ratings[:-int(1e7)], max_user=max_user, max_item=max_item, name='Train')
val_dataset = Dataset(netflix_ratings[-int(1e7):-int(5e6)], max_user=max_user, max_item=max_item, name='Val')
test_dataset = Dataset(netflix_ratings[-int(5e6):], max_user=max_user, max_item=max_item, name='Test')

model = ItrMLP(batch_size=batch_size, max_user=max_user, max_item=max_item, dim_embed=20, opt='SGD',
              pretrained_user_embeddings=pretrained_user_embeddings, pretrained_item_embeddings=pretrained_item_embeddings,
              user_dims=[30, 30, 20], item_dims=[30, 30, 20], test_batch_size=test_batch_size)

sampler = ExplicitSampler(batch_size=batch_size, dataset=train_dataset, chronological=True)
model_trainer = ItrMLPModelTrainer(batch_size=batch_size, test_batch_size=test_batch_size, 
    train_dataset=train_dataset, model=model, sampler=sampler)

mse_evaluator = MSE()

model_trainer.train(num_itr=int(1e5), display_itr=display_itr, update_itr=update_itr,
                    eval_datasets=[val_dataset, test_dataset],
Exemplo n.º 4
0
    def sample_data_and_train(self):
        self.logger.warning(
            'sample_data_and_train called, pid = %d Please kill process on unsuccessful training',
            os.getpid())
        self.logger.info('-------- sample_data_and_train starts --------')

        total_users = 0
        interactions_count = 0
        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                self.path_to_dataset, 'r') as fin:
            for line in fin:
                interactions_count += int(line.split()[0])
                total_users += 1
        self.logger.info('############ collecting data.. ############')

        # radomly hold out an item per user for validation and testing respectively.
        val_structured_arr = np.zeros(total_users,
                                      dtype=[('user_id', np.int32),
                                             ('item_id', np.int32)])
        test_structured_arr = np.zeros(total_users,
                                       dtype=[('user_id', np.int32),
                                              ('item_id', np.int32)])
        train_structured_arr = np.zeros(interactions_count - total_users * 2,
                                        dtype=[('user_id', np.int32),
                                               ('item_id', np.int32)])

        interaction_ind = 0
        next_user_id = 0
        next_item_id = 0
        map_to_item_id = dict()  # Map item id from 0 to len(items)-1

        with open(
                os.path.dirname(os.path.abspath(__file__)) +
                self.path_to_dataset, 'r') as fin:
            for line in fin:
                item_list = line.split()[1:]
                random.shuffle(item_list)
                for ind, item in enumerate(item_list):
                    if item not in map_to_item_id:
                        map_to_item_id[item] = next_item_id
                        next_item_id += 1
                    if ind == 0:
                        val_structured_arr[next_user_id] = (
                            next_user_id, map_to_item_id[item])
                    elif ind == 1:
                        test_structured_arr[next_user_id] = (
                            next_user_id, map_to_item_id[item])
                    else:
                        train_structured_arr[interaction_ind] = (
                            next_user_id, map_to_item_id[item])
                        interaction_ind += 1
                next_user_id += 1

        self.logger.info('############ instantiating dataset.. ############')

        from openrec.utils import Dataset

        train_dataset = Dataset(raw_data=train_structured_arr,
                                total_users=total_users,
                                total_items=len(map_to_item_id),
                                name='Train')
        val_dataset = Dataset(raw_data=val_structured_arr,
                              total_users=total_users,
                              total_items=len(map_to_item_id),
                              num_negatives=500,
                              name='Val')
        test_dataset = Dataset(raw_data=test_structured_arr,
                               total_users=total_users,
                               total_items=len(map_to_item_id),
                               num_negatives=500,
                               name='Test')

        self.logger.info("############ instantiating Samplers.. ############")

        from openrec.utils.samplers import RandomPairwiseSampler
        from openrec.utils.samplers import EvaluationSampler

        train_sampler = RandomPairwiseSampler(batch_size=1000,
                                              dataset=train_dataset,
                                              num_process=5)
        val_sampler = EvaluationSampler(batch_size=1000, dataset=val_dataset)
        test_sampler = EvaluationSampler(batch_size=1000, dataset=test_dataset)

        self.logger.info(
            "############ instantiating Recommender.. ############")

        from openrec.recommenders import BPR

        bpr_model = BPR(batch_size=1000,
                        total_users=train_dataset.total_users(),
                        total_items=train_dataset.total_items(),
                        dim_user_embed=50,
                        dim_item_embed=50,
                        save_model_dir='bpr_recommender/',
                        train=True,
                        serve=True)

        self.logger.info("############ instantiating Evaluator.. ############")

        from openrec.utils.evaluators import AUC

        auc_evaluator = AUC()

        self.logger.info(
            "############ instantiating Model trainer.. ############")

        from openrec import ModelTrainer

        model_trainer = ModelTrainer(model=bpr_model)

        print("############ starting training.. ############")

        model_trainer.train(
            total_iter=10000,  # Total number of training iterations
            eval_iter=1000,  # Evaluate the model every "eval_iter" iterations
            save_iter=10000,  # Save the model every "save_iter" iterations
            train_sampler=train_sampler,
            eval_samplers=[val_sampler, test_sampler],
            evaluators=[auc_evaluator])
        # self.logger.info("THIS IS WHEN MODEL WILL START TRAINING... returning")
        self.logger.info("-------- sample_data_and_train ends --------")
Exemplo n.º 5
0
def exp(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, save_log, eval_rank):
    
    if neg_ratio is not None:
        if pos_ratio + neg_ratio > 1.0 or pos_ratio + neg_ratio <= 0.0:
            print ("Invalid sampling ratios...")
            return
    
    if dataset == 'spotify':
        data = loadSpotify()
        
    elif dataset == 'bytedance':
        data = loadByteDance()
        
    else:
        print ("Unsupported dataset...")
        return 
    
    # save logging and model
    log_dir = "validation_logs/{}_{}_{}_{}_{}_{}/".format(dataset, l2_reg, pos_ratio, neg_ratio, eval_explicit, eval_rank)
    os.popen("mkdir -p %s" % log_dir).read()
    if save_log:
        log = open(log_dir + "validation.log", "w")
        sys.stdout = log
    
    
    # prepare train, val, test sets and samplers
    train_dataset = Dataset(data['train'], data['total_users'], data['total_items'], name='Train')    
    if neg_ratio is None:
        train_sampler = StratifiedPointwiseSampler(batch_size=batch_size, 
                                                   dataset=train_dataset, 
                                                   pos_ratio=pos_ratio, 
                                                   num_process=5)
    else:
        train_sampler = NegativePointwiseSampler(batch_size=batch_size, 
                                                 dataset=train_dataset, 
                                                 pos_ratio=pos_ratio, 
                                                 neg_ratio=neg_ratio, 
                                                 num_process=5)
        if neg_ratio > 0.0:
            print ("Re-weighting implicit negative feedback")
        else:
            print ("Corrected negative feedback labels but not re-weighting")
    
    
    eval_num_neg = None if eval_explicit else 500 # num of negative samples for evaluation
    if eval_rank:
        # show evaluation metrics for click-complete and click-skip items separately
        pos_dataset = Dataset(data['pos_test'],  data['total_users'], data['total_items'], 
                              implicit_negative=not eval_explicit, name='Pos_Test', num_negatives=eval_num_neg)
        neg_dataset = Dataset(data['neg_test'],  data['total_users'], data['total_items'], 
                              implicit_negative=not eval_explicit, name='Neg_Test', num_negatives=eval_num_neg)
        pos_sampler = EvaluationSampler(batch_size=batch_size, dataset=pos_dataset)
        neg_sampler = EvaluationSampler(batch_size=batch_size, dataset=neg_dataset)
        eval_samplers = [pos_sampler, neg_sampler]
    else:
        val_dataset = Dataset(data['val'],  data['total_users'], data['total_items'], 
                              implicit_negative=not eval_explicit, name='Val', num_negatives=eval_num_neg)
        test_dataset = Dataset(data['test'],  data['total_users'], data['total_items'], 
                               implicit_negative=not eval_explicit, name='Test', num_negatives=eval_num_neg)
        val_sampler = EvaluationSampler(batch_size=batch_size, dataset=val_dataset)
        test_sampler = EvaluationSampler(batch_size=batch_size, dataset=test_dataset)
        eval_samplers = [val_sampler, test_sampler]
    
    # set evaluators
    auc_evaluator = AUC()
    evaluators = [auc_evaluator]
  
    
    # set model parameters
    model = PMF(l2_reg=l2_reg, 
                batch_size=batch_size, 
                total_users=train_dataset.total_users(), 
                total_items=train_dataset.total_items(), 
                dim_user_embed=dim_user_embed, 
                dim_item_embed=dim_item_embed, 
                save_model_dir=log_dir, 
                train=True, 
                serve=True)
    
    
    # set model trainer
    model_trainer = ModelTrainer(model=model)  
    model_trainer.train(total_iter=total_iter, 
                        eval_iter=eval_iter, 
                        save_iter=save_iter, 
                        train_sampler=train_sampler, 
                        eval_samplers=eval_samplers, 
                        evaluators=evaluators)
Exemplo n.º 6
0
total_users = 992
total_items = 14598
train_data = np.load('dataset/lastfm/lastfm_train.npy')
test_data = np.load('dataset/lastfm/lastfm_test.npy')

dim_item_embed = 50
max_seq_len = 20
total_iter = int(1e5)
batch_size = 100
eval_iter = 100
save_iter = eval_iter

train_dataset = Dataset(train_data,
                        total_users,
                        total_items,
                        sortby='ts',
                        name='Train')
test_dataset = Dataset(test_data,
                       total_users,
                       total_items,
                       sortby='ts',
                       name='Test')

train_sampler = TemporalSampler(batch_size=batch_size,
                                max_seq_len=max_seq_len,
                                dataset=train_dataset,
                                num_process=1)
test_sampler = TemporalEvaluationSampler(dataset=test_dataset,
                                         max_seq_len=max_seq_len)
Exemplo n.º 7
0
lastfm_train = np.load('dataset/lastfm/lastfm_train.npy')
lastfm_test = np.load('dataset/lastfm/lastfm_test.npy')
total_users = 992
total_items = 14598

dim_item_embed = 50
max_seq_len = 100
num_units = 32
batch_size = 256
total_iter = int(1e5)
eval_iter = 100
save_iter = eval_iter

train_dataset = Dataset(raw_data=lastfm_train,
                        total_users=total_users,
                        total_items=total_items,
                        sortby='ts',
                        name='Train')
test_dataset = Dataset(raw_data=lastfm_test,
                       total_users=total_users,
                       total_items=total_items,
                       sortby='ts',
                       name='Test')

train_sampler = TemporalSampler(batch_size=batch_size,
                                max_seq_len=max_seq_len,
                                dataset=train_dataset,
                                num_process=1)
test_sampler = TemporalEvaluationSampler(dataset=test_dataset,
                                         max_seq_len=max_seq_len)