def readFile(self, filename):
        This file reads an aggregated file, and return 
        1. duid mapping (from duid to an integer, indicates the row number in the sparse matrix); 
        2. pid mapping  (from pid  to an integer, indicates the column number in the sparse matrix);
        3. core sparse matrix. 
        Note: this method pops data for ALL users. 

        mapping_duid = {}
        # store duid->row# mapping
        mapping_pid = {}
        # store pid->col# mapping

        row = []
        col = []
        data = []

        lineNum = 0
        with open(filename, 'rb') as csvfile:
            logreader = csv.reader(csvfile,
            for logrow in logreader:
                log_duid = logrow[self.fieldMapping['duid']]
                log_pid = logrow[self.fieldMapping['pid']]
                log_watchtime = int(logrow[self.fieldMapping['watchtime']])

                if not (log_duid in mapping_duid):
                    mapping_duid[log_duid] = len(mapping_duid)

                if not (log_pid in mapping_pid):
                    mapping_pid[log_pid] = len(mapping_pid)


                lineNum += 1
                if self.verbose and (lineNum % self.display == 0):
                    Logger.Log(str(lineNum) + ' lines read.')

        if (self.verbose):
            Logger.Log('Done reading agg log file. '+str(len(data)) + ' elements read'+ \
                ' ( '+str(len(mapping_duid))+' row/user, '+str(len(mapping_pid))+' col/program).')

        #return [mapping_duid, mapping_pid, row, col, data];

        result = FeedbackData(row, col, data, len(mapping_duid), len(mapping_pid),\
                    mapping_duid, mapping_pid, [])

        return result
    def read_file_info(self, filename):
        This file reads an aggregated file and get summary (occurrences) 
        for program and device. This information can be used to filtered 
        out programs/devices later. 

        occur_duid = {}
        occur_pid = {}

        # turn a single file into a file list.
        if not isinstance(filename, list):
            filename_arr = [filename]
            filename_arr = filename

        lineNum = 0

        for filename in filename_arr:
            with open(filename, 'rb') as csvfile:
                logreader = csv.reader(csvfile,
                for logrow in logreader:
                    log_duid = logrow[self.fieldMapping['duid']]
                    log_pid = logrow[self.fieldMapping['pid']]

                    if not (log_duid in occur_duid):
                        occur_duid[log_duid] = 1
                        occur_duid[log_duid] += 1

                    if not (log_pid in occur_pid):
                        occur_pid[log_pid] = 1
                        occur_pid[log_pid] += 1

                    lineNum += 1
                    if self.verbose and (lineNum % self.display == 0):
                        Logger.Log(str(lineNum) + ' lines read.')

        # count occurrence into bins.
        cnt_duid = {}
        # cnt_duid[number of occurrence] = number of duid with specific occurrence.
        for val in occur_duid.values():
            if not (val in cnt_duid):
                cnt_duid[val] = 1
                cnt_duid[val] += 1

        cnt_pid = {}
        # cnt_duid[number of occurrence] = number of duid with specific occurrence.
        for val in occur_pid.values():
            if not (val in cnt_pid):
                cnt_pid[val] = 1
                cnt_pid[val] += 1

        return [occur_duid, occur_pid, cnt_duid, cnt_pid]
    def read_file_with_id_list(self, filename, duidlist, pidlist):
        This file reads an aggregated file. 
        The file only include the specified duid and pid.  

        mapping_duid = {}
        # store duid->row# mapping
        mapping_pid = {}
        # store pid->col# mapping

        row = []
        col = []
        data = []

        lineNum = 0

        # turn a single file into a file list.
        if not isinstance(filename, list):
            filename_arr = [filename]
            filename_arr = filename

        for filename in filename_arr:
            with open(filename, 'rb') as csvfile:
                logreader = csv.reader(csvfile,
                for logrow in logreader:
                    log_duid = logrow[self.fieldMapping['duid']]
                    log_pid = logrow[self.fieldMapping['pid']]

                    ## we need both duid and pid are in the list.
                    if (log_duid in duidlist) and (log_pid in pidlist):

                        log_watchtime = int(

                        if not (log_duid in mapping_duid):
                            mapping_duid[log_duid] = len(mapping_duid)

                        if not (log_pid in mapping_pid):
                            mapping_pid[log_pid] = len(mapping_pid)


                    lineNum += 1

                    if self.verbose and (lineNum % self.display == 0):
                        print str(lineNum), ' lines read.'

        if (self.verbose):
            Logger.Log('Done reading agg log file. '+str(len(data)) + ' elements read'+ \
                ' ( '+str(len(mapping_duid))+' row/user, '+str(len(mapping_pid))+' col/program).')

        return [mapping_duid, mapping_pid, row, col, data]
def experiment_future_program(exp_name, previous_data_files, future_data_file, \
                              min_occ_user, min_occ_prog, method_list, top_k):
    experiment entrance for future programs.
    Top-k precision. 
    exp_name:    a human-readable experiment name.
    method_list: a list of recommendation models  
    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    exp_id = exp_name + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog)

    mcpl_log('Experimental ID: ' + exp_id)

    reader = DailyWatchTimeReader()
    tr_data = reader.read_file_with_minval(previous_data_files, min_occ_user,
    te_data = reader.read_file_with_minval(future_data_file, min_occ_user,

    mcpl_log('Normalization data ...')
    # there is no need to normalize train data because we evaluate the hits.

    result = {}

    for method in method_list:
        # do for each method

        mcpl_log('Method: ' + method.unique_str())
        method_result = experiment_unit_future_program(exp_id, method, tr_data,
                                                       te_data, top_k)

        result[method.unique_str()] = method_result

    mcpl_log('Experiment Done [' + exp_id + ']')

    return result
    def __init__(self, cache_folder=None, use_cache=None):
        Create a cache manager with the specified cache location. 

        # if cache is system is used.
        # if turned off then save/load/check resource will not do anything.
        config_use_cache = ConfigManager.GetBoolean(CFG_SEC_UTILS,
        self.use_cache = config_use_cache if use_cache is None else use_cache
        if not self.use_cache:
            Logger.Log("URM is turned off. All cached resources are not available.", \

        # set up the directory for cache.
        config_cache_folder = ConfigManager.Get(CFG_SEC_UTILS,
        self.cache_folder = config_cache_folder if cache_folder is None else cache_folder

        # create directory if it does not exist
        self.cacheLocation = self.cache_folder
        if not os.path.exists(self.cacheLocation):
def experiment_unit_rand_split(exp_id, method, tr_data, te_data, iteration):
    One iteration of training and testing. The experimental ID 

    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration)
    sub_folder = exp_id + '/models/' + method.unique_str()
    # use a sub folder to store the experiment resource.

    # check resource for existing model.
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str,
    if not trained_model:

        # train model using the training data.
        # NOTE: this is the most time-consuming part.
        mcpl_log('training models...')

        # save resource
        trained_model = [method]
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model,

    # compute performance on test data using the model.
    [method] = trained_model
    mcpl_log('computing evaluation metrics on the test data...')
    eval_result = rmse(te_data.data_val,
                       method.predict(te_data.data_row, te_data.data_col))

    return eval_result
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \
                          method_list,  training_prec, total_iteration):
    exp_name:    a human-readable experiment name.
    method_list: a list of matrix completion models  
    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    #mcpl_log('Data ID: ' + hash(daily_data_file));

    # here we use a regular hash.
    exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration)

    mcpl_log('Experiment ID: ' + exp_id)

    # save experiment splitting as resources.
    reader = UtilityDataReader()
    data = reader.read_file_with_minval(daily_data_file, min_occ_user,

    # we normalize here before splitting.
    mcpl_log('Normalizing data...')

    result = {}

    for method in method_list:
        # do for each method

        perf_vect = []
        for iteration in range(total_iteration):
            # do for each iteration for each method;

            mcpl_log('Method: ' + method.unique_str() + ' Iteration: ' +

            # data split of the current iteration.
            split_resource_str = 'exp' + exp_id + '_split_iter' + str(
            split_dir = exp_id + '/split'
            split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str,
            if not split:
                split = ds.split(data.num_row, training_prec)
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split,

            [split_tr, split_te] = split
            data_tr = data.subdata_row(split_tr)
            data_te = data.subdata_row(split_te)

            iter_result = experiment_unit_rand_split(exp_id, method, data_tr,
                                                     data_te, iteration)


        result[method.unique_str()] = perf_vect

    mcpl_log('Experiment Done [' + exp_id + ']')

    return result

import numpy as np
import scipy.sparse
import scipy.linalg
#import timeit;

from rs.utils.log import Logger
from rs.algorithms.recommendation.generic_recalg import CFAlg
from rs.algorithms.optimization.prox import projfun_probability_simplex,\
    proximal, proj_nonneg
from rs.algorithms.optimization.sparsa import Opt_SpaRSA

# an encapsulated logger.
log = lambda message: Logger.Log(HierLat.ALG_NAME + ':' + message, Logger.

class HierLat(CFAlg):
    A random guess recommender (demo).
    ALG_NAME = 'HierLat'

    CS_EQUAL_PROB = 'cs_equal_prob'
    # equal probability
    CS_ALL_AVG = 'cs_all_avg'
    # all average and then normalize.
    CS_OBS_AVG = 'cs_obs_avg'
    # average at observe and then normalize.
Created on Jan 31, 2014

@author: Shiyu C. ([email protected])


import numpy as np;
from rs.algorithms.recommendation.generic_recalg import CFAlg;
from rs.utils.log import Logger; 
import scipy.sparse;
import scipy.linalg

# an encapsulated logger.  
log = lambda message: Logger.Log(CF_ONMTF.ALG_NAME + ':'+message, Logger.MSG_CATEGORY_ALGO);

class CF_ONMTF(CFAlg):
    A random guess recommender (demo).

    def __init__(self, latent_factor = 20, lamb = 1e-3, stop_delta = 1e-4, maxiter = 1e3, verbose = False):
@author: Shiyu C. ([email protected])

Modified on Feb 5, 2014
by Jiayu Zhou, added Rec_LMaFit.  

import numpy as np;
from rs.algorithms.recommendation.generic_recalg import CFAlg;
from rs.algorithms.recommendation.recommender_wrapper import Recommender;
from rs.utils.log import Logger; 
import scipy.sparse
import scipy.linalg

# encapsulated loggers.  
mcpl_log = lambda message: Logger.Log(LMaFit.ALG_NAME + ':'+message, Logger.MSG_CATEGORY_ALGO);
rec_log = lambda message: Logger.Log(Rec_LMaFit.ALG_NAME + ':'+message, Logger.MSG_CATEGORY_ALGO);

class Rec_LMaFit(Recommender):
    An implementation recommender wrapper using the LMaFit algorithm (with the capability of ensemble). 
    ALG_NAME = 'Rec_LMaFit';

    def __init__(self, latent_factors = [1, 2], lamb = [1e-3, 1e-1], stop_delta = 1e-4, maxiter = 1e3, verbose = False):
        self.models = [];
        for model_idx in range(len(latent_factors)):
Created on Jan 27, 2014

@author: jiayu.zhou
from rs.utils.log import Logger

if __name__ == '__main__':
    logger = Logger('./logs');
    logger._log('Hello world', 'TEST', 0);
    logger._log('The second line', 'TEST', 0 );
    logger._log('Something else.', 'TEST', 0);
Created on Jan 27, 2014

@author: jiayu.zhou
from rs.utils.log import Logger

if __name__ == '__main__':
    logger = Logger('./logs')
    logger._log('Hello world', 'TEST', 0)
    logger._log('The second line', 'TEST', 0)
    logger._log('Something else.', 'TEST', 0)
def experiment_leave_k_out(exp_name, daily_data_file, min_occ_user, min_occ_prog, \
                           method_list, leave_k_out, total_iteration, top_n, binary = False):
    @param exp_name: the experiment name (prefix) 
    @param daily_datafile:
    @param min_occ_user:
    @param method_list:
    @param leave_k_out: leave k out for each user. The k must be strict less than
    @param binary: if this is set to true then the binary data is used (non-zero set to 1). 
    @return out 

    if leave_k_out >= min_occ_user:
        raise ValueError(
            'The k in the leave k out should be strictly less than min_occ_user.'

    # define lko_log style.
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    # construct exp_id
    if binary:
        exp_id = 'lko_bi_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration)
        exp_id = 'lko_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration)
    lko_log('Experiment ID: ' + exp_id)

    # load data.
    lko_log('Read data...')
    reader = DailyWatchTimeReader()
    data = reader.read_file_with_minval(daily_data_file, min_occ_user,
    lko_log('Data loaded: ' + str(data))

    if binary:
        lko_log('Binarizing data...')
        # normalize
        lko_log('Normalizing data...')

    result = {}

    for method in method_list:
        # do for each method

        perf_vect = []
        for iteration in range(total_iteration):
            # do for each iteration for each method.

            lko_log('Method: ' + method.unique_str() + ' Iteration: ' +

            # data split of the current iteration.
            split_resource_str = 'exp' + exp_id + '_lvidx_iter' + str(
            split_dir = exp_id + '/lv_idx'
            leave_k_out_idx = URM.LoadResource(URM.RTYPE_RESULT,
                                               split_resource_str, split_dir)
            if not leave_k_out_idx:
                # randomly generate k items from each row/user.
                leave_k_out_idx = ds.leave_k_out(data, leave_k_out)
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str,
                                 leave_k_out_idx, split_dir)

            # split the k items as a separate.
            [data_left, data_tr] = data.leave_k_out(leave_k_out_idx)

            iter_result = experiment_unit_leave_k_out(exp_id, method, \
                                    data_tr, data_left, iteration, top_n)


        result[method.unique_str()] = perf_vect

    return result
 def read_file_with_id_list(self, filename, duidlist, pidlist, ignore_prog_without_genre = True):
     This file reads an aggregated file. 
     The file only include the specified duid and pid.  
     mapping_duid = {}; # store duid->row# mapping 
     mapping_pid  = {}; # store pid->col# mapping
     row  = [];
     col  = [];
     data = [];
     pggr_pg = [];
     pggr_gr = [];
     visited_program_list = set([]);
     lineNum = 0;
     # turn a single file into a file list. 
     if not isinstance(filename, list):
         filename_arr = [filename];
         filename_arr = filename;  
     for filename in filename_arr:
         with open(filename, 'rb') as csvfile:
             logreader = csv.reader(csvfile, delimiter = self.fieldDelimiter, quotechar = '|');
             for logrow in logreader:
                 log_duid      = logrow[self.fieldMapping['duid']];
                 log_pid       = logrow[self.fieldMapping['pid']];
                 log_pg_gr     = logrow[self.fieldMapping['genre']].strip();
                 if not log_pg_gr:
                     Logger.Log('Empty genre information for program '+log_pid, Logger.MSG_CATEGORY_DATA);
                     if ignore_prog_without_genre:
                         # ignore records whose program has no genre information. 
                 ## we need both duid and pid are in the list. 
                 if (log_duid in duidlist) and (log_pid in pidlist):
                     log_watchtime = int(logrow[self.fieldMapping['watchtime']]);
                     if not (log_duid in mapping_duid):
                         mapping_duid[log_duid] = len(mapping_duid);
                     if not (log_pid in mapping_pid):
                         mapping_pid[log_pid]   = len(mapping_pid);
                     # store program - genre mappings, for programs that were not visited before.
                     if not mapping_pid[log_pid] in visited_program_list: 
                         for pg_gr in log_pg_gr.split(','):
                             if not pg_gr:
                 if self.verbose and (lineNum%self.display == 0):
                     print str(lineNum), ' lines read.';
     if (self.verbose):
         Logger.Log('Done reading agg log file. '+str(len(data)) + ' elements read'+ \
             ' ( '+str(len(mapping_duid))+' row/user, '+str(len(mapping_pid))+' col/program).');
     return [mapping_duid, mapping_pid, row, col, data, pggr_pg, pggr_gr];
def experiment_unit_future_program(exp_id, method, tr_data, te_data, top_k):

    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    result_resource_str = 'model_exp'      + exp_id + \
                          '_method'  + method.unique_str()
    sub_folder = exp_id + '/models/' + method.unique_str()
    # use a sub folder to store the experiment resource.

    # check resource for existing model.
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str,
    if not trained_model:

        # train model using the training data.
        # NOTE: this is the most time-consuming part.
        mcpl_log('training models...')

        # save resource
        trained_model = [method]
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model,

    # compute performance on test data using the model.
    [method] = trained_model
    mcpl_log('computing evaluation metrics on the test data...')

    # compute the score of the programs in the prediction.
    prog_list = te_data.col_mapping.keys()
    # program list

    te_datamat = te_data.get_sparse_matrix().tolil()

    eval_result = []

    # TODO: on a subset of DUID?
    for duid in te_data.row_mapping.keys():  # iterate every element.
        prog_score = method.get_score(duid, prog_list, te_data.meta)
        # get scores of the programs in the list.

        # sort the score (first dimension is the index and the second is the actual prediction value).
        #    NOTE: the first dimension is the order with respect to prog_list
        srt_list = [(k[0], k[1]) for k in sorted(
            enumerate(prog_score), key=lambda x: x[1], reverse=True)]

        srt_list = srt_list[:top_k]
        # truncate to top k.

        [srt_idx, _] = zip(*srt_list)

        # map from prog_list to actual index.
        mapped_srt_idx = [
            te_data.col_mapping[prog_list[idx]] for idx in srt_idx

        #print te_datamat[te_data.row_mapping[duid], mapped_srt_idx].todense();

        # get the ground truth hit.
        prog_hit = (te_datamat[te_data.row_mapping[duid],

        # compute hit precision (now we consider only binary hit).

    return eval_result
 def log(msg):
     logging style. 
     Logger.Log(msg, Logger.MSG_CATEGORY_CACHE)
Created on Feb 17, 2014

@author: Shiyu C. ([email protected])


import numpy as np
from rs.algorithms.recommendation.generic_recalg import CFAlg
from rs.utils.log import Logger
import scipy.sparse
#import scipy.linalg

# an encapsulated logger.
log = lambda message: Logger.Log(item_item_sim.ALG_NAME + ':'+message, \

class item_item_sim(CFAlg):

    ALG_NAME = 'item_item_sim'

    def __init__(self, N=3):
        # initialize parameters.
        self.N = N
        log('Item-based similarity algorithm created: Neighborhood size' +
Created on Feb 12, 2014

@author: jiayu.zhou

from rs.algorithms.recommendation.generic_recalg import CFAlg
from rs.utils.log import Logger;
import numpy as np;

mcpl_log = lambda message: Logger.Log(SVDPlusPlus.ALG_NAME + ':'+message, Logger.MSG_CATEGORY_ALGO);

class SVDPlusPlus(CFAlg):
    SVD Plus Plus
    ALG_NAME = 'SVD++';

    def __init__(self, params):
        self.regularization  = 0.015;
        self.learn_rate      = 0.001;
        self.bais_learn_rate = 0.7;
        self.bais_reg        = 0.33;
def experiment_coldstart_map(exp_name,     daily_data_file,\
                    min_occ_user, min_occ_prog, num_user, num_prog,\
                    method_list, blind_k_out, total_iteration, max_rank, binary = False):
    @param exp_name:       the experiment name (prefix) 
    @param daily_datafile: a list of files. 
    @param min_occ_user:   cold start user criteria
    @param min_occ_prog:   cold start user criteria
    @param num_user:       the number of users selected in the experiment. 
    @param num_prog:       the number of programs selected in the experiment. 
    @param method_list:
    @param blind_k_out: leave k out for each user. The k must be strict less than
    @param binary: if this is set to true then the binary data is used (non-zero set to 1). 
    @return out 
    print 'Blind k out: k = ', str(blind_k_out);
    print 'Min_occ_user: '******'Min_occ_prog: ',    str(min_occ_prog);
    if blind_k_out >= min_occ_user:
        raise ValueError('The k in the leave k out [' + str(blind_k_out) 
                         +'] should be strictly less than min_occ_user [' + str(min_occ_user) +'].'); 
    # define lko_log style. 
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    if isinstance(daily_data_file, list):    
        hash_file_str = str(hash(tuple(daily_data_file)));
        hash_file_str = str(hash(daily_data_file));
    # construct exp_id
    if binary:
        exp_id = 'cst_bi_' + exp_name + '_data' + hash_file_str\
                      + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_nu' + str(num_user) + '_np' + str(num_prog) \
                      + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration);
        exp_id = 'cst_'    + exp_name + '_data' + hash_file_str\
                      + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_nu' + str(num_user) + '_np' + str(num_prog) \
                      + '_k' + str(blind_k_out) + '_toiter' + str(total_iteration);
    lko_log('Experiment ID: ' + exp_id);
    # load data. 
    lko_log('Read data...');
    reader = DailyWatchTimeReader();
    data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog, num_user, num_prog);
    lko_log('Data loaded: ' + str(data));
    if binary:
        lko_log('Binarizing data...');
        # normalize 
        lko_log('Normalizing data...');
    result = {};
    for method in method_list:
        # do for each method
        perf_vect = [];
        for iteration in range(total_iteration):
            # do for each iteration for each method. 
            lko_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration));
            # data split of the current iteration. 
            split_resource_str = 'exp' + exp_id + '_blind_idx_iter' + str(iteration); 
            split_dir = exp_id + '/blind_idx';
            blind_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir);
            if not blind_out_idx:
                # randomly generate k items to blind out.
                blind_out_idx   = ds.sample_num(data.num_col, blind_k_out);    
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, blind_out_idx, split_dir);
            lko_log('Blind index done.');
            # split the k items as a separate. 
            [data_tr, data_left] = data.blind_k_out(blind_out_idx); 
            lko_log('Start index');
            iter_result = experiment_unit_leave_k_out_map(exp_id, method, \
                                    data_tr, data_left, iteration, max_rank);
        result[method.unique_str()] = perf_vect;
    return result;
def experiment_unit_leave_k_out_map(exp_id, method, data_tr, data_left, iteration, max_rank):
    This method works on the column/row index of the data_tr and data_left, and 
    the data_tr and data_left must be completely aligned in both row-wise and column-wise. 
    # define lko_log style. 
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration);
    sub_folder = exp_id + '/models/' + method.unique_str(); # use a sub folder to store the experiment resource. 
    # check resource for existing model.  
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str, sub_folder);
    if not trained_model:
        # train model using the training data. 
        # NOTE: this is the most time-consuming part. 
        lko_log('training models...');
        # save resource
        trained_model = [method];
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model, sub_folder);
    # compute performance on test data using the model.    
    [method] = trained_model;
    lko_log('computing evaluation metrics on the test data...');
    eval_result = {};
    # ranked list.
    col_num  = data_left.num_col;
    pred_col = range(col_num);
    tr_data_csr = data_tr.get_sparse_matrix().tocsr();
    lo_data_csr = data_left.get_sparse_matrix().tocsr();
    perf_vect_prec = np.zeros(max_rank); # precision 
    perf_vect_rec  = np.zeros(max_rank); # recall 
    perf_vect_hr   = np.zeros(max_rank); # hit rate (Modification of Xia Ning's Paper) 
    for user_idx in range(data_left.num_row): 
        # predict the entire row. 
        # test column index;
        lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist());
        # there is no testing on this user. 
        if len(lo_col) == 0:
        #pred_row = [user_idx] * col_num;
        #row_pred = method.predict(pred_row, pred_col);
        row_pred = method.predict_row(user_idx, pred_col);
        # rank the column (the result is a list of indices).
        srt_col = [k[0] for k in sorted(enumerate(row_pred), key=lambda x:x[1], reverse=True)];
        # trained columns.
        tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist());
        # remove the trained column from prediction. 
        # this contains a set of indices that predicted (excluding training items).
        te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col];
        #max_rank will result in an array of 0:max_rank-1;
        hit = 0; # the hit variable keeps track of the number of hits till the current rank. 
        for rk in range(max_rank):
            # if rk is greater than the length of te_srt_col, then continue;
            # if not, detect possible hits.
            #    a hit is defined by items hits  
            if (rk < len(te_srt_col)) and (te_srt_col[rk] in lo_col):
                hit += 1;
            perf_vect_hr[rk]   += float(hit)/len(lo_col); # hit rate
            perf_vect_prec[rk] += float(hit)/(rk+1);          # precision
            perf_vect_rec[rk]  += float(hit)/len(lo_col); # recall

    #normalization over users.
    perf_vect_hr   = perf_vect_hr/data_left.num_row; 
    perf_vect_prec = perf_vect_prec/data_left.num_row;
    perf_vect_rec  = perf_vect_rec/data_left.num_row;
    eval_result['hit_rate']  = perf_vect_hr;
    eval_result['precision'] = perf_vect_prec; 
    eval_result['recall']    = perf_vect_rec; 
    eval_result['RMSE']      = rmse(data_left.data_val, method.predict(data_left.data_row, data_left.data_col));
    return eval_result;
 def read_file_with_minval(self, filename, min_duid, min_pid, num_duid = None, num_pid = None, rand_seed = 1): 
     This method first goes through the data once, and filter out 
     the device and program that has occurrences below specified values. 
     Support random undersampling.
     @param filename: a string consists of the file name and location of the data file to be read.
     @param min_duid: a positive integer. the minimum occurrence of a device for the device to be included.
     @param min_pid:  a positive integer. the minimum occurrence of a program for the program to be included. 
     result: a FeedbackData data structure constructed from the data file. In the result there is also 
             a genre-program mapping data (result.meta['pggr_pg'][i], meta['pggr_pr'][i]) indicates that 
             the program at result.meta['pggr_pg'][i] is marked by genre at meta['pggr_pr'][i]. The genre 
             mapping is in R:/Data/Rovi/genre.csv, and a vintage copy is also kept in datasample/Rovi folder.
     if num_duid is None and num_pid is None:
         subsample = False;
         res_str  = 'DWT_RFWMV[' + str(filename) + '][MIN DUID' + str(min_duid) + '][MIN PID' + str(min_pid) +']';
     elif num_duid is not None and num_pid is not None:
         subsample = True;
         res_str  = 'DWT_RFWMV[' + str(filename) + '][MIN DUID' + str(min_duid) + '][MIN PID' + str(min_pid) +']'\
                         + '[NUM DUID' + str(num_duid) + ']' + '[NUM PID' + str(num_pid) + ']';
         raise ValueError('num_duid and num_pid should be both set or both use default');
     # We check if the current resource is available. If not then load from test data and save resource.  
     if not URM.CheckResource(URM.RTYPE_DATA, res_str): 
         Logger.Log('Computing data information...');
         [occur_duid, occur_pid, _, _] = self.read_file_info(filename);
         print str(len(occur_duid)), 'devices', str(len(occur_pid)), 'programs';
         Logger.Log('Generating filtering indices...');
         duidlist = [sel_duid for sel_duid, sel_duidcnt in occur_duid.iteritems() if sel_duidcnt > min_duid];
         pidlist  = [sel_pid  for sel_pid,  sel_pidcnt  in occur_pid.iteritems()  if sel_pidcnt  > min_pid];
         print 'After filtering [MIN_DUID',str(min_duid), ' MIN_PID:', str(min_pid),']:',\
             str(len(occur_duid)), 'devices', str(len(occur_pid)), 'programs';
         # perform random sampling.
         if subsample:
             if len(duidlist) > num_duid:
                 # subsample DUID;
                 duidlist = duidlist[:num_duid];
             if len(pidlist)  > num_pid:
                 # subsample PID;
                 pidlist  = pidlist[:num_pid];
         duidlist = set(duidlist);
         pidlist  = set(pidlist);
         # read the raw data file with the list.
         [mapping_duid, mapping_pid, row, col, data, pggr_pg, pggr_gr] \
             = self.read_file_with_id_list(filename, duidlist, pidlist);
         Logger.Log('read_file_with_minval process completed.');
         result = FeedbackData(row, col, data, len(mapping_duid), len(mapping_pid),\
                 mapping_duid, mapping_pid, {'pggr_pg': pggr_pg, 'pggr_gr': pggr_gr});
         # save computed results to resource cache. 
         URM.SaveResource(URM.RTYPE_DATA, res_str, result);    
         return result;
         return URM.LoadResource(URM.RTYPE_DATA, res_str);
def experiment_unit_leave_k_out(exp_id, method, data_tr, data_left, iteration,
    This method works on the column/row index of the data_tr and data_left, and 
    the data_tr and data_left must be completely aligned in both row-wise and column-wise. 

    # define lko_log style.
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    result_resource_str = 'exp'      + exp_id + \
                          '_method'  + method.unique_str() + \
                          '_iter'    + str(iteration)
    sub_folder = exp_id + '/models/' + method.unique_str()
    # use a sub folder to store the experiment resource.

    # check resource for existing model.
    trained_model = URM.LoadResource(URM.RTYPE_RESULT, result_resource_str,
    if not trained_model:

        # train model using the training data.
        # NOTE: this is the most time-consuming part.
        lko_log('training models...')

        # save resource
        trained_model = [method]
        URM.SaveResource(URM.RTYPE_RESULT, result_resource_str, trained_model,

    # compute performance on test data using the model.
    [method] = trained_model
    lko_log('computing evaluation metrics on the test data...')

    eval_result = {}
    # ranked list.

    col_num = data_left.num_col
    pred_col = range(col_num)

    tr_data_csr = data_tr.get_sparse_matrix().tocsr()
    lo_data_csr = data_left.get_sparse_matrix().tocsr()

    for user_idx in range(data_left.num_row):
        # predict the entire row.

        #pred_row = [user_idx] * col_num;
        #row_pred = method.predict(pred_row, pred_col);
        row_pred = method.predict_row(user_idx, pred_col)

        # rank the column (the result is a list of indices).
        srt_col = [
            k[0] for k in sorted(
                enumerate(row_pred), key=lambda x: x[1], reverse=True)
        # trained columns.
        tr_col = set(np.nonzero(tr_data_csr[user_idx, :])[1].tolist())
        # remove the trained column.
        te_srt_col = [col_pos for col_pos in srt_col if col_pos not in tr_col]
        # top - k (safeguard)
        te_topk_col = te_srt_col[:min(top_n,
                                      len(te_srt_col) - 1)]
        # test column index;
        lo_col = set(np.nonzero(lo_data_csr[user_idx, :])[1].tolist())

        prec = precision_itemlist(te_topk_col, lo_col)
        rec = recall_itemlist(te_topk_col, lo_col)
    eval_result['prec'] = prec
    eval_result['recall'] = rec
    eval_result['rmse'] = rmse(
        method.predict(data_left.data_row, data_left.data_col))
    return eval_result
def experiment_tr_te_map(exp_name, train_data_file,  test_data_file, \
                        train_item_feature_file, test_item_feature_file, \
                        max_rank, binary = False):
    @param exp_name:           
    @param train_data_file:    
    @param test_data_file:     
    @param train_content_file: 
    @param test_content_file:  
    @param max_rank:           the maximal N in the computation. 
    @return out

    # initialize utilities
    trte_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    # processing file name hashing (used for cache string).
    #   create hash for single file or a list of files.
    if isinstance(train_data_file, list):
        hash_file_tr_data_str = str(hash(tuple(train_data_file)))
        hash_file_tr_data_str = str(hash(train_data_file))

    if isinstance(test_data_file, list):
        hash_file_te_data_str = str(hash(tuple(test_data_file)))
        hash_file_te_data_str = str(hash(test_data_file))

    if train_item_feature_file:
        if isinstance(train_item_feature_file, list):
            hash_file_tr_item_feature_str = str(
            hash_file_tr_item_feature_str = str(hash(train_item_feature_file))
        hash_file_tr_item_feature_str = ''

    if test_item_feature_file:
        if isinstance(test_item_feature_file, list):
            hash_file_te_item_feature_str = str(
            hash_file_te_item_feature_str = str(hash(test_item_feature_file))
        hash_file_te_item_feature_str = ''

    # display information
    print 'Training data file', train_data_file, ' [hash:', hash_file_tr_data_str, ']'
    if train_item_feature_file:
        print 'Training content feature provided: ', train_item_feature_file, \
                 ' [hash:', hash_file_tr_item_feature_str, ']'
        print 'Training content feature not provided.'

    print 'Testing data file ', test_data_file, ' [hash:', hash_file_te_data_str, ']'
    if test_item_feature_file:
        print 'Testing content feature provided: ', test_item_feature_file, \
                 ' [hash:', hash_file_te_item_feature_str, ']'
        print 'Testing content feature not provided.'

    if binary:
        exp_id_prefix = 'trte_bi_'
        exp_id_prefix = 'trte_'

    exp_id = exp_id_prefix + exp_name + '_trdata_' + hash_file_tr_data_str \
                                      + '_tedata_' + hash_file_te_data_str \
                                      + '_tritemf_' + hash_file_tr_item_feature_str \
                                      + '_tritemf_' + hash_file_te_item_feature_str
    trte_log('Experiment ID: ' + exp_id)

    # load utility data and feature data.
    trte_log('Read training data...')
    reader = UtilityDataReader(fieldDelimiter='\t')

    tr_data = reader.read_file_with_minval(train_data_file, 0, 0)
    trte_log('Training data loaded: ' + str(tr_data))

    te_data = reader.read_file_with_minval(test_data_file, 0, 0)
    trte_log('Testing data loaded: ' + str(te_data))

    # load item feature data

    if binary:
        trte_log('Binarizing data...')

    result = {}
Created on Jan 29, 2014

@author: jiayu.zhou
import numpy as np
from rs.algorithms.recommendation.generic_recalg import CFAlg
from rs.utils.log import Logger

# an encapsulated logger.
log = lambda message: Logger.Log(RandUV.ALG_NAME + ':' + message, Logger.

class RandUV(CFAlg):
    A random guess recommender (demo).

    def __init__(self, latent_factor=5, verbose=False):
        # initialize parameters.
        self.latent_factor = latent_factor
        log('dummy algorithm instance created: latent factor ' +

        self.verbose = verbose

import numpy as np
import csv
import cPickle as pickle
import os
import sys

from rs.utils.log import Logger
from scipy.sparse import coo_matrix
from import DailyWatchTimeReader
from rs.utils.sparse_matrix import normalize_row
from scipy.spatial.distance import cosine

mcpl_log = lambda message: Logger.Log('PROG SIMILARITY: ' + message, Logger.

if __name__ == '__main__':

    if len(sys.argv) >= 3:
        filename = sys.argv[1]
        rovi_daily_file = sys.argv[2]
        # INPUT: the ROVI daily mapping.
        #  Hadoop location: /apps/vddil/rovi_daily
        rovi_daily_file = "/Users/jiayu.zhou/Data/rovi_daily/20131209.tsv"

        # INPUT: the aggregated data file.
        #  Hadoop location: /apps/vddil/duid-program-watchTime-genre
        #filename = "/Users/jiayu.zhou/Data/duid-program-watchTime-genre/20131209/part-r-00000";
        filename = "../../datasample/agg_duid_pid_watchtime_genre/20131209_100000"