def experiment_leave_k_out(exp_name, daily_data_file, min_occ_user, min_occ_prog, \
                           method_list, leave_k_out, total_iteration, top_n, binary = False):
    '''
    
    Parameters
    ----------
    @param exp_name: the experiment name (prefix) 
    @param daily_datafile:
    @param min_occ_user:
    
    @param method_list:
    @param leave_k_out: leave k out for each user. The k must be strict less than
         min_occ_user
    
    @param binary: if this is set to true then the binary data is used (non-zero set to 1). 
    
    Returns
    ----------
    @return out 
    '''
    
    if leave_k_out >= min_occ_user:
        raise ValueError('The k in the leave k out should be strictly less than min_occ_user.'); 
    
    # define lko_log style. 
    lko_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP);
    
    # construct exp_id
    if binary:
        exp_id = 'lko_bi_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration);
    else:
        exp_id = 'lko_' + exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_k' + str(leave_k_out) + '_toiter' + str(total_iteration);
    lko_log('Experiment ID: ' + exp_id);
    
    # load data. 
    lko_log('Read data...');
    reader = UtilityDataReader();
    data = reader.read_file_with_minval(daily_data_file, min_occ_user, min_occ_prog);
    lko_log('Data loaded: ' + str(data));
    
    if binary:
        lko_log('Binarizing data...');
        data.binarize();
    else:
        # normalize 
        lko_log('Normalizing data...');
        data.normalize_row();
    
    result = {};
    
    for method in method_list:
        # do for each method
    
        perf_vect = [];
        for iteration in range(total_iteration):
            # do for each iteration for each method. 
    
            lko_log('Method: '+ method.unique_str() + ' Iteration: '+ str(iteration));
    
            # data split of the current iteration. 
            split_resource_str = 'exp' + exp_id + '_lvidx_iter' + str(iteration); 
            split_dir = exp_id + '/lv_idx';
            leave_k_out_idx = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str, split_dir);
            if not leave_k_out_idx:
                # randomly generate k items from each row/user.   
                leave_k_out_idx = ds.leave_k_out(data, leave_k_out);
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, leave_k_out_idx, split_dir);
            
            # split the k items as a separate. 
            [data_left, data_tr] = data.leave_k_out(leave_k_out_idx); 
            
            iter_result = experiment_unit_leave_k_out(exp_id, method, \
                                    data_tr, data_left, iteration, top_n);
            
            perf_vect.append(iter_result);
    
        result[method.unique_str()] = perf_vect;
    
    return result;
Пример #2
0
def experiment_rand_split(exp_name, daily_data_file, min_occ_user, min_occ_prog, \
                          method_list,  training_prec, total_iteration):
    '''
    
    Parameters
    ----------
    exp_name:    a human-readable experiment name.
    method_list: a list of matrix completion models  
    
    Returns
    ---------- 
    
    '''
    # define mcpl_log style.
    mcpl_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    #mcpl_log('Data ID: ' + hash(daily_data_file));

    # here we use a regular hash.
    exp_id = exp_name + '_data' +str(hash(daily_data_file)) + '_mu' + str(min_occ_user) + '_mp' + str(min_occ_prog) \
                      + '_trprec' + str(training_prec) + '_toiter' + str(total_iteration)

    mcpl_log('Experiment ID: ' + exp_id)

    # save experiment splitting as resources.
    reader = UtilityDataReader()
    data = reader.read_file_with_minval(daily_data_file, min_occ_user,
                                        min_occ_prog)

    # we normalize here before splitting.
    mcpl_log('Normalizing data...')
    data.normalize_row()

    result = {}

    for method in method_list:
        # do for each method

        perf_vect = []
        for iteration in range(total_iteration):
            # do for each iteration for each method;

            mcpl_log('Method: ' + method.unique_str() + ' Iteration: ' +
                     str(iteration))

            # data split of the current iteration.
            split_resource_str = 'exp' + exp_id + '_split_iter' + str(
                iteration)
            split_dir = exp_id + '/split'
            split = URM.LoadResource(URM.RTYPE_RESULT, split_resource_str,
                                     split_dir)
            if not split:
                split = ds.split(data.num_row, training_prec)
                URM.SaveResource(URM.RTYPE_RESULT, split_resource_str, split,
                                 split_dir)

            [split_tr, split_te] = split
            data_tr = data.subdata_row(split_tr)
            data_te = data.subdata_row(split_te)

            iter_result = experiment_unit_rand_split(exp_id, method, data_tr,
                                                     data_te, iteration)

            perf_vect.append(iter_result)

        result[method.unique_str()] = perf_vect

    mcpl_log('Experiment Done [' + exp_id + ']')

    return result
def experiment_tr_te_map(exp_name, train_data_file,  test_data_file, \
                        train_item_feature_file, test_item_feature_file, \
                        max_rank, binary = False):
    '''
    Parameters 
    ----------
    @param exp_name:           
    @param train_data_file:    
    @param test_data_file:     
    @param train_content_file: 
    @param test_content_file:  
    @param max_rank:           the maximal N in the computation. 
    
    Returns
    ----------
    @return out
    '''

    # initialize utilities
    trte_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)

    # processing file name hashing (used for cache string).
    #   create hash for single file or a list of files.
    if isinstance(train_data_file, list):
        hash_file_tr_data_str = str(hash(tuple(train_data_file)))
    else:
        hash_file_tr_data_str = str(hash(train_data_file))

    if isinstance(test_data_file, list):
        hash_file_te_data_str = str(hash(tuple(test_data_file)))
    else:
        hash_file_te_data_str = str(hash(test_data_file))

    if train_item_feature_file:
        if isinstance(train_item_feature_file, list):
            hash_file_tr_item_feature_str = str(
                hash(tuple(train_item_feature_file)))
        else:
            hash_file_tr_item_feature_str = str(hash(train_item_feature_file))
    else:
        hash_file_tr_item_feature_str = ''

    if test_item_feature_file:
        if isinstance(test_item_feature_file, list):
            hash_file_te_item_feature_str = str(
                hash(tuple(test_item_feature_file)))
        else:
            hash_file_te_item_feature_str = str(hash(test_item_feature_file))
    else:
        hash_file_te_item_feature_str = ''

    # display information
    print 'Training data file', train_data_file, ' [hash:', hash_file_tr_data_str, ']'
    if train_item_feature_file:
        print 'Training content feature provided: ', train_item_feature_file, \
                 ' [hash:', hash_file_tr_item_feature_str, ']'
    else:
        print 'Training content feature not provided.'

    print 'Testing data file ', test_data_file, ' [hash:', hash_file_te_data_str, ']'
    if test_item_feature_file:
        print 'Testing content feature provided: ', test_item_feature_file, \
                 ' [hash:', hash_file_te_item_feature_str, ']'
    else:
        print 'Testing content feature not provided.'

    if binary:
        exp_id_prefix = 'trte_bi_'
    else:
        exp_id_prefix = 'trte_'

    exp_id = exp_id_prefix + exp_name + '_trdata_' + hash_file_tr_data_str \
                                      + '_tedata_' + hash_file_te_data_str \
                                      + '_tritemf_' + hash_file_tr_item_feature_str \
                                      + '_tritemf_' + hash_file_te_item_feature_str
    trte_log('Experiment ID: ' + exp_id)

    # load utility data and feature data.
    trte_log('Read training data...')
    reader = UtilityDataReader(fieldDelimiter='\t')

    tr_data = reader.read_file_with_minval(train_data_file, 0, 0)
    trte_log('Training data loaded: ' + str(tr_data))

    te_data = reader.read_file_with_minval(test_data_file, 0, 0)
    trte_log('Testing data loaded: ' + str(te_data))

    # load item feature data

    if binary:
        trte_log('Binarizing data...')
        tr_data.binarize()
        te_data.binarize()

    result = {}
#from StringIO import StringIO
from rs.data.utility_data import UtilityDataReader
from rs.data.recdata import FeedbackData
from copy import deepcopy

if __name__ == '__main__':
    
    #txt = "U1\tD1\t44\n"+"U2\tD2\t10\n"+"U2\tD1\t20\n"    
    #for line in StringIO(txt):
    #    print line
    
    
    
    data_file = '../../datasample/agg_duid_pid_watchtime_genre/toy_small_day1'
    
    reader = UtilityDataReader();
    feedback = reader.readFile(data_file);
    print feedback.col_mapping
    
    col_feature_map= {};
    col_feature_map['P0001'] = 'feature1'
    col_feature_map['P0002'] = 'feature2'
    col_feature_map['P0003'] = 'feature3'
    col_feature_map['P0004'] = 'feature4'
    
    feedback2 = deepcopy(feedback)
    
    feedback.attach_col_feature(col_feature_map)
    
    print feedback.meta[FeedbackData.METAKEY_COL_FEATURE]
    
def experiment_tr_te_map(exp_name, train_data_file,  test_data_file, \
                        train_item_feature_file, test_item_feature_file, \
                        max_rank, binary = False):
    '''
    Parameters 
    ----------
    @param exp_name:           
    @param train_data_file:    
    @param test_data_file:     
    @param train_content_file: 
    @param test_content_file:  
    @param max_rank:           the maximal N in the computation. 
    
    Returns
    ----------
    @return out
    '''
    
    # initialize utilities 
    trte_log = lambda msg: Logger.Log(msg, Logger.MSG_CATEGORY_EXP)
    
    # processing file name hashing (used for cache string).
    #   create hash for single file or a list of files. 
    if isinstance(train_data_file, list):    
        hash_file_tr_data_str = str(hash(tuple(train_data_file)));
    else:
        hash_file_tr_data_str = str(hash(train_data_file));   
    
    if isinstance(test_data_file, list):    
        hash_file_te_data_str = str(hash(tuple(test_data_file)));
    else:
        hash_file_te_data_str = str(hash(test_data_file));
    
    if train_item_feature_file:
        if isinstance(train_item_feature_file, list):
            hash_file_tr_item_feature_str = str(hash(tuple(train_item_feature_file)))
        else:
            hash_file_tr_item_feature_str = str(hash(train_item_feature_file))
    else:
        hash_file_tr_item_feature_str = '';
        
    if test_item_feature_file:
        if isinstance(test_item_feature_file, list):
            hash_file_te_item_feature_str = str(hash(tuple(test_item_feature_file)))
        else:
            hash_file_te_item_feature_str = str(hash(test_item_feature_file))
    else:
        hash_file_te_item_feature_str = '';    
    
    # display information 
    print 'Training data file', train_data_file, ' [hash:', hash_file_tr_data_str, ']'
    if train_item_feature_file:
        print 'Training content feature provided: ', train_item_feature_file, \
                 ' [hash:', hash_file_tr_item_feature_str, ']'
    else:
        print 'Training content feature not provided.'
         
    print 'Testing data file ', test_data_file, ' [hash:', hash_file_te_data_str, ']'
    if test_item_feature_file:
        print 'Testing content feature provided: ', test_item_feature_file, \
                 ' [hash:', hash_file_te_item_feature_str, ']'
    else:
        print 'Testing content feature not provided.'
    
    if binary:
        exp_id_prefix = 'trte_bi_'
    else:
        exp_id_prefix = 'trte_'
    
    exp_id = exp_id_prefix + exp_name + '_trdata_' + hash_file_tr_data_str \
                                      + '_tedata_' + hash_file_te_data_str \
                                      + '_tritemf_' + hash_file_tr_item_feature_str \
                                      + '_tritemf_' + hash_file_te_item_feature_str;
    trte_log('Experiment ID: ' + exp_id)
    
    # load utility data and feature data. 
    trte_log('Read training data...')
    reader = UtilityDataReader(fieldDelimiter = '\t');
    
    tr_data = reader.read_file_with_minval(train_data_file, 0, 0);
    trte_log('Training data loaded: '+ str(tr_data))
    
    te_data = reader.read_file_with_minval(test_data_file, 0, 0);
    trte_log('Testing data loaded: '+ str(te_data))
    
    # load item feature data 
    
    
    if binary:
        trte_log('Binarizing data...');
        tr_data.binarize();
        te_data.binarize();
    
    result = {};