def estimate_users_per_task(mb_per_task, input_format, trainfile, modelfile): num_users, num_items, nnz = get_dataset_size(input_format, trainfile) logging.info('loading model to get size...') model = load_recommender(modelfile) # we load the training and test data on every task # - let's guess that worst case the test data will be the same size required_mb_per_task = 2 * (nnz * 16) / ONE_MB if isinstance(model, MatrixFactorizationRecommender): # we have to load the factors on every task required_mb_per_task += ((model.U.size + model.V.size) * 16) / ONE_MB if mb_per_task > required_mb_per_task: # remaining mem usage is dominated by computed scores: users_per_task = ((mb_per_task - required_mb_per_task) * ONE_MB) / (num_items * 16) elif isinstance(model, ItemSimilarityRecommender): # we have to load the similarity matrix on every task required_mb_per_task += (model.similarity_matrix.nnz * 16) / ONE_MB if mb_per_task > required_mb_per_task: # estimate additional usage from avg items per user and sims per item items_per_user = nnz / num_users sims_per_item = model.similarity_matrix.nnz / num_items users_per_task = ((mb_per_task - required_mb_per_task) * ONE_MB) / (items_per_user * sims_per_item * 16) else: # assume nothing else to load users_per_task = num_users if mb_per_task <= required_mb_per_task: raise RuntimeError( 'requires at least {0}MB per task, increase --mb_per_task if you can'.format(required_mb_per_task)) return int(users_per_task), int(num_users)
def run(task): # import modules required by engine import os import subprocess import numpy as np from scipy.sparse import coo_matrix from mrec import load_sparse_matrix, load_recommender from mrec.evaluation import Evaluator modelfile, input_format, trainfile, test_input_format, testfile, feature_format, featurefile, outdir, start, end, evaluator, generate = task # initialise the model model = load_recommender(modelfile) outfile = os.path.join(outdir, 'recs.{0}-{1}.tsv'.format(start, end)) if generate: # generate recommendations for our batch of users dataset = load_sparse_matrix(input_format, trainfile) out = open(outfile, 'w') if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix item_features = load_sparse_matrix(feature_format, featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] item_features = item_features[:num_items, :] recs = model.range_recommend_items(dataset, start, end, max_items=20, return_scores=True, item_features=item_features) else: recs = model.range_recommend_items(dataset, start, end, max_items=20, return_scores=True) for u, items in zip(xrange(start, end), recs): for i, w in items: print >> out, '{0}\t{1}\t{2}'.format(u + 1, i + 1, w) # write as 1-indexed out.close() # record success cmd = [ 'touch', os.path.join(outdir, '{0}-{1}.SUCCESS'.format(start, end)) ] subprocess.check_call(cmd) # load the test data testdata = load_sparse_matrix(test_input_format, testfile).tocsr() # return evaluation metrics return evaluator.process(testdata, outfile, start, end)
def run(self, view, model, input_format, trainfile, feature_format, featurefile, num_engines, workdir, overwrite, modelfile): logging.info('creating models directory {0}...'.format(workdir)) subprocess.check_call(['mkdir', '-p', workdir]) done = [] if not overwrite: logging.info('checking for existing output models...') done.extend(self.find_done(workdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = self.create_tasks(model, input_format, trainfile, feature_format, featurefile, workdir, num_engines, done) if tasks: logging.info('running in parallel across ipython engines...') async_job = view.map_async(process, tasks, retries=2) # wait for tasks to complete results = async_job.get() logging.info('checking output files...') done = self.find_done(workdir) remaining = len(tasks) - len(done) else: remaining = 0 if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} models...'.format(len(done))) for ix in sorted(done): partial_model = load_recommender( self.get_modelfile(ix, workdir)) if ix == 0: model = partial_model else: # concatenate factors model.d += partial_model.d model.U = np.hstack((model.U, partial_model.U)) model.V = np.hstack((model.V, partial_model.V)) if hasattr(model, 'W'): model.W = np.hstack((model.W, partial_model.W)) save_recommender(model, modelfile) logging.info('removing partial output files...') rmtree(workdir) logging.info('done') else: logging.error( 'FAILED: {0}/{1} tasks did not complete successfully'.format( remaining, len(tasks))) logging.error( 'try rerunning the command to retry the remaining tasks')
def run(self,view,model,input_format,trainfile,num_engines,workdir,overwrite,modelfile): logging.info('creating models directory {0}...'.format(workdir)) subprocess.check_call(['mkdir','-p',workdir]) done = [] if not overwrite: logging.info('checking for existing output models...') done.extend(self.find_done(workdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = self.create_tasks(model,input_format,trainfile,workdir,num_engines,done) if tasks: logging.info('running in parallel across ipython engines...') async_job = view.map_async(process,tasks,retries=2) # wait for tasks to complete results = async_job.get() logging.info('checking output files...') done = self.find_done(workdir) remaining = len(tasks) - len(done) else: remaining = 0 if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('averaging {0} models...'.format(len(done))) for ix in sorted(done): # average two models at a time to limit memory usage partial_model = load_recommender(self.get_modelfile(ix,workdir)) if ix == 0: model = partial_model else: model.U = (ix*model.U + partial_model.U)/float(ix+1) model.V = (ix*model.V + partial_model.V)/float(ix+1) save_recommender(model,modelfile) logging.info('removing partial output files...') rmtree(workdir) logging.info('done') else: logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) logging.error('try rerunning the command to retry the remaining tasks')
def run(task): # import modules required by engine import os import subprocess import numpy as np from scipy.sparse import coo_matrix from mrec import load_sparse_matrix, load_recommender from mrec.evaluation import Evaluator modelfile,input_format,trainfile,test_input_format,testfile,feature_format,featurefile,outdir,start,end,evaluator,generate = task # initialise the model model = load_recommender(modelfile) outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end)) if generate: # generate recommendations for our batch of users dataset = load_sparse_matrix(input_format,trainfile) out = open(outfile,'w') if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix item_features = load_sparse_matrix(feature_format,featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] item_features = item_features[:num_items,:] recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True,item_features=item_features) else: recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True) for u,items in zip(xrange(start,end),recs): for i,w in items: print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w) # write as 1-indexed out.close() # record success cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))] subprocess.check_call(cmd) # load the test data testdata = load_sparse_matrix(test_input_format,testfile).tocsr() # return evaluation metrics return evaluator.process(testdata,outfile,start,end)
def __init__(self,d=10,num_iters=4,reg=0.02): #TODO: clean this up filepath = PARS['data_dir']+"/reduced_row_col_num_cutoff_1.5.csv" file_name='/Users/davej/data/AVSC/reduced.csv' self.model_file = make_mrec_outfile(filepath,d=d,num_iters=num_iters,reg=reg) self.dictfile_user=file_name.replace('.csv','_dict_user.csv') self.dictfile_item=file_name.replace('.csv','_dict_item.csv') print "loading model in : %s" % self.model_file self.model=load_recommender(self.model_file) print "loading dictionaries" self.dict_user=dict(list(csv.reader(open(self.dictfile_user,'rU')))) self.dict_item=dict(list(csv.reader(open(self.dictfile_item,'rU')))) self.nbad=0 #kmeans stuff self.k_default=75 self.alpha=10.0 self.mu=0.30
def read_mrec(mrec_file='reduced.v1_numbers_mrec_d5_iter9_reg0.0150.npz'): file_name=mrec_file data_file_name=file_name.split('_mrec_')[0]+'.csv' model=mrec.load_recommender(file_name) U=model.U V=model.V model_matrix=np.dot(U,V.transpose()) shape=model_matrix.shape shape=(U.shape[0],V.shape[0]) data_matrix=np.ndarray(shape,dtype=int) line_num=0 for line in open(data_file_name,'r'): line_num+=1 if line_num % 1000000 ==0 : print line_num dat=line.strip().split(',') row=int(dat[0])-1 col=int(dat[1])-1 val=int(float(dat[2])) data_matrix[row,col]=val return (data_matrix,U,V)
def read_mrec(mrec_file='reduced.v1_numbers_mrec_d5_iter9_reg0.0150.npz'): file_name = mrec_file data_file_name = file_name.split('_mrec_')[0] + '.csv' model = mrec.load_recommender(file_name) U = model.U V = model.V model_matrix = np.dot(U, V.transpose()) shape = model_matrix.shape shape = (U.shape[0], V.shape[0]) data_matrix = np.ndarray(shape, dtype=int) line_num = 0 for line in open(data_file_name, 'r'): line_num += 1 if line_num % 1000000 == 0: print line_num dat = line.strip().split(',') row = int(dat[0]) - 1 col = int(dat[1]) - 1 val = int(float(dat[2])) data_matrix[row, col] = val return (data_matrix, U, V)
def __init__(self, d=10, num_iters=4, reg=0.02): #TODO: clean this up filepath = PARS['data_dir'] + "/reduced_row_col_num_cutoff_1.5.csv" file_name = '/Users/davej/data/AVSC/reduced.csv' self.model_file = make_mrec_outfile(filepath, d=d, num_iters=num_iters, reg=reg) self.dictfile_user = file_name.replace('.csv', '_dict_user.csv') self.dictfile_item = file_name.replace('.csv', '_dict_item.csv') print "loading model in : %s" % self.model_file self.model = load_recommender(self.model_file) print "loading dictionaries" self.dict_user = dict(list(csv.reader(open(self.dictfile_user, 'rU')))) self.dict_item = dict(list(csv.reader(open(self.dictfile_item, 'rU')))) self.nbad = 0 #kmeans stuff self.k_default = 75 self.alpha = 5.0 self.mu = 0.31
def run(task): # import modules required by engine import os import subprocess import numpy as np from scipy.sparse import coo_matrix from mrec import load_sparse_matrix, load_recommender from mrec.evaluation import Evaluator modelfile,input_format,trainfile,test_input_format,testfile,outdir,start,end,evaluator,generate = task # initialise the model model = load_recommender(modelfile) dataset = load_sparse_matrix(input_format,trainfile) outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end)) if generate: # generate recommendations for our batch of users out = open(outfile,'w') recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True) for u,items in zip(xrange(start,end),recs): for i,w in items: print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w) # write as 1-indexed out.close() # record success cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))] subprocess.check_call(cmd) # load the test data testdata = load_sparse_matrix(test_input_format,testfile).tocsr() # return evaluation metrics return evaluator.process(testdata,outfile,start,end)
import pandas as pd import numpy as np import random from mrec import load_sparse_matrix, load_recommender from in_store_dict import stores train = load_sparse_matrix('tsv','../data/PATH_TO_DATA_USED_TO_TRAIN_FINAL_MODEL') model = load_recommender('../../../mrec/PATH_TO_FINAL_MODEL') next_usr_num = 382,716 # -> load in users to predict and make into mrec format: # item id == label encoded, # user id == new numbers starting at next_usr_num (add new user code to label encoded dict), # call this table to_predict cold_starters = ['BIG BASS WHEEL', 'SUPER SHOT', 'WIZARD OF OZ 6 PLAYER PUSHER'] counts = to_predict.groupby('user').count().sort('item') def predict_one_user(user, store): if counts.ix[user] < 3: i = 0 game = random.choice(cold_starters) while game not in stores[game] and i < 1000: game = random.choice(cold_starters) i += 1 if store in stores[game]: return game else: