def process(task): """ Training task to run on an ipython engine. """ # import modules required by engine import os import subprocess from mrec import load_sparse_matrix, save_recommender model,input_format,trainfile,feature_format,featurefile,outfile,offset,step = task dataset = load_sparse_matrix(input_format,trainfile) if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix item_features = load_sparse_matrix(feature_format,featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] item_features = item_features[:num_items,:] model.fit(dataset,item_features=item_features) else: model.fit(dataset) save_recommender(model,outfile) # record success cmd = ['touch','{0}.SUCCESS'.format(outfile)] subprocess.check_call(cmd) # return the offset for the samples that we've learned from return offset
def process(task): """ Training task to run on an ipython engine. """ # import modules required by engine import os import subprocess from mrec import load_sparse_matrix, save_recommender model, input_format, trainfile, feature_format, featurefile, outfile, offset, step = task dataset = load_sparse_matrix(input_format, trainfile) if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix item_features = load_sparse_matrix(feature_format, featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] item_features = item_features[:num_items, :] model.fit(dataset, item_features=item_features) else: model.fit(dataset) save_recommender(model, outfile) # record success cmd = ['touch', '{0}.SUCCESS'.format(outfile)] subprocess.check_call(cmd) # return the offset for the samples that we've learned from return offset
def run(task): # import modules required by engine import os import subprocess import numpy as np from scipy.sparse import coo_matrix from mrec import load_sparse_matrix, load_recommender from mrec.evaluation import Evaluator modelfile, input_format, trainfile, test_input_format, testfile, feature_format, featurefile, outdir, start, end, evaluator, generate = task # initialise the model model = load_recommender(modelfile) outfile = os.path.join(outdir, 'recs.{0}-{1}.tsv'.format(start, end)) if generate: # generate recommendations for our batch of users dataset = load_sparse_matrix(input_format, trainfile) out = open(outfile, 'w') if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix item_features = load_sparse_matrix(feature_format, featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] item_features = item_features[:num_items, :] recs = model.range_recommend_items(dataset, start, end, max_items=20, return_scores=True, item_features=item_features) else: recs = model.range_recommend_items(dataset, start, end, max_items=20, return_scores=True) for u, items in zip(xrange(start, end), recs): for i, w in items: print >> out, '{0}\t{1}\t{2}'.format(u + 1, i + 1, w) # write as 1-indexed out.close() # record success cmd = [ 'touch', os.path.join(outdir, '{0}-{1}.SUCCESS'.format(start, end)) ] subprocess.check_call(cmd) # load the test data testdata = load_sparse_matrix(test_input_format, testfile).tocsr() # return evaluation metrics return evaluator.process(testdata, outfile, start, end)
def main(file_format, filepath, feature_format, feature_file, outfile): from mrec import load_sparse_matrix, save_recommender # load training set train = load_sparse_matrix(file_format, filepath) # load item features, assume they are tsv: item_id,feature_id,val X = load_sparse_matrix(feature_format, feature_file).toarray() # strip features for any trailing items that don't appear in training set num_items = train.shape[1] X = X[:num_items, :] model = WARP2MFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10) model.fit(train, X) save_recommender(model, outfile)
def main(): from optparse import OptionParser from mrec import load_sparse_matrix, save_sparse_matrix parser = OptionParser() parser.add_option('--input_format',dest='input_format',help='format of input dataset tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)') parser.add_option('--input',dest='input',help='filepath to input') parser.add_option('--output_format',dest='output_format',help='format of output dataset(s) tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)') parser.add_option('--output',dest='output',help='filepath for output') (opts,args) = parser.parse_args() if not opts.input or not opts.output or not opts.input_format or not opts.output_format: parser.print_help() raise SystemExit if opts.output_format == opts.input_format: raise SystemExit('input and output format are the same, not doing anything') if opts.input_format == 'tsv' and opts.output_format == 'mm': # we can do this without loading the data tsv2mtx(opts.input,opts.output) else: data = load_sparse_matrix(opts.input_format,opts.input) save_sparse_matrix(data,opts.output_format,opts.output)
def run(self, view, model, input_format, trainfile, num_engines, workdir, modelfile): logging.info('creating factors directory {0}'.format(workdir)) subprocess.check_call(['mkdir', '-p', workdir]) logging.info('getting data size') data = load_sparse_matrix(input_format, trainfile) num_users, num_items = data.shape del data for it in xrange(model.num_iters): logging.info('iteration {0}'.format(it)) tasks = self.create_tasks(num_users, num_engines, model, input_format, trainfile, workdir, 'U', get_user_indices, get_item_factor_files, init_item_factors) self.run_tasks(view, tasks) tasks = self.create_tasks( num_items, num_engines, model, input_format, trainfile, workdir, 'V', get_item_indices, get_user_factor_files, None) # won't need to initialize user factors self.run_tasks(view, tasks) model.U = np.vstack( [np.load(f) for f in get_user_factor_files(workdir)]) model.V = np.vstack( [np.load(f) for f in get_item_factor_files(workdir)]) save_recommender(model, modelfile) logging.info('removing partial output files') rmtree(workdir) logging.info('done')
def process(task): """ Training task to run on an ipython engine. """ # import modules required by engine import os import subprocess from mrec import load_sparse_matrix, save_recommender from mrec.mf.warp import ShuffleSampler model,input_format,trainfile,outfile,offset,step = task # TODO: configure this!!! positive_thresh = 1 dataset = load_sparse_matrix(input_format,trainfile) # TODO: models don't seem to converge, investigate.... #sampler = ShuffleSampler(dataset,positive_thresh,42,offset,step) sampler = ShuffleSampler(dataset,positive_thresh,42) model.fit(dataset,sampler) save_recommender(model,outfile) # record success cmd = ['touch','{0}.SUCCESS'.format(outfile)] subprocess.check_call(cmd) # return the offset for the samples that we've learned from return offset
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix from mrec.item_similarity.knn import CosineKNNRecommender from mrec.mf.warp import WARPMFRecommender from mrec.reranking_recommender import RerankingRecommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format, filepath) item_sim_model = CosineKNNRecommender(k=100) mf_model = WARPMFRecommender(d=80, gamma=0.01, C=100.0, max_iters=25000, validation_iters=1000, batch_size=10) recommender = RerankingRecommender(item_sim_model, mf_model, num_candidates=100) recommender.fit(train) save_recommender(recommender, outfile)
def main(file_format,filepath,feature_format,feature_file,outfile): from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix # load training set train = load_sparse_matrix(file_format,filepath) # load item features, assume they are tsv: item_id,feature_id,val X = load_sparse_matrix(feature_format,feature_file).toarray() # strip features for any trailing items that don't appear in training set num_items = train.shape[1] X = X[:num_items,:] model = WARP2MFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10) model.fit(train,X) save_recommender(model,outfile)
def run(task): # import modules required by engine import os import subprocess import numpy as np from scipy.sparse import coo_matrix from mrec import load_sparse_matrix, load_recommender from mrec.evaluation import Evaluator modelfile,input_format,trainfile,test_input_format,testfile,feature_format,featurefile,outdir,start,end,evaluator,generate = task # initialise the model model = load_recommender(modelfile) outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end)) if generate: # generate recommendations for our batch of users dataset = load_sparse_matrix(input_format,trainfile) out = open(outfile,'w') if featurefile is not None: # currently runs much faster if features are loaded as a dense matrix item_features = load_sparse_matrix(feature_format,featurefile).toarray() # strip features for any trailing items that don't appear in training set num_items = dataset.shape[1] item_features = item_features[:num_items,:] recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True,item_features=item_features) else: recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True) for u,items in zip(xrange(start,end),recs): for i,w in items: print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w) # write as 1-indexed out.close() # record success cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))] subprocess.check_call(cmd) # load the test data testdata = load_sparse_matrix(test_input_format,testfile).tocsr() # return evaluation metrics return evaluator.process(testdata,outfile,start,end)
def __call__(self): from mrec import load_sparse_matrix import random current_split_path = self.__generate_current_split_path() train = load_sparse_matrix('csv', current_split_path) import warp max_iters,validation_iters,validation = warp.WARPMFRecommender.create_validation_set(train) users = validation.keys() return train, users, validation
def run(task): # import modules required by engine from mrec import load_sparse_matrix input_format, testfile, recsfile, start, end, evaluator = task # load the test data testdata = load_sparse_matrix(input_format, testfile) return evaluator.process(testdata, recsfile, start, end)
def __call__(self): from mrec import load_sparse_matrix import random current_split_path = self.__generate_current_split_path() train = load_sparse_matrix('csv', current_split_path) import warp max_iters, validation_iters, validation = warp.WARPMFRecommender.create_validation_set( train) users = validation.keys() return train, users, validation
def run(self,view,model,input_format,trainfile,num_engines,simsdir,overwrite,max_sims,simsfile,modelfile): logging.info('finding number of items...') dataset = load_sparse_matrix(input_format,trainfile) num_users,num_items = dataset.shape del dataset logging.info('%d users and %d items', num_users, num_items) logging.info('creating sims directory {0}...'.format(simsdir)) subprocess.check_call(['mkdir','-p',simsdir]) done = [] if not overwrite: logging.info('checking for existing output sims...') done.extend(self.find_done(simsdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = self.create_tasks(model,input_format,trainfile,simsdir,num_items,num_engines,max_sims,done) if num_engines > 0: logging.info('running %d tasks in parallel across ipython' ' engines...', len(tasks)) async_job = view.map_async(process,tasks,retries=2) # wait for tasks to complete results = async_job.get() else: # Sequential run to make it easier for debugging logging.info('training similarity model sequentially') results = [process(task) for task in tasks] logging.info('checking output files...') done = self.find_done(simsdir) remaining = len(tasks) - len(done) if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format(len(done))) paths = [os.path.join(simsdir,'sims.{0}-{1}.tsv'.format(start,end)) for start,end in done] cmd = ['cat']+paths subprocess.check_call(cmd,stdout=open(simsfile,'w')) logging.info('removing partial output files...') rmtree(simsdir) logging.info('loading %d items in %s model from %s', num_items, type(model).__name__, simsfile) model.load_similarity_matrix(simsfile,num_items) save_recommender(model,modelfile) logging.info('done') else: logging.error('FAILED: {0}/{1} tasks did not complete successfully'.format(remaining,len(tasks))) logging.error('try rerunning the command to retry the remaining tasks')
def main(): import os import logging import glob from optparse import OptionParser from collections import defaultdict from mrec import load_sparse_matrix from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate from mrec.evaluation import Evaluator from mrec.evaluation.metrics import print_report from filename_conventions import get_testfile, get_recsfile logging.basicConfig(level=logging.INFO,format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() parser.add_option('--input_format',dest='input_format',help='format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)') parser.add_option('--test_input_format',dest='test_input_format',default='npz',help='format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary) (default: %default)') parser.add_option('--train',dest='train',help='glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard') parser.add_option('--recsdir',dest='recsdir',help='directory containing tsv files of precomputed recommendations') parser.add_option('--metrics',dest='metrics',default='main',help='which set of metrics to compute, main|hitrate (default: %default)') parser.add_option('--description',dest='description',help='description of model which generated the recommendations') metrics_funcs = {'main':compute_main_metrics, 'hitrate':compute_hit_rate} (opts,args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.recsdir \ or opts.metrics not in metrics_funcs: parser.print_help() raise SystemExit opts.train = os.path.abspath(os.path.expanduser(opts.train)) opts.recsdir = os.path.abspath(os.path.expanduser(opts.recsdir)) evaluator = Evaluator(metrics_funcs[opts.metrics],max_items=20) trainfiles = glob.glob(opts.train) all_metrics = defaultdict(list) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) testfile = get_testfile(trainfile) recsfile = get_recsfile(trainfile,opts.recsdir) testdata = load_sparse_matrix(opts.test_input_format,testfile).tocsr() cum_metrics,count = evaluator.process(testdata,recsfile,0,testdata.shape[0]) if cum_metrics is not None: for m in cum_metrics: all_metrics[m].append(float(cum_metrics[m])/count) print_report([opts.description],[all_metrics])
def test_save_load_sparse_matrix(): X = get_random_coo_matrix() for fmt in ["tsv", "csv", "npz", "mm", "fsm"]: if fmt == "mm": suffix = ".mtx" elif fmt == "npz" or fmt == "fsm": suffix = ".npz" else: suffix = "" f, path = tempfile.mkstemp(suffix=suffix) save_sparse_matrix(X, fmt, path) Y = load_sparse_matrix(fmt, path) assert_sparse_matrix_equal(X, Y) os.remove(path)
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format,filepath) model = WARPMFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10, max_iters=7001, validation_iters = 1000, sample_item_rate=0.1) model.fit(train)
def test_save_load_sparse_matrix(): X = get_random_coo_matrix() for fmt in ['tsv','csv','npz','mm','fsm']: if fmt == 'mm': suffix = '.mtx' elif fmt == 'npz' or fmt == 'fsm': suffix = '.npz' else: suffix = '' f,path = tempfile.mkstemp(suffix=suffix) save_sparse_matrix(X,fmt,path) Y = load_sparse_matrix(fmt,path) assert_sparse_matrix_equal(X,Y) os.remove(path)
def run(task): # import modules required by engine import os import subprocess import numpy as np from scipy.sparse import coo_matrix from mrec import load_sparse_matrix, load_recommender from mrec.evaluation import Evaluator modelfile,input_format,trainfile,test_input_format,testfile,outdir,start,end,evaluator,generate = task # initialise the model model = load_recommender(modelfile) dataset = load_sparse_matrix(input_format,trainfile) outfile = os.path.join(outdir,'recs.{0}-{1}.tsv'.format(start,end)) if generate: # generate recommendations for our batch of users out = open(outfile,'w') recs = model.range_recommend_items(dataset,start,end,max_items=20,return_scores=True) for u,items in zip(xrange(start,end),recs): for i,w in items: print >>out,'{0}\t{1}\t{2}'.format(u+1,i+1,w) # write as 1-indexed out.close() # record success cmd = ['touch',os.path.join(outdir,'{0}-{1}.SUCCESS'.format(start,end))] subprocess.check_call(cmd) # load the test data testdata = load_sparse_matrix(test_input_format,testfile).tocsr() # return evaluation metrics return evaluator.process(testdata,outfile,start,end)
def run(task): # import modules required by engine import numpy as np from scipy.sparse import coo_matrix from collections import defaultdict from mrec import load_sparse_matrix input_format,testfile,recsfile,start,end,evaluator = task # load the test data testdata = load_sparse_matrix(input_format,testfile) return evaluator.process(testdata,recsfile,start,end)
def run(task): # import modules required by engine import numpy as np from scipy.sparse import coo_matrix from collections import defaultdict from mrec import load_sparse_matrix input_format, testfile, recsfile, start, end, evaluator = task # load the test data testdata = load_sparse_matrix(input_format, testfile) return evaluator.process(testdata, recsfile, start, end)
def main(): import sys from mrec import load_sparse_matrix, save_recommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format, filepath) model = WARPMFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10) model.fit(train) save_recommender(model, outfile)
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format,filepath) model = WARPMFRecommender(d=100,gamma=0.01,C=100.0,batch_size=10) model.fit(train) save_recommender(model,outfile)
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.mf.climf import CLiMFRecommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format, filepath) model = CLiMFRecommender(d=5) model.fit(train) save_recommender(model, outfile)
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.mf.climf import CLiMFRecommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format,filepath) model = CLiMFRecommender(d=5) model.fit(train) save_recommender(model,outfile)
def run_mrec(d=10, num_iters=4, reg=0.02): #d is dimension of subspace, i.e. groups import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix from mrec.mf.wrmf import WRMFRecommender alpha = 1.0 start = time.time() file_format = "csv" #file shoule be csv, with: row,col,data #data may just be ones filepath = PARS['data_dir'] + "/reduced_row_col_num_cutoff_1.5.csv" #filepath = PARS['data_dir']+"test_10_mill.csv" outfile = make_mrec_outfile(filepath, d, num_iters, reg) print outfile print 'reading file: %s' % filepath # load training set as scipy sparse matrix print "loading file" train = load_sparse_matrix(file_format, filepath) print "loaded file" print(time.time() - start), "seconds" print "size:", train.shape print "creating recommender" model = WRMFRecommender(d=d, num_iters=num_iters, alpha=alpha, lbda=reg) print "training on data" print time.time() - start model.fit(train) print "done training" print time.time() - start print "saving model" save_recommender(model, outfile) print "wrote model to: %s" % outfile print time.time() - start return print "validating" data, U, V = read_mrec(mrec_file=outfile) plot_file = outfile.replace('.npz', '.png') multi_thresh(data, model, thresh_list=None, plot_file=plot_file) run_time = (time.time() - start) / 60.0 print "runtime: %0.3f minutes" % run_time print 'done'
def run_mrec(d=10,num_iters=4,reg=0.02): #d is dimension of subspace, i.e. groups import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix from mrec.mf.wrmf import WRMFRecommender alpha=1.0 start=time.time() file_format = "csv" #file shoule be csv, with: row,col,data #data may just be ones filepath = PARS['data_dir']+"/reduced_row_col_num_cutoff_1.5.csv" #filepath = PARS['data_dir']+"test_10_mill.csv" outfile = make_mrec_outfile(filepath,d,num_iters,reg) print outfile print 'reading file: %s'%filepath # load training set as scipy sparse matrix print "loading file" train = load_sparse_matrix(file_format,filepath) print "loaded file" print (time.time()-start),"seconds" print "size:",train.shape print "creating recommender" model = WRMFRecommender(d=d,num_iters=num_iters,alpha=alpha,lbda=reg) print "training on data" print time.time()-start model.fit(train) print "done training" print time.time()-start print "saving model" save_recommender(model,outfile) print "wrote model to: %s"%outfile print time.time()-start return print "validating" data,U,V=read_mrec(mrec_file=outfile) plot_file=outfile.replace('.npz','.png') multi_thresh(data,model,thresh_list=None,plot_file=plot_file) run_time=(time.time()-start)/60.0 print "runtime: %0.3f minutes"%run_time print 'done'
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format, filepath) model = WARPMFRecommender(d=100, gamma=0.01, C=100.0, batch_size=10, max_iters=7001, validation_iters=1000, sample_item_rate=0.1) model.fit(train)
def main(): import sys from mrec import load_sparse_matrix, save_recommender from mrec.sparse import fast_sparse_matrix from mrec.item_similarity.knn import CosineKNNRecommender from mrec.mf.warp import WARPMFRecommender from mrec.reranking_recommender import RerankingRecommender file_format = sys.argv[1] filepath = sys.argv[2] outfile = sys.argv[3] # load training set as scipy sparse matrix train = load_sparse_matrix(file_format,filepath) item_sim_model = CosineKNNRecommender(k=100) mf_model = WARPMFRecommender(d=80,gamma=0.01,C=100.0,max_iters=25000,validation_iters=1000,batch_size=10) recommender = RerankingRecommender(item_sim_model,mf_model,num_candidates=100) recommender.fit(train) save_recommender(recommender,outfile)
def run(self,view,model,input_format,trainfile,num_engines,workdir,modelfile): logging.info('creating factors directory {0}'.format(workdir)) subprocess.check_call(['mkdir','-p',workdir]) logging.info('getting data size') data = load_sparse_matrix(input_format,trainfile) num_users,num_items = data.shape del data for it in xrange(model.num_iters): logging.info('iteration {0}'.format(it)) tasks = self.create_tasks(num_users,num_engines,model,input_format,trainfile,workdir,'U',get_user_indices,get_item_factor_files,init_item_factors) self.run_tasks(view,tasks) tasks = self.create_tasks(num_items,num_engines,model,input_format,trainfile,workdir,'V',get_item_indices,get_user_factor_files,None) # won't need to initialize user factors self.run_tasks(view,tasks) model.U = np.vstack([np.load(f) for f in get_user_factor_files(workdir)]) model.V = np.vstack([np.load(f) for f in get_item_factor_files(workdir)]) save_recommender(model,modelfile) logging.info('removing partial output files') rmtree(workdir) logging.info('done')
def main(): import sys from mrec import load_sparse_matrix from mrec.sparse import fast_sparse_matrix # load training set as scipy sparse matrix file_format = sys.argv[1] filepath = sys.argv[2] train = load_sparse_matrix(file_format,filepath) model = WARPMFRecommender(d=100,gamma=0.01,C=100,max_iter=100000,validation_iters=500) # these values work for ml-100k sampler = ShuffleSampler(train,1) model.fit(train,sampler) def output(i,j,val): # convert back to 1-indexed print '{0}\t{1}\t{2:.3f}'.format(i+1,j+1,val) print 'making some recommendations...' for u in xrange(20): recs = model.recommend_items(train,u) for i,score in recs: output(u,i,score) print 'making batch recommendations...' recs = model.batch_recommend_items(train) for u in xrange(20): for i,score in recs[u]: output(u,i,score) print 'making range recommendations...' for start,end in [(0,2),(2,3)]: recs = model.range_recommend_items(train,start,end) for u in xrange(start,end): for i,score in recs[u-start]: output(u,i,score)
def main(): from optparse import OptionParser from mrec import load_sparse_matrix, save_sparse_matrix parser = OptionParser() parser.add_option( '--input_format', dest='input_format', help= 'format of input dataset tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)' ) parser.add_option('--input', dest='input', help='filepath to input') parser.add_option( '--output_format', dest='output_format', help= 'format of output dataset(s) tsv | csv | mm (matrixmarket) | csr (scipy.sparse.csr_matrix) | fsm (mrec.sparse.fast_sparse_matrix)' ) parser.add_option('--output', dest='output', help='filepath for output') (opts, args) = parser.parse_args() if not opts.input or not opts.output or not opts.input_format or not opts.output_format: parser.print_help() raise SystemExit if opts.output_format == opts.input_format: raise SystemExit( 'input and output format are the same, not doing anything') if opts.input_format == 'tsv' and opts.output_format == 'mm': # we can do this without loading the data tsv2mtx(opts.input, opts.output) else: data = load_sparse_matrix(opts.input_format, opts.input) save_sparse_matrix(data, opts.output_format, opts.output)
import pandas as pd import numpy as np import random from mrec import load_sparse_matrix, load_recommender from in_store_dict import stores train = load_sparse_matrix('tsv','../data/PATH_TO_DATA_USED_TO_TRAIN_FINAL_MODEL') model = load_recommender('../../../mrec/PATH_TO_FINAL_MODEL') next_usr_num = 382,716 # -> load in users to predict and make into mrec format: # item id == label encoded, # user id == new numbers starting at next_usr_num (add new user code to label encoded dict), # call this table to_predict cold_starters = ['BIG BASS WHEEL', 'SUPER SHOT', 'WIZARD OF OZ 6 PLAYER PUSHER'] counts = to_predict.groupby('user').count().sort('item') def predict_one_user(user, store): if counts.ix[user] < 3: i = 0 game = random.choice(cold_starters) while game not in stores[game] and i < 1000: game = random.choice(cold_starters) i += 1 if store in stores[game]: return game else:
def run(self, view, model, input_format, trainfile, num_engines, simsdir, overwrite, max_sims, simsfile, modelfile): logging.info('finding number of items...') dataset = load_sparse_matrix(input_format, trainfile) num_users, num_items = dataset.shape del dataset logging.info('%d users and %d items', num_users, num_items) logging.info('creating sims directory {0}...'.format(simsdir)) subprocess.check_call(['mkdir', '-p', simsdir]) done = [] if not overwrite: logging.info('checking for existing output sims...') done.extend(self.find_done(simsdir)) if done: logging.info('found {0} output files'.format(len(done))) logging.info('creating tasks...') tasks = self.create_tasks(model, input_format, trainfile, simsdir, num_items, num_engines, max_sims, done) if num_engines > 0: logging.info( 'running %d tasks in parallel across ipython' ' engines...', len(tasks)) async_job = view.map_async(process, tasks, retries=2) # wait for tasks to complete results = async_job.get() else: # Sequential run to make it easier for debugging logging.info('training similarity model sequentially') results = [process(task) for task in tasks] logging.info('checking output files...') done = self.find_done(simsdir) remaining = len(tasks) - len(done) if remaining == 0: logging.info('SUCCESS: all tasks completed') logging.info('concatenating {0} partial output files...'.format( len(done))) paths = [ os.path.join(simsdir, 'sims.{0}-{1}.tsv'.format(start, end)) for start, end in done ] cmd = ['cat'] + paths subprocess.check_call(cmd, stdout=open(simsfile, 'w')) logging.info('removing partial output files...') rmtree(simsdir) logging.info('loading %d items in %s model from %s', num_items, type(model).__name__, simsfile) model.load_similarity_matrix(simsfile, num_items) save_recommender(model, modelfile) logging.info('done') else: logging.error( 'FAILED: {0}/{1} tasks did not complete successfully'.format( remaining, len(tasks))) logging.error( 'try rerunning the command to retry the remaining tasks')
def main(): import os import logging import glob from optparse import OptionParser from collections import defaultdict from mrec import load_sparse_matrix from mrec.evaluation.metrics import compute_main_metrics, compute_hit_rate from mrec.evaluation import Evaluator from mrec.evaluation.metrics import print_report from filename_conventions import get_testfile, get_recsfile logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s') parser = OptionParser() parser.add_option( '--input_format', dest='input_format', help= 'format of training dataset(s) tsv | csv | mm (matrixmarket) | fsm (fast_sparse_matrix)' ) parser.add_option( '--test_input_format', dest='test_input_format', default='npz', help= 'format of test dataset(s) tsv | csv | mm (matrixmarket) | npz (numpy binary) (default: %default)' ) parser.add_option( '--train', dest='train', help= 'glob specifying path(s) to training dataset(s) IMPORTANT: must be in quotes if it includes the * wildcard' ) parser.add_option( '--recsdir', dest='recsdir', help='directory containing tsv files of precomputed recommendations') parser.add_option( '--metrics', dest='metrics', default='main', help='which set of metrics to compute, main|hitrate (default: %default)' ) parser.add_option( '--description', dest='description', help='description of model which generated the recommendations') metrics_funcs = {'main': compute_main_metrics, 'hitrate': compute_hit_rate} (opts, args) = parser.parse_args() if not opts.input_format or not opts.train or not opts.recsdir \ or opts.metrics not in metrics_funcs: parser.print_help() raise SystemExit opts.train = os.path.abspath(os.path.expanduser(opts.train)) opts.recsdir = os.path.abspath(os.path.expanduser(opts.recsdir)) evaluator = Evaluator(metrics_funcs[opts.metrics], max_items=20) trainfiles = glob.glob(opts.train) all_metrics = defaultdict(list) for trainfile in trainfiles: logging.info('processing {0}...'.format(trainfile)) testfile = get_testfile(trainfile) recsfile = get_recsfile(trainfile, opts.recsdir) testdata = load_sparse_matrix(opts.test_input_format, testfile).tocsr() cum_metrics, count = evaluator.process(testdata, recsfile, 0, testdata.shape[0]) if cum_metrics is not None: for m in cum_metrics: all_metrics[m].append(float(cum_metrics[m]) / count) print_report([opts.description], [all_metrics])
def get_dataset_size(input_format, datafile): logging.info('loading dataset to get size...') dataset = load_sparse_matrix(input_format, datafile) return dataset.shape[0], dataset.shape[1], dataset.nnz