def loadDataSet(conf, file, bTest=False, binarized=False, threshold=3.0): trainingData = [] testData = [] ratingConfig = LineConfig(conf['ratings.setup']) if not bTest: print('loading training data...') else: print('loading test data...') with open(file) as f: ratings = f.readlines() # ignore the headline if ratingConfig.contains('-header'): ratings = ratings[1:] # order of the columns order = ratingConfig['-columns'].strip().split() delim = ' |,|\t' if ratingConfig.contains('-delim'): delim = ratingConfig['-delim'] for lineNo, line in enumerate(ratings): items = split(delim, line.strip()) if not bTest and len(order) < 2: print( 'The rating file is not in a correct format. Error: Line num %d' % lineNo) exit(-1) try: userId = items[int(order[0])] itemId = items[int(order[1])] if len(order) < 3: rating = 1 #default value else: rating = items[int(order[2])] if binarized: if float(items[int(order[2])]) < threshold: continue else: rating = 1 except ValueError: print( 'Error! Have you added the option -header to the rating.setup?' ) exit(-1) if not bTest: trainingData.append([userId, itemId, float(rating)]) else: if binarized: if rating == 1: testData.append([userId, itemId, float(rating)]) else: continue testData.append([userId, itemId, float(rating)]) if not bTest: return trainingData else: return testData
def execute(self, config, max_sample=1000): # import the algorithm module importStr = 'from algorithm.ranking.' + config[ 'recommender'] + ' import ' + config['recommender'] exec(importStr) algo_evaluation = LineConfig(config['evaluation.setup']) if algo_evaluation.contains('-ul') and eval( algo_evaluation['-ul']) > 0: training_data = 'self.training_user_item' social_info = 'relation=self.relation' else: training_data = 'self.training_account_item' social_info = '' if config['recommender'].startswith('ABPR'): recommender = config['recommender'] + '(config, {}, self.test_user_item, {}, C={}, N={})'. \ format(training_data, social_info, self.C, self.N) else: recommender = config['recommender'] + '(config, {}, self.test_user_item, {})'.\ format(training_data, social_info) algorithum = eval(recommender) algorithum.accountDAO = self.accountDAO algorithum.evaluation_conf = algo_evaluation algorithum.get_test_map(K=self.K, L=self.L) algorithum.get_test_sample_data(max_sample=max_sample) algorithum.execute()
def loadRelationship(conf, filePath): socialConfig = LineConfig(conf['social.setup']) relation = [] print('loading social data...') with open(filePath) as f: relations = f.readlines() # ignore the headline if socialConfig.contains('-header'): relations = relations[1:] # order of the columns order = socialConfig['-columns'].strip().split() if len(order) <= 2: print('The social file is not in a correct format.') for lineNo, line in enumerate(relations): items = split(' |,|\t', line.strip()) if len(order) < 2: print( 'The social file is not in a correct format. Error: Line num %d' % lineNo) exit(-1) userId1 = items[int(order[0])] userId2 = items[int(order[1])] if len(order) < 3: weight = 1 else: weight = float(items[int(order[2])]) relation.append([userId1, userId2, weight]) return relation
def __init__(self, config_dict, config_account, account_DAO, C=3, K=1, L=-1, N=0): self.trainingData = [] # training data self.testData = [] # testData self.relation = [] self.measure = [] self.config_dict = config_dict self.C = C self.K = K self.L = L self.N = N self.accountDAO = account_DAO if config_account.contains('evaluation.setup'): all_evaluation = LineConfig(config_account['evaluation.setup']) if all_evaluation.contains('--account'): self.training_user_item = account_DAO.training_user_item self.training_account_item = account_DAO.training_account_item self.relation = account_DAO.relation self.test_user_item = account_DAO.test_user_item else: raise Exception('Evaluation is not well configured!') print('preprocessing...')
def __init__(self,config): self.trainingData = [] # training data self.testData = [] # testData self.measure = [] self.config =config setup = LineConfig(config['record.setup']) columns = {} labels = setup['-columns'].split(',') delim = '' if setup.contains('-delim'): delim=setup['-delim'] for col in labels: label = col.split(':') columns[label[0]] = int(label[1]) if self.config.contains('evaluation.setup'): self.evaluation = LineConfig(config['evaluation.setup']) binarized = False bottom = 0 if self.evaluation.contains('-b'): binarized = True bottom = float(self.evaluation['-b']) if self.evaluation.contains('-testSet'): #specify testSet self.trainingData = FileIO.loadDataSet(config['record'],columns=columns,binarized=binarized,threshold=bottom,delim=delim) self.testData = FileIO.loadDataSet(self.evaluation['-testSet'],binarized=binarized,columns=columns,threshold=bottom,delim=delim) elif self.evaluation.contains('-ap'): #auto partition self.trainingData = FileIO.loadDataSet(config['record'],columns=columns,binarized=binarized,threshold=bottom,delim=delim) self.trainingData,self.testData = DataSplit.\ dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap'])) elif self.evaluation.contains('-byTime'): self.trainingData = FileIO.loadDataSet(config['record'], columns=columns, binarized=binarized,threshold=bottom, delim=delim) self.testData = [] elif self.evaluation.contains('-cv'): #cross validation self.trainingData = FileIO.loadDataSet(config['record'],columns=columns,binarized=binarized,threshold=bottom,delim=delim) #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv'])) else: print 'Evaluation is not well configured!' exit(-1) # if config.contains('social'): # self.socialConfig = LineConfig(self.config['social.setup']) # self.relation = FileIO.loadRelationship(config,self.config['social']) print 'preprocessing...'
def loadDataSet(conf, file, bTest=False): trainingData = defaultdict(dict) testData = defaultdict(dict) ratingConfig = LineConfig(conf['ratings.setup']) if not bTest: print('loading training data...') else: print('loading test data...') with open(file) as f: ratings = f.readlines() # ignore the headline if ratingConfig.contains('-header'): ratings = ratings[1:] # order of the columns order = ratingConfig['-columns'].strip().split() for lineNo, line in enumerate(ratings): items = split(' |,|\t', line.strip()) if not bTest and len(order) < 3: print( 'The rating file is not in a correct format. Error: Line num %d' % lineNo) exit(-1) try: userId = items[int(order[0])] itemId = items[int(order[1])] if bTest and len(order) < 3: rating = 1 #default value else: rating = items[int(order[2])] except ValueError: print( 'Error! Have you added the option -header to the rating.setup?' ) exit(-1) if not bTest: trainingData[userId][itemId] = float(rating) else: testData[userId][itemId] = float(rating) if not bTest: return trainingData else: return testData
class Recommender(object): def __init__(self, conf, trainingSet, testSet, fold='[1]'): self.config = conf self.data = None self.isSaveModel = False self.ranking = None self.isLoadModel = False self.output = None self.isOutput = True self.data = RatingDAO(self.config, trainingSet, testSet) self.foldInfo = fold self.evalSettings = LineConfig(self.config['evaluation.setup']) self.measure = [] self.record = [] if self.evalSettings.contains('-cold'): #evaluation on cold-start users threshold = int(self.evalSettings['-cold']) removedUser = {} for user in self.data.testSet_u: if self.data.trainSet_u.has_key(user) and len( self.data.trainSet_u[user]) > threshold: removedUser[user] = 1 for user in removedUser: del self.data.testSet_u[user] testData = [] for item in self.data.testData: if not removedUser.has_key(item[0]): testData.append(item) self.data.testData = testData self.num_users, self.num_items, self.train_size = self.data.trainingSize( ) def readConfiguration(self): self.algorName = self.config['recommender'] self.output = LineConfig(self.config['output.setup']) self.isOutput = self.output.isMainOn() self.ranking = LineConfig(self.config['item.ranking']) def printAlgorConfig(self): "show algorithm's configuration" print('Algorithm:', self.config['recommender']) print('Ratings dataset:', abspath(self.config['ratings'])) if LineConfig(self.config['evaluation.setup']).contains('-testSet'): print( 'Test set:', abspath( LineConfig(self.config['evaluation.setup']).getOption( '-testSet'))) #print 'Count of the users in training set: ',len() print( 'Training set size: (user count: %d, item count %d, record count: %d)' % (self.data.trainingSize())) print( 'Test set size: (user count: %d, item count %d, record count: %d)' % (self.data.testSize())) print('=' * 80) def initModel(self): pass def buildModel(self): 'build the model (for model-based algorithms )' pass def buildModel_tf(self): 'training model on tensorflow' pass def saveModel(self): pass def loadModel(self): pass def predict(self, u, i): pass def predictForRanking(self, u): pass def checkRatingBoundary(self, prediction): if prediction > self.data.rScale[-1]: return self.data.rScale[-1] elif prediction < self.data.rScale[0]: return self.data.rScale[0] else: return round(prediction, 3) def evalRatings(self): res = [] #used to contain the text of the result res.append('userId itemId original prediction\n') #predict for ind, entry in enumerate(self.data.testData): user, item, rating = entry #predict prediction = self.predict(user, item) #denormalize #prediction = denormalize(prediction,self.data.rScale[-1],self.data.rScale[0]) ##################################### pred = self.checkRatingBoundary(prediction) # add prediction in order to measure self.data.testData[ind].append(pred) res.append(user + ' ' + item + ' ' + str(rating) + ' ' + str(pred) + '\n') currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) #output prediction result if self.isOutput: outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-rating-predictions' + self.foldInfo + '.txt' FileIO.writeFile(outDir, fileName, res) print('The result has been output to ', abspath(outDir), '.') #output evaluation result outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' self.measure = Measure.ratingMeasure(self.data.testData) FileIO.writeFile(outDir, fileName, self.measure) print('The result of %s %s:\n%s' % (self.algorName, self.foldInfo, ''.join(self.measure))) def evalRanking(self): res = [] # used to contain the text of the result if self.ranking.contains('-topN'): top = self.ranking['-topN'].split(',') top = [int(num) for num in top] N = int(top[-1]) if N > 100 or N < 0: print( 'N can not be larger than 100! It has been reassigned with 10' ) N = 10 if N > len(self.data.item): N = len(self.data.item) else: print('No correct evaluation metric is specified!') exit(-1) res.append( 'userId: recommendations in (itemId, ranking score) pairs, * means the item matches.\n' ) # predict recList = {} userN = {} userCount = len(self.data.testSet_u) #rawRes = {} for i, user in enumerate(self.data.testSet_u): itemSet = {} line = user + ':' predictedItems = self.predictForRanking(user) # predictedItems = denormalize(predictedItems, self.data.rScale[-1], self.data.rScale[0]) for id, rating in enumerate(predictedItems): # if not self.data.rating(user, self.data.id2item[id]): # prediction = self.checkRatingBoundary(prediction) # pred = self.checkRatingBoundary(prediction) ##################################### # add prediction in order to measure itemSet[self.data.id2item[id]] = rating ratedList, ratingList = self.data.userRated(user) for item in ratedList: del itemSet[item] Nrecommendations = [] for item in itemSet: if len(Nrecommendations) < N: Nrecommendations.append((item, itemSet[item])) else: break Nrecommendations.sort(key=lambda d: d[1], reverse=True) recommendations = [item[1] for item in Nrecommendations] resNames = [item[0] for item in Nrecommendations] # find the N biggest scores for item in itemSet: ind = N l = 0 r = N - 1 if recommendations[r] < itemSet[item]: while r >= l: mid = (r - l) / 2 + l if recommendations[mid] >= itemSet[item]: l = mid + 1 elif recommendations[mid] < itemSet[item]: r = mid - 1 if r < l: ind = r break #move the items backwards if ind < N - 2: recommendations[ind + 2:] = recommendations[ind + 1:-1] resNames[ind + 2:] = resNames[ind + 1:-1] if ind < N - 1: recommendations[ind + 1] = itemSet[item] resNames[ind + 1] = item recList[user] = zip(resNames, recommendations) if i % 100 == 0: print(self.algorName, self.foldInfo, 'progress:' + str(i) + '/' + str(userCount)) for item in recList[user]: line += ' (' + item[0] + ',' + str(item[1]) + ')' if self.data.testSet_u[user].has_key(item[0]): line += '*' line += '\n' res.append(line) currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) # output prediction result if self.isOutput: fileName = '' outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-top-' + str( N) + 'items' + self.foldInfo + '.txt' FileIO.writeFile(outDir, fileName, res) print('The result has been output to ', abspath(outDir), '.') # output evaluation result outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' self.measure = Measure.rankingMeasure(self.data.testSet_u, recList, top) FileIO.writeFile(outDir, fileName, self.measure) print('The result of %s %s:\n%s' % (self.algorName, self.foldInfo, ''.join(self.measure))) def execute(self): self.readConfiguration() if self.foldInfo == '[1]': self.printAlgorConfig() #load model from disk or build model if self.isLoadModel: print('Loading model %s...' % (self.foldInfo)) self.loadModel() else: print('Initializing model %s...' % (self.foldInfo)) self.initModel() print('Building Model %s...' % (self.foldInfo)) try: import tensorflow if self.evalSettings.contains('-tf'): self.buildModel_tf() else: self.buildModel() except ImportError: self.buildModel() #preict the ratings or item ranking print('Predicting %s...' % (self.foldInfo)) if self.ranking.isMainOn(): self.evalRanking() else: self.evalRatings() #save model if self.isSaveModel: print('Saving model %s...' % (self.foldInfo)) self.saveModel() # with open(self.foldInfo+'measure.txt','w') as f: # f.writelines(self.record) return self.measure
class RecQ(object): def __init__(self, config): self.trainingData = [] # training data self.testData = [] # testData self.relation = [] self.measure = [] self.config = config self.ratingConfig = LineConfig(config['ratings.setup']) if self.config.contains('evaluation.setup'): self.evaluation = LineConfig(config['evaluation.setup']) binarized = False bottom = 0 if self.evaluation.contains('-b'): binarized = True bottom = float(self.evaluation['-b']) if self.evaluation.contains('-testSet'): #specify testSet self.trainingData = FileIO.loadDataSet(config, config['ratings'], binarized=binarized, threshold=bottom) self.testData = FileIO.loadDataSet(config, self.evaluation['-testSet'], bTest=True, binarized=binarized, threshold=bottom) elif self.evaluation.contains('-ap'): #auto partition self.trainingData = FileIO.loadDataSet(config, config['ratings'], binarized=binarized, threshold=bottom) self.trainingData,self.testData = DataSplit.\ dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap']),binarized=binarized) elif self.evaluation.contains('-cv'): #cross validation self.trainingData = FileIO.loadDataSet(config, config['ratings'], binarized=binarized, threshold=bottom) #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv'])) else: print('Evaluation is not well configured!') exit(-1) if config.contains('social'): self.socialConfig = LineConfig(self.config['social.setup']) self.relation = FileIO.loadRelationship(config, self.config['social']) print('preprocessing...') def execute(self): #import the algorithm module try: importStr = 'from algorithm.rating.' + self.config[ 'recommender'] + ' import ' + self.config['recommender'] exec(importStr) except ImportError: importStr = 'from algorithm.ranking.' + self.config[ 'recommender'] + ' import ' + self.config['recommender'] exec(importStr) if self.evaluation.contains('-cv'): k = int(self.evaluation['-cv']) if k <= 1 or k > 10: k = 3 mkl.set_num_threads(max(1, mkl.get_max_threads() / k)) #create the manager manager = Manager() m = manager.dict() i = 1 tasks = [] binarized = False if self.evaluation.contains('-b'): binarized = True for train, test in DataSplit.crossValidation(self.trainingData, k, binarized=binarized): fold = '[' + str(i) + ']' if self.config.contains('social'): recommender = self.config[ 'recommender'] + "(self.config,train,test,self.relation,fold)" else: recommender = self.config[ 'recommender'] + "(self.config,train,test,fold)" #create the process p = Process(target=run, args=(m, eval(recommender), i)) tasks.append(p) i += 1 #start the processes for p in tasks: p.start() if not self.evaluation.contains('-p'): p.join() #wait until all processes are completed if self.evaluation.contains('-p'): for p in tasks: p.join() #compute the mean error of k-fold cross validation self.measure = [dict(m)[i] for i in range(1, k + 1)] res = [] for i in range(len(self.measure[0])): if self.measure[0][i][:3] == 'Top': res.append(self.measure[0][i]) continue measure = self.measure[0][i].split(':')[0] total = 0 for j in range(k): total += float(self.measure[j][i].split(':')[1]) res.append(measure + ':' + str(total / k) + '\n') #output result currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) outDir = LineConfig(self.config['output.setup'])['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-' + str( k) + '-fold-cv' + '.txt' FileIO.writeFile(outDir, fileName, res) print('The result of %d-fold cross validation:\n%s' % (k, ''.join(res))) else: if self.config.contains('social'): recommender = self.config[ 'recommender'] + '(self.config,self.trainingData,self.testData,self.relation)' else: recommender = self.config[ 'recommender'] + '(self.config,self.trainingData,self.testData)' eval(recommender).execute()
class Recommender(object): def __init__(self, conf, trainingSet, testSet, fold='[1]'): self.currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) self.config = conf self.data = None self.isSaveModel = False self.ranking = None self.isLoadModel = False self.output = None self.isOutput = True self.data = RatingDAO(self.config, trainingSet, testSet) self.foldInfo = fold self.evalSettings = LineConfig(self.config['evaluation.setup']) self.measure = [] self.record = [] if self.evalSettings.contains('-cold'): # evaluation on cold-start users threshold = int(self.evalSettings['-cold']) removedUser = {} for user in self.data.testSet_u: if user in self.data.trainSet_u and len( self.data.trainSet_u[user]) > threshold: removedUser[user] = 1 for user in removedUser: del self.data.testSet_u[user] testData = [] for item in self.data.testData: if item[0] not in removedUser: testData.append(item) self.data.testData = testData self.num_users, self.num_items, self.train_size = self.data.trainingSize( ) def get_test_sample_data(self, max_sample=1000): testSample = {} keys = list(self.data.testSet_u.keys()) if len(self.data.testSet_u) <= max_sample: testSample = self.data.testSet_u else: while True: if len(testSample) == max_sample: break index = np.random.choice(len(self.data.testSet_u)) user = keys[index] testSample[user] = self.data.testSet_u[user] self.testSample = testSample def get_test_map(self, K=1, L=-1): self.K = K self.L = L if not hasattr(self, 'accountDAO') or self.accountDAO is None: self.map_from_true_to_identify = { i: i for i in list(self.data.testSet_u.keys()) } elif self.evaluation_conf.contains('-ul') and eval( self.evaluation_conf['-ul']) > 0: self.map_from_true_to_identify = self.get_map_from_true_to_identify( k=K, index=L) else: self.map_from_true_to_identify = self.accountDAO.map_from_user_to_account def readConfiguration(self): self.algorName = self.config['recommender'] self.output = LineConfig(self.config['output.setup']) self.isOutput = self.output.isMainOn() self.ranking = LineConfig(self.config['item.ranking']) def printAlgorConfig(self): "show algorithm's configuration" print('Algorithm:', self.config['recommender']) print('Ratings dataset:', abspath(self.config['ratings'])) if LineConfig(self.config['evaluation.setup']).contains('-testSet'): print( 'Test set:', abspath( LineConfig(self.config['evaluation.setup']).getOption( '-testSet'))) # print 'Count of the users in training set: ',len() print( 'Training set size: (user count: %d, item count %d, record count: %d)' % (self.data.trainingSize())) print( 'Test set size: (user count: %d, item count %d, record count: %d)' % (self.data.testSize())) print('=' * 80) def initModel(self): pass def buildModel(self): 'build the model (for model-based algorithms )' pass def buildModel_tf(self): 'training model on tensorflow' pass def saveModel(self): pass def loadModel(self): pass def predict(self, u, i): pass def predictForRanking(self, u): pass def checkRatingBoundary(self, prediction): if prediction > self.data.rScale[-1]: return self.data.rScale[-1] elif prediction < self.data.rScale[0]: return self.data.rScale[0] else: return round(prediction, 3) def evalRatings(self): res = [] # used to contain the text of the result res.append('userId itemId original prediction\n') # predict for ind, entry in enumerate(self.data.testData): user, item, rating = entry # predict prediction = self.predict(user, item) # denormalize # prediction = denormalize(prediction,self.data.rScale[-1],self.data.rScale[0]) ##################################### pred = self.checkRatingBoundary(prediction) # add prediction in order to measure self.data.testData[ind].append(pred) res.append(user + ' ' + item + ' ' + str(rating) + ' ' + str(pred) + '\n') currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) # output prediction result if self.isOutput: outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-rating-predictions' + self.foldInfo + '.txt' FileIO.writeFile(outDir, fileName, res) print('The result has been output to ', abspath(outDir), '.') # output evaluation result outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' self.measure = Measure.ratingMeasure(self.data.testData) FileIO.writeFile(outDir, fileName, self.measure) print('The result of %s %s:\n%s' % (self.algorName, self.foldInfo, ''.join(self.measure))) def get_map_from_true_to_identify(self, k=1, index=-1): map_from_true_to_identify = {} table = self.accountDAO.test_table[self.accountDAO.test_table.k == k] reserve_list = [ ind for ind, users in enumerate(table['identify_user']) if len(users) ] table = table.iloc[reserve_list].copy() table['identify_user_index'] = [ i_list[index] if len(i_list) and len(i_list) >= index + 1 else None for i_list in table['identify_user'] ] # table['identify_user_index'].astype(int) # table.groupby identify_list = table.groupby( 'truth_user')['identify_user_index'].aggregate(list) for truth, idens in identify_list.items(): i_users, counts = np.unique(np.array(idens)[np.array(idens) > 0], return_counts=True) if len(i_users) == 0: continue map_from_true_to_identify[truth] = i_users[np.argmax(counts)] # identification_result = dict(zip(table['truth_user'].to_list(), table['identify_user'].to_list())) # for key, value in identification_result.items(): # if len(value) and len(value) >= index + 1: # try: # map_from_true_to_identify[key] = value[index] # except: # print(key, value) # map_from_true_to_identify[key] = value[index] return map_from_true_to_identify def get_recommendation(self, data_user, N): user, identified_user, testSample_user = data_user itemSet = {} line = str(user) + ':' predictedItems = self.predictForRanking(identified_user) for id, rating in enumerate(predictedItems): itemSet[self.data.id2item[id]] = rating # if not hasattr(self, 'accountDAO') or self.accountDAO is None: # ratedList, ratingList = self.data.userRated(user) # else: # ratedList = list(self.accountDAO.ground_visit[user].keys()) # for item in ratedList: # del itemSet[item] Nrecommendations = [] for item in itemSet: if len(Nrecommendations) < N: Nrecommendations.append((item, itemSet[item])) else: break # Nrecommendations = list(itemSet.items())[:N] Nrecommendations.sort(key=lambda d: d[1], reverse=True) recommendations = [item[1] for item in Nrecommendations] resNames = [item[0] for item in Nrecommendations] # find the N biggest scores for item in itemSet: ind = N l = 0 r = N - 1 if recommendations[r] < itemSet[item]: while r >= l: mid = (r - l) // 2 + l if recommendations[mid] >= itemSet[item]: l = mid + 1 elif recommendations[mid] < itemSet[item]: r = mid - 1 if r < l: ind = r break # move the items backwards if ind < N - 2: recommendations[ind + 2:] = recommendations[ind + 1:-1] resNames[ind + 2:] = resNames[ind + 1:-1] if ind < N - 1: recommendations[ind + 1] = itemSet[item] resNames[ind + 1] = item # recList[user] = list(zip(resNames, recommendations)) # recList[user] = list(itemSet_sorted.items())[:N] recList_user = list(zip(resNames, recommendations)) for item in recList_user: line += ' (' + str(item[0]) + ',' + str(item[1]) + ')' if item[0] in testSample_user: line += '*' line += '\n' return user, line, recList_user def evalRanking(self, write_to_file=True, use_now_time=False): res = [] # used to contain the text of the result if self.ranking.contains('-topN'): top = self.ranking['-topN'].split(',') top = [int(num) for num in top] N = max(top) if N > 100 or N < 0: print( 'N can not be larger than 100! It has been reassigned with 10' ) N = 10 if N > len(self.data.item): N = len(self.data.item) else: print('No correct evaluation metric is specified!') exit(-1) res.append( 'userId: recommendations in (itemId, ranking score) pairs, * means the item matches.\n' ) # predict recList = {} userN = {} testSample = self.testSample # # multiprocessing way # pool = Pool(12) # dataset = [] # for user, testSample_u in testSample.items(): # identified_user = self.map_from_true_to_identify.get(user, -1) # if identified_user == -1: # continue # dataset.append([user, identified_user, testSample_u]) # # result_generator = pool.imap_unordered(partial(self.get_recommendation, N=N), dataset) # for result in tqdm(result_generator, total=len(dataset), desc='Measuring [{}]'): # user, line, recList_user = result # recList[user] = recList_user # res.append(line) # pool.close() # pool.join() testSample_copy = testSample.copy() for i, user in tqdm(enumerate(testSample), total=len(testSample), desc='Measuring [{}]'.format(self.algorName)): identified_user = self.map_from_true_to_identify.get(user, -1) if identified_user == -1: del testSample_copy[user] continue user, line, recList_user = self.get_recommendation( (user, identified_user, testSample[user]), N) recList[user] = recList_user res.append(line) self.measure = Measure.rankingMeasure(testSample_copy, recList, top) try: self.measure.append("C:{}\n".format(self.C)) except: pass try: self.measure.append("L:{}\n".format(self.L)) except: pass try: self.measure.append("K:{}\n".format(self.K)) except: pass try: self.measure.append("N:{}\n".format(self.N)) except: pass if use_now_time: currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) else: currentTime = self.currentTime if write_to_file: # output prediction result if False and self.isOutput: fileName = '' outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-top-' + str( N) + 'items' + self.foldInfo + '.txt' FileIO.writeFile(outDir, fileName, res) # output evaluation result outDir = self.output['-dir'] try: fileName = self.config[ 'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '_C{}'.format( self.C) + '.txt' except: fileName = self.config[ 'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' FileIO.writeFile(outDir, fileName, self.measure) # FileIO.writeFile(outDir, fileName, "C:{}".format(self.C)) print('The result has been output to ', abspath(outDir), '.') print('The result of %s %s:\n%s' % (self.algorName, self.foldInfo, ''.join(self.measure))) def execute(self): self.readConfiguration() if self.foldInfo == '[1]': self.printAlgorConfig() # load model from disk or build model if self.isLoadModel: print('Loading model %s...' % (self.foldInfo)) self.loadModel() else: print('Initializing model %s...' % (self.foldInfo)) self.initModel() print('Building Model %s...' % (self.foldInfo)) try: import tensorflow if self.evalSettings.contains('-tf'): self.buildModel_tf() else: self.buildModel() except ImportError: self.buildModel() # preict the ratings or item ranking print('Predicting %s...' % (self.foldInfo)) if self.ranking.isMainOn(): self.evalRanking() else: self.evalRatings() # save model if self.isSaveModel: print('Saving model %s...' % (self.foldInfo)) self.saveModel() # with open(self.foldInfo+'measure.txt','w') as f: # f.writelines(self.record) return self.measure
class RatingDAO(object): 'data access control' def __init__(self, config): self.config = config self.ratingConfig = LineConfig(config['ratings.setup']) self.evaluation = LineConfig(config['evaluation.setup']) self.user = {} #used to store the order of users self.item = {} #used to store the order of items self.userMeans = {} #used to store the mean values of users's ratings self.itemMeans = {} #used to store the mean values of items's ratings self.triple = [] #training data self.globalMean = 0 self.timestamp = {} self.ratingMatrix = None self.trainingMatrix = None self.validationMatrix = None self.testSet_u = None # used to store the test set by hierarchy user:[item,rating] self.testSet_i = None # used to store the test set by hierarchy item:[user,rating] self.rScale = [-9999999, 999999] if self.evaluation.contains('-testSet'): #specify testSet self.trainingMatrix = self.__loadRatings(config['ratings']) self.testSet_u, self.testSet_i = self.__loadRatings( self.evaluation['-testSet'], True) else: #cross validation and leave-one-out self.ratingMatrix = self.__loadRatings(config['ratings']) self.__computeItemMean() self.__computeUserMean() self.__globalAverage() def __loadRatings(self, file, bTest=False): if not bTest: print 'load training data...' else: print 'load test data...' with open(file) as f: ratings = f.readlines() #ignore the headline if self.ratingConfig.contains('-header'): ratings = ratings[1:] #order of the columns order = self.ratingConfig['-columns'].strip().split() #split data #userList= [] u_i_r = {} i_u_r = {} triple = [] #find the maximum rating and minimum value for lineNo, line in enumerate(ratings): items = split(' |,|\t', line.strip()) if len(order) < 3: print 'The rating file is not in a correct format. Error: Line num %d' % lineNo exit(-1) userId = items[int(order[0])] itemId = items[int(order[1])] rating = items[int(order[2])] if float(rating) > self.rScale[0]: self.rScale[0] = float(rating) if float(rating) < self.rScale[1]: self.rScale[1] = float(rating) for lineNo, line in enumerate(ratings): items = split(' |,|\t', line.strip()) if len(order) < 3: print 'The rating file is not in a correct format. Error: Line num %d' % lineNo exit(-1) userId = items[int(order[0])] itemId = items[int(order[1])] rating = items[int(order[2])] #makes the rating within the range [0, 1]. normRating = normalize(float(rating), self.rScale[0], self.rScale[1]) #order the user if not self.user.has_key(userId): self.user[userId] = len(self.user) #order the item if not self.item.has_key(itemId): self.item[itemId] = len(self.item) if not u_i_r.has_key(userId): u_i_r[userId] = [] #userList.append(userId) u_i_r[userId].append([itemId, float(rating)]) if not i_u_r.has_key(itemId): i_u_r[itemId] = [] i_u_r[itemId].append([userId, float(rating)]) if not bTest: self.triple.append([userId, itemId, normRating]) triple.append( [self.user[userId], self.item[itemId], normRating]) if not bTest: #contruct the sparse matrix # data=[] # indices=[] # indptr=[] # offset = 0 # for uid in userList: # uRating = [r[1] for r in u_i_r[uid]] # uColunms = [self.item[r[0]] for r in u_i_r[uid]] # data += uRating # indices += uColunms # indptr .append(offset) # offset += len(uRating) # indptr.append(offset) # return sparseMatrix.SparseMatrix(data, indices, indptr) return new_sparseMatrix.SparseMatrix(triple) else: # return testSet return u_i_r, i_u_r def __globalAverage(self): total = sum(self.userMeans.values()) if total == 0: self.globalMean = 0 else: self.globalMean = total / len(self.userMeans) def __computeUserMean(self): for u in self.user: n = self.row(u) > 0 mean = 0 if not self.containsUser( u): # no data about current user in training set pass else: sum = float(self.row(u)[0].sum()) try: mean = sum / n[0].sum() except ZeroDivisionError: mean = 0 self.userMeans[u] = mean def __computeItemMean(self): for c in self.item: n = self.col(c) > 0 mean = 0 if not self.containsItem( c): # no data about current user in training set pass else: sum = float(self.col(c)[0].sum()) try: mean = sum / n[0].sum() except ZeroDivisionError: mean = 0 self.itemMeans[c] = mean def getUserId(self, u): if self.user.has_key(u): return self.user[u] else: return -1 def getItemId(self, i): if self.item.has_key(i): return self.item[i] else: return -1 def trainingSize(self): return self.trainingMatrix.size def testSize(self): return (len(self.testSet_u), len(self.testSet_i)) def contains(self, u, i): 'whether user u rated item i' return self.trainingMatrix.contains(self.getUserId(u), self.getItemId(i)) def containsUser(self, u): 'whether user is in training set' return self.trainingMatrix.matrix_User.has_key(self.getUserId(u)) def containsItem(self, i): 'whether item is in training set' return self.trainingMatrix.matrix_Item.has_key(self.getItemId(i)) def userRated(self, u): if self.trainingMatrix.matrix_User.has_key(self.getUserId(u)): userIndex = self.trainingMatrix.matrix_User[self.user[u]].keys() rating = self.trainingMatrix.matrix_User[self.user[u]].values() return (userIndex, rating) return ([], []) def itemRated(self, i): if self.trainingMatrix.matrix_Item.has_key(self.getItemId(i)): itemIndex = self.trainingMatrix.matrix_Item[self.item[i]].keys() rating = self.trainingMatrix.matrix_Item[self.item[i]].values() return (itemIndex, rating) return ([], []) def row(self, u): return self.trainingMatrix.row(self.getUserId(u)) def col(self, c): return self.trainingMatrix.col(self.getItemId(c)) def rating(self, u, c): return self.trainingMatrix.elem(self.getUserId(u), self.getItemId(c)) def ratingScale(self): return (self.rScale[0], self.rScale[1]) def elemCount(self): return self.trainingMatrix.elemCount()
class ratingDAO(object): 'data access control' def __init__(self, config): self.config = config self.ratingConfig = LineConfig(config['ratings.setup']) self.evaluation = LineConfig(config['evaluation']) self.user = {} self.item = {} self.timestamp = {} self.ratingMatrix = None self.trainingMatrix = None self.validationMatrix = None self.testSet_u = None # used to store the test set by hierarchy user:[item,rating] self.testSet_i = None # used to store the test set by hierarchy item:[user,rating] self.rScale = [-9999999, 999999] if self.evaluation.contains('-testSet'): #specify testSet self.trainingMatrix = self.loadRatings(config['ratings']) self.testSet_u, self.testSet_i = self.loadRatings( self.evaluation['-testSet'], True) else: #cross validation and leave-one-out self.ratingMatrix = self.loadRatings(config['ratings']) def loadRatings(self, file, bTest=False): with open(file) as f: ratings = f.readlines() #ignore the headline if self.ratingConfig.contains('-header'): ratings = ratings[1:] #set delimiter delimiter = ' ' if self.ratingConfig.contains('-d'): delimiter = self.ratingConfig['-d'] #order of the columns order = self.ratingConfig['-columns'].strip().split() #split data userList = [] u_i_r = {} i_u_r = {} triple = [] for line in ratings: items = line.strip().split(delimiter) userId = items[int(order[0])] itemId = items[int(order[1])] rating = items[int(order[2])] if float(rating) > self.rScale[0]: self.rScale[0] = float(rating) if float(rating) < self.rScale[1]: self.rScale[1] = float(rating) #order the user if not self.user.has_key(userId): self.user[userId] = len(self.user) #order the item if not self.item.has_key(itemId): self.item[itemId] = len(self.item) if not u_i_r.has_key(userId): u_i_r[userId] = [] userList.append(userId) u_i_r[userId].append([itemId, float(rating)]) if not i_u_r.has_key(itemId): i_u_r[itemId] = [] i_u_r[itemId].append([userId, float(rating)]) triple.append( [self.user[userId], self.item[itemId], float(rating)]) if not bTest: #contruct the sparse matrix # data=[] # indices=[] # indptr=[] # offset = 0 # for uid in userList: # uRating = [r[1] for r in u_i_r[uid]] # uColunms = [self.item[r[0]] for r in u_i_r[uid]] # data += uRating # indices += uColunms # indptr .append(offset) # offset += len(uRating) # indptr.append(offset) # return sparseMatrix.SparseMatrix(data, indices, indptr) return new_sparseMatrix.SparseMatrix( triple, (len(self.user), len(self.item))) else: # return testSet return u_i_r, i_u_r def row(self, u): return self.trainingMatrix.row(self.user[u]) def col(self, c): return self.trainingMatrix.col(self.item[c]) def rating(self, u, c): return self.trainingMatrix.elem(self.user[u], self.item[c]) def ratingScale(self): return (self.rScale[0], self.rScale[1])
class Record(object): 'data access control' def __init__(self, config, trainingSet, testSet): self.config = config self.recordConfig = LineConfig(config['record.setup']) self.evalConfig = LineConfig(config['evaluation.setup']) self.name2id = defaultdict(dict) self.id2name = defaultdict(dict) self.listened = {} self.listened['artist'] = defaultdict(dict) self.listened['track'] = defaultdict(dict) self.listened['album'] = defaultdict(dict) self.artist2Album = defaultdict( dict) #key:artist id, value:{album id1:1, album id2:1 ...} self.album2Track = defaultdict(dict) # self.artist2Track = defaultdict(dict) # self.userRecord = defaultdict( list) #user data in training set. form: {user:[record1,record2]} self.testSet = defaultdict( dict ) #user data in test set. form: {user:{recommenedObject1:1,recommendedObject:1}} self.recordCount = 0 self.columns = {} labels = self.recordConfig['-columns'].split(',') for col in labels: label = col.split(':') self.columns[label[0]] = int(label[1]) if self.evalConfig.contains('-byTime'): trainingSet, testSet = self.splitDataByTime(trainingSet) self.preprocess(trainingSet, testSet) def splitDataByTime(self, dataset): trainingSet = [] testSet = [] ratio = float(self.evalConfig['-byTime']) records = defaultdict(list) for event in dataset: records[event['user']].append(event) for user in records: orderedList = sorted(records[user], key=lambda d: d['time']) training = orderedList[0:int(len(orderedList) * (1 - ratio))] test = orderedList[int(len(orderedList) * (1 - ratio)):] trainingSet += training testSet += test return trainingSet, testSet def preprocess(self, trainingSet, testSet): for entry in trainingSet: self.recordCount += 1 for key in entry: if key != 'time': if not self.name2id[key].has_key(entry[key]): self.name2id[key][entry[key]] = len(self.name2id[key]) self.id2name[key][len(self.id2name[key])] = entry[key] if key == 'user': self.userRecord[entry['user']].append(entry) if entry.has_key('artist'): if not self.listened['artist'][ entry['artist']].has_key(entry[key]): self.listened['artist'][entry['artist']][ entry[key]] = 0 else: self.listened['artist'][entry['artist']][ entry[key]] += 1 if entry.has_key('album'): if not self.listened['album'][entry['album']].has_key( entry[key]): self.listened['album'][entry['album']][ entry[key]] = 0 else: self.listened['album'][entry['album']][ entry[key]] += 1 if entry.has_key('track'): if not self.listened['track'][entry['track']].has_key( entry[key]): self.listened['track'][entry['track']][ entry[key]] = 0 else: self.listened['track'][entry['track']][ entry[key]] += 1 if key == 'artist' and entry.has_key('album'): self.artist2Album[entry[key]][entry['album']] = 1 if key == 'album' and entry.has_key('track'): self.album2Track[entry[key]][entry['track']] = 1 if key == 'artist' and entry.has_key('track'): self.artist2Track[entry[key]][entry['track']] = 1 recType = self.evalConfig['-target'] for entry in testSet: for key in entry: if key != 'time': if not self.name2id[key].has_key(entry[key]): self.name2id[key][entry[key]] = len(self.name2id[key]) self.id2name[key][len(self.id2name[key])] = entry[key] if key == 'user': if entry.has_key(recType): self.testSet[entry['user']][entry[recType]] = 1 def printTrainingSize(self): if self.name2id.has_key('user'): print 'user count:', len(self.name2id['user']) if self.name2id.has_key('artist'): print 'artist count:', len(self.name2id['artist']) if self.name2id.has_key('album'): print 'album count:', len(self.name2id['album']) if self.name2id.has_key('track'): print 'track count:', len(self.name2id['track']) print 'Training set size:', self.recordCount def getId(self, obj, t): if self.name2id[t].has_key(obj): return self.name2id[t][obj] else: print 'No ' + t + ' ' + obj + ' exists!' exit(-1) def getSize(self, t): return len(self.name2id[t])
class Recommender(object): def __init__(self, conf, trainingSet=None, testSet=None, fold='[1]'): self.config = conf self.dao = None self.isSaveModel = False self.ranking = None self.isLoadModel = False self.output = None self.isOutput = True self.dao = RatingDAO(self.config, trainingSet, testSet) self.foldInfo = fold self.measure = [] def readConfiguration(self): self.algorName = self.config['recommender'] self.output = LineConfig(self.config['output.setup']) self.isOutput = self.output.isMainOn() self.ranking = LineConfig(self.config['item.ranking']) def printAlgorConfig(self): "show algorithm's configuration" print 'Algorithm:', self.config['recommender'] print 'Ratings dataset:', abspath(self.config['ratings']) if LineConfig(self.config['evaluation.setup']).contains('-testSet'): print 'Test set:', abspath( LineConfig( self.config['evaluation.setup']).getOption('-testSet')) #print 'Count of the users in training set: ',len() print 'Training set size: (user count: %d, item count %d, record count: %d)' % ( self.dao.trainingSize()) print 'Test set size: (user count: %d, item count %d, record count: %d)' % ( self.dao.testSize()) print '=' * 80 def initModel(self): pass def buildModel(self): 'build the model (for model-based algorithms )' pass def saveModel(self): pass def loadModel(self): pass def predict(self, u, i): pass def predictForRanking(self, u): pass def checkRatingBoundary(self, prediction): if prediction > self.dao.rScale[-1]: return self.dao.rScale[-1] elif prediction < self.dao.rScale[0]: return self.dao.rScale[0] else: return round(prediction, 3) def evalRatings(self): res = [] #used to contain the text of the result res.append('userId itemId original prediction\n') #predict for ind, entry in enumerate(self.dao.testData): user, item, rating = entry #predict prediction = self.predict(user, item) #denormalize prediction = denormalize(prediction, self.dao.rScale[-1], self.dao.rScale[0]) ##################################### pred = self.checkRatingBoundary(prediction) # add prediction in order to measure self.dao.testData[ind].append(pred) res.append(user + ' ' + item + ' ' + str(rating) + ' ' + str(pred) + '\n') currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) #output prediction result if self.isOutput: outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-rating-predictions' + self.foldInfo + '.txt' FileIO.writeFile(outDir, fileName, res) print 'The Result has been output to ', abspath(outDir), '.' #output evaluation result outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' self.measure = Measure.ratingMeasure(self.dao.testData) FileIO.writeFile(outDir, fileName, self.measure) def evalRanking(self): res = [] # used to contain the text of the result N = 0 threshold = 0 bThres = False bTopN = False if self.ranking.contains('-topN'): bTopN = True N = int(self.ranking['-topN']) if N > 100 or N < 0: print 'N can not be larger than 100! It has been reassigned with 100' N = 100 elif self.ranking.contains('-threshold'): threshold = float(self.ranking['-threshold']) bThres = True else: print 'No correct evaluation metric is specified!' exit(-1) res.append( 'userId: recommendations in (itemId, ranking score) pairs, * means the item matches.\n' ) # predict recList = {} userN = {} userCount = len(self.dao.testSet_u) for i, user in enumerate(self.dao.testSet_u): itemSet = {} line = user + ':' for item in self.dao.item: # predict prediction = self.predict(user, item) # denormalize prediction = denormalize(prediction, self.dao.rScale[-1], self.dao.rScale[0]) #prediction = self.checkRatingBoundary(prediction) #pred = self.checkRatingBoundary(prediction) ##################################### # add prediction in order to measure if bThres: if prediction > threshold: itemSet[item] = prediction else: itemSet[item] = prediction ratedList, ratingList = self.dao.userRated(user) for item in ratedList: del itemSet[self.dao.id2item[item]] itemSet = sorted(itemSet.iteritems(), key=lambda d: d[1], reverse=True) if self.ranking.contains('-topN'): recList[user] = itemSet[0:N] elif self.ranking.contains('-threshold'): recList[user] = itemSet[:] userN[user] = len(itemSet) if i % 100 == 0: print self.algorName, self.foldInfo, 'progress:' + str( i) + '/' + str(userCount) for item in recList[user]: line += ' (' + item[0] + ',' + str(item[1]) + ')' if self.dao.testSet_u[user].has_key(item[0]): line += '*' line += '\n' res.append(line) currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) # output prediction result if self.isOutput: fileName = '' outDir = self.output['-dir'] if self.ranking.contains('-topN'): fileName = self.config[ 'recommender'] + '@' + currentTime + '-top-' + str( N) + 'items' + self.foldInfo + '.txt' elif self.ranking.contains('-threshold'): fileName = self.config[ 'recommender'] + '@' + currentTime + '-threshold-' + str( threshold) + self.foldInfo + '.txt' FileIO.writeFile(outDir, fileName, res) print 'The Result has been output to ', abspath(outDir), '.' #output evaluation result outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' if self.ranking.contains('-topN'): self.measure = Measure.rankingMeasure(self.dao.testSet_u, recList, N) elif self.ranking.contains('-threshold'): origin = self.dao.testSet_u.copy() for user in origin: temp = {} for item in origin[user]: if origin[user][item] >= threshold: temp[item] = threshold origin[user] = temp self.measure = Measure.rankingMeasure_threshold( origin, recList, userN) FileIO.writeFile(outDir, fileName, self.measure) def execute(self): self.readConfiguration() if self.foldInfo == '[1]': self.printAlgorConfig() #load model from disk or build model if self.isLoadModel: print 'Loading model %s...' % (self.foldInfo) self.loadModel() else: print 'Initializing model %s...' % (self.foldInfo) self.initModel() print 'Building Model %s...' % (self.foldInfo) self.buildModel() #preict the ratings or item ranking print 'Predicting %s...' % (self.foldInfo) if self.ranking.isMainOn(): self.evalRanking() else: self.evalRatings() #save model if self.isSaveModel: print 'Saving model %s...' % (self.foldInfo) self.saveModel() return self.measure def performance(self): #res = [] # used to contain the text of the result #res.append('userId itemId original prediction\n') # predict res = [] for ind, entry in enumerate(self.dao.testData): user, item, rating = entry # predict prediction = self.predict(user, item) # denormalize prediction = denormalize(prediction, self.dao.rScale[-1], self.dao.rScale[0]) ##################################### pred = self.checkRatingBoundary(prediction) # add prediction in order to measure res.append([user, item, rating, pred]) #res.append(user + ' ' + item + ' ' + str(rating) + ' ' + str(pred) + '\n') #currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) # output prediction result # if self.isOutput: # outDir = self.output['-dir'] # fileName = self.config['recommender'] + '@' + currentTime + '-rating-predictions' + self.foldInfo + '.txt' # FileIO.writeFile(outDir, fileName, res) # print 'The Result has been output to ', abspath(outDir), '.' # output evaluation result # outDir = self.output['-dir'] # fileName = self.config['recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' self.measure = Measure.ratingMeasure(res) return self.measure
class Recommender(object): def __init__(self, conf, trainingSet=None, testSet=None, fold='[1]'): self.config = conf self.isSaveModel = False self.isLoadModel = False self.isOutput = True self.data = Record(self.config, trainingSet, testSet) self.foldInfo = fold self.evalConfig = LineConfig(self.config['evaluation.setup']) if self.evalConfig.contains('-target'): self.recType = self.evalConfig['-target'] else: self.recType = 'track' if LineConfig(self.config['evaluation.setup']).contains('-cold'): #evaluation on cold-start users threshold = int( LineConfig(self.config['evaluation.setup'])['-cold']) removedUser = {} for user in self.data.testSet: if self.data.userRecord.has_key(user) and len( self.data.userRecord[user]) > threshold: removedUser[user] = 1 for user in removedUser: del self.data.testSet[user] def readConfiguration(self): self.algorName = self.config['recommender'] self.output = LineConfig(self.config['output.setup']) self.isOutput = self.output.isMainOn() self.ranking = LineConfig(self.config['item.ranking']) def printAlgorConfig(self): "show algorithm's configuration" print 'Algorithm:', self.config['recommender'] print 'Training set:', abspath(self.config['record']) if LineConfig(self.config['evaluation.setup']).contains('-testSet'): print 'Test set:', abspath( LineConfig( self.config['evaluation.setup']).getOption('-testSet')) #print 'Count of the users in training set: ',len() self.data.printTrainingSize() print '=' * 80 def initModel(self): pass def buildModel(self): 'build the model (for model-based algorithms )' pass def saveModel(self): pass def loadModel(self): pass def predict(self, user): return [] def evalRanking(self): res = [] # used to contain the text of the result N = 0 threshold = 0 N = int(self.ranking['-topN']) if N > 100 or N < 0: print 'N can not be larger than 100! It has been reassigned with 10' N = 10 res.append( 'userId: recommendations in (itemId, ranking score) pairs, * means the item matches.\n' ) # predict recList = {} userCount = len(self.data.testSet) rawRes = {} for i, user in enumerate(self.data.testSet): itemSet = {} line = user + ':' predictedItems = self.predict(user) recList[user] = predictedItems if i % 100 == 0: print self.algorName, self.foldInfo, 'progress:' + str( i) + '/' + str(userCount) for item in recList[user]: if self.data.testSet[user].has_key(item[0]): line += '*' line += item + ',' line += '\n' res.append(line) currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) # output prediction result if self.isOutput: fileName = '' outDir = self.output['-dir'] if self.ranking.contains('-topN'): fileName = self.config[ 'recommender'] + '@' + currentTime + '-top-' + str( N) + 'items' + self.foldInfo + '.txt' elif self.ranking.contains('-threshold'): fileName = self.config[ 'recommender'] + '@' + currentTime + '-threshold-' + str( threshold) + self.foldInfo + '.txt' FileIO.writeFile(outDir, fileName, res) print 'The result has been output to ', abspath(outDir), '.' # output evaluation result outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' if self.ranking.contains('-topN'): self.measure = Measure.rankingMeasure(self.data.testSet, recList, rawRes, N) FileIO.writeFile(outDir, fileName, self.measure) print 'The result of %s %s:\n%s' % (self.algorName, self.foldInfo, ''.join(self.measure)) def execute(self): self.readConfiguration() if self.foldInfo == '[1]': self.printAlgorConfig() #load model from disk or build model if self.isLoadModel: print 'Loading model %s...' % (self.foldInfo) self.loadModel() else: print 'Initializing model %s...' % (self.foldInfo) self.initModel() print 'Building Model %s...' % (self.foldInfo) self.buildModel() #preict the ratings or item ranking print 'Predicting %s...' % (self.foldInfo) self.evalRanking() #save model if self.isSaveModel: print 'Saving model %s...' % (self.foldInfo) self.saveModel() return self.measure
class Recommender(object): def __init__(self, conf, trainingSet=None, testSet=None, fold='[1]'): self.config = conf self.isSaveModel = False self.isLoadModel = False self.isOutput = True self.data = Record(self.config, trainingSet, testSet) self.foldInfo = fold self.evalConfig = LineConfig(self.config['evaluation.setup']) if self.evalConfig.contains('-target'): self.recType = self.evalConfig['-target'] else: self.recType = 'track' if LineConfig(self.config['evaluation.setup']).contains('-cold'): #evaluation on cold-start users threshold = int( LineConfig(self.config['evaluation.setup'])['-cold']) removedUser = [] removedTrack = defaultdict(list) #for user in self.data.testSet: # if user in self.data.userRecord and len(self.data.userRecord[user])>threshold: # removedUser.append(user) for user in self.data.testSet: if user in self.data.userRecord: for item in self.data.testSet[user]: if len(self.data.trackRecord[item]) > threshold: removedTrack[user].append(item) for user in removedTrack: for item in removedTrack[user]: del self.data.testSet[user][item] if len(self.data.testSet[user]) == 0: del self.data.testSet[user] #for user in removedUser: # del self.data.testSet[user] if LineConfig(self.config['evaluation.setup']).contains('-sample'): userList = list(self.data.testSet.keys()) removedUser = userList[:int(len(userList) * 0.9)] for user in removedUser: del self.data.testSet[user] def readConfiguration(self): self.algorName = self.config['recommender'] self.output = LineConfig(self.config['output.setup']) self.isOutput = self.output.isMainOn() self.ranking = LineConfig(self.config['item.ranking']) def printAlgorConfig(self): "show algorithm's configuration" print('Algorithm:', self.config['recommender']) print('Training set:', abspath(self.config['record'])) if LineConfig(self.config['evaluation.setup']).contains('-testSet'): print( 'Test set:', abspath( LineConfig(self.config['evaluation.setup']).getOption( '-testSet'))) #print 'Count of the users in training set: ',len() self.data.printTrainingSize() print('=' * 80) def initModel(self): pass def buildModel(self): 'build the model (for model-based algorithms )' pass def saveModel(self): pass def loadModel(self): pass def predict(self, user): return [] def evalRanking(self): res = [] # used to contain the text of the result N = 0 threshold = 0 top = self.ranking['-topN'].split(',') top = [int(num) for num in top] N = int(top[-1]) if N > 100 or N < 0: print( 'N can not be larger than 100! It has been reassigned with 10') N = 10 res.append( 'userId: recommendations in (itemId, ranking score) pairs, * means the item matches, $ means the unpop item\n' ) # predict recList = {} userCount = len(self.data.testSet) for i, user in enumerate(self.data.testSet): num_pop = 0 line = user + ':' if user in self.data.userRecord: predictedItems = self.predict(user) else: predictedItems = ['0'] * N predicted = {} for k, item in enumerate(predictedItems): predicted[item] = k for item in self.data.userRecord[user]: if item[self.recType] in predicted: del predicted[item[self.recType]] predicted = sorted(predicted.items(), key=lambda d: d[1]) predictedItems = [item[0] for item in predicted] recList[user] = predictedItems[:N] #print('user', user, 'the recList:', type(self.data.testSet[user])) if i % 100 == 0: print(self.algorName, self.foldInfo, 'progress:' + str(i) + '/' + str(userCount)) for item in recList[user]: if item in self.data.testSet[user]: line += '*' if item in self.data.PopTrack: num_pop += 1 line += '$' line += item + ',' line += '\n' res.append(line) currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) # output prediction result if self.isOutput: fileName = '' outDir = self.output['-dir'] if self.ranking.contains('-topN'): fileName = self.config['recommender'] + '@' + currentTime + '-top-' + self.ranking['-topN']\ + 'items' + self.foldInfo + '.txt' FileIO.writeFile(outDir, fileName, res) print('The result has been output to ', abspath(outDir), '.') # output evaluation result outDir = self.output['-dir'] fileName = self.config[ 'recommender'] + '@' + currentTime + '-measure' + self.foldInfo + '.txt' self.measure = Measure.rankingMeasure(self.data.testSet, recList, top, self.data.getSize(self.recType)) FileIO.writeFile(outDir, fileName, self.measure) print('The result of %s %s:\n%s' % (self.algorName, self.foldInfo, ''.join(self.measure))) def execute(self): self.readConfiguration() if self.foldInfo == '[1]': self.printAlgorConfig() #load model from disk or build model if self.isLoadModel: print('Loading model %s...' % (self.foldInfo)) self.loadModel() else: print('Initializing model %s...' % (self.foldInfo)) self.initModel() print('Building Model %s...' % (self.foldInfo)) self.buildModel() #preict the ratings or item ranking print('Predicting %s...' % (self.foldInfo)) self.evalRanking() #save model if self.isSaveModel: print('Saving model %s...' % (self.foldInfo)) self.saveModel() return self.measure
class RecQ(object): def __init__(self,config): self.trainingData = [] # training data self.testData = [] # testData self.measure = [] self.config =config self.ratingConfig = LineConfig(config['ratings.setup']) if self.config.contains('evaluation.setup'): self.evaluation = LineConfig(config['evaluation.setup']) if self.evaluation.contains('-testSet'): #specify testSet self.__loadDataSet(config['ratings']) self.__loadDataSet(self.evaluation['-testSet'],bTest=True) elif self.evaluation.contains('-ap'): #auto partition self.__loadDataSet(config['ratings']) self.trainingData,self.testData = DataSplit.\ dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap'])) elif self.evaluation.contains('-cv'): #cross validation self.__loadDataSet(config['ratings']) #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv'])) else: print 'Evaluation is not well configured!' exit(-1) def __loadDataSet(self, file, bTest=False): if not bTest: print 'loading training data...' else: print 'loading test data...' with open(file) as f: ratings = f.readlines() # ignore the headline if self.ratingConfig.contains('-header'): ratings = ratings[1:] # order of the columns order = self.ratingConfig['-columns'].strip().split() for lineNo, line in enumerate(ratings): items = split(' |,|\t', line.strip()) if len(order) < 3: print 'The rating file is not in a correct format. Error: Line num %d' % lineNo exit(-1) try: userId = items[int(order[0])] itemId = items[int(order[1])] rating = items[int(order[2])] except ValueError: print 'Error! Have you added the option -header to the rating.setup?' exit(-1) if not bTest: self.trainingData.append([userId, itemId, float(rating)]) else: self.testData.append([userId, itemId, float(rating)]) def execute(self): exec ('from algorithm.rating.' + self.config['recommender'] + ' import ' + self.config['recommender']) if self.evaluation.contains('-cv'): i = 1 for train,test in DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv'])): fold = '['+str(i)+']' recommender = self.config['recommender']+ "(self.config,train,test,fold)" measure = eval(recommender).execute() self.measure.append(measure) i+=1 res = [] for i in range(len(self.measure[0])): measure = self.measure[0][i].split(':')[0] total = 0 for j in range(len(self.measure)): total += float(self.measure[j][i].split(':')[1]) res.append(measure+':'+str(total/len(self.measure))+'\n') outDir = LineConfig(self.config['output.setup'])['-dir'] fileName = self.config['recommender'] +'@'+str(int(self.evaluation['-cv']))+'-fold-cv' + '.txt' FileIO.writeFile(outDir,fileName,res) else: recommender = self.config['recommender']+'(self.config,self.trainingData,self.testData)' eval(recommender).execute()
class SDLib(object): def __init__(self, config): self.trainingData = [] # training data self.testData = [] # testData self.relation = [] self.measure = [] self.config = config self.ratingConfig = LineConfig(config['ratings.setup']) self.labels = FileIO.loadLabels(config['label']) if self.config.contains('evaluation.setup'): self.evaluation = LineConfig(config['evaluation.setup']) if self.evaluation.contains('-testSet'): #specify testSet self.trainingData = FileIO.loadDataSet(config, config['ratings']) self.testData = FileIO.loadDataSet(config, self.evaluation['-testSet'], bTest=True) elif self.evaluation.contains('-ap'): #auto partition self.trainingData = FileIO.loadDataSet(config, config['ratings']) self.trainingData,self.testData = DataSplit.\ dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap'])) elif self.evaluation.contains('-cv'): #cross validation self.trainingData = FileIO.loadDataSet(config, config['ratings']) #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv'])) else: print('Evaluation is not well configured!') exit(-1) if config.contains('social'): self.socialConfig = LineConfig(self.config['social.setup']) self.relation = FileIO.loadRelationship(config, self.config['social']) print('preprocessing...') def execute(self): #import the algorithm module importStr = 'from method.' + self.config[ 'methodName'] + ' import ' + self.config['methodName'] exec(importStr) if self.evaluation.contains('-cv'): k = int(self.evaluation['-cv']) if k <= 1 or k > 10: k = 3 #create the manager used to communication in multiprocess manager = Manager() m = manager.dict() i = 1 tasks = [] for train, test in DataSplit.crossValidation(self.trainingData, k): fold = '[' + str(i) + ']' if self.config.contains('social'): method = self.config[ 'methodName'] + "(self.config,train,test,self.labels,self.relation,fold)" else: method = self.config[ 'methodName'] + "(self.config,train,test,self.labels,fold)" #create the process p = Process(target=run, args=(m, eval(method), i)) tasks.append(p) i += 1 #start the processes for p in tasks: p.start() #wait until all processes are completed for p in tasks: p.join() #compute the mean error of k-fold cross validation self.measure = [dict(m)[i] for i in range(1, k + 1)] res = [] pattern = re.compile('(\d+\.\d+)') countPattern = re.compile('\d+\\n') labelPattern = re.compile('\s\d{1}[^\.|\n|\d]') labels = re.findall(labelPattern, self.measure[0]) values = np.array([0] * 9, dtype=float) count = np.array([0, 0, 0], dtype=int) for report in self.measure: values += np.array(re.findall(pattern, report), dtype=float) count += np.array(re.findall(countPattern, report), dtype=int) values /= k values = np.around(values, decimals=4) res.append(' precision recall f1-score support\n\n') res.append(' ' + labels[0] + ' ' + ' '.join(np.array(values[0:3], dtype=str).tolist()) + ' ' + str(count[0]) + '\n') res.append(' ' + labels[1] + ' ' + ' '.join(np.array(values[3:6], dtype=str).tolist()) + ' ' + str(count[1]) + '\n\n') res.append(' avg/total ' + ' '.join(np.array(values[6:9], dtype=str).tolist()) + ' ' + str(count[2]) + '\n') print('Total:') print(''.join(res)) # for line in lines[1:]: # # measure = self.measure[0][i].split(':')[0] # total = 0 # for j in range(k): # total += float(self.measure[j][i].split(':')[1]) # res.append(measure+':'+str(total/k)+'\n') #output result currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) outDir = LineConfig(self.config['output.setup'])['-dir'] fileName = self.config[ 'methodName'] + '@' + currentTime + '-' + str( k) + '-fold-cv' + '.txt' FileIO.writeFile(outDir, fileName, res) print('The results have been output to ' + abspath(LineConfig(self.config['output.setup'])['-dir']) + '\n') else: if self.config.contains('social'): method = self.config[ 'methodName'] + '(self.config,self.trainingData,self.testData,self.labels,self.relation)' else: method = self.config[ 'methodName'] + '(self.config,self.trainingData,self.testData,self.labels)' result = eval(method).execute() return result
class Record(object): 'data access control' def __init__(self,config,trainingSet,testSet): self.config = config self.recordConfig = LineConfig(config['record.setup']) self.evalConfig = LineConfig(config['evaluation.setup']) self.name2id = defaultdict(dict) self.id2name = defaultdict(dict) self.listened = {} self.listened['artist']=defaultdict(dict) self.listened['track']=defaultdict(dict) self.listened['album']=defaultdict(dict) self.artist2Album = defaultdict(dict) #key:artist id, value:{album id1:1, album id2:1 ...} self.album2Track = defaultdict(dict) # self.artist2Track = defaultdict(dict) # self.Track2artist = defaultdict(dict) # self.Track2album = defaultdict(dict) # self.userRecord = defaultdict(list) #user data in training set. form: {user:[record1,record2]} self.trackRecord = defaultdict(list) # track data in training set. form: {track:[record1, record2]} self.testSet = defaultdict(dict) #user data in test set. form: {user:{recommenedObject1:1,recommendedObject:1}} self.recordCount = 0 self.columns = {} self.globalMean = 0 self.userMeans = {} #used to store the mean values of users's listen tims self.trackListen = {} self.trainingData = trainingSet self.computeUserMean() self.globalAverage() self.PopTrack = {} labels = self.recordConfig['-columns'].split(',') for col in labels: label = col.split(':') self.columns[label[0]] = int(label[1]) if self.evalConfig.contains('-byTime'): trainingSet,testSet = self.splitDataByTime(trainingSet) self.preprocess(trainingSet,testSet) self.computePop(trainingSet) def globalAverage(self): total = sum(self.userMeans.values()) if total==0: self.globalMean = 0 else: self.globalMean = total/len(self.userMeans) def computeUserMean(self): for user in self.userRecord: for item in self.userRecord[user]: userSum += self.listened['track'][item].values() self.userMeans[user] = userSum/float(len(self.userRecord[user])) ''' def splitDataByTime(self,dataset): trainingSet = [] testSet = [] listened = {} ratio = float(self.evalConfig['-byTime']) records = defaultdict(list) for event in dataset: records[event['user']].append(event) if event['user'] not in listened: listened[event['user']] = 1 else: listened[event['user']] += 1 orderlist = sorted(listened.items(), key=lambda item:item[1], reverse=True) dellist = orderlist[:int(len(orderlist)*ratio)] for i in range(len(dellist)): if dellist[i][0] in records: del records[dellist[i][0]] #print('The amount of data after deletion:', len(records)) for user in records: orderedList = sorted(records[user],key=lambda d:d['time']) training = orderedList[0:int(len(orderedList)*(1-ratio))] test = orderedList[int(len(orderedList)*(1-ratio)):] trainingSet += training testSet += test #print ('the type1 :', type(trainingSet), type(testSet)) #file_train = 'trainset.txt' #file_test = 'testset.txt' #trainf = open(file_train, 'wb') #testf = open(file_test, 'wb') #pickle.dump(trainingSet, trainf, 2) #pickle.dump(testSet, testf, 2) #trainf.close() #testf.close() return trainingSet,testSet ''' def splitDataByTime(self,dataset): trainingSet = [] testSet = [] ratio = float(self.evalConfig['-byTime']) records = defaultdict(list) for event in dataset: records[event['user']].append(event) for user in records: orderedList = sorted(records[user],key=lambda d:d['time']) training = orderedList[0:int(len(orderedList)*(1-ratio))] test = orderedList[int(len(orderedList)*(1-ratio)):] trainingSet += training testSet += test return trainingSet,testSet def computePop(self, dataset): print('computePop...') for event in dataset: total = 0 for value in self.listened['track'][event['track']].values(): total += value if value > 0: self.PopTrack[event['track']] = total print('computePop is finished...') print('PopTrack', len(self.PopTrack)) def preprocess(self,trainingSet,testSet): for entry in trainingSet: self.recordCount+=1 for key in entry: if key!='time': if entry[key] not in self.name2id[key]: self.name2id[key][entry[key]] = len(self.name2id[key]) self.id2name[key][len(self.id2name[key])] = entry[key] if key=='user': self.userRecord[entry['user']].append(entry) if 'artist' in entry: if entry[key] not in self.listened['artist'][entry['artist']]: self.listened['artist'][entry['artist']][entry[key]] = 1 else: self.listened['artist'][entry['artist']][entry[key]] += 1 if 'album' in entry: if entry[key] not in self.listened['album'][entry['album']]: self.listened['album'][entry['album']][entry[key]] = 1 else: self.listened['album'][entry['album']][entry[key]] += 1 if 'track' in entry: if entry[key] not in self.listened['track'][entry['track']]: self.listened['track'][entry['track']][entry[key]] = 1 else: self.listened['track'][entry['track']][entry[key]] += 1 if key == 'artist' and 'album' in entry: self.artist2Album[entry[key]][entry['album']] = 1 if key == 'album' and 'track' in entry: self.album2Track[entry[key]] = self.name2id['track'][entry['track']] self.Track2album[entry['track']] = self.name2id[key][entry[key]] if key == 'artist' and 'track' in entry: self.artist2Track[entry[key]] = self.name2id['track'][entry['track']] self.Track2artist[entry['track']] = self.name2id[key][entry[key]] if key == 'track': self.trackRecord[entry['track']].append(entry) recType = self.evalConfig['-target'] for entry in testSet: for key in entry: if key != 'time': if entry[key] not in self.name2id[key]: self.name2id[key][entry[key]] = len(self.name2id[key]) self.id2name[key][len(self.id2name[key])] = entry[key] if key=='user': if recType in entry and entry[recType] not in self.testSet[entry['user']]: self.testSet[entry['user']][entry[recType]]=1 else: self.testSet[entry['user']][entry[recType]]+=1 #remove items appearing in the training set from the test set for item in self.listened[recType]: for user in self.listened[recType][item]: try: del self.testSet[user][item] except KeyError: pass if user in self.testSet and len(self.testSet[user])==0: del self.testSet[user] def printTrainingSize(self): if 'user' in self.name2id: print ('user count:',len(self.name2id['user'])) if 'artist' in self.name2id: print ('artist count:',len(self.name2id['artist'])) if 'album' in self.name2id: print ('album count:',len(self.name2id['album'])) if 'track' in self.name2id: print ('track count:', len(self.name2id['track'])) print ('Training set size:',self.recordCount) def getId(self,obj,t): if obj in self.name2id[t]: return self.name2id[t][obj] else: print ('No '+t+' '+obj+' exists!') exit(-1) def getSize(self,t): return len(self.name2id[t]) def contains(self, obj, t): 'whether the recType t is in trainging set' if obj in self.name2id[t]: return True else: return False
class SocialDAO(object): def __init__(self, conf): self.config = conf self.socialConfig = LineConfig(self.config['social.setup']) self.user = {} #used to store the order of users self.triple = [] self.followees = {} self.followers = {} self.trustMatrix = self.loadRelationship(self.config['social']) def loadRelationship(self, filePath): print 'load social data...' triple = [] with open(filePath) as f: relations = f.readlines() # ignore the headline if self.socialConfig.contains('-header'): relations = relations[1:] # order of the columns order = self.socialConfig['-columns'].strip().split() if len(order) <= 2: print 'The social file is not in a correct format.' for line in relations: items = split(' |,|\t', line.strip()) if len(order) < 2: print 'The social file is not in a correct format. Error: Line num %d' % lineNo exit(-1) userId1 = items[int(order[0])] userId2 = items[int(order[1])] if len(order) < 3: weight = 1 else: weight = float(items[int(order[2])]) #add relations to dict if not self.followees.has_key(userId1): self.followees[userId1] = {} self.followees[userId1][userId2] = weight if not self.followers.has_key(userId2): self.followers[userId2] = {} self.followers[userId2][userId1] = weight # order the user if not self.user.has_key(userId1): self.user[userId1] = len(self.user) if not self.user.has_key(userId2): self.user[userId2] = len(self.user) self.triple.append([userId1, userId2, weight]) triple.append([self.user[userId1], self.user[userId2], weight]) return new_sparseMatrix.SparseMatrix(triple) def row(self, u): #return user u's followees return self.trustMatrix.row(self.user[u]) def col(self, u): #return user u's followers return self.trustMatrix.col(self.user[u]) def weight(self, u1, u2): if self.followees.has_key(u1) and self.followees[u1].has_key[u2]: return self.followees[u1][u2] else: return 0 def trustSize(self): return self.trustMatrix.size def getFollowers(self, u): if self.followers.has_key(u): return self.followers[u] else: return {} def getFollowees(self, u): if self.followees.has_key(u): return self.followees[u] else: return {} def hasFollowee(self, u1, u2): if self.followees.has_key(u1): if self.followees[u1].has_key(u2): return True else: return False return False def hasFollower(self, u1, u2): if self.followers.has_key(u1): if self.followers[u1].has_key(u2): return True else: return False return False
class SDLib(object): def __init__(self,config): self.trainingData = [] # training data self.testData = [] # testData self.relation = [] self.measure = [] self.config =config self.ratingConfig = LineConfig(config['ratings.setup']) self.labels = FileIO.loadLabels(config['label']) if self.config.contains('evaluation.setup'): self.evaluation = LineConfig(config['evaluation.setup']) if self.evaluation.contains('-testSet'): #specify testSet self.trainingData = FileIO.loadDataSet(config, config['ratings']) self.testData = FileIO.loadDataSet(config, self.evaluation['-testSet'], bTest=True) elif self.evaluation.contains('-ap'): #auto partition self.trainingData = FileIO.loadDataSet(config,config['ratings']) self.trainingData,self.testData = DataSplit.\ dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap'])) elif self.evaluation.contains('-cv'): #cross validation self.trainingData = FileIO.loadDataSet(config, config['ratings']) #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv'])) else: print 'Evaluation is not well configured!' exit(-1) if config.contains('social'): self.socialConfig = LineConfig(self.config['social.setup']) self.relation = FileIO.loadRelationship(config,self.config['social']) print 'preprocessing...' def execute(self): #import the algorithm module importStr = 'from method.' + self.config['methodName'] + ' import ' + self.config['methodName'] exec (importStr) if self.evaluation.contains('-cv'): k = int(self.evaluation['-cv']) if k <= 1 or k > 10: k = 3 #create the manager used to communication in multiprocess manager = Manager() m = manager.dict() i = 1 tasks = [] for train,test in DataSplit.crossValidation(self.trainingData,k): fold = '['+str(i)+']' if self.config.contains('social'): method = self.config['methodName'] + "(self.config,train,test,self.labels,self.relation,fold)" else: method = self.config['methodName'] + "(self.config,train,test,self.labels,fold)" #create the process p = Process(target=run,args=(m,eval(method),i)) tasks.append(p) i+=1 #start the processes for p in tasks: p.start() #wait until all processes are completed for p in tasks: p.join() #compute the mean error of k-fold cross validation self.measure = [dict(m)[i] for i in range(1,k+1)] res = [] pattern = re.compile('(\d+\.\d+)') countPattern = re.compile('\d+\\n') labelPattern = re.compile('\s\d{1}[^\.|\n|\d]') labels = re.findall(labelPattern, self.measure[0]) values = np.array([0]*9,dtype=float) count = np.array([0,0,0],dtype=int) for report in self.measure: values += np.array(re.findall(pattern,report),dtype=float) count+=np.array(re.findall(countPattern,report),dtype=int) values/=k values=np.around(values,decimals=4) res.append(' precision recall f1-score support\n\n') res.append(' '+labels[0]+' '+' '.join(np.array(values[0:3],dtype=str).tolist())+' '+str(count[0])+'\n') res.append(' '+labels[1]+' '+' '.join(np.array(values[3:6],dtype=str).tolist())+' '+str(count[1])+'\n\n') res.append(' avg/total ' + ' '.join(np.array(values[6:9], dtype=str).tolist()) + ' ' + str(count[2]) + '\n') print 'Total:' print ''.join(res) # for line in lines[1:]: # # measure = self.measure[0][i].split(':')[0] # total = 0 # for j in range(k): # total += float(self.measure[j][i].split(':')[1]) # res.append(measure+':'+str(total/k)+'\n') #output result currentTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) outDir = LineConfig(self.config['output.setup'])['-dir'] fileName = self.config['methodName'] +'@'+currentTime+'-'+str(k)+'-fold-cv' + '.txt' FileIO.writeFile(outDir,fileName,res) print 'The results have been output to '+abspath(LineConfig(self.config['output.setup'])['-dir'])+'\n' else: if self.config.contains('social'): method = self.config['methodName'] + '(self.config,self.trainingData,self.testData,self.labels,self.relation)' else: method = self.config['methodName'] + '(self.config,self.trainingData,self.testData,self.labels)' eval(method).execute()
class RecQ(object): def __init__(self, config): self.trainingData = [] # training data self.testData = [] # testData self.relation = [] self.measure = [] self.config = config self.ratingConfig = LineConfig(config['ratings.setup']) if self.config.contains('evaluation.setup'): self.evaluation = LineConfig(config['evaluation.setup']) if self.evaluation.contains('-testSet'): #specify testSet self.trainingData = FileIO.loadDataSet(config, config['ratings']) self.testData = FileIO.loadDataSet(config, self.evaluation['-testSet'], bTest=True) elif self.evaluation.contains('-ap'): #auto partition self.trainingData = FileIO.loadDataSet(config, config['ratings']) self.trainingData,self.testData = DataSplit.\ dataSplit(self.trainingData,test_ratio=float(self.evaluation['-ap'])) elif self.evaluation.contains('-cv'): #cross validation self.trainingData = FileIO.loadDataSet(config, config['ratings']) #self.trainingData,self.testData = DataSplit.crossValidation(self.trainingData,int(self.evaluation['-cv'])) else: print 'Evaluation is not well configured!' exit(-1) if config.contains('social'): self.socialConfig = LineConfig(self.config['social.setup']) self.relation = FileIO.loadRelationship(config, self.config['social']) print 'preprocessing...' def execute(self): #import the algorithm module importStr = 'from algorithm.rating.' + self.config[ 'recommender'] + ' import ' + self.config['recommender'] exec(importStr) if self.evaluation.contains('-cv'): k = int(self.evaluation['-cv']) if k <= 1 or k > 10: k = 3 #create the manager used to communication in multiprocess manager = Manager() m = manager.dict() i = 1 tasks = [] for train, test in DataSplit.crossValidation(self.trainingData, k): fold = '[' + str(i) + ']' if self.config.contains('social'): recommender = self.config[ 'recommender'] + "(self.config,train,test,self.relation,fold)" else: recommender = self.config[ 'recommender'] + "(self.config,train,test,fold)" #create the process p = Process(target=run, args=(m, eval(recommender), i)) tasks.append(p) i += 1 #start the processes for p in tasks: p.start() #wait until all processes are completed for p in tasks: p.join() #compute the mean error of k-fold cross validation self.measure = [dict(m)[i] for i in range(1, k + 1)] res = [] for i in range(len(self.measure[0])): measure = self.measure[0][i].split(':')[0] total = 0 for j in range(k): total += float(self.measure[j][i].split(':')[1]) res.append(measure + ':' + str(total / k) + '\n') #output result outDir = LineConfig(self.config['output.setup'])['-dir'] fileName = self.config['recommender'] + '@' + str( k) + '-fold-cv' + '.txt' FileIO.writeFile(outDir, fileName, res) else: if self.config.contains('social'): recommender = self.config[ 'recommender'] + '(self.config,self.trainingData,self.testData,self.relation)' else: recommender = self.config[ 'recommender'] + '(self.config,self.trainingData,self.testData)' eval(recommender).execute()