def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) #print 'train user:' + str(self.dataModel.getUsersNum()) V = self.dataModel.getData() model = ProjectedGradientNMF(n_components=self.factors, max_iter=1000, nls_max_iter=1000) self.pu = model.fit_transform(V) self.qi = model.fit(V).components_.transpose()
def fit(self, trainSamples, trainTargets): #print len(trainSamples) self.dataModel = MemeryDataModel(trainSamples, trainTargets) usersNum = self.dataModel.getUsersNum() self.simiMatrix = np.zeros((usersNum, usersNum)) for i in range(usersNum): for j in range(i+1, usersNum): s = self.similarity.compute(self.dataModel.getItemIDsFromUid(i), self.dataModel.getItemIDsFromUid(j)) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
def gen_items_popular(self, trainSamples, trainTargets, hasTimes=False): self.dataModel = MemeryDataModel(trainSamples, trainTargets) itempopular = np.zeros(self.dataModel.getItemsNum()) uids = self.dataModel.getData().nonzero()[0] iids = self.dataModel.getData().nonzero()[1] for i in range(len(iids)): iid = iids[i] itempopular[iid] += 1 self.popItems = itempopular
class NMF(BaseEstimator): def __init__(self, n=5, factors=50): print 'nmf begin' self.n = n self.factors = factors def predict(self, testSamples): recommend_lists = [] for user_item in testSamples: uid = self.dataModel.getUidByUser(user_item[0]) iid = self.dataModel.getUidByUser(user_item[1]) recommend_lists.append(np.dot(self.pu[uid], self.qi[iid])) return recommend_lists def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) #print 'train user:'******'not in test' return [] else: predict_scores = [] for i in range(self.dataModel.getItemsNum()): predict_scores.append(self.predict_single(uid, i)) topN = np.argsort(np.array(predict_scores))[-1:-self.n - 1:-1] return [self.dataModel.getItemByIid(i) for i in topN] def score(self, testSamples, trueLabels): print 'NMF scoring ...' trueList = [] recommendList = [] user_unique = list(set(np.array(testSamples)[:, 0])) #print 'test user:'******'NMF result:' + '(' + str(self.get_params()) + ')' + str( (result)['F1']) return (result)['F1']
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) itemsNum = self.dataModel.getItemsNum() self.simiMatrix = np.zeros((itemsNum, itemsNum)) for i in range(itemsNum): for j in range(i + 1, itemsNum): s = self.similarity.compute( self.dataModel.getUserIDsFromIid(i), self.dataModel.getUserIDsFromIid(j)) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
class TopN(BaseEstimator): def __init__(self, n=5): print 'topN begin' self.n = n def gen_items_popular(self, trainSamples, trainTargets, hasTimes=False): self.dataModel = MemeryDataModel(trainSamples, trainTargets) itempopular = np.zeros(self.dataModel.getItemsNum()) uids = self.dataModel.getData().nonzero()[0] iids = self.dataModel.getData().nonzero()[1] for i in range(len(iids)): iid = iids[i] itempopular[iid] += 1 self.popItems = itempopular def predict(self, testSamples): recommend_lists = [] for user_item in testSamples: if self.dataModel.getIidByItem(user_item[1]) in self.topN[:self.n]: recommend_lists.append(1) else: recommend_lists.append(0) return recommend_lists def fit(self, trainSamples, trainTargets): #print trainSamples, trainTargets #print len(trainSamples), len(trainTargets) self.gen_items_popular(trainSamples, trainTargets) self.topN = np.argsort(np.array(self.popItems))[-1::-1] return self def recommend(self, uid): return [self.dataModel.getItemByIid(i) for i in self.topN[:self.n]] def score(self, testSamples, trueLabels): #print testSamples #print len(testSamples) trueList = [] recommendList = [] user_unique = list(set(np.array(testSamples)[:, 0])) for u in user_unique: uTrueIndex = np.argwhere(np.array(testSamples)[:, 0] == u)[:, 0] #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])] true = list(np.array(testSamples)[uTrueIndex][:, 1]) trueList.append(true) pre = [self.dataModel.getItemByIid(i) for i in self.topN[:self.n]] recommendList.append(pre) e = Eval() result = e.evalAll(recommendList, trueList) print 'TopN result:' + '(' + str(self.get_params()) + ')' + str( (result)['F1']) return (result)['F1']
class TopN(BaseEstimator): def __init__(self, n=5): print 'topN begin' self.n = n def gen_items_popular(self, trainSamples, trainTargets, hasTimes=False): self.dataModel = MemeryDataModel(trainSamples, trainTargets) itempopular = np.zeros(self.dataModel.getItemsNum()) uids = self.dataModel.getData().nonzero()[0] iids = self.dataModel.getData().nonzero()[1] for i in range(len(iids)): iid = iids[i] itempopular[iid] += 1 self.popItems = itempopular def predict(self, testSamples): recommend_lists = [] for user_item in testSamples: if self.dataModel.getIidByItem(user_item[1]) in self.topN[:self.n]: recommend_lists.append(1) else: recommend_lists.append(0) return recommend_lists def fit(self, trainSamples, trainTargets): #print trainSamples, trainTargets #print len(trainSamples), len(trainTargets) self.gen_items_popular(trainSamples, trainTargets) self.topN = np.argsort(np.array(self.popItems))[-1::-1] return self def recommend(self, uid): return [self.dataModel.getItemByIid(i) for i in self.topN[:self.n]] def score(self, testSamples, trueLabels): #print testSamples #print len(testSamples) trueList = [] recommendList= [] user_unique = list(set(np.array(testSamples)[:,0])) for u in user_unique: uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0] #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])] true = list(np.array(testSamples)[uTrueIndex][:,1]) trueList.append(true) pre = [self.dataModel.getItemByIid(i) for i in self.topN[:self.n]] recommendList.append(pre) e = Eval() result = e.evalAll(recommendList, trueList) print 'TopN result:'+'('+str(self.get_params())+')'+str((result)['F1']) return (result)['F1']
class NMF(BaseEstimator): def __init__(self, n=5, factors=50): print 'nmf begin' self.n = n self.factors = factors def predict(self, testSamples): recommend_lists = [] for user_item in testSamples: uid = self.dataModel.getUidByUser(user_item[0]) iid = self.dataModel.getUidByUser(user_item[1]) recommend_lists.append(np.dot(self.pu[uid], self.qi[iid])) return recommend_lists def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) #print 'train user:'******'not in test' return [] else: predict_scores = [] for i in range(self.dataModel.getItemsNum()): predict_scores.append(self.predict_single(uid, i)) topN = np.argsort(np.array(predict_scores))[-1:-self.n-1:-1] return [self.dataModel.getItemByIid(i) for i in topN] def score(self, testSamples, trueLabels): print 'NMF scoring ...' trueList = [] recommendList= [] user_unique = list(set(np.array(testSamples)[:,0])) #print 'test user:'******'NMF result:'+'('+str(self.get_params())+')' + str((result)['F1']) return (result)['F1']
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets, isRating=True) usersNum = self.dataModel.getUsersNum() itemsNum = self.dataModel.getItemsNum() self.T = [{} for i in range(usersNum)] for uid in range(usersNum): purchased_items = self.dataModel.getItemIDsFromUid(uid) for i in range(len(purchased_items)): for j in range(i+1, len(purchased_items)): rating_i = self.dataModel.getRating(uid, purchased_items[i]) rating_j = self.dataModel.getRating(uid, purchased_items[j]) if rating_i > rating_j: key = str(purchased_items[i]) + " " + str(purchased_items[j]) elif rating_i < rating_j: key = str(purchased_items[j]) + " " + str(purchased_items[i]) else: continue self.T[uid][key] = 1 for uid in range(usersNum): print self.dataModel.getUserByUid(uid), len(self.T[uid]) idf = {} pair_sum = [[0]*itemsNum for i in range(itemsNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") pair_sum[int(i1)][int(i2)] += 1 for i1 in range(itemsNum): for i2 in range(itemsNum): if pair_sum[i1][i2] != 0: key = str(i1) + ' ' + str(i2) sum = pair_sum[i1][i2] + pair_sum[i2][i1] alpha = log10(1+9.0*sum/usersNum) idf[key] = alpha*log2(sum*1.0/pair_sum[i1][i2])+(1-alpha) W = [{} for i in range(usersNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") diff = self.dataModel.getRating(uid, int(i1))-self.dataModel.getRating(uid, int(i2)) tf = log2(1+abs(diff)) W[uid][t] = tf * idf[t] self.simiMatrix = np.zeros((usersNum, usersNum)) for i in range(usersNum): for j in range(i+1, usersNum): s = self.cos(W[i], W[j]) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) self.mu = np.array(trainTargets).mean() self.bu = np.zeros(self.dataModel.getUsersNum()) self.bi = np.zeros(self.dataModel.getItemsNum()) temp = math.sqrt(self.factors) self.qi = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())] self.pu = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())] lineData = self.dataModel.getLineData() lengthOfTrain = len(lineData) for step in range(self.iter): rmse_sum = 0.0 hash = np.random.permutation(lengthOfTrain) for j in range(lengthOfTrain): n = hash[j] row = lineData[n] uid = self.dataModel.getUidByUser(row[0]) iid = self.dataModel.getIidByItem(row[1]) rating = row[2] #rating = 1 eui = rating - self.predict_single(uid, iid) rmse_sum += eui**2 self.bu[uid] += self.learningrate*(eui-self.userregular*self.bu[uid]) self.bi[iid] += self.learningrate*(eui-self.itemregular*self.bi[iid]) temp = self.qi[iid] self.qi[iid] += self.learningrate*(np.dot(eui, self.pu[uid]) - np.dot(self.itemregular, self.qi[iid])) self.pu[uid] += self.learningrate*(np.dot(eui, temp) - np.dot(self.userregular, self.pu[uid])) self.learningrate = self.learningrate * 0.93
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) temp = math.sqrt(self.factors) self.item_bias = np.zeros(self.dataModel.getItemsNum()) self.user_factors = np.array([[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())]) self.item_factors = np.array([[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())]) ''' user_file = 'pu' item_file = 'qi' self.user_factors = np.array(pd.read_csv(user_file).values)[:, 1:] self.item_factors = np.array(pd.read_csv(item_file).values)[:, 1:] ''' num_loss_samples = int(100*self.dataModel.getUsersNum()**0.5) #print 'sampling {0} <user,item i,item j> triples...'.format(num_loss_samples) loss_sampler = UniformUserUniformItem(True) self.loss_samples = [t for t in loss_sampler.generate_samples(self.dataModel, num_loss_samples)] old_loss = self.loss() update_sampler = UniformPairWithoutReplacement(True) #print 'initial loss = {0}'.format(self.loss()) for it in xrange(self.iter): #print 'starting iteration {0}'.format(it) for u, i, j in update_sampler.generate_samples(self.dataModel): self.update_factors(u, i, j) if abs(self.loss() - old_loss) < 0.01 or self.loss() - old_loss > 0: #print 'iteration {0}: loss = {1}'.format(it, self.loss()) #print 'converge!!' break else: old_loss = self.loss() self.learning_rate *= 0.9
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) itemsNum = self.dataModel.getItemsNum() self.simiMatrix = np.zeros((itemsNum, itemsNum)) for i in range(itemsNum): for j in range(i+1, itemsNum): s = self.similarity.compute(self.dataModel.getUserIDsFromIid(i), self.dataModel.getUserIDsFromIid(j)) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets, hasTimes=True) usersNum = self.dataModel.getUsersNum() itemsNum = self.dataModel.getItemsNum() all_item_set = set(range(itemsNum)) self.T = [{} for i in range(usersNum)] for uid in range(usersNum): purchased_items = self.dataModel.getItemIDsFromUid(uid) for i in range(len(purchased_items)): for j in range(i+1, len(purchased_items)): rating_i = self.dataModel.getRating(uid, purchased_items[i]) rating_j = self.dataModel.getRating(uid, purchased_items[j]) if rating_i > rating_j: key = str(purchased_items[i]) + " " + str(purchased_items[j]) elif rating_i < rating_j: key = str(purchased_items[j]) + " " + str(purchased_items[i]) else: continue self.T[uid][key] = 1 # for i in purchased_items: # purchased_items = self.dataModel.getItemIDsFromUid(uid) # unpurchased_items = random.sample(all_item_set.difference(purchased_items), self.sample_rate) # for j in unpurchased_items: # key = str(i) + " " + str(j) # self.T[uid][key] = 1 idf = {} pair_sum = [[0]*itemsNum for i in range(itemsNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") pair_sum[int(i1)][int(i2)] += 1 for i1 in range(itemsNum): for i2 in range(itemsNum): if pair_sum[i1][i2] != 0: key = str(i1) + ' ' + str(i2) sum = pair_sum[i1][i2] + pair_sum[i2][i1] alpha = log10(1+9.0*sum/usersNum) idf[key] = alpha*log2(sum*1.0/pair_sum[i1][i2])+(1-alpha) W = [{} for i in range(usersNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") diff = self.dataModel.getRating(uid, int(i1))-self.dataModel.getRating(uid, int(i2)) # if diff != 1: # print 'error!' tf = log2(1+abs(diff)) if diff < 0: tf = -tf W[uid][t] = tf * idf[t] self.simiMatrix = np.zeros((usersNum, usersNum)) for i in range(usersNum): for j in range(i+1, usersNum): s = self.cos(W[i], W[j]) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) temp = math.sqrt(self.factors) self.item_bias = np.zeros(self.dataModel.getItemsNum()) self.user_factors = np.array([[ (0.1 * random.random() / temp) for j in range(self.factors) ] for i in range(self.dataModel.getUsersNum())]) self.item_factors = np.array([[ (0.1 * random.random() / temp) for j in range(self.factors) ] for i in range(self.dataModel.getItemsNum())]) ''' user_file = 'pu' item_file = 'qi' self.user_factors = np.array(pd.read_csv(user_file).values)[:, 1:] self.item_factors = np.array(pd.read_csv(item_file).values)[:, 1:] ''' num_loss_samples = int(100 * self.dataModel.getUsersNum()**0.5) #print 'sampling {0} <user,item i,item j> triples...'.format(num_loss_samples) loss_sampler = UniformUserUniformItem(True) self.loss_samples = [ t for t in loss_sampler.generate_samples(self.dataModel, num_loss_samples) ] old_loss = self.loss() update_sampler = UniformPairWithoutReplacement(True) #print 'initial loss = {0}'.format(self.loss()) for it in xrange(self.iter): #print 'starting iteration {0}'.format(it) for u, i, j in update_sampler.generate_samples(self.dataModel): self.update_factors(u, i, j) if abs(self.loss() - old_loss) < 0.01 or self.loss() - old_loss > 0: #print 'iteration {0}: loss = {1}'.format(it, self.loss()) #print 'converge!!' break else: old_loss = self.loss() self.learning_rate *= 0.9
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) self.mu = np.array(trainTargets).mean() self.bu = np.zeros(self.dataModel.getUsersNum()) self.bi = np.zeros(self.dataModel.getItemsNum()) temp = math.sqrt(self.factors) self.qi = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())] self.pu = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())] lineData = self.dataModel.getLineData() lengthOfTrain = len(lineData) for step in range(self.iter): rmse_sum = 0.0 hash = np.random.permutation(lengthOfTrain) for j in range(lengthOfTrain): n = hash[j] row = lineData[n] uid = self.dataModel.getUidByUser(row[0]) iid = self.dataModel.getIidByItem(row[1]) rating = row[2] #rating = 1 eui = rating - self.predict_single(uid, iid) rmse_sum += eui**2 self.bu[uid] += self.learningrate * ( eui - self.userregular * self.bu[uid]) self.bi[iid] += self.learningrate * ( eui - self.itemregular * self.bi[iid]) temp = self.qi[iid] self.qi[iid] += self.learningrate * ( np.dot(eui, self.pu[uid]) - np.dot(self.itemregular, self.qi[iid])) self.pu[uid] += self.learningrate * ( np.dot(eui, temp) - np.dot(self.userregular, self.pu[uid])) self.learningrate = self.learningrate * 0.93
class BPR(BaseEstimator): def __init__(self, n=5, factors=50, learning_rate=0.001, bias_regularization=0.001, user_regularization=0.001, positive_item_regularization=0.001, negative_item_regularization=0.001,iter = 50): """initialise BPR matrix factorization model D: number of factors """ print 'bpr begin' self.n = n self.factors = factors self.learning_rate = learning_rate self.bias_regularization = bias_regularization self.user_regularization = user_regularization self.positive_item_regularization = positive_item_regularization self.negative_item_regularization = negative_item_regularization self.iter = iter def predict(self, testSamples): recList = [] for user_item in testSamples: uid = self.dataModel.getUidByUser(user_item[0]) recList.append(self.recommend(uid)) return recList def update_factors(self, u, i, j, update_u=True, update_i=True): """apply SGD update""" update_j = True x = self.item_bias[i] - self.item_bias[j] \ + np.dot(self.user_factors[u],self.item_factors[i]-self.item_factors[j]) z = 1.0/(1.0+exp(x)) # update bias terms if update_i: d = z - self.bias_regularization * self.item_bias[i] self.item_bias[i] += self.learning_rate * d if update_j: d = -z - self.bias_regularization * self.item_bias[j] self.item_bias[j] += self.learning_rate * d if update_u: d = (self.item_factors[i]-self.item_factors[j])*z - self.user_regularization*self.user_factors[u] self.user_factors[u,:] += self.learning_rate*d if update_i: d = self.user_factors[u]*z - self.positive_item_regularization*self.item_factors[i] self.item_factors[i,:] += self.learning_rate*d if update_j: d = -self.user_factors[u]*z - self.negative_item_regularization*self.item_factors[j] self.item_factors[j] += self.learning_rate*d def loss(self): ranking_loss = 0 for u,i,j in self.loss_samples: x = self.predict_single(u,i) - self.predict_single(u,j) ranking_loss += math.log(1.0+exp(-x)) complexity = 0 for u,i,j in self.loss_samples: complexity += self.user_regularization * np.dot(self.user_factors[u],self.user_factors[u]) complexity += self.positive_item_regularization * np.dot(self.item_factors[i],self.item_factors[i]) complexity += self.negative_item_regularization * np.dot(self.item_factors[j],self.item_factors[j]) complexity += self.bias_regularization * self.item_bias[i]**2 complexity += self.bias_regularization * self.item_bias[j]**2 return ranking_loss + 0.5*complexity def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) temp = math.sqrt(self.factors) self.item_bias = np.zeros(self.dataModel.getItemsNum()) self.user_factors = np.array([[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())]) self.item_factors = np.array([[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())]) ''' user_file = 'pu' item_file = 'qi' self.user_factors = np.array(pd.read_csv(user_file).values)[:, 1:] self.item_factors = np.array(pd.read_csv(item_file).values)[:, 1:] ''' num_loss_samples = int(100*self.dataModel.getUsersNum()**0.5) #print 'sampling {0} <user,item i,item j> triples...'.format(num_loss_samples) loss_sampler = UniformUserUniformItem(True) self.loss_samples = [t for t in loss_sampler.generate_samples(self.dataModel, num_loss_samples)] old_loss = self.loss() update_sampler = UniformPairWithoutReplacement(True) #print 'initial loss = {0}'.format(self.loss()) for it in xrange(self.iter): #print 'starting iteration {0}'.format(it) for u, i, j in update_sampler.generate_samples(self.dataModel): self.update_factors(u, i, j) if abs(self.loss() - old_loss) < 0.01 or self.loss() - old_loss > 0: #print 'iteration {0}: loss = {1}'.format(it, self.loss()) #print 'converge!!' break else: old_loss = self.loss() self.learning_rate *= 0.9 #print 'iteration {0}: loss = {1}'.format(it, self.loss()) def predict_single(self,uid,iid): return self.item_bias[iid] + np.dot(self.user_factors[uid],self.item_factors[iid]) def recommend(self, u): uid = self.dataModel.getUidByUser(u) if uid == -1: print 'not in test' return [] else: predict_scores = [] for i in range(self.dataModel.getItemsNum()): s = self.predict_single(uid, i) predict_scores.append(s) topN = np.argsort(np.array(predict_scores))[-1:-self.n - 1:-1] return [self.dataModel.getItemByIid(i) for i in topN] def score(self, testSamples, trueLabels): print 'BPR scoring ...' trueList = [] recommendList= [] user_unique = list(set(np.array(testSamples)[:,0])) for u in user_unique: uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0] #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])] true = list(np.array(testSamples)[uTrueIndex][:,1]) trueList.append(true) pre = self.recommend(u) recommendList.append(pre) e = Eval() result = e.evalAll(recommendList, trueList) print 'BPR result:'+ '('+str(self.get_params())+')'+str((result)['F1']) return (result)['F1']
class LFM(BaseEstimator): def __init__(self, n=5, factors=25, learningrate=0.05, userregular=0.0001, itemregular=0.0001, iter=10): print 'lfm begin' self.factors = factors self.n = n self.learningrate = learningrate self.userregular = userregular self.itemregular = itemregular self.iter = iter def predict(self, testSamples): recList = [] for user_item in testSamples: uid = self.dataModel.getUidByUser(user_item[0]) recList.append(self.recommend(uid)) return recList def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) self.mu = np.array(trainTargets).mean() self.bu = np.zeros(self.dataModel.getUsersNum()) self.bi = np.zeros(self.dataModel.getItemsNum()) temp = math.sqrt(self.factors) self.qi = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())] self.pu = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())] lineData = self.dataModel.getLineData() lengthOfTrain = len(lineData) for step in range(self.iter): rmse_sum = 0.0 hash = np.random.permutation(lengthOfTrain) for j in range(lengthOfTrain): n = hash[j] row = lineData[n] uid = self.dataModel.getUidByUser(row[0]) iid = self.dataModel.getIidByItem(row[1]) rating = row[2] #rating = 1 eui = rating - self.predict_single(uid, iid) rmse_sum += eui**2 self.bu[uid] += self.learningrate * ( eui - self.userregular * self.bu[uid]) self.bi[iid] += self.learningrate * ( eui - self.itemregular * self.bi[iid]) temp = self.qi[iid] self.qi[iid] += self.learningrate * ( np.dot(eui, self.pu[uid]) - np.dot(self.itemregular, self.qi[iid])) self.pu[uid] += self.learningrate * ( np.dot(eui, temp) - np.dot(self.userregular, self.pu[uid])) self.learningrate = self.learningrate * 0.93 def predict_single(self, uid, iid): ans = self.mu + self.bi[iid] + self.bu[uid] + np.dot( self.qi[iid], self.pu[uid]) if ans > 5: return 5 elif ans < 1: return 1 return ans def recommend(self, u): uid = self.dataModel.getUidByUser(u) if uid == -1: print 'not in test' return [] else: predict_scores = [] for i in range(self.dataModel.getItemsNum()): predict_scores.append(self.predict_single(uid, i)) topN = np.argsort(np.array(predict_scores))[-1:-self.n - 1:-1] return [self.dataModel.getItemByIid(i) for i in topN] def score(self, testSamples, trueLabels): print 'LFM scoring ...' trueList = [] recommendList = [] user_unique = list(set(np.array(testSamples)[:, 0])) for u in user_unique: uTrueIndex = np.argwhere(np.array(testSamples)[:, 0] == u)[:, 0] #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])] true = list(np.array(testSamples)[uTrueIndex][:, 1]) trueList.append(true) pre = self.recommend(u) recommendList.append(pre) e = Eval() result = e.evalAll(recommendList, trueList) print 'LFM result:' + '(' + str(self.get_params()) + ')' + str( (result)['F1']) return (result)['F1']
class VSRank(BaseEstimator): def __init__(self, neighbornum=5, n=5): print 'vsrank begin' self.neighbornum = neighbornum self.similarity = Similarity('COSINE') self.n = n def predict(self,testSamples): recList = [] for user_item in testSamples: uid = self.dataModel.getUidByUser(user_item[0]) recList.append(self.recommend(uid)) return recList def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets, isRating=True) usersNum = self.dataModel.getUsersNum() itemsNum = self.dataModel.getItemsNum() self.T = [{} for i in range(usersNum)] for uid in range(usersNum): purchased_items = self.dataModel.getItemIDsFromUid(uid) for i in range(len(purchased_items)): for j in range(i+1, len(purchased_items)): rating_i = self.dataModel.getRating(uid, purchased_items[i]) rating_j = self.dataModel.getRating(uid, purchased_items[j]) if rating_i > rating_j: key = str(purchased_items[i]) + " " + str(purchased_items[j]) elif rating_i < rating_j: key = str(purchased_items[j]) + " " + str(purchased_items[i]) else: continue self.T[uid][key] = 1 for uid in range(usersNum): print self.dataModel.getUserByUid(uid), len(self.T[uid]) idf = {} pair_sum = [[0]*itemsNum for i in range(itemsNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") pair_sum[int(i1)][int(i2)] += 1 for i1 in range(itemsNum): for i2 in range(itemsNum): if pair_sum[i1][i2] != 0: key = str(i1) + ' ' + str(i2) sum = pair_sum[i1][i2] + pair_sum[i2][i1] alpha = log10(1+9.0*sum/usersNum) idf[key] = alpha*log2(sum*1.0/pair_sum[i1][i2])+(1-alpha) W = [{} for i in range(usersNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") diff = self.dataModel.getRating(uid, int(i1))-self.dataModel.getRating(uid, int(i2)) tf = log2(1+abs(diff)) W[uid][t] = tf * idf[t] self.simiMatrix = np.zeros((usersNum, usersNum)) for i in range(usersNum): for j in range(i+1, usersNum): s = self.cos(W[i], W[j]) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s def cos(self, dict1, dict2): product = 0.0 m1 = 0.0 m2 = 0.0 for k, v in dict1.iteritems(): m1 += v*v i1, i2 = k.split(' ') k_ = i2 + ' ' + i1 if dict2.has_key(k): product += v * dict2[k] elif dict2.has_key(k_): product -= v * dict2[k_] for k, v in dict2.iteritems(): m2 += v*v if product == 0: return 0 else: return product/sqrt(m1)/sqrt(m2) def tau(self, dict1, dict2, u1, u2): pass def neighborhood(self, userID): neighbors = np.argsort(np.array(self.simiMatrix[userID]))[-1:-self.neighbornum-1:-1] return neighbors def predict_single(self, userID, itemID): rating = 0.0 for uid in self.neighborhood(userID): if itemID in self.dataModel.getItemIDsFromUid(uid): rating += self.simiMatrix[userID][uid] * self.dataModel.getRating(uid, itemID) return rating def recommend(self, u): userID = self.dataModel.getUidByUser(u) if userID == -1: print 'not in test' return [] else: # return self.recommend_listwise(userID) return self.recommend_pairwise(userID) def recommend_pointwise(self, userID): #interactedItems = self.dataModel.getItemIDsFromUid(userID) ratings = dict() for uid in self.neighborhood(userID): for iid in self.dataModel.getItemIDsFromUid(uid): #if iid in interactedItems: #continue r = ratings.get(iid, 0) ratings[iid] = r + self.simiMatrix[userID][uid] * self.dataModel.getRating(uid, iid) r = [x for (x, y) in sorted(ratings.items(), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.n]] return [self.dataModel.getItemByIid(i) for i in r] def recommend_pairwise(self, userID): itemsNum = self.dataModel.getItemsNum() N = itemsNum recNum = self.n pi = [0]*itemsNum rank = [] for i in range(itemsNum): sum1 = 0 sum2 = 0 for j in range(itemsNum): if j != i: p = self.preference(userID, i, j) sum1 += p sum2 -= p pi[i] = sum1 - sum2 I = set(i for i in range(itemsNum)) while recNum > 0: # while len(I) > 0: recNum -= 1 t = np.argmax(pi) rank.append(t) I.remove(t) pi[t] = None for i in I: pi[i] += self.preference(userID, t, i) - self.preference(userID, i, t) # r = [x for (x, y) in sorted(zip(range(itemsNum), rank), lambda a, b: cmp(a[1], b[1]))[:self.n]] return [self.dataModel.getItemByIid(i) for i in rank] def preference(self, uid, i1, i2): nerghborhood = [] keystr = str(i1) + ' ' + str(i2) keystr_ = str(i2) + ' ' + str(i1) for i in range(self.dataModel.getUsersNum()): if self.T[i].has_key(keystr) or self.T[i].has_key(keystr_): nerghborhood.append(i) distance = [0]*len(nerghborhood) for i in range(len(nerghborhood)): distance[i] = self.simiMatrix[uid][nerghborhood[i]] nerghborhood = [x for (x, y) in sorted(zip(nerghborhood, distance), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.neighbornum]] preference = 0.0 sum = 0.0 for i in nerghborhood: rating1 = self.dataModel.getRating(i, i1) rating2 = self.dataModel.getRating(i, i2) sum += self.simiMatrix[uid][i] if rating1 > rating2: preference += self.simiMatrix[uid][i] elif rating1 < rating2: preference -= self.simiMatrix[uid][i] if sum == 0: return 0 else: return preference/sum def recommend_listwise(self, userID): itemsNum = self.dataModel.getItemsNum() M = [[0]*itemsNum for i in range(itemsNum)] for uid in self.neighborhood(userID): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") M[int(i1)][int(i2)] += 1 for m in xrange(itemsNum): for n in xrange(itemsNum): for k in xrange(itemsNum): M[n][k] = max(M[n][k], min(M[n][m], M[m][k])) rank = [0]*itemsNum for m in range(itemsNum): for n in range(itemsNum): if n != m and M[m][n] > M[n][m]: rank[m] += 1 r = [x for (x, y) in sorted(zip(range(itemsNum), rank), lambda a, b: cmp(a[1], b[1]))[:self.n]] return [self.dataModel.getItemByIid(i) for i in r] def score(self, testSamples, trueLabels): print 'vsrank scoring ...' #print len(testSamples) trueList = [] recommendList= [] user_unique = list(set(np.array(testSamples)[:,0])) for u in user_unique: uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0] #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])] uTrueItem = list(np.array(testSamples)[uTrueIndex][:,1]) uTrueRating = list(np.array(trueLabels)[uTrueIndex]) true = [x for (x, y) in sorted(zip(uTrueItem, uTrueRating), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.n]] trueList.append(true) pre = self.recommend(u) recommendList.append(pre) e = Eval() result = e.evalAll(recommendList, trueList) print 'vsrank result:'+'('+str(self.get_params())+')'+str(result) return (result)['F1']
class LFM(BaseEstimator): def __init__(self, n=5, factors=25, learningrate=0.05, userregular=0.0001, itemregular=0.0001, iter = 10): print 'lfm begin' self.factors = factors self.n = n self.learningrate = learningrate self.userregular = userregular self.itemregular = itemregular self.iter = iter def predict(self, testSamples): recList = [] for user_item in testSamples: uid = self.dataModel.getUidByUser(user_item[0]) recList.append(self.recommend(uid)) return recList def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) self.mu = np.array(trainTargets).mean() self.bu = np.zeros(self.dataModel.getUsersNum()) self.bi = np.zeros(self.dataModel.getItemsNum()) temp = math.sqrt(self.factors) self.qi = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())] self.pu = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())] lineData = self.dataModel.getLineData() lengthOfTrain = len(lineData) for step in range(self.iter): rmse_sum = 0.0 hash = np.random.permutation(lengthOfTrain) for j in range(lengthOfTrain): n = hash[j] row = lineData[n] uid = self.dataModel.getUidByUser(row[0]) iid = self.dataModel.getIidByItem(row[1]) rating = row[2] #rating = 1 eui = rating - self.predict_single(uid, iid) rmse_sum += eui**2 self.bu[uid] += self.learningrate*(eui-self.userregular*self.bu[uid]) self.bi[iid] += self.learningrate*(eui-self.itemregular*self.bi[iid]) temp = self.qi[iid] self.qi[iid] += self.learningrate*(np.dot(eui, self.pu[uid]) - np.dot(self.itemregular, self.qi[iid])) self.pu[uid] += self.learningrate*(np.dot(eui, temp) - np.dot(self.userregular, self.pu[uid])) self.learningrate = self.learningrate * 0.93 def predict_single(self, uid, iid): ans = self.mu + self.bi[iid] + self.bu[uid] + np.dot(self.qi[iid], self.pu[uid]) if ans > 5: return 5 elif ans < 1: return 1 return ans def recommend(self, u): uid = self.dataModel.getUidByUser(u) if uid == -1: print 'not in test' return [] else: predict_scores = [] for i in range(self.dataModel.getItemsNum()): predict_scores.append(self.predict_single(uid, i)) topN = np.argsort(np.array(predict_scores))[-1:-self.n-1:-1] return [self.dataModel.getItemByIid(i) for i in topN] def score(self, testSamples, trueLabels): print 'LFM scoring ...' trueList = [] recommendList= [] user_unique = list(set(np.array(testSamples)[:,0])) for u in user_unique: uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0] #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])] true = list(np.array(testSamples)[uTrueIndex][:,1]) trueList.append(true) pre = self.recommend(u) recommendList.append(pre) e = Eval() result = e.evalAll(recommendList, trueList) print 'LFM result:'+ '('+str(self.get_params())+')'+str((result)['F1']) return (result)['F1']
class BPR(BaseEstimator): def __init__(self, n=5, factors=50, learning_rate=0.001, bias_regularization=0.001, user_regularization=0.001, positive_item_regularization=0.001, negative_item_regularization=0.001, iter=50): """initialise BPR matrix factorization model D: number of factors """ print 'bpr begin' self.n = n self.factors = factors self.learning_rate = learning_rate self.bias_regularization = bias_regularization self.user_regularization = user_regularization self.positive_item_regularization = positive_item_regularization self.negative_item_regularization = negative_item_regularization self.iter = iter def predict(self, testSamples): recList = [] for user_item in testSamples: uid = self.dataModel.getUidByUser(user_item[0]) recList.append(self.recommend(uid)) return recList def update_factors(self, u, i, j, update_u=True, update_i=True): """apply SGD update""" update_j = True x = self.item_bias[i] - self.item_bias[j] \ + np.dot(self.user_factors[u],self.item_factors[i]-self.item_factors[j]) z = 1.0 / (1.0 + exp(x)) # update bias terms if update_i: d = z - self.bias_regularization * self.item_bias[i] self.item_bias[i] += self.learning_rate * d if update_j: d = -z - self.bias_regularization * self.item_bias[j] self.item_bias[j] += self.learning_rate * d if update_u: d = (self.item_factors[i] - self.item_factors[j] ) * z - self.user_regularization * self.user_factors[u] self.user_factors[u, :] += self.learning_rate * d if update_i: d = self.user_factors[ u] * z - self.positive_item_regularization * self.item_factors[ i] self.item_factors[i, :] += self.learning_rate * d if update_j: d = -self.user_factors[ u] * z - self.negative_item_regularization * self.item_factors[ j] self.item_factors[j] += self.learning_rate * d def loss(self): ranking_loss = 0 for u, i, j in self.loss_samples: x = self.predict_single(u, i) - self.predict_single(u, j) ranking_loss += math.log(1.0 + exp(-x)) complexity = 0 for u, i, j in self.loss_samples: complexity += self.user_regularization * np.dot( self.user_factors[u], self.user_factors[u]) complexity += self.positive_item_regularization * np.dot( self.item_factors[i], self.item_factors[i]) complexity += self.negative_item_regularization * np.dot( self.item_factors[j], self.item_factors[j]) complexity += self.bias_regularization * self.item_bias[i]**2 complexity += self.bias_regularization * self.item_bias[j]**2 return ranking_loss + 0.5 * complexity def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) temp = math.sqrt(self.factors) self.item_bias = np.zeros(self.dataModel.getItemsNum()) self.user_factors = np.array([[ (0.1 * random.random() / temp) for j in range(self.factors) ] for i in range(self.dataModel.getUsersNum())]) self.item_factors = np.array([[ (0.1 * random.random() / temp) for j in range(self.factors) ] for i in range(self.dataModel.getItemsNum())]) ''' user_file = 'pu' item_file = 'qi' self.user_factors = np.array(pd.read_csv(user_file).values)[:, 1:] self.item_factors = np.array(pd.read_csv(item_file).values)[:, 1:] ''' num_loss_samples = int(100 * self.dataModel.getUsersNum()**0.5) #print 'sampling {0} <user,item i,item j> triples...'.format(num_loss_samples) loss_sampler = UniformUserUniformItem(True) self.loss_samples = [ t for t in loss_sampler.generate_samples(self.dataModel, num_loss_samples) ] old_loss = self.loss() update_sampler = UniformPairWithoutReplacement(True) #print 'initial loss = {0}'.format(self.loss()) for it in xrange(self.iter): #print 'starting iteration {0}'.format(it) for u, i, j in update_sampler.generate_samples(self.dataModel): self.update_factors(u, i, j) if abs(self.loss() - old_loss) < 0.01 or self.loss() - old_loss > 0: #print 'iteration {0}: loss = {1}'.format(it, self.loss()) #print 'converge!!' break else: old_loss = self.loss() self.learning_rate *= 0.9 #print 'iteration {0}: loss = {1}'.format(it, self.loss()) def predict_single(self, uid, iid): return self.item_bias[iid] + np.dot(self.user_factors[uid], self.item_factors[iid]) def recommend(self, u): uid = self.dataModel.getUidByUser(u) if uid == -1: print 'not in test' return [] else: predict_scores = [] for i in range(self.dataModel.getItemsNum()): s = self.predict_single(uid, i) predict_scores.append(s) topN = np.argsort(np.array(predict_scores))[-1:-self.n - 1:-1] return [self.dataModel.getItemByIid(i) for i in topN] def score(self, testSamples, trueLabels): print 'BPR scoring ...' trueList = [] recommendList = [] user_unique = list(set(np.array(testSamples)[:, 0])) for u in user_unique: uTrueIndex = np.argwhere(np.array(testSamples)[:, 0] == u)[:, 0] #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])] true = list(np.array(testSamples)[uTrueIndex][:, 1]) trueList.append(true) pre = self.recommend(u) recommendList.append(pre) e = Eval() result = e.evalAll(recommendList, trueList) print 'BPR result:' + '(' + str(self.get_params()) + ')' + str( (result)['F1']) return (result)['F1']
class ItemCF(BaseEstimator): def __init__(self, neighbornum=5, n=5): self.neighbornum = neighbornum self.similarity = Similarity('COSINE') self.n = n def predict(self,testSamples): recList = [] for user_item in testSamples: uid = self.dataModel.getUidByUser(user_item[0]) recList.append(self.recommend(uid)) return recList def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) itemsNum = self.dataModel.getItemsNum() self.simiMatrix = np.zeros((itemsNum, itemsNum)) for i in range(itemsNum): for j in range(i+1, itemsNum): s = self.similarity.compute(self.dataModel.getUserIDsFromIid(i), self.dataModel.getUserIDsFromIid(j)) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s def neighborhood(self, itemID): neighbors = np.argsort(np.array(self.simiMatrix[itemID]))[-1:-self.neighbornum-1:-1] return neighbors def predict_single(self, userID, itemID): rating = 0.0 for iid in self.neighborhood(itemID): if userID in self.dataModel.getUserIDsFromIid(iid): rating += self.simiMatrix[itemID][iid] * self.dataModel.getRating(userID, iid) return rating def recommend(self, u): userID = self.dataModel.getUidByUser(u) if userID == -1: print 'not in test' return [] else: #interactedItems = self.dataModel.getItemIDsFromUid(userID) ratings = dict() for iid in self.dataModel.getItemIDsFromUid(userID): for niid in self.neighborhood(iid): #if iid in interactedItems: #continue r = ratings.get(iid, 0) ratings[iid] = r + self.simiMatrix[iid][niid] * self.dataModel.getRating(userID, niid) r = [x for (x, y) in sorted(ratings.items(), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.n]] return [self.dataModel.getItemByIid(i) for i in r] def score(self, testSamples, trueLabels): print 'Item_CF scoring ...' trueList = [] recommendList= [] user_unique = list(set(np.array(testSamples)[:,0])) for u in user_unique: uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0] #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])] true = list(np.array(testSamples)[uTrueIndex][:,1]) trueList.append(true) pre = self.recommend(u) recommendList.append(pre) e = Eval() result = e.evalAll(trueList, recommendList) print 'ItemCF result:'+'('+str(self.get_params())+')'+str((result)['F1']) return (result)['F1']
class VSRankPlus(BaseEstimator): def __init__(self, neighbornum=5, n=5): print 'vsrank begin' self.neighbornum = neighbornum self.similarity = Similarity('COSINE') self.n = n self.sample_rate = 5 def predict(self,testSamples): recList = [] for user_item in testSamples: uid = self.dataModel.getUidByUser(user_item[0]) recList.append(self.recommend(uid)) return recList def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets, hasTimes=True) usersNum = self.dataModel.getUsersNum() itemsNum = self.dataModel.getItemsNum() all_item_set = set(range(itemsNum)) self.T = [{} for i in range(usersNum)] for uid in range(usersNum): purchased_items = self.dataModel.getItemIDsFromUid(uid) for i in range(len(purchased_items)): for j in range(i+1, len(purchased_items)): rating_i = self.dataModel.getRating(uid, purchased_items[i]) rating_j = self.dataModel.getRating(uid, purchased_items[j]) if rating_i > rating_j: key = str(purchased_items[i]) + " " + str(purchased_items[j]) elif rating_i < rating_j: key = str(purchased_items[j]) + " " + str(purchased_items[i]) else: continue self.T[uid][key] = 1 # for i in purchased_items: # purchased_items = self.dataModel.getItemIDsFromUid(uid) # unpurchased_items = random.sample(all_item_set.difference(purchased_items), self.sample_rate) # for j in unpurchased_items: # key = str(i) + " " + str(j) # self.T[uid][key] = 1 idf = {} pair_sum = [[0]*itemsNum for i in range(itemsNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") pair_sum[int(i1)][int(i2)] += 1 for i1 in range(itemsNum): for i2 in range(itemsNum): if pair_sum[i1][i2] != 0: key = str(i1) + ' ' + str(i2) sum = pair_sum[i1][i2] + pair_sum[i2][i1] alpha = log10(1+9.0*sum/usersNum) idf[key] = alpha*log2(sum*1.0/pair_sum[i1][i2])+(1-alpha) W = [{} for i in range(usersNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") diff = self.dataModel.getRating(uid, int(i1))-self.dataModel.getRating(uid, int(i2)) # if diff != 1: # print 'error!' tf = log2(1+abs(diff)) if diff < 0: tf = -tf W[uid][t] = tf * idf[t] self.simiMatrix = np.zeros((usersNum, usersNum)) for i in range(usersNum): for j in range(i+1, usersNum): s = self.cos(W[i], W[j]) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s def cos(self, dict1, dict2): product = 0.0 m1 = 0.0 m2 = 0.0 for k, v in dict1.iteritems(): m1 += v**2 if dict2.has_key(k): product += v * dict2[k] for k, v in dict2.iteritems(): m2 += v**2 if product == 0: return 0 else: return product/sqrt(m1)/sqrt(m2) def tau(self, dict1, dict2, u1, u2): pass def neighborhood(self, userID): neighbors = np.argsort(np.array(self.simiMatrix[userID]))[-1:-self.neighbornum-1:-1] return neighbors def predict_single(self, userID, itemID): rating = 0.0 for uid in self.neighborhood(userID): if itemID in self.dataModel.getItemIDsFromUid(uid): rating += self.simiMatrix[userID][uid] * self.dataModel.getRating(uid, itemID) return rating def recommend(self, u): userID = self.dataModel.getUidByUser(u) if userID == -1: print 'not in test' return [] else: # return self.recommend_listwise(userID) return self.recommend_pairwise(userID) def recommend_pointwise(self, userID): #interactedItems = self.dataModel.getItemIDsFromUid(userID) ratings = dict() for uid in self.neighborhood(userID): for iid in self.dataModel.getItemIDsFromUid(uid): #if iid in interactedItems: #continue r = ratings.get(iid, 0) ratings[iid] = r + self.simiMatrix[userID][uid] * self.dataModel.getRating(uid, iid) r = [x for (x, y) in sorted(ratings.items(), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.n]] return [self.dataModel.getItemByIid(i) for i in r] def recommend_pairwise(self, userID): itemsNum = self.dataModel.getItemsNum() N = itemsNum recNum = self.n pi = [0]*itemsNum rank = [] for i in range(itemsNum): sum1 = 0 sum2 = 0 for j in range(itemsNum): if j != i: p = self.preference(userID, i, j) sum1 += p sum2 -= p pi[i] = sum1 - sum2 I = set(i for i in range(itemsNum)) while recNum > 0: # while len(I) > 0: recNum -= 1 t = np.argmax(pi) rank.append(t) I.remove(t) pi[t] = None for i in I: pi[i] += self.preference(userID, t, i) - self.preference(userID, i, t) # r = [x for (x, y) in sorted(zip(range(itemsNum), rank), lambda a, b: cmp(a[1], b[1]))[:self.n]] return [self.dataModel.getItemByIid(i) for i in rank] def preference(self, uid, i1, i2): nerghborhood = [] keystr = str(i1) + ' ' + str(i2) keystr_ = str(i2) + ' ' + str(i1) for i in range(self.dataModel.getUsersNum()): if self.T[i].has_key(keystr) or self.T[i].has_key(keystr_): nerghborhood.append(i) distance = [0]*len(nerghborhood) for i in range(len(nerghborhood)): distance[i] = self.simiMatrix[uid][nerghborhood[i]] nerghborhood = [x for (x, y) in sorted(zip(nerghborhood, distance), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.neighbornum]] preference = 0.0 sum = 0.0 for i in nerghborhood: rating1 = self.dataModel.getRating(i, i1) rating2 = self.dataModel.getRating(i, i2) sum += self.simiMatrix[uid][i] if rating1 > rating2: preference += self.simiMatrix[uid][i] elif rating1 < rating2: preference -= self.simiMatrix[uid][i] if sum == 0: return 0 else: return preference/sum def recommend_listwise(self, userID): itemsNum = self.dataModel.getItemsNum() M = [[0]*itemsNum for i in range(itemsNum)] for uid in self.neighborhood(userID): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") M[int(i1)][int(i2)] += 1 for m in xrange(itemsNum): for n in xrange(itemsNum): for k in xrange(itemsNum): M[n][k] = max(M[n][k], min(M[n][m], M[m][k])) rank = [0]*itemsNum for m in range(itemsNum): for n in range(itemsNum): if n != m and M[m][n] > M[n][m]: rank[m] += 1 r = [x for (x, y) in sorted(zip(range(itemsNum), rank), lambda a, b: cmp(a[1], b[1]))[:self.n]] return [self.dataModel.getItemByIid(i) for i in r] def score(self, testSamples, trueLabels): print 'vsrank scoring ...' #print len(testSamples) trueList = [] recommendList= [] user_unique = list(set(np.array(testSamples)[:,0])) for u in user_unique: uTrueIndex = np.argwhere(np.array(testSamples)[:,0] == u)[:,0] #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])] true = list(np.array(testSamples)[uTrueIndex][:,1]) trueList.append(true) pre = self.recommend(u) recommendList.append(pre) e = Eval() result = e.evalAll(recommendList, trueList) print 'vsrank result:'+'('+str(self.get_params())+')'+str((result)['F1']) return (result)['F1']
class ItemCF(BaseEstimator): def __init__(self, neighbornum=5, n=5): self.neighbornum = neighbornum self.similarity = Similarity('COSINE') self.n = n def predict(self, testSamples): recList = [] for user_item in testSamples: uid = self.dataModel.getUidByUser(user_item[0]) recList.append(self.recommend(uid)) return recList def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) itemsNum = self.dataModel.getItemsNum() self.simiMatrix = np.zeros((itemsNum, itemsNum)) for i in range(itemsNum): for j in range(i + 1, itemsNum): s = self.similarity.compute( self.dataModel.getUserIDsFromIid(i), self.dataModel.getUserIDsFromIid(j)) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s def neighborhood(self, itemID): neighbors = np.argsort(np.array( self.simiMatrix[itemID]))[-1:-self.neighbornum - 1:-1] return neighbors def predict_single(self, userID, itemID): rating = 0.0 for iid in self.neighborhood(itemID): if userID in self.dataModel.getUserIDsFromIid(iid): rating += self.simiMatrix[itemID][ iid] * self.dataModel.getRating(userID, iid) return rating def recommend(self, u): userID = self.dataModel.getUidByUser(u) if userID == -1: print 'not in test' return [] else: #interactedItems = self.dataModel.getItemIDsFromUid(userID) ratings = dict() for iid in self.dataModel.getItemIDsFromUid(userID): for niid in self.neighborhood(iid): #if iid in interactedItems: #continue r = ratings.get(iid, 0) ratings[iid] = r + self.simiMatrix[iid][ niid] * self.dataModel.getRating(userID, niid) r = [ x for (x, y) in sorted(ratings.items(), lambda a, b: cmp(a[1], b[1]), reverse=True)[:self.n] ] return [self.dataModel.getItemByIid(i) for i in r] def score(self, testSamples, trueLabels): print 'Item_CF scoring ...' trueList = [] recommendList = [] user_unique = list(set(np.array(testSamples)[:, 0])) for u in user_unique: uTrueIndex = np.argwhere(np.array(testSamples)[:, 0] == u)[:, 0] #true = [self.dataModel.getIidByItem(i) for i in list(np.array(testSamples)[uTrueIndex][:,1])] true = list(np.array(testSamples)[uTrueIndex][:, 1]) trueList.append(true) pre = self.recommend(u) recommendList.append(pre) e = Eval() result = e.evalAll(trueList, recommendList) print 'ItemCF result:' + '(' + str(self.get_params()) + ')' + str( (result)['F1']) return (result)['F1']
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets, hasTimes=True) usersNum = self.dataModel.getUsersNum() itemsNum = self.dataModel.getItemsNum() all_item_set = set(range(itemsNum)) self.T = [{} for i in range(usersNum)] for uid in range(usersNum): purchased_items = self.dataModel.getItemIDsFromUid(uid) for i in range(len(purchased_items)): for j in range(i + 1, len(purchased_items)): rating_i = self.dataModel.getRating( uid, purchased_items[i]) rating_j = self.dataModel.getRating( uid, purchased_items[j]) if rating_i > rating_j: key = str(purchased_items[i]) + " " + str( purchased_items[j]) elif rating_i < rating_j: key = str(purchased_items[j]) + " " + str( purchased_items[i]) else: continue self.T[uid][key] = 1 # for i in purchased_items: # purchased_items = self.dataModel.getItemIDsFromUid(uid) # unpurchased_items = random.sample(all_item_set.difference(purchased_items), self.sample_rate) # for j in unpurchased_items: # key = str(i) + " " + str(j) # self.T[uid][key] = 1 idf = {} pair_sum = [[0] * itemsNum for i in range(itemsNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") pair_sum[int(i1)][int(i2)] += 1 for i1 in range(itemsNum): for i2 in range(itemsNum): if pair_sum[i1][i2] != 0: key = str(i1) + ' ' + str(i2) sum = pair_sum[i1][i2] + pair_sum[i2][i1] alpha = log10(1 + 9.0 * sum / usersNum) idf[key] = alpha * log2(sum * 1.0 / pair_sum[i1][i2]) + ( 1 - alpha) W = [{} for i in range(usersNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") diff = self.dataModel.getRating( uid, int(i1)) - self.dataModel.getRating(uid, int(i2)) # if diff != 1: # print 'error!' tf = log2(1 + abs(diff)) if diff < 0: tf = -tf W[uid][t] = tf * idf[t] self.simiMatrix = np.zeros((usersNum, usersNum)) for i in range(usersNum): for j in range(i + 1, usersNum): s = self.cos(W[i], W[j]) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s