def gen_items_popular(self, trainSamples, trainTargets, hasTimes=False): self.dataModel = MemeryDataModel(trainSamples, trainTargets) itempopular = np.zeros(self.dataModel.getItemsNum()) uids = self.dataModel.getData().nonzero()[0] iids = self.dataModel.getData().nonzero()[1] for i in range(len(iids)): iid = iids[i] itempopular[iid] += 1 self.popItems = itempopular
def fit(self, trainSamples, trainTargets): #print len(trainSamples) self.dataModel = MemeryDataModel(trainSamples, trainTargets) usersNum = self.dataModel.getUsersNum() self.simiMatrix = np.zeros((usersNum, usersNum)) for i in range(usersNum): for j in range(i+1, usersNum): s = self.similarity.compute(self.dataModel.getItemIDsFromUid(i), self.dataModel.getItemIDsFromUid(j)) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) #print 'train user:' + str(self.dataModel.getUsersNum()) V = self.dataModel.getData() model = ProjectedGradientNMF(n_components=self.factors, max_iter=1000, nls_max_iter=1000) self.pu = model.fit_transform(V) self.qi = model.fit(V).components_.transpose()
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) itemsNum = self.dataModel.getItemsNum() self.simiMatrix = np.zeros((itemsNum, itemsNum)) for i in range(itemsNum): for j in range(i + 1, itemsNum): s = self.similarity.compute( self.dataModel.getUserIDsFromIid(i), self.dataModel.getUserIDsFromIid(j)) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets, isRating=True) usersNum = self.dataModel.getUsersNum() itemsNum = self.dataModel.getItemsNum() self.T = [{} for i in range(usersNum)] for uid in range(usersNum): purchased_items = self.dataModel.getItemIDsFromUid(uid) for i in range(len(purchased_items)): for j in range(i+1, len(purchased_items)): rating_i = self.dataModel.getRating(uid, purchased_items[i]) rating_j = self.dataModel.getRating(uid, purchased_items[j]) if rating_i > rating_j: key = str(purchased_items[i]) + " " + str(purchased_items[j]) elif rating_i < rating_j: key = str(purchased_items[j]) + " " + str(purchased_items[i]) else: continue self.T[uid][key] = 1 for uid in range(usersNum): print self.dataModel.getUserByUid(uid), len(self.T[uid]) idf = {} pair_sum = [[0]*itemsNum for i in range(itemsNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") pair_sum[int(i1)][int(i2)] += 1 for i1 in range(itemsNum): for i2 in range(itemsNum): if pair_sum[i1][i2] != 0: key = str(i1) + ' ' + str(i2) sum = pair_sum[i1][i2] + pair_sum[i2][i1] alpha = log10(1+9.0*sum/usersNum) idf[key] = alpha*log2(sum*1.0/pair_sum[i1][i2])+(1-alpha) W = [{} for i in range(usersNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") diff = self.dataModel.getRating(uid, int(i1))-self.dataModel.getRating(uid, int(i2)) tf = log2(1+abs(diff)) W[uid][t] = tf * idf[t] self.simiMatrix = np.zeros((usersNum, usersNum)) for i in range(usersNum): for j in range(i+1, usersNum): s = self.cos(W[i], W[j]) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) temp = math.sqrt(self.factors) self.item_bias = np.zeros(self.dataModel.getItemsNum()) self.user_factors = np.array([[ (0.1 * random.random() / temp) for j in range(self.factors) ] for i in range(self.dataModel.getUsersNum())]) self.item_factors = np.array([[ (0.1 * random.random() / temp) for j in range(self.factors) ] for i in range(self.dataModel.getItemsNum())]) ''' user_file = 'pu' item_file = 'qi' self.user_factors = np.array(pd.read_csv(user_file).values)[:, 1:] self.item_factors = np.array(pd.read_csv(item_file).values)[:, 1:] ''' num_loss_samples = int(100 * self.dataModel.getUsersNum()**0.5) #print 'sampling {0} <user,item i,item j> triples...'.format(num_loss_samples) loss_sampler = UniformUserUniformItem(True) self.loss_samples = [ t for t in loss_sampler.generate_samples(self.dataModel, num_loss_samples) ] old_loss = self.loss() update_sampler = UniformPairWithoutReplacement(True) #print 'initial loss = {0}'.format(self.loss()) for it in xrange(self.iter): #print 'starting iteration {0}'.format(it) for u, i, j in update_sampler.generate_samples(self.dataModel): self.update_factors(u, i, j) if abs(self.loss() - old_loss) < 0.01 or self.loss() - old_loss > 0: #print 'iteration {0}: loss = {1}'.format(it, self.loss()) #print 'converge!!' break else: old_loss = self.loss() self.learning_rate *= 0.9
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets) self.mu = np.array(trainTargets).mean() self.bu = np.zeros(self.dataModel.getUsersNum()) self.bi = np.zeros(self.dataModel.getItemsNum()) temp = math.sqrt(self.factors) self.qi = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getItemsNum())] self.pu = [[(0.1 * random.random() / temp) for j in range(self.factors)] for i in range(self.dataModel.getUsersNum())] lineData = self.dataModel.getLineData() lengthOfTrain = len(lineData) for step in range(self.iter): rmse_sum = 0.0 hash = np.random.permutation(lengthOfTrain) for j in range(lengthOfTrain): n = hash[j] row = lineData[n] uid = self.dataModel.getUidByUser(row[0]) iid = self.dataModel.getIidByItem(row[1]) rating = row[2] #rating = 1 eui = rating - self.predict_single(uid, iid) rmse_sum += eui**2 self.bu[uid] += self.learningrate * ( eui - self.userregular * self.bu[uid]) self.bi[iid] += self.learningrate * ( eui - self.itemregular * self.bi[iid]) temp = self.qi[iid] self.qi[iid] += self.learningrate * ( np.dot(eui, self.pu[uid]) - np.dot(self.itemregular, self.qi[iid])) self.pu[uid] += self.learningrate * ( np.dot(eui, temp) - np.dot(self.userregular, self.pu[uid])) self.learningrate = self.learningrate * 0.93
def fit(self, trainSamples, trainTargets): self.dataModel = MemeryDataModel(trainSamples, trainTargets, hasTimes=True) usersNum = self.dataModel.getUsersNum() itemsNum = self.dataModel.getItemsNum() all_item_set = set(range(itemsNum)) self.T = [{} for i in range(usersNum)] for uid in range(usersNum): purchased_items = self.dataModel.getItemIDsFromUid(uid) for i in range(len(purchased_items)): for j in range(i + 1, len(purchased_items)): rating_i = self.dataModel.getRating( uid, purchased_items[i]) rating_j = self.dataModel.getRating( uid, purchased_items[j]) if rating_i > rating_j: key = str(purchased_items[i]) + " " + str( purchased_items[j]) elif rating_i < rating_j: key = str(purchased_items[j]) + " " + str( purchased_items[i]) else: continue self.T[uid][key] = 1 # for i in purchased_items: # purchased_items = self.dataModel.getItemIDsFromUid(uid) # unpurchased_items = random.sample(all_item_set.difference(purchased_items), self.sample_rate) # for j in unpurchased_items: # key = str(i) + " " + str(j) # self.T[uid][key] = 1 idf = {} pair_sum = [[0] * itemsNum for i in range(itemsNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") pair_sum[int(i1)][int(i2)] += 1 for i1 in range(itemsNum): for i2 in range(itemsNum): if pair_sum[i1][i2] != 0: key = str(i1) + ' ' + str(i2) sum = pair_sum[i1][i2] + pair_sum[i2][i1] alpha = log10(1 + 9.0 * sum / usersNum) idf[key] = alpha * log2(sum * 1.0 / pair_sum[i1][i2]) + ( 1 - alpha) W = [{} for i in range(usersNum)] for uid in range(usersNum): for t, times in self.T[uid].iteritems(): i1, i2 = t.split(" ") diff = self.dataModel.getRating( uid, int(i1)) - self.dataModel.getRating(uid, int(i2)) # if diff != 1: # print 'error!' tf = log2(1 + abs(diff)) if diff < 0: tf = -tf W[uid][t] = tf * idf[t] self.simiMatrix = np.zeros((usersNum, usersNum)) for i in range(usersNum): for j in range(i + 1, usersNum): s = self.cos(W[i], W[j]) self.simiMatrix[i][j] = self.simiMatrix[j][i] = s