def active_learning(simClass, start, end, simItem_k, topUser_k, itemsFrame, ratingsFrame, users_rating): #### Split training set and test set #### ratings_train = ratingsFrame.to_records(index=False).tolist() ratings_test = [] for ratingNum in range(ratingsFrame.shape[0]): if (ratings_train[ratingNum][1] == start): test_start = ratingNum while (ratingNum in range(len(ratings_train)) and ratings_train[ratingNum][1] <= end): ratings_test.append(ratings_train[ratingNum]) ratingNum += 1 test_end = ratingNum break ratings_train = ratings_train[0:test_start] + ratings_train[test_end:] print("pre-split DONE") #### find k similar items for each new item #### print("test interval: [" + str(start) + ":" + str(end) + "]") sims = simClass.generate_topk_item_similarity( range(start, end), simItem_k, itemsFrame.loc[:, "asin"].tolist()) print("Sims calculation DONE") #### Calculate Propability for each new item #### ratings_set = ratingsFrame.loc[:, ["userID", "itemID"]].to_records( index=False).tolist() ratings_origin = ratingsFrame.to_records(index=False).tolist() item_groups = ratingsFrame.groupby("itemID") for item in sims: # first, find the set of users who have rated items within sim set. users_rated_sims = [] for item_sim in sims[item]: users_rated_sims += item_groups.get_group( item_sim)["userID"].tolist() users_rated_sims = list(set(users_rated_sims)) user_score = Score(users_rated_sims, users_rating, sims[item]) user_score = sorted(user_score.items(), key=lambda d: d[1], reverse=True) for top in range(topUser_k): t = (user_score[top][0], item) if t in ratings_set: index = ratings_set.index(t) ratings_test.remove(ratings_origin[index]) ratings_train.append(ratings_origin[index]) print("active learning ALL DONE") ##### Caculate RMSE for each iteration2 ##### mf = MatrixFactorization() return mf.matrix_factorization(ratings_train, ratings_test)
def run_matrix_factorization(): """ Executes the Matrix Factorization model on the ratings dataset. """ ratings = pd.read_csv('../data/ratings2.csv', sep='\t') num_users = len(ratings.buyer_id.unique()) num_items = ratings.product_id.max() + 1 train, val = train_test_split(ratings) model = MatrixFactorization(train, num_users, num_items, num_latent_factors=20) model.train(max_iter=20, learning_rate=0.01, regularize=0.5, val=val, lr_scheduler=True)
def test_matrix_factorization(): print('mf test.............') since = time.time() # 潜在因子数 K = 2 # m x n のレーティング行列 R = np.array([[5, 3, 0, 1], [4, 0, 0, 1], [1, 1, 0, 5], [1, 0, 0, 4], [0, 1, 5, 4]]) m, n = R.shape # m x K のユーザ行列P P = np.random.rand(m, K) # n x K のアイテム行列Q Q = np.random.rand(n, K) P_new, Q_new = mf.run(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.2, threshold=0.001) evaluate(P, Q, R, P_new, Q_new) assert False
def test_movielens(load_dataset): print('mf test.............') # Load dataset df_rating = load_dataset # print(df_rating.head()) m = max(np.unique(df_rating.userId)) n = max(np.unique(df_rating.movieId)) # m = len(np.unique(df_rating.userId)) # n = len(np.unique(df_rating.movieId)) print('m x n: {} x {}'.format(m, n)) R = np.zeros([m, n]) print(type(df_rating.userId[0])) print(type(df_rating.movieId[0])) print(R[df_rating.userId[0]][df_rating.movieId[0]]) print(np.unique(df_rating.rating)) print(sum(df_rating.rating.isnull())) for i, s in df_rating.iterrows(): if s.rating == 0: continue u_id = int(s.userId) m_id = int(s.movieId) R[u_id - 1][m_id - 1] = s.rating print(R) # 潜在因子数 K = 2 # m x n のレーティング行列 m, n = df_rating[['userId', 'movieId']].shape print(m, n) R = np.array([[5, 3, 0, 1], [4, 0, 0, 1], [1, 1, 0, 5], [1, 0, 0, 4], [0, 1, 5, 4]]) m, n = R.shape # m x K のユーザ行列P P = np.random.rand(m, K) # n x K のアイテム行列Q Q = np.random.rand(n, K) P_new, Q_new = mf.run(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.2, threshold=0.001) evaluate(P, Q, R, P_new, Q_new) assert False
def generate_prediction_model(lr_bound, tree, rI, sMatrix, plambda_candidates, validation_set): ''' lr_bound: dict { level 0: [[left_bound, right_bound]], users' bound for one level, each ele in dictionary represents one node level 1: [[left_bound, right_bound], [left_bound, right_bound], [left_bound, right_bound]], 3 level 2: ..., 9 } (bound means index) plambda_candidates: { level 0: [clambda1, clambda2, clambda3, ...] level 1: [clambda1, clambda2, clambda3, ...] level 2: [clambda1, clambda2, clambda3, ...] } prediction_model: dict { level 0: { 'best_lambda': x, 'user_profile': ..., 'item_profile': ...} level 1: { 'best_lambda': x, 'user_profile': ..., 'item_profile': ...} level 2: { 'best_lambda': x, 'user_profile': ..., 'item_profile': ...} } ''' MF = MatrixFactorization() prediction_model = {} val_item_list = find(validation_set)[0] val_user_list = find(validation_set)[1] user_node_ind = np.zeros( sMatrix.shape[1]) #### notice that index is not id for level in lr_bound: prediction_model.setdefault(level, {}) train_lst = [] for pseudo_user_bound, userid in zip(lr_bound[level], range(len(lr_bound[level]))): if pseudo_user_bound[0] > pseudo_user_bound[1]: continue pseudo_user_lst = tree[pseudo_user_bound[0]:(pseudo_user_bound[1] + 1)] pseudo_user_for_item = calculate_avg_rating_for_pesudo_user( pseudo_user_lst, sMatrix) train_lst += [(userid, int(itemid), float(pseudo_user_for_item[itemid])) for itemid in range(pseudo_user_for_item.shape[0]) if pseudo_user_for_item[itemid]] #### find node index for each validation user #### user_node_ind[pseudo_user_lst] = userid #### Train MF and Do validation #### min_RMSE = -1 for plambda in plambda_candidates[level]: MF.change_parameter(plambda) user_profile, item_profile = MF.matrix_factorization(train_lst) RMSE = pred_RMSE_for_validate_user(user_node_ind, user_profile, item_profile, val_user_list, val_item_list, sMatrix) if min_RMSE is -1 or RMSE < min_RMSE: min_RMSE = RMSE min_user_profile, min_item_profile, min_lambda = user_profile, item_profile, plambda prediction_model[level]['upro'], prediction_model[level]['ipro'], prediction_model[level]['plambda'] \ = min_user_profile, min_item_profile, min_lambda MF.end() #### close MF spark session return prediction_model
def main(): """ main function for test """ # For command line arguments psr = argparse.ArgumentParser() psr.add_argument("--data_dir", default="ml-100k/") psr.add_argument("--base", default="u1.base") psr.add_argument("--test", default="u1.test") psr.add_argument("--a", default=0.01, type=float) psr.add_argument("--b", default=0.2, type=float) psr.add_argument("--K", default=50, type=int) psr.add_argument("--tol", default=3.e-2, type=float) args = psr.parse_args() # Get rating matrix R, R_test, user_size, item_size = get_rating(data_dir=args.data_dir, base=args.base, test=args.test, debug=True) # Training mf = MatrixFactorization(R=R, R_test=R_test, user_size=user_size, item_size=item_size, K=args.K, a=args.a, b=args.b) print("training...") mf.train(tol=args.tol, debug=True) print("The number of test data is {}.".format(R_test.shape[0])) icor = 0 for r in R_test: iu = int(r[0]) - 1 ii = int(r[1]) - 1 Rhatui = mf.rating(iu=iu, ii=ii) if np.round(Rhatui) == r[2]: icor = icor + 1 print("The number of correct predictions is {}.".format(icor))
def __init__(self, data_path=None): mf = MatrixFactorization(data_path) #TODO: Correct? self._context_data = {str(i): mf.Q[i] for i in range(len(mf.Q))}
#!/usr/bin/env python3 from matrix_factorization import MatrixFactorization DATA = { "DATA_DIR": "./ml-latest-small", "RATING_FILE": "ratings.csv", "MOVIE_FILE": "movies.csv", "TAG_FILE": "tags.csv" } if __name__ == "__main__": mf_cf = MatrixFactorization(DATA) mf_cf.train() mf_cf.predict_single(1, 110)
# load MovieLens data num_user, num_item, ratings = load_ml_1m() np.random.shuffle(ratings) # set feature numbers num_feature = 10 # set max_iterations max_iter = 20 # split data to training & testing train_pct = 0.9 train_size = int(train_pct * len(ratings)) train = ratings[:train_size] validation = ratings[train_size:] # models rec = MatrixFactorization(num_user, num_item, num_feature, train, validation, max_rating=5, min_rating=1) # fitting rec.estimate(max_iter) # results train_preds = rec.predict(train) train_rmse = RMSE(train_preds, np.float16(train[:, 2])) validation_preds = rec.predict(validation) validation_rmse = RMSE(validation_preds, np.float16(validation[:, 2])) print "train RMSE: %.6f, validation RMSE: %.6f " % (train_rmse, validation_rmse)
data = proprocess() num_data = data.shape[0] num_training = math.ceil(num_data * 0.8) idx = np.arange(num_data) np.random.shuffle(idx) training = data[idx[:num_training], :] testing = data[idx[num_training:], :] best_score = math.inf for _ in range(NUM_TEST): dmf = DeepMatrixFactorization(NUM_USER, NUM_MOVIE, EMBEDDING_DIM, HIDDEN_DIM) dmf.learn(training[:, 0], training[:, 1], training[:, 2], 2000, 500) score = dmf.test(testing[:, 0], testing[:, 1], testing[:, 2]) if best_score > score: best_score = score print("Best score for deep matrix factorization is %f" % best_score) best_score = math.inf for _ in range(NUM_TEST): mf = MatrixFactorization(NUM_USER, NUM_MOVIE, EMBEDDING_DIM) mf.learn(training[:, 0], training[:, 1], training[:, 2], 2000, 500) score = mf.test(testing[:, 0], testing[:, 1], testing[:, 2]) if best_score > score: best_score = score print("Best score for matrix factorization is %f" % best_score)
def __init__(self, sMatrix, depth_threshold=6, plambda=7, MSP_item=200): ''' sMatrix: I*U matrix depth_threshold: terminate depth plambda: regularization parameter self.rI: dict { itemid1: [ [uid11, rating11], [uid12, rating12], ... ] rating for item 1 itemid2: [ [uid21, rating21], [uid22, rating22], ... ] rating for item 2 itemid3: ... } self.rU: dict { userid1: { itemid11: rating11, itemid12: rating12, ... } rating of user 1 userid2: { itemid21: rating21, itemid22: rating22, ... } rating of user 2 userid3: ... } self.lr_bound: dict { level 0: [[left_bound, right_bound]], users' bound for one level, each ele in dictionary represents one node level 1: [[left_bound, right_bound], [left_bound, right_bound], [left_bound, right_bound]], 3 level 2: ..., 9 } (bound means index) self.tree: [] all of userid self.split_item: list [ level 0: [] level 1: [] level 2: [] ] self.sum_cur_t: dict { itemid1: {'rating': sum of ratings for item 1, 'cnt': sum of users rated item 1} itemid2: {'rating': sum of ratings for item 2, 'cnt': sum of users rated item 2} ... } self.sum_2_cur_t: dict { itemid1: sum of square ratings for item 1 itemid2: sum of square ratings for item 2 ... } self.biasU: dict { userid1: bias1 userid2: bias2 ... } self.user_profile: dict { level 0: {pseudo_user1: [k1, k2, k3, ... , kt]} level 1: {pseudo_user1: [k1, k2, k3, ... , kt], pseudo_user2: [k1, k2, k3, ... , kt], pseudo_user3: [k1, k2, k3, ... , kt]} ... } profile for each level's node self.item_profile: dict { level 0: {itemid1: [k1, k2, k3, ... , kt], itemid2: [k1, k2, k3, ... , kt], itemid3: [k1, k2, k3, ... , kt], ...} for each item level 1: {itemid1: [k1, k2, k3, ... , kt], itemid2: [k1, k2, k3, ... , kt], itemid3: [k1, k2, k3, ... , kt], ...} for each item ... } profile for each item every element represents ratings for one item, its order decide the users in tree nodes ''' self.depth_threshold = depth_threshold self.plambda = plambda self.cur_depth = 0 self.MSP_item = MSP_item self.real_item_num = sMatrix.shape[0] x = find(sMatrix) itemset = x[0] userset = x[1] # self.rI = {} self.rU = {} self.sum_cur_t = {} self.sum_2_cur_t = {} # self.rI[itemset[0]] = [[userset[0], sMatrix[itemset[0], userset[0]]]] # self.rU[userset[0]] = {itemset[0]: sMatrix[itemset[0], userset[0]]} self.global_mean = 0 # global average of ratings #### Calculate rate of progress #### self.node_num = 0 self.cur_node = 0 for i in range(self.depth_threshold): self.node_num += 3**i #### Generate rI, rU #### self.rI = list(set(sMatrix.nonzero()[0])) for itemid, userid in zip(itemset, userset): self.rU.setdefault(userid, {})[itemid] = sMatrix[itemid, userid] # self.rI.setdefault(itemid, []).append([userid, sMatrix[itemid, userid]]) self.global_mean += sMatrix[itemid, userid] self.global_mean /= len(itemset) self.item_size = len(self.rI) self.user_size = len(self.rU) #### Initiate Tree, lr_bound #### self.tree = list(self.rU.keys()) self.split_item = [] self.lr_bound = {'0': [[0, len(self.tree) - 1]]} #### Generate bias, sum_cur_t, sum_2_cur_t #### self.biasU = {} self.sum_cur_t = np.zeros(self.real_item_num) self.sum_2_cur_t = np.zeros(self.real_item_num) self.sum_cntt = np.zeros(self.real_item_num) # self.sum_cur_t[itemset[0]] = {'rating': sMatrix[itemset[0], userset[0]]-self.biasU[userset[0]], 'cnt': 1} # self.sum_2_cur_t[itemset[0]] = (sMatrix[itemset[0], userset[0]]-self.biasU[userset[0]])**2 for userid in self.rU: self.biasU[userid] = (sum(list(self.rU[userid].values())) + self.plambda * self.global_mean) / ( self.plambda + len(self.rU[userid])) user_all_rating_id = np.array(list(self.rU[userid].keys())) # print('user_all_rating_id ', user_all_rating_id[:]) user_all_rating = np.array(list(self.rU[userid].values())) self.sum_cur_t[ user_all_rating_id[:]] += user_all_rating[:] - self.biasU[ userid] self.sum_2_cur_t[user_all_rating_id[:]] += (user_all_rating[:] - self.biasU[userid])**2 self.sum_cntt[user_all_rating_id[:]] += 1 # for itemid, userid, ind in zip(itemset[1:],userset[1:],range(1, len(itemset))): # if itemid == itemset[ind-1]: # self.sum_cur_t[itemid]['rating'] += sMatrix[itemid, userid]-self.biasU[userid] # self.sum_cur_t[itemid]['cnt'] += 1 # self.sum_2_cur_t[itemid] += (sMatrix[itemid, userid]-self.biasU[userid])**2 # else: # self.sum_cur_t[itemid] = {'rating': sMatrix[itemid, userid]-self.biasU[userid], 'cnt': 1} # self.sum_2_cur_t[itemid] = (sMatrix[itemid, userid]-self.biasU[userid])**2 #### Prediction Model #### self.user_profile = {} self.item_profile = {} self.MF = MatrixFactorization() print("Initiation DONE!")
class DecisionTreeModel: def __init__(self, sMatrix, depth_threshold=6, plambda=7, MSP_item=200): ''' sMatrix: I*U matrix depth_threshold: terminate depth plambda: regularization parameter self.rI: dict { itemid1: [ [uid11, rating11], [uid12, rating12], ... ] rating for item 1 itemid2: [ [uid21, rating21], [uid22, rating22], ... ] rating for item 2 itemid3: ... } self.rU: dict { userid1: { itemid11: rating11, itemid12: rating12, ... } rating of user 1 userid2: { itemid21: rating21, itemid22: rating22, ... } rating of user 2 userid3: ... } self.lr_bound: dict { level 0: [[left_bound, right_bound]], users' bound for one level, each ele in dictionary represents one node level 1: [[left_bound, right_bound], [left_bound, right_bound], [left_bound, right_bound]], 3 level 2: ..., 9 } (bound means index) self.tree: [] all of userid self.split_item: list [ level 0: [] level 1: [] level 2: [] ] self.sum_cur_t: dict { itemid1: {'rating': sum of ratings for item 1, 'cnt': sum of users rated item 1} itemid2: {'rating': sum of ratings for item 2, 'cnt': sum of users rated item 2} ... } self.sum_2_cur_t: dict { itemid1: sum of square ratings for item 1 itemid2: sum of square ratings for item 2 ... } self.biasU: dict { userid1: bias1 userid2: bias2 ... } self.user_profile: dict { level 0: {pseudo_user1: [k1, k2, k3, ... , kt]} level 1: {pseudo_user1: [k1, k2, k3, ... , kt], pseudo_user2: [k1, k2, k3, ... , kt], pseudo_user3: [k1, k2, k3, ... , kt]} ... } profile for each level's node self.item_profile: dict { level 0: {itemid1: [k1, k2, k3, ... , kt], itemid2: [k1, k2, k3, ... , kt], itemid3: [k1, k2, k3, ... , kt], ...} for each item level 1: {itemid1: [k1, k2, k3, ... , kt], itemid2: [k1, k2, k3, ... , kt], itemid3: [k1, k2, k3, ... , kt], ...} for each item ... } profile for each item every element represents ratings for one item, its order decide the users in tree nodes ''' self.depth_threshold = depth_threshold self.plambda = plambda self.cur_depth = 0 self.MSP_item = MSP_item self.real_item_num = sMatrix.shape[0] x = find(sMatrix) itemset = x[0] userset = x[1] # self.rI = {} self.rU = {} self.sum_cur_t = {} self.sum_2_cur_t = {} # self.rI[itemset[0]] = [[userset[0], sMatrix[itemset[0], userset[0]]]] # self.rU[userset[0]] = {itemset[0]: sMatrix[itemset[0], userset[0]]} self.global_mean = 0 # global average of ratings #### Calculate rate of progress #### self.node_num = 0 self.cur_node = 0 for i in range(self.depth_threshold): self.node_num += 3**i #### Generate rI, rU #### self.rI = list(set(sMatrix.nonzero()[0])) for itemid, userid in zip(itemset, userset): self.rU.setdefault(userid, {})[itemid] = sMatrix[itemid, userid] # self.rI.setdefault(itemid, []).append([userid, sMatrix[itemid, userid]]) self.global_mean += sMatrix[itemid, userid] self.global_mean /= len(itemset) self.item_size = len(self.rI) self.user_size = len(self.rU) #### Initiate Tree, lr_bound #### self.tree = list(self.rU.keys()) self.split_item = [] self.lr_bound = {'0': [[0, len(self.tree) - 1]]} #### Generate bias, sum_cur_t, sum_2_cur_t #### self.biasU = {} self.sum_cur_t = np.zeros(self.real_item_num) self.sum_2_cur_t = np.zeros(self.real_item_num) self.sum_cntt = np.zeros(self.real_item_num) # self.sum_cur_t[itemset[0]] = {'rating': sMatrix[itemset[0], userset[0]]-self.biasU[userset[0]], 'cnt': 1} # self.sum_2_cur_t[itemset[0]] = (sMatrix[itemset[0], userset[0]]-self.biasU[userset[0]])**2 for userid in self.rU: self.biasU[userid] = (sum(list(self.rU[userid].values())) + self.plambda * self.global_mean) / ( self.plambda + len(self.rU[userid])) user_all_rating_id = np.array(list(self.rU[userid].keys())) # print('user_all_rating_id ', user_all_rating_id[:]) user_all_rating = np.array(list(self.rU[userid].values())) self.sum_cur_t[ user_all_rating_id[:]] += user_all_rating[:] - self.biasU[ userid] self.sum_2_cur_t[user_all_rating_id[:]] += (user_all_rating[:] - self.biasU[userid])**2 self.sum_cntt[user_all_rating_id[:]] += 1 # for itemid, userid, ind in zip(itemset[1:],userset[1:],range(1, len(itemset))): # if itemid == itemset[ind-1]: # self.sum_cur_t[itemid]['rating'] += sMatrix[itemid, userid]-self.biasU[userid] # self.sum_cur_t[itemid]['cnt'] += 1 # self.sum_2_cur_t[itemid] += (sMatrix[itemid, userid]-self.biasU[userid])**2 # else: # self.sum_cur_t[itemid] = {'rating': sMatrix[itemid, userid]-self.biasU[userid], 'cnt': 1} # self.sum_2_cur_t[itemid] = (sMatrix[itemid, userid]-self.biasU[userid])**2 #### Prediction Model #### self.user_profile = {} self.item_profile = {} self.MF = MatrixFactorization() print("Initiation DONE!") def calculate_error(self, sumt, sumt_2, cntt): ''' Calculate error for one item-split in one node ''' Error_i = np.sum(sumt_2 - (sumt**2) / (cntt + 1e-9)) # for itemid in sumtL: # Error_i += sumtL_2[itemid] - (sumtL[itemid]['rating']**2)/(sumtL[itemid]['cnt']+1e-9) \ # + sumtD_2[itemid] - (sumtD[itemid]['rating']**2)/(sumtD[itemid]['cnt']+1e-9) \ # + sumtU_2[itemid] - (sumtU[itemid]['rating']**2)/(sumtU[itemid]['cnt']+1e-9) return Error_i def generate_decision_tree(self, lr_bound_for_node, chosen_id): ''' sumtL: dict { itemid1: {'rating': sum of ratings for item 1, 'cnt': sum of users rated item 1} itemid2: {'rating': sum of ratings for item 2, 'cnt': sum of users rated item 2} ... } sumtL_2: dict { itemid1: sum of square ratings for item 1 itemid2: sum of square ratings for item 2 ... } lr_bound_for_node: list [leftind, rightind] for one node ''' #### Terminate #### self.cur_depth += 1 if self.cur_depth > self.depth_threshold or len( chosen_id) == self.item_size: return #### Choose Most Popular Items of This Node #### num_rec = np.zeros(self.real_item_num) for userid in self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)]: user_all_rating_id = np.array(list(self.rU[userid].keys())) num_rec[user_all_rating_id[:]] += 1 sub_item_id = np.argsort(num_rec)[:self.MSP_item] #### Find optimum item to split #### min_sumtL, min_sumtD, min_sumtL_2, min_sumtD_2, min_sumtU, min_sumtU_2, Error = {}, {}, {}, {}, {}, {}, {} min_Error = "None" for itemid in sub_item_id: if itemid in chosen_id: continue ''' user_rating_item_in_nodet: [ [uid01, rating01], [uid02, rating02], ... ] to find all users in node t who rates item i ''' user_rating_item_in_nodet = ([ userid, self.rU[userid][itemid] ] for userid in self.tree[lr_bound_for_node[0]:( lr_bound_for_node[1] + 1)] if itemid in self.rU[userid]) sumt = np.zeros((self.real_item_num, 3)) sumt_2 = np.zeros((self.real_item_num, 3)) cntt = np.zeros((self.real_item_num, 3)) for user in user_rating_item_in_nodet: ''' user_all_rating: array [ [itemid11, rating11], [itemid12, rating12], ... ] ''' user_all_rating_id = np.array(list(self.rU[user[0]].keys())) user_all_rating = np.array(list(self.rU[user[0]].values())) #### calculate sumtL for node LIKE #### if user[1] >= 4: sumt[user_all_rating_id[:], 0] += user_all_rating[:] - self.biasU[user[0]] sumt_2[user_all_rating_id[:], 0] += (user_all_rating[:] - self.biasU[user[0]])**2 cntt[user_all_rating_id[:], 0] += 1 #### calculate sumtD for node DISLIKE #### elif user[1] <= 3: sumt[user_all_rating_id[:], 1] += user_all_rating[:] - self.biasU[user[0]] sumt_2[user_all_rating_id[:], 1] += (user_all_rating[:] - self.biasU[user[0]])**2 cntt[user_all_rating_id[:], 1] += 1 #### calculate sumtU for node UNKNOWN #### sumt[:, 2] = self.sum_cur_t[:] - sumt[:, 0] - sumt[:, 1] sumt_2[:, 2] = self.sum_2_cur_t[:] - sumt_2[:, 0] - sumt_2[:, 1] cntt[:, 2] = self.sum_cntt[:] - cntt[:, 0] - cntt[:, 1] Error[itemid] = self.calculate_error(sumt, sumt_2, cntt) # sumtL, sumtD, sumtL_2, sumtD_2, sumtU, sumtU_2 = {}, {}, {}, {}, {}, {} # sumtL = {k:{'rating': 0, 'cnt': 0} for k in self.rI.keys()} # sumtL_2 = sumtL_2.fromkeys(self.rI.keys(), 0) # sumtD = {k:{'rating': 0, 'cnt': 0} for k in self.rI.keys()} # sumtD_2 = sumtD_2.fromkeys(self.rI.keys(), 0) # for user in user_rating_item_in_nodet: # ''' user_all_rating: [ [itemid11, rating11], [itemid12, rating12], ... ] ''' # user_all_rating = self.rU[user[0]] # #### calculate sumtL for node LIKE #### # if user[1] >= 4: # for uritem, rating in user_all_rating.items(): # sumtL[uritem]['rating'] += rating # sumtL_2[uritem] += (rating-self.biasU[user[0]])**2 # sumtL[uritem]['rating'] -= self.biasU[user[0]] # sumtL[uritem]['cnt'] += 1 # #### calculate sumtD for node DISLIKE #### # elif user[1] <= 3: # for uritem, rating in user_all_rating.items(): # sumtD[uritem]['rating'] += rating # sumtD_2[uritem] += (rating-self.biasU[user[0]])**2 # sumtD[uritem]['rating'] -= self.biasU[user[0]] # sumtD[uritem]['cnt'] += 1 # #### calculate sumtU for node UNKNOWN #### # for iid in self.rI: # sumtU[iid] = {} # sumtU[iid]['rating'] = self.sum_cur_t[iid]['rating'] - sumtL[iid]['rating'] - sumtD[iid]['rating'] # sumtU[iid]['cnt'] = self.sum_cur_t[iid]['cnt'] - sumtL[iid]['cnt'] - sumtD[iid]['cnt'] # sumtU_2[iid] = self.sum_2_cur_t[iid] - sumtL_2[iid] - sumtD_2[iid] # #### calculate error by (eL + eD + eU) #### # Error[itemid] = self.calculate_error(sumtL, sumtL_2, sumtD, sumtD_2, sumtU, sumtU_2) if min_Error == "None" or Error[itemid] < min_Error: min_sumt = sumt min_sumt_2 = sumt_2 min_cntt = cntt min_Error = Error[itemid] #### Find optimum split-item #### optimum_itemid = min(Error, key=Error.get) if len(self.split_item) == self.cur_depth - 1: self.split_item.append([optimum_itemid]) else: self.split_item[self.cur_depth - 1].append(optimum_itemid) # self.split_item.setdefault(str(self.cur_depth-1), []).append(optimum_itemid) chosen_id.append(optimum_itemid) #### sort tree #### self.lr_bound.setdefault(str(self.cur_depth), []).append([]) # for LIKE self.lr_bound[str(self.cur_depth)].append([]) # for DISLIKE self.lr_bound[str(self.cur_depth)].append([]) # for UNKNOWN listU, listL, listD = [], [], [] for userid in self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)]: if optimum_itemid not in self.rU[userid]: listU.append(userid) elif self.rU[userid][optimum_itemid] >= 4: listL.append(userid) elif self.rU[userid][optimum_itemid] <= 3: listD.append(userid) self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)] = listL + listD + listU self.lr_bound[str(self.cur_depth)][-3] = [ lr_bound_for_node[0], lr_bound_for_node[0] + len(listL) - 1 ] # for LIKE self.lr_bound[str(self.cur_depth)][-2] = [ lr_bound_for_node[0] + len(listL), lr_bound_for_node[0] + len(listL) + len(listD) - 1 ] # for DISLIKE self.lr_bound[str(self.cur_depth)][-1] = [ lr_bound_for_node[0] + len(listL) + len(listD), lr_bound_for_node[0] + len(listL) + len(listD) + len(listU) - 1 ] # for UNKNOWN #### Generate Subtree of Node LIKE #### self.sum_cur_t = min_sumt[:, 0] self.sum_2_cur_t = min_sumt_2[:, 0] self.sum_cntt = min_cntt[:, 0] self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-3], chosen_id[:]) self.cur_depth -= 1 #### Generate Subtree of Node DISLIKE #### self.sum_cur_t = min_sumt[:, 1] self.sum_2_cur_t = min_sumt_2[:, 1] self.sum_cntt = min_cntt[:, 1] self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-2], chosen_id[:]) self.cur_depth -= 1 #### Generate Subtree of Node UNKNOWN #### self.sum_cur_t = min_sumt[:, 2] self.sum_2_cur_t = min_sumt_2[:, 2] self.sum_cntt = min_cntt[:, 2] self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-1], chosen_id[:]) self.cur_depth -= 1 #### Show Rating Progress #### for i in range(self.cur_depth - 1): print("┃", end="") print("┏", end="") self.cur_node += 1 print("Current depth: " + str(self.cur_depth) + " %.2f%%" % (100 * self.cur_node / self.node_num)) def calculate_avg_rating_for_pesudo_user(self, pseudo_user_lst): '''ret_dict: dict { itemid0: rating0 itemid1: rating1 ... }''' cal_dict = {key: {'rating': 0, 'cnt': 0} for key in self.rI} ret_dict = {} for userid in pseudo_user_lst: for itemid, rating in self.rU[userid].items(): cal_dict[itemid]['rating'] += rating cal_dict[itemid]['cnt'] += 1 for itemid in cal_dict: if cal_dict[itemid]['cnt'] == 0: continue ret_dict[ itemid] = cal_dict[itemid]['rating'] / cal_dict[itemid]['cnt'] return ret_dict def generate_prediction_model(self): '''self.lr_bound: dict { level 0: [[left_bound, right_bound]], users' bound for one level, each ele in dictionary represents one node level 1: [[left_bound, right_bound], [left_bound, right_bound], [left_bound, right_bound]], 3 level 2: ..., 9 } (bound means index) ''' for level in self.lr_bound: self.user_profile.setdefault(level) train_lst = [] for pseudo_user_bound, userid in zip( self.lr_bound[level], range(len(self.lr_bound[level]))): if pseudo_user_bound[0] > pseudo_user_bound[1]: continue pseudo_user_lst = self.tree[pseudo_user_bound[0]:( pseudo_user_bound[1] + 1)] pseudo_user_for_item = self.calculate_avg_rating_for_pesudo_user( pseudo_user_lst) train_lst += [(userid, int(key), float(value)) for key, value in pseudo_user_for_item.items()] self.user_profile[level], self.item_profile[ level] = self.MF.matrix_factorization(train_lst) def build_model(self): #### Construct the tree & get the prediction model #### self.generate_decision_tree(self.lr_bound['0'][0], []) self.generate_prediction_model() def predict(self, new_user_ratings, pred_index): ''' new_user_ratings: list [ [itemid1, rating1], [itemid2, rating2], [itemid3, rating3], [itemid4, rating4], ... ] pred_rating: array: (I,) new user's rating for each item ''' #### Find user profile for new user #### new_user_profile = np.array(self.user_profile[str( len(new_user_ratings))][pred_index]) # shape: (k,) new_item_profile = np.array( list(self.item_profile[str( len(new_user_ratings))].values())) # shape: (I, k) #### Calculate predict rating #### pred_rating = {itemid: np.dot(new_item_profile[i], new_user_profile) \ for itemid,i in zip(self.item_profile[str(len(new_user_ratings))], range(new_item_profile.shape[0]))} return pred_rating