示例#1
0
def active_learning(simClass, start, end, simItem_k, topUser_k, itemsFrame,
                    ratingsFrame, users_rating):
    #### Split training set and test set ####
    ratings_train = ratingsFrame.to_records(index=False).tolist()
    ratings_test = []
    for ratingNum in range(ratingsFrame.shape[0]):
        if (ratings_train[ratingNum][1] == start):
            test_start = ratingNum
            while (ratingNum in range(len(ratings_train))
                   and ratings_train[ratingNum][1] <= end):
                ratings_test.append(ratings_train[ratingNum])
                ratingNum += 1
            test_end = ratingNum
            break
    ratings_train = ratings_train[0:test_start] + ratings_train[test_end:]
    print("pre-split DONE")

    #### find k similar items for each new item ####
    print("test interval: [" + str(start) + ":" + str(end) + "]")
    sims = simClass.generate_topk_item_similarity(
        range(start, end), simItem_k, itemsFrame.loc[:, "asin"].tolist())
    print("Sims calculation DONE")

    #### Calculate Propability for each new item ####
    ratings_set = ratingsFrame.loc[:, ["userID", "itemID"]].to_records(
        index=False).tolist()
    ratings_origin = ratingsFrame.to_records(index=False).tolist()
    item_groups = ratingsFrame.groupby("itemID")

    for item in sims:
        # first, find the set of users who have rated items within sim set.
        users_rated_sims = []
        for item_sim in sims[item]:
            users_rated_sims += item_groups.get_group(
                item_sim)["userID"].tolist()
        users_rated_sims = list(set(users_rated_sims))

        user_score = Score(users_rated_sims, users_rating, sims[item])
        user_score = sorted(user_score.items(),
                            key=lambda d: d[1],
                            reverse=True)

        for top in range(topUser_k):
            t = (user_score[top][0], item)
            if t in ratings_set:
                index = ratings_set.index(t)
                ratings_test.remove(ratings_origin[index])
                ratings_train.append(ratings_origin[index])
    print("active learning ALL DONE")

    ##### Caculate RMSE for each iteration2 #####
    mf = MatrixFactorization()
    return mf.matrix_factorization(ratings_train, ratings_test)
def run_matrix_factorization():
    """ Executes the Matrix Factorization model on the ratings dataset. """
    ratings = pd.read_csv('../data/ratings2.csv', sep='\t')
    num_users = len(ratings.buyer_id.unique())
    num_items = ratings.product_id.max() + 1
    train, val = train_test_split(ratings)
    model = MatrixFactorization(train,
                                num_users,
                                num_items,
                                num_latent_factors=20)
    model.train(max_iter=20,
                learning_rate=0.01,
                regularize=0.5,
                val=val,
                lr_scheduler=True)
示例#3
0
def test_matrix_factorization():
    print('mf test.............')
    since = time.time()

    # 潜在因子数
    K = 2
    # m x n のレーティング行列
    R = np.array([[5, 3, 0, 1], [4, 0, 0, 1], [1, 1, 0, 5], [1, 0, 0, 4],
                  [0, 1, 5, 4]])
    m, n = R.shape
    # m x K のユーザ行列P
    P = np.random.rand(m, K)
    # n x K のアイテム行列Q
    Q = np.random.rand(n, K)

    P_new, Q_new = mf.run(R,
                          P,
                          Q,
                          K,
                          steps=5000,
                          alpha=0.0002,
                          beta=0.2,
                          threshold=0.001)
    evaluate(P, Q, R, P_new, Q_new)
    assert False
示例#4
0
def test_movielens(load_dataset):
    print('mf test.............')

    # Load dataset
    df_rating = load_dataset
    # print(df_rating.head())

    m = max(np.unique(df_rating.userId))
    n = max(np.unique(df_rating.movieId))
    # m = len(np.unique(df_rating.userId))
    # n = len(np.unique(df_rating.movieId))

    print('m x n: {} x {}'.format(m, n))

    R = np.zeros([m, n])

    print(type(df_rating.userId[0]))
    print(type(df_rating.movieId[0]))
    print(R[df_rating.userId[0]][df_rating.movieId[0]])
    print(np.unique(df_rating.rating))
    print(sum(df_rating.rating.isnull()))

    for i, s in df_rating.iterrows():
        if s.rating == 0: continue

        u_id = int(s.userId)
        m_id = int(s.movieId)

        R[u_id - 1][m_id - 1] = s.rating

    print(R)

    # 潜在因子数
    K = 2
    # m x n のレーティング行列
    m, n = df_rating[['userId', 'movieId']].shape
    print(m, n)
    R = np.array([[5, 3, 0, 1], [4, 0, 0, 1], [1, 1, 0, 5], [1, 0, 0, 4],
                  [0, 1, 5, 4]])
    m, n = R.shape
    # m x K のユーザ行列P
    P = np.random.rand(m, K)
    # n x K のアイテム行列Q
    Q = np.random.rand(n, K)

    P_new, Q_new = mf.run(R,
                          P,
                          Q,
                          K,
                          steps=5000,
                          alpha=0.0002,
                          beta=0.2,
                          threshold=0.001)
    evaluate(P, Q, R, P_new, Q_new)

    assert False
示例#5
0
def generate_prediction_model(lr_bound, tree, rI, sMatrix, plambda_candidates,
                              validation_set):
    ''' lr_bound: dict {
                level 0: [[left_bound, right_bound]], users' bound for one level, each ele in dictionary represents one node
                level 1: [[left_bound, right_bound], [left_bound, right_bound], [left_bound, right_bound]], 3
                level 2: ..., 9
            } (bound means index)
        plambda_candidates: {
            level 0: [clambda1, clambda2, clambda3, ...]
            level 1: [clambda1, clambda2, clambda3, ...]
            level 2: [clambda1, clambda2, clambda3, ...]
        }
        prediction_model: dict {
                level 0: { 'best_lambda': x, 'user_profile': ..., 'item_profile': ...}
                level 1: { 'best_lambda': x, 'user_profile': ..., 'item_profile': ...}
                level 2: { 'best_lambda': x, 'user_profile': ..., 'item_profile': ...}
            }
    '''
    MF = MatrixFactorization()
    prediction_model = {}
    val_item_list = find(validation_set)[0]
    val_user_list = find(validation_set)[1]
    user_node_ind = np.zeros(
        sMatrix.shape[1])  #### notice that index is not id

    for level in lr_bound:
        prediction_model.setdefault(level, {})
        train_lst = []
        for pseudo_user_bound, userid in zip(lr_bound[level],
                                             range(len(lr_bound[level]))):
            if pseudo_user_bound[0] > pseudo_user_bound[1]:
                continue
            pseudo_user_lst = tree[pseudo_user_bound[0]:(pseudo_user_bound[1] +
                                                         1)]
            pseudo_user_for_item = calculate_avg_rating_for_pesudo_user(
                pseudo_user_lst, sMatrix)
            train_lst += [(userid, int(itemid),
                           float(pseudo_user_for_item[itemid]))
                          for itemid in range(pseudo_user_for_item.shape[0])
                          if pseudo_user_for_item[itemid]]
            #### find node index for each validation user ####
            user_node_ind[pseudo_user_lst] = userid

        #### Train MF and Do validation ####
        min_RMSE = -1
        for plambda in plambda_candidates[level]:
            MF.change_parameter(plambda)
            user_profile, item_profile = MF.matrix_factorization(train_lst)
            RMSE = pred_RMSE_for_validate_user(user_node_ind, user_profile,
                                               item_profile, val_user_list,
                                               val_item_list, sMatrix)
            if min_RMSE is -1 or RMSE < min_RMSE:
                min_RMSE = RMSE
                min_user_profile, min_item_profile, min_lambda = user_profile, item_profile, plambda
        prediction_model[level]['upro'], prediction_model[level]['ipro'], prediction_model[level]['plambda'] \
                                             = min_user_profile, min_item_profile, min_lambda
    MF.end()  #### close MF spark session
    return prediction_model
示例#6
0
def main():
    """
    main function for test
    """

    # For command line arguments
    psr = argparse.ArgumentParser()
    psr.add_argument("--data_dir", default="ml-100k/")
    psr.add_argument("--base", default="u1.base")
    psr.add_argument("--test", default="u1.test")
    psr.add_argument("--a", default=0.01, type=float)
    psr.add_argument("--b", default=0.2, type=float)
    psr.add_argument("--K", default=50, type=int)
    psr.add_argument("--tol", default=3.e-2, type=float)
    args = psr.parse_args()

    # Get rating matrix
    R, R_test, user_size, item_size = get_rating(data_dir=args.data_dir,
                                                 base=args.base,
                                                 test=args.test,
                                                 debug=True)

    # Training
    mf = MatrixFactorization(R=R,
                             R_test=R_test,
                             user_size=user_size,
                             item_size=item_size,
                             K=args.K,
                             a=args.a,
                             b=args.b)
    print("training...")
    mf.train(tol=args.tol, debug=True)

    print("The number of test data is {}.".format(R_test.shape[0]))

    icor = 0
    for r in R_test:
        iu = int(r[0]) - 1
        ii = int(r[1]) - 1
        Rhatui = mf.rating(iu=iu, ii=ii)
        if np.round(Rhatui) == r[2]: icor = icor + 1

    print("The number of correct predictions is {}.".format(icor))
示例#7
0
    def __init__(self, data_path=None):
        mf = MatrixFactorization(data_path)

        #TODO: Correct?
        self._context_data = {str(i): mf.Q[i] for i in range(len(mf.Q))}
#!/usr/bin/env python3
from matrix_factorization import MatrixFactorization

DATA = {
    "DATA_DIR": "./ml-latest-small",
    "RATING_FILE": "ratings.csv",
    "MOVIE_FILE": "movies.csv",
    "TAG_FILE": "tags.csv"
}

if __name__ == "__main__":
    mf_cf = MatrixFactorization(DATA)
    mf_cf.train()
    mf_cf.predict_single(1, 110)
    # load MovieLens data
    num_user, num_item, ratings = load_ml_1m()
    np.random.shuffle(ratings)

    # set feature numbers
    num_feature = 10

    # set max_iterations
    max_iter = 20

    # split data to training & testing
    train_pct = 0.9
    train_size = int(train_pct * len(ratings))
    train = ratings[:train_size]
    validation = ratings[train_size:]

    # models
    rec = MatrixFactorization(num_user, num_item, num_feature, train, validation, max_rating=5, min_rating=1)

    # fitting
    rec.estimate(max_iter)

    # results
    train_preds = rec.predict(train)
    train_rmse = RMSE(train_preds, np.float16(train[:, 2]))
    validation_preds = rec.predict(validation)
    validation_rmse = RMSE(validation_preds, np.float16(validation[:, 2]))

    print "train RMSE: %.6f, validation RMSE: %.6f " % (train_rmse, validation_rmse)
示例#10
0
data = proprocess()
num_data = data.shape[0]
num_training = math.ceil(num_data * 0.8)
idx = np.arange(num_data)
np.random.shuffle(idx)
training = data[idx[:num_training], :]
testing = data[idx[num_training:], :]

best_score = math.inf
for _ in range(NUM_TEST):
    dmf = DeepMatrixFactorization(NUM_USER, NUM_MOVIE, EMBEDDING_DIM,
                                  HIDDEN_DIM)
    dmf.learn(training[:, 0], training[:, 1], training[:, 2], 2000, 500)

    score = dmf.test(testing[:, 0], testing[:, 1], testing[:, 2])
    if best_score > score:
        best_score = score

print("Best score for deep matrix factorization is %f" % best_score)

best_score = math.inf
for _ in range(NUM_TEST):
    mf = MatrixFactorization(NUM_USER, NUM_MOVIE, EMBEDDING_DIM)
    mf.learn(training[:, 0], training[:, 1], training[:, 2], 2000, 500)

    score = mf.test(testing[:, 0], testing[:, 1], testing[:, 2])
    if best_score > score:
        best_score = score
print("Best score for matrix factorization is %f" % best_score)
    def __init__(self, sMatrix, depth_threshold=6, plambda=7, MSP_item=200):
        '''
            sMatrix: I*U matrix
            depth_threshold: terminate depth
            plambda: regularization parameter
            self.rI: dict { 
                        itemid1: [ [uid11, rating11], [uid12, rating12], ... ] rating for item 1
                        itemid2: [ [uid21, rating21], [uid22, rating22], ... ] rating for item 2
                        itemid3: ...
                     }
            self.rU: dict {
                        userid1: { itemid11: rating11, itemid12: rating12, ... } rating of user 1
                        userid2: { itemid21: rating21, itemid22: rating22, ... } rating of user 2
                        userid3: ...
                     }
            self.lr_bound: dict {
                                level 0: [[left_bound, right_bound]], users' bound for one level, each ele in dictionary represents one node
                                level 1: [[left_bound, right_bound], [left_bound, right_bound], [left_bound, right_bound]], 3
                                level 2: ..., 9
                            } (bound means index)
            self.tree: []  all of userid
            self.split_item: list [
                    level 0: []
                    level 1: []
                    level 2: []
            ]
            self.sum_cur_t: dict {
                        itemid1: {'rating': sum of ratings for item 1, 'cnt': sum of users rated item 1}
                        itemid2: {'rating': sum of ratings for item 2, 'cnt': sum of users rated item 2}
                        ...
                    }
            self.sum_2_cur_t: dict {
                        itemid1: sum of square ratings for item 1
                        itemid2: sum of square ratings for item 2
                        ...
                    }
            self.biasU: dict {
                        userid1: bias1
                        userid2: bias2
                        ...
                    }
            self.user_profile: dict {
                        level 0: {pseudo_user1: [k1, k2, k3, ... , kt]}
                        level 1: {pseudo_user1: [k1, k2, k3, ... , kt], pseudo_user2: [k1, k2, k3, ... , kt], pseudo_user3: [k1, k2, k3, ... , kt]}
                        ... 
                    } profile for each level's node
            self.item_profile: dict {
                        level 0: {itemid1: [k1, k2, k3, ... , kt], itemid2: [k1, k2, k3, ... , kt], itemid3: [k1, k2, k3, ... , kt], ...} for each item
                        level 1: {itemid1: [k1, k2, k3, ... , kt], itemid2: [k1, k2, k3, ... , kt], itemid3: [k1, k2, k3, ... , kt], ...} for each item
                        ...
                    } profile for each item
            every element represents ratings for one item, its order decide the users in tree nodes
        '''
        self.depth_threshold = depth_threshold
        self.plambda = plambda
        self.cur_depth = 0
        self.MSP_item = MSP_item
        self.real_item_num = sMatrix.shape[0]
        x = find(sMatrix)
        itemset = x[0]
        userset = x[1]
        # self.rI = {}
        self.rU = {}
        self.sum_cur_t = {}
        self.sum_2_cur_t = {}
        # self.rI[itemset[0]] = [[userset[0], sMatrix[itemset[0], userset[0]]]]
        # self.rU[userset[0]] = {itemset[0]: sMatrix[itemset[0], userset[0]]}
        self.global_mean = 0  # global average of ratings

        #### Calculate rate of progress ####
        self.node_num = 0
        self.cur_node = 0
        for i in range(self.depth_threshold):
            self.node_num += 3**i

        #### Generate rI, rU ####
        self.rI = list(set(sMatrix.nonzero()[0]))
        for itemid, userid in zip(itemset, userset):
            self.rU.setdefault(userid, {})[itemid] = sMatrix[itemid, userid]
            # self.rI.setdefault(itemid, []).append([userid, sMatrix[itemid, userid]])
            self.global_mean += sMatrix[itemid, userid]
        self.global_mean /= len(itemset)
        self.item_size = len(self.rI)
        self.user_size = len(self.rU)

        #### Initiate Tree, lr_bound ####
        self.tree = list(self.rU.keys())
        self.split_item = []
        self.lr_bound = {'0': [[0, len(self.tree) - 1]]}

        #### Generate bias, sum_cur_t, sum_2_cur_t ####
        self.biasU = {}
        self.sum_cur_t = np.zeros(self.real_item_num)
        self.sum_2_cur_t = np.zeros(self.real_item_num)
        self.sum_cntt = np.zeros(self.real_item_num)
        #         self.sum_cur_t[itemset[0]] = {'rating': sMatrix[itemset[0], userset[0]]-self.biasU[userset[0]], 'cnt': 1}
        #         self.sum_2_cur_t[itemset[0]] = (sMatrix[itemset[0], userset[0]]-self.biasU[userset[0]])**2
        for userid in self.rU:
            self.biasU[userid] = (sum(list(self.rU[userid].values())) +
                                  self.plambda * self.global_mean) / (
                                      self.plambda + len(self.rU[userid]))
            user_all_rating_id = np.array(list(self.rU[userid].keys()))
            # print('user_all_rating_id ', user_all_rating_id[:])
            user_all_rating = np.array(list(self.rU[userid].values()))
            self.sum_cur_t[
                user_all_rating_id[:]] += user_all_rating[:] - self.biasU[
                    userid]
            self.sum_2_cur_t[user_all_rating_id[:]] += (user_all_rating[:] -
                                                        self.biasU[userid])**2
            self.sum_cntt[user_all_rating_id[:]] += 1

#         for itemid, userid, ind in zip(itemset[1:],userset[1:],range(1, len(itemset))):
#             if itemid == itemset[ind-1]:
#                 self.sum_cur_t[itemid]['rating'] += sMatrix[itemid, userid]-self.biasU[userid]
#                 self.sum_cur_t[itemid]['cnt'] += 1
#                 self.sum_2_cur_t[itemid] += (sMatrix[itemid, userid]-self.biasU[userid])**2
#             else:
#                 self.sum_cur_t[itemid] = {'rating': sMatrix[itemid, userid]-self.biasU[userid], 'cnt': 1}
#                 self.sum_2_cur_t[itemid] = (sMatrix[itemid, userid]-self.biasU[userid])**2

#### Prediction Model ####
        self.user_profile = {}
        self.item_profile = {}
        self.MF = MatrixFactorization()

        print("Initiation DONE!")
class DecisionTreeModel:
    def __init__(self, sMatrix, depth_threshold=6, plambda=7, MSP_item=200):
        '''
            sMatrix: I*U matrix
            depth_threshold: terminate depth
            plambda: regularization parameter
            self.rI: dict { 
                        itemid1: [ [uid11, rating11], [uid12, rating12], ... ] rating for item 1
                        itemid2: [ [uid21, rating21], [uid22, rating22], ... ] rating for item 2
                        itemid3: ...
                     }
            self.rU: dict {
                        userid1: { itemid11: rating11, itemid12: rating12, ... } rating of user 1
                        userid2: { itemid21: rating21, itemid22: rating22, ... } rating of user 2
                        userid3: ...
                     }
            self.lr_bound: dict {
                                level 0: [[left_bound, right_bound]], users' bound for one level, each ele in dictionary represents one node
                                level 1: [[left_bound, right_bound], [left_bound, right_bound], [left_bound, right_bound]], 3
                                level 2: ..., 9
                            } (bound means index)
            self.tree: []  all of userid
            self.split_item: list [
                    level 0: []
                    level 1: []
                    level 2: []
            ]
            self.sum_cur_t: dict {
                        itemid1: {'rating': sum of ratings for item 1, 'cnt': sum of users rated item 1}
                        itemid2: {'rating': sum of ratings for item 2, 'cnt': sum of users rated item 2}
                        ...
                    }
            self.sum_2_cur_t: dict {
                        itemid1: sum of square ratings for item 1
                        itemid2: sum of square ratings for item 2
                        ...
                    }
            self.biasU: dict {
                        userid1: bias1
                        userid2: bias2
                        ...
                    }
            self.user_profile: dict {
                        level 0: {pseudo_user1: [k1, k2, k3, ... , kt]}
                        level 1: {pseudo_user1: [k1, k2, k3, ... , kt], pseudo_user2: [k1, k2, k3, ... , kt], pseudo_user3: [k1, k2, k3, ... , kt]}
                        ... 
                    } profile for each level's node
            self.item_profile: dict {
                        level 0: {itemid1: [k1, k2, k3, ... , kt], itemid2: [k1, k2, k3, ... , kt], itemid3: [k1, k2, k3, ... , kt], ...} for each item
                        level 1: {itemid1: [k1, k2, k3, ... , kt], itemid2: [k1, k2, k3, ... , kt], itemid3: [k1, k2, k3, ... , kt], ...} for each item
                        ...
                    } profile for each item
            every element represents ratings for one item, its order decide the users in tree nodes
        '''
        self.depth_threshold = depth_threshold
        self.plambda = plambda
        self.cur_depth = 0
        self.MSP_item = MSP_item
        self.real_item_num = sMatrix.shape[0]
        x = find(sMatrix)
        itemset = x[0]
        userset = x[1]
        # self.rI = {}
        self.rU = {}
        self.sum_cur_t = {}
        self.sum_2_cur_t = {}
        # self.rI[itemset[0]] = [[userset[0], sMatrix[itemset[0], userset[0]]]]
        # self.rU[userset[0]] = {itemset[0]: sMatrix[itemset[0], userset[0]]}
        self.global_mean = 0  # global average of ratings

        #### Calculate rate of progress ####
        self.node_num = 0
        self.cur_node = 0
        for i in range(self.depth_threshold):
            self.node_num += 3**i

        #### Generate rI, rU ####
        self.rI = list(set(sMatrix.nonzero()[0]))
        for itemid, userid in zip(itemset, userset):
            self.rU.setdefault(userid, {})[itemid] = sMatrix[itemid, userid]
            # self.rI.setdefault(itemid, []).append([userid, sMatrix[itemid, userid]])
            self.global_mean += sMatrix[itemid, userid]
        self.global_mean /= len(itemset)
        self.item_size = len(self.rI)
        self.user_size = len(self.rU)

        #### Initiate Tree, lr_bound ####
        self.tree = list(self.rU.keys())
        self.split_item = []
        self.lr_bound = {'0': [[0, len(self.tree) - 1]]}

        #### Generate bias, sum_cur_t, sum_2_cur_t ####
        self.biasU = {}
        self.sum_cur_t = np.zeros(self.real_item_num)
        self.sum_2_cur_t = np.zeros(self.real_item_num)
        self.sum_cntt = np.zeros(self.real_item_num)
        #         self.sum_cur_t[itemset[0]] = {'rating': sMatrix[itemset[0], userset[0]]-self.biasU[userset[0]], 'cnt': 1}
        #         self.sum_2_cur_t[itemset[0]] = (sMatrix[itemset[0], userset[0]]-self.biasU[userset[0]])**2
        for userid in self.rU:
            self.biasU[userid] = (sum(list(self.rU[userid].values())) +
                                  self.plambda * self.global_mean) / (
                                      self.plambda + len(self.rU[userid]))
            user_all_rating_id = np.array(list(self.rU[userid].keys()))
            # print('user_all_rating_id ', user_all_rating_id[:])
            user_all_rating = np.array(list(self.rU[userid].values()))
            self.sum_cur_t[
                user_all_rating_id[:]] += user_all_rating[:] - self.biasU[
                    userid]
            self.sum_2_cur_t[user_all_rating_id[:]] += (user_all_rating[:] -
                                                        self.biasU[userid])**2
            self.sum_cntt[user_all_rating_id[:]] += 1

#         for itemid, userid, ind in zip(itemset[1:],userset[1:],range(1, len(itemset))):
#             if itemid == itemset[ind-1]:
#                 self.sum_cur_t[itemid]['rating'] += sMatrix[itemid, userid]-self.biasU[userid]
#                 self.sum_cur_t[itemid]['cnt'] += 1
#                 self.sum_2_cur_t[itemid] += (sMatrix[itemid, userid]-self.biasU[userid])**2
#             else:
#                 self.sum_cur_t[itemid] = {'rating': sMatrix[itemid, userid]-self.biasU[userid], 'cnt': 1}
#                 self.sum_2_cur_t[itemid] = (sMatrix[itemid, userid]-self.biasU[userid])**2

#### Prediction Model ####
        self.user_profile = {}
        self.item_profile = {}
        self.MF = MatrixFactorization()

        print("Initiation DONE!")

    def calculate_error(self, sumt, sumt_2, cntt):
        ''' Calculate error for one item-split in one node '''
        Error_i = np.sum(sumt_2 - (sumt**2) / (cntt + 1e-9))
        #         for itemid in sumtL:
        #             Error_i += sumtL_2[itemid] - (sumtL[itemid]['rating']**2)/(sumtL[itemid]['cnt']+1e-9) \
        #                         + sumtD_2[itemid] - (sumtD[itemid]['rating']**2)/(sumtD[itemid]['cnt']+1e-9) \
        #                             + sumtU_2[itemid] - (sumtU[itemid]['rating']**2)/(sumtU[itemid]['cnt']+1e-9)
        return Error_i

    def generate_decision_tree(self, lr_bound_for_node, chosen_id):
        '''
            sumtL: dict {
                itemid1: {'rating': sum of ratings for item 1, 'cnt': sum of users rated item 1}
                itemid2: {'rating': sum of ratings for item 2, 'cnt': sum of users rated item 2}
                ...
            }
            sumtL_2: dict {
                itemid1: sum of square ratings for item 1
                itemid2: sum of square ratings for item 2
                ...
            }
            lr_bound_for_node: list [leftind, rightind] for one node
        '''

        #### Terminate ####
        self.cur_depth += 1
        if self.cur_depth > self.depth_threshold or len(
                chosen_id) == self.item_size:
            return

        #### Choose Most Popular Items of This Node ####
        num_rec = np.zeros(self.real_item_num)
        for userid in self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] +
                                                      1)]:
            user_all_rating_id = np.array(list(self.rU[userid].keys()))
            num_rec[user_all_rating_id[:]] += 1
        sub_item_id = np.argsort(num_rec)[:self.MSP_item]

        #### Find optimum item to split ####
        min_sumtL, min_sumtD, min_sumtL_2, min_sumtD_2, min_sumtU, min_sumtU_2, Error = {}, {}, {}, {}, {}, {}, {}
        min_Error = "None"
        for itemid in sub_item_id:
            if itemid in chosen_id:
                continue
            ''' 
                user_rating_item_in_nodet: [ [uid01, rating01], [uid02, rating02], ... ] 
                to find all users in node t who rates item i
            '''
            user_rating_item_in_nodet = ([
                userid, self.rU[userid][itemid]
            ] for userid in self.tree[lr_bound_for_node[0]:(
                lr_bound_for_node[1] + 1)] if itemid in self.rU[userid])
            sumt = np.zeros((self.real_item_num, 3))
            sumt_2 = np.zeros((self.real_item_num, 3))
            cntt = np.zeros((self.real_item_num, 3))
            for user in user_rating_item_in_nodet:
                ''' user_all_rating: array [ [itemid11, rating11], [itemid12, rating12], ... ] '''
                user_all_rating_id = np.array(list(self.rU[user[0]].keys()))
                user_all_rating = np.array(list(self.rU[user[0]].values()))
                #### calculate sumtL for node LIKE ####
                if user[1] >= 4:
                    sumt[user_all_rating_id[:],
                         0] += user_all_rating[:] - self.biasU[user[0]]
                    sumt_2[user_all_rating_id[:],
                           0] += (user_all_rating[:] - self.biasU[user[0]])**2
                    cntt[user_all_rating_id[:], 0] += 1
                #### calculate sumtD for node DISLIKE ####
                elif user[1] <= 3:
                    sumt[user_all_rating_id[:],
                         1] += user_all_rating[:] - self.biasU[user[0]]
                    sumt_2[user_all_rating_id[:],
                           1] += (user_all_rating[:] - self.biasU[user[0]])**2
                    cntt[user_all_rating_id[:], 1] += 1
            #### calculate sumtU for node UNKNOWN ####
            sumt[:, 2] = self.sum_cur_t[:] - sumt[:, 0] - sumt[:, 1]
            sumt_2[:, 2] = self.sum_2_cur_t[:] - sumt_2[:, 0] - sumt_2[:, 1]
            cntt[:, 2] = self.sum_cntt[:] - cntt[:, 0] - cntt[:, 1]
            Error[itemid] = self.calculate_error(sumt, sumt_2, cntt)

            #             sumtL, sumtD, sumtL_2, sumtD_2, sumtU, sumtU_2 = {}, {}, {}, {}, {}, {}
            #             sumtL = {k:{'rating': 0, 'cnt': 0} for k in self.rI.keys()}
            #             sumtL_2 = sumtL_2.fromkeys(self.rI.keys(), 0)
            #             sumtD = {k:{'rating': 0, 'cnt': 0} for k in self.rI.keys()}
            #             sumtD_2 = sumtD_2.fromkeys(self.rI.keys(), 0)
            #             for user in user_rating_item_in_nodet:
            #                 ''' user_all_rating: [ [itemid11, rating11], [itemid12, rating12], ... ] '''
            #                 user_all_rating = self.rU[user[0]]
            #                 #### calculate sumtL for node LIKE ####
            #                 if user[1] >= 4:
            #                     for uritem, rating in user_all_rating.items():
            #                         sumtL[uritem]['rating'] += rating
            #                         sumtL_2[uritem] += (rating-self.biasU[user[0]])**2
            #                         sumtL[uritem]['rating'] -= self.biasU[user[0]]
            #                         sumtL[uritem]['cnt'] += 1
            #                 #### calculate sumtD for node DISLIKE ####
            #                 elif user[1] <= 3:
            #                     for uritem, rating in user_all_rating.items():
            #                         sumtD[uritem]['rating'] += rating
            #                         sumtD_2[uritem] += (rating-self.biasU[user[0]])**2
            #                         sumtD[uritem]['rating'] -= self.biasU[user[0]]
            #                         sumtD[uritem]['cnt'] += 1
            #             #### calculate sumtU for node UNKNOWN ####
            #             for iid in self.rI:
            #                 sumtU[iid] = {}
            #                 sumtU[iid]['rating'] = self.sum_cur_t[iid]['rating'] - sumtL[iid]['rating'] - sumtD[iid]['rating']
            #                 sumtU[iid]['cnt'] = self.sum_cur_t[iid]['cnt'] - sumtL[iid]['cnt'] - sumtD[iid]['cnt']
            #                 sumtU_2[iid] = self.sum_2_cur_t[iid] - sumtL_2[iid] - sumtD_2[iid]
            #             #### calculate error by (eL + eD + eU) ####
            #             Error[itemid] = self.calculate_error(sumtL, sumtL_2, sumtD, sumtD_2, sumtU, sumtU_2)
            if min_Error == "None" or Error[itemid] < min_Error:
                min_sumt = sumt
                min_sumt_2 = sumt_2
                min_cntt = cntt
                min_Error = Error[itemid]
        #### Find optimum split-item ####
        optimum_itemid = min(Error, key=Error.get)
        if len(self.split_item) == self.cur_depth - 1:
            self.split_item.append([optimum_itemid])
        else:
            self.split_item[self.cur_depth - 1].append(optimum_itemid)
        # self.split_item.setdefault(str(self.cur_depth-1), []).append(optimum_itemid)
        chosen_id.append(optimum_itemid)

        #### sort tree ####
        self.lr_bound.setdefault(str(self.cur_depth),
                                 []).append([])  # for LIKE
        self.lr_bound[str(self.cur_depth)].append([])  # for DISLIKE
        self.lr_bound[str(self.cur_depth)].append([])  # for UNKNOWN
        listU, listL, listD = [], [], []
        for userid in self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] +
                                                      1)]:
            if optimum_itemid not in self.rU[userid]:
                listU.append(userid)
            elif self.rU[userid][optimum_itemid] >= 4:
                listL.append(userid)
            elif self.rU[userid][optimum_itemid] <= 3:
                listD.append(userid)
        self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] +
                                        1)] = listL + listD + listU
        self.lr_bound[str(self.cur_depth)][-3] = [
            lr_bound_for_node[0], lr_bound_for_node[0] + len(listL) - 1
        ]  # for LIKE
        self.lr_bound[str(self.cur_depth)][-2] = [
            lr_bound_for_node[0] + len(listL),
            lr_bound_for_node[0] + len(listL) + len(listD) - 1
        ]  # for DISLIKE
        self.lr_bound[str(self.cur_depth)][-1] = [
            lr_bound_for_node[0] + len(listL) + len(listD),
            lr_bound_for_node[0] + len(listL) + len(listD) + len(listU) - 1
        ]  # for UNKNOWN

        #### Generate Subtree of Node LIKE ####
        self.sum_cur_t = min_sumt[:, 0]
        self.sum_2_cur_t = min_sumt_2[:, 0]
        self.sum_cntt = min_cntt[:, 0]
        self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-3],
                                    chosen_id[:])
        self.cur_depth -= 1

        #### Generate Subtree of Node DISLIKE ####
        self.sum_cur_t = min_sumt[:, 1]
        self.sum_2_cur_t = min_sumt_2[:, 1]
        self.sum_cntt = min_cntt[:, 1]
        self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-2],
                                    chosen_id[:])
        self.cur_depth -= 1

        #### Generate Subtree of Node UNKNOWN ####
        self.sum_cur_t = min_sumt[:, 2]
        self.sum_2_cur_t = min_sumt_2[:, 2]
        self.sum_cntt = min_cntt[:, 2]
        self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-1],
                                    chosen_id[:])
        self.cur_depth -= 1

        #### Show Rating Progress ####
        for i in range(self.cur_depth - 1):
            print("┃", end="")
        print("┏", end="")
        self.cur_node += 1
        print("Current depth: " + str(self.cur_depth) + "        %.2f%%" %
              (100 * self.cur_node / self.node_num))

    def calculate_avg_rating_for_pesudo_user(self, pseudo_user_lst):
        '''ret_dict: dict {
            itemid0: rating0 
            itemid1: rating1
            ...				
        }'''
        cal_dict = {key: {'rating': 0, 'cnt': 0} for key in self.rI}
        ret_dict = {}
        for userid in pseudo_user_lst:
            for itemid, rating in self.rU[userid].items():
                cal_dict[itemid]['rating'] += rating
                cal_dict[itemid]['cnt'] += 1
        for itemid in cal_dict:
            if cal_dict[itemid]['cnt'] == 0:
                continue
            ret_dict[
                itemid] = cal_dict[itemid]['rating'] / cal_dict[itemid]['cnt']
        return ret_dict

    def generate_prediction_model(self):
        '''self.lr_bound: dict {
                    level 0: [[left_bound, right_bound]], users' bound for one level, each ele in dictionary represents one node
                    level 1: [[left_bound, right_bound], [left_bound, right_bound], [left_bound, right_bound]], 3
                    level 2: ..., 9
                } (bound means index)
        '''
        for level in self.lr_bound:
            self.user_profile.setdefault(level)
            train_lst = []
            for pseudo_user_bound, userid in zip(
                    self.lr_bound[level], range(len(self.lr_bound[level]))):
                if pseudo_user_bound[0] > pseudo_user_bound[1]:
                    continue
                pseudo_user_lst = self.tree[pseudo_user_bound[0]:(
                    pseudo_user_bound[1] + 1)]
                pseudo_user_for_item = self.calculate_avg_rating_for_pesudo_user(
                    pseudo_user_lst)
                train_lst += [(userid, int(key), float(value))
                              for key, value in pseudo_user_for_item.items()]
            self.user_profile[level], self.item_profile[
                level] = self.MF.matrix_factorization(train_lst)

    def build_model(self):
        #### Construct the tree & get the prediction model ####
        self.generate_decision_tree(self.lr_bound['0'][0], [])
        self.generate_prediction_model()

    def predict(self, new_user_ratings, pred_index):
        ''' new_user_ratings: list [
                       [itemid1, rating1],
                       [itemid2, rating2],
                       [itemid3, rating3],
                       [itemid4, rating4],
                       ... ] 
            pred_rating: array: (I,)
                            new user's rating for each item
         '''

        #### Find user profile for new user ####
        new_user_profile = np.array(self.user_profile[str(
            len(new_user_ratings))][pred_index])  # shape: (k,)
        new_item_profile = np.array(
            list(self.item_profile[str(
                len(new_user_ratings))].values()))  # shape: (I, k)

        #### Calculate predict rating ####
        pred_rating = {itemid: np.dot(new_item_profile[i], new_user_profile) \
            for itemid,i in zip(self.item_profile[str(len(new_user_ratings))], range(new_item_profile.shape[0]))}
        return pred_rating