finput_title_sim_matrix = sys.argv[3] finput_description_sim_matrix = sys.argv[4] finput_user_cluster_set = sys.argv[5] finput_train_item_id = sys.argv[6] finput_test_item_id = sys.argv[7] finput_nonlinreg = sys.argv[8] finput_init_tp = float(sys.argv[9]) finput_init_dp = float(sys.argv[10]) foutput_iuclst_rating_matrix_train = sys.argv[11] foutput_iuclst_rating_matrix_test = sys.argv[12] foutput_item_sim_matrix = sys.argv[13] # load data iu_rating_matrix_train = scipy.sparse.load_npz(finput_iu_rating_matrix_train) iu_rating_matrix_test = scipy.sparse.load_npz(finput_iu_rating_matrix_test) title_sim_matrix = rw.readffile(finput_title_sim_matrix) description_sim_matrix = rw.readffile(finput_description_sim_matrix) user_cluster_set = rw.readffile(finput_user_cluster_set) train_item_id = rw.readffile(finput_train_item_id) test_item_id = rw.readffile(finput_test_item_id) # run matlab script and get parameters for title and description print("call matlab script....") cur_path = os.getcwd() os.chdir("D:\GitCode\Dissertation\Step1-Preprocessing") eng = matlab.engine.start_matlab() x = eng.my_fitnlm(finput_nonlinreg, finput_init_tp, finput_init_dp, nargout=3) theta1, theta2, RMSE = x[0], x[1], x[2] eng.quit() sim_matrix = theta1*title_sim_matrix + theta2*description_sim_matrix os.chdir(cur_path)
''' if (__name__ == '__main__'): finput_iu_sparse_matrix_train = sys.argv[1] finput_iu_sparse_matrix_test = sys.argv[2] finput_iuclst_rating_matrix_train = sys.argv[3] finput_iuclst_rating_matrix_test = sys.argv[4] finput_user_cluster_set = sys.argv[5] finput_desired_depth = int(sys.argv[6]) # read into data for tree construction iu_sparse_matrix_train = scipy.sparse.load_npz( finput_iu_sparse_matrix_train) iu_sparse_matrix_test = scipy.sparse.load_npz(finput_iu_sparse_matrix_test) iuclst_rating_matrix_train = rw.readffile( finput_iuclst_rating_matrix_train) iuclst_rating_matrix_test = rw.readffile(finput_iuclst_rating_matrix_test) user_cluster_set = rw.readffile(finput_user_cluster_set) # build tree dt_model = DecisionTree(iu_sparse_matrix_train, iu_sparse_matrix_test, iuclst_rating_matrix_train, iuclst_rating_matrix_test, user_cluster_set, finput_desired_depth) dt_model.buildTreeModel() print("\ntree construction finished") # build prediction model dt_model.buildPredModel() print("prediction model finished") # predict # dt_model.predict()
import scipy.sparse from MatrixFactorization import MatrixFactorization if (__name__ == '__main__'): finput_dataset = sys.argv[1] finput_K = (int)(sys.argv[2]) iu_matrix_train_path = "../../Data/" + finput_dataset + "/iu_sparse_matrix_train.npz" iu_matrix_test_path = "../../Data/" + finput_dataset + "/iu_sparse_matrix_test.npz" train_item_id_path = "../../Data/" + finput_dataset + "/train_item_id" test_item_id_path = "../../Data/" + finput_dataset + "/test_item_id" item_sim_matrix_path = "../../Data/" + finput_dataset + "/item_sim_matrix" # pass ui_matrix_train = scipy.sparse.load_npz(iu_matrix_train_path).T ui_matrix_test = scipy.sparse.load_npz(iu_matrix_test_path).T ui_matrix = scipy.sparse.csr_matrix(np.hstack((ui_matrix_train.toarray(), np.zeros(ui_matrix_test.shape)))) train_item_id = rw.readffile(train_item_id_path) test_item_id = rw.readffile(test_item_id_path) item_sim_matrix = rw.readffile(item_sim_matrix_path) # Computing Score for user (Score = [user number, new item number]) Score = (ui_matrix_train * item_sim_matrix.loc[train_item_id, test_item_id]) / \ ((ui_matrix_train != 0) * item_sim_matrix.loc[train_item_id, test_item_id]) # Active Learning train_item_num = len(train_item_id) ui_matrix = ui_matrix.tolil() ui_matrix_test = ui_matrix_test.tolil() for i in range(len(test_item_id)): ind = np.argsort(-Score[:, i]) if finput_K < ind.shape[0]: topK = ind[:(finput_K+1)]
foutput_train_item_id = "Data/train_item_id" foutput_test_item_id = "Data/test_item_id" ''' if (__name__ == '__main__'): #### data path finput = sys.argv[1] finput_item = sys.argv[2] foutput1 = sys.argv[3] foutput2 = sys.argv[4] foutput_uid = sys.argv[5] foutput_train_item_id = sys.argv[6] foutput_test_item_id = sys.argv[7] # read into item id whose title and description is not null dict_item_id = rw.readffile(finput_item) # read into review file and select item df = rdf.getDF(finput) df = df.loc[df['asin'].isin(dict_item_id)] # split item into train and test itemid = list(df['asin'].unique()) train_item_id = random.sample(itemid, int(0.75 * len(itemid))) test_item_id = [ele for ele in itemid if ele not in train_item_id] print("train: %d/%d, test: %d/%d" % (len(train_item_id), len(itemid), len(test_item_id), len(itemid))) df_train = df.loc[df['asin'].isin(train_item_id)] df_test = df.loc[df['asin'].isin(test_item_id)] # set user set as those who rate at least one item in the training set
from scipy.sparse import csr_matrix, find import read_write as rw ''' finput_uid = "Data/uid" finput_rating_matrix_train = "Data/iu_sparse_matrix_train.npz" foutput_user_similarity = "Data/user_similarity_matrix" ''' if (__name__ == '__main__'): #### data path finput_uid = sys.argv[1] finput_rating_matrix_train = sys.argv[2] foutput_user_similarity = sys.argv[3] # read into user id information and train rating matrix uid = rw.readffile(finput_uid) rating_matrix_train = scipy.sparse.load_npz( finput_rating_matrix_train).toarray() # generate user similarity rating_matrix_train = ( rating_matrix_train - np.sum(rating_matrix_train, axis=0) / np.sum(rating_matrix_train != 0, axis=0)) * (rating_matrix_train != 0) rating_matrix_train_2 = rating_matrix_train**2 # user_similarity_matrix = np.dot(rating_matrix_train.T, rating_matrix_train) / (np.dot(rating_matrix_train_2.T, rating_matrix_train_2)**0.5 + 1e-9) row_num = rating_matrix_train.shape[0] col_num = rating_matrix_train.shape[1] user_similarity_matrix = np.zeros((col_num, col_num)) nominatorM = np.dot(rating_matrix_train.T, rating_matrix_train) print("nominator done!") cnt = 0
foutput_title_similarity = "Data/title_similarity_matrix" foutput_description_similarity = "Data/description_similarity_matrix" ''' if (__name__ == '__main__'): #### data path finput_topic_num = int(sys.argv[1]) finput_title = sys.argv[2] finput_description = sys.argv[3] finput_train_item_id = sys.argv[4] finput_test_item_id = sys.argv[5] foutput_title_similarity = sys.argv[6] foutput_description_similarity = sys.argv[7] #### read into item title and description information (dict: {id : content}) dict_title = rw.readffile(finput_title) dict_description = rw.readffile(finput_description) train_item_id = rw.readffile(finput_train_item_id) test_item_id = rw.readffile(finput_test_item_id) #### preprocess before LDA dict_title_preprocessed = lda.texts_preprocess(dict_title) dict_description_preprocessed = lda.texts_preprocess(dict_description) list_title_preprocessed = list(dict_title_preprocessed.values()) list_description_preprocessed = list( dict_description_preprocessed.values()) print("text preprocessed done!") #### generate item title and description similarity for selected items item_tt_id_lst = list(train_item_id.keys()) + list(test_item_id.keys()) item_total_id_lst = list(dict_title.keys())
finput_rating_matrix_test = "Data/iu_sparse_matrix_test.npz" foutput = "Data/nonlinreg.mat" ''' if (__name__ == '__main__'): #### data path finput_title = sys.argv[1] finput_description = sys.argv[2] finput_train_id = sys.argv[3] finput_test_id = sys.argv[4] finput_rating_matrix_train = sys.argv[5] finput_rating_matrix_test = sys.argv[6] foutput = sys.argv[7] # read into similarity file and train/test item id matrix_title = rw.readffile(finput_title) matrix_description = rw.readffile(finput_description) train_id = rw.readffile(finput_train_id) test_id = rw.readffile(finput_test_id) # combine these items and select corresponding matrix item_id = list(train_id.keys()) + list(test_id.keys()) matrix_title = matrix_title.loc[item_id, item_id] matrix_description = matrix_description.loc[item_id, item_id] # read into train/test rating sparse matrix and combine them up rating_matrix_train = scipy.sparse.load_npz(finput_rating_matrix_train) rating_matrix_test = scipy.sparse.load_npz(finput_rating_matrix_test) rating_matrix = scipy.sparse.csr_matrix( np.vstack( (rating_matrix_train.toarray(), rating_matrix_test.toarray())))
import read_write as rw import k_medoids as km ''' finput_user_similarity = "Data/user_similarity_matrix" finput_cluster_number = 200 foutput_user_cluster_set = "Data/user_cluster_set" ''' if (__name__ == '__main__'): # data path finput_user_similarity = sys.argv[1] finput_cluster_number = int(sys.argv[2]) foutput_user_cluster_set = sys.argv[3] # read into user similarity matrix user_similarity_matrix = rw.readffile(finput_user_similarity) # k-medoids user_cluster_set = km.k_medoids(user_similarity_matrix.values, K=finput_cluster_number, max_iterations=20) print("\ndone!") rw.write2file(user_cluster_set, foutput_user_cluster_set) print("file saved done!") print("top 20% of user cluster:") length = [] for lst in user_cluster_set: length.append(len(lst)) length.sort(reverse=True)