description_sim_matrix = rw.readffile(finput_description_sim_matrix)
	user_cluster_set = rw.readffile(finput_user_cluster_set)
	train_item_id = rw.readffile(finput_train_item_id)
	test_item_id = rw.readffile(finput_test_item_id)

	# run matlab script and get parameters for title and description
	print("call matlab script....")
	cur_path = os.getcwd()
	os.chdir("D:\GitCode\Dissertation\Step1-Preprocessing")
	eng = matlab.engine.start_matlab()
	x = eng.my_fitnlm(finput_nonlinreg, finput_init_tp, finput_init_dp, nargout=3)
	theta1, theta2, RMSE = x[0], x[1], x[2]
	eng.quit()
	sim_matrix = theta1*title_sim_matrix + theta2*description_sim_matrix
	os.chdir(cur_path)
	rw.write2file(sim_matrix, foutput_item_sim_matrix)
	print("theta1 = ", theta1)
	print("theta2 = ", theta2)
	print("RMSE = ", RMSE)
	print("matlab finished")

	# extract similarity matrix for training and test item
	# resort_id = list(train_item_id.keys()) + list(test_item_id.keys())
	sim_matrix_train = sim_matrix.loc[list(train_item_id.keys()), list(train_item_id.keys())].values
	sim_matrix_test = sim_matrix.loc[list(test_item_id.keys()), list(test_item_id.keys())].values

	# user cluster - item rating matrix
	iuclst_rating_matrix_train = np.zeros((len(train_item_id), len(user_cluster_set)))
	iuclst_rating_matrix_test = np.zeros((len(test_item_id), len(user_cluster_set)))
	item_in_node_train = list(range(iu_rating_matrix_train.shape[0]))
	item_in_node_test = list(range(iu_rating_matrix_test.shape[0]))
예제 #2
0
                        user_id_dict[row['reviewerID']]] = int(row['overall'])
        cnt += 1
    iu_sparse_matrix_train = scipy.sparse.csr_matrix(iu_matrix_train)
    print(
        "density of iu train matrix is: %.4f%%" %
        (100 * len(find(iu_sparse_matrix_train)[0]) /
         (iu_sparse_matrix_train.shape[0] * iu_sparse_matrix_train.shape[1])))
    scipy.sparse.save_npz(foutput1, iu_sparse_matrix_train)
    # test
    iu_matrix_test = np.zeros((test_row, col), dtype=np.int8)
    cnt = 0
    lenght = df_test.shape[0]
    for index, row in df_test.iterrows():
        print("iu test matrix: %d / %d" % (cnt, lenght), end="\r")
        if row['reviewerID'] in user_id_dict.keys():
            iu_matrix_test[test_item_id_dict[row['asin']],
                           user_id_dict[row['reviewerID']]] = int(
                               row['overall'])
        cnt += 1
    iu_sparse_matrix_test = scipy.sparse.csr_matrix(iu_matrix_test)
    print("density of iu test matrix is: %.4f%%" %
          (100 * len(find(iu_sparse_matrix_test)[0]) /
           (iu_sparse_matrix_test.shape[0] * iu_sparse_matrix_test.shape[1])))
    scipy.sparse.save_npz(foutput2, iu_sparse_matrix_test)
    print("iu matrix generated done!")

    # write uid, train_item_id and test_item_id into files
    rw.write2file(user_id_dict, foutput_uid)
    rw.write2file(train_item_id_dict, foutput_train_item_id)
    rw.write2file(test_item_id_dict, foutput_test_item_id)
    print("write done!")
예제 #3
0
import read2df as rdf
import read_write as rw
'''
Input: input path ("../Dataset/All_Beauty/meta_All_Beauty.json.gz")
	   output path ("Data/title" && "Data/description")
output: files
'''
if (__name__ == '__main__'):
    #### data path
    finput = sys.argv[1]
    foutput_title = sys.argv[2]
    foutput_description = sys.argv[3]

    #### read data into dataframe
    df = rdf.getDF(finput)

    #### delete rows where title or description is nan
    dict_title = {}
    dict_description = {}
    subdf = df[~(df['title'].isin([np.nan])
                 | df['description'].isin([np.nan]))]
    for indexs in subdf.index:
        dict_title[subdf.loc[indexs]['asin']] = subdf.loc[indexs]['title']
        dict_description[subdf.loc[indexs]
                         ['asin']] = subdf.loc[indexs]['description']

    #### write generated dictionary into files
    rw.write2file(dict_title, foutput_title)
    rw.write2file(dict_description, foutput_description)
    print("Write Done!")
    print("Info: %d/%d" % (subdf.shape[0], df.shape[0]))
예제 #4
0
    rating_matrix_train = (
        rating_matrix_train - np.sum(rating_matrix_train, axis=0) /
        np.sum(rating_matrix_train != 0, axis=0)) * (rating_matrix_train != 0)
    rating_matrix_train_2 = rating_matrix_train**2
    # user_similarity_matrix = np.dot(rating_matrix_train.T, rating_matrix_train) / (np.dot(rating_matrix_train_2.T, rating_matrix_train_2)**0.5 + 1e-9)
    row_num = rating_matrix_train.shape[0]
    col_num = rating_matrix_train.shape[1]
    user_similarity_matrix = np.zeros((col_num, col_num))
    nominatorM = np.dot(rating_matrix_train.T, rating_matrix_train)
    print("nominator done!")
    cnt = 0
    for i in range(col_num):
        cnt += 1
        print("progress: %d / %d" % (cnt, col_num), end="\r")
        flag = ((rating_matrix_train[:, i] != 0).reshape(
            row_num, 1)) * (rating_matrix_train != 0)
        user_similarity_matrix[i] = nominatorM[i] / (
            (np.dot(rating_matrix_train_2[:, i].T, flag)**0.5) *
            (np.sum(rating_matrix_train_2 * flag, axis=0)**0.5) + 1e-9)
    # or it will be 0 for some users
    # np.fill_diagonal(user_similarity_matrix, 1)
    print("\ndone!")

    # transfer to dataframe and save to file
    # rw.write2file(user_similarity_matrix, "Data/test")
    df_user_similarity_matrix = pd.DataFrame(user_similarity_matrix,
                                             index=list(uid.keys()),
                                             columns=list(uid.keys()))
    del user_similarity_matrix
    rw.write2file(df_user_similarity_matrix, foutput_user_similarity)
    print("file saved done!")
예제 #5
0
                               num_topics=finput_topic_num)
    description_similarity = lda.LDA(texts=list_description_preprocessed,
                                     index_lst=index_lst,
                                     num_topics=finput_topic_num)
    print("lda similarity calculated done!")

    #### generate train/test item similarity matrix
    df_title_similarity_matrix = pd.DataFrame(np.array(title_similarity),
                                              index=item_tt_id_lst,
                                              columns=item_tt_id_lst)
    df_description_similarity_matrix = pd.DataFrame(
        np.array(description_similarity),
        index=item_tt_id_lst,
        columns=item_tt_id_lst)
    # train_item_id = rw.readffile(finput_train_item_id)
    # test_item_id = rw.readffile(finput_test_item_id)
    # #### title/train
    # df_title_similarity_matrix_train = df_title_similarity_matrix.loc[list(train_item_id.keys()), list(train_item_id.keys())]
    # #### title/test
    # df_title_similarity_matrix_test = df_title_similarity_matrix.loc[list(test_item_id.keys()), list(test_item_id.keys())]
    # #### description/train
    # df_description_similarity_matrix_train = df_description_similarity_matrix.loc[list(train_item_id.keys()), list(train_item_id.keys())]
    # #### description/test
    # df_description_similarity_matrix_test = df_description_similarity_matrix.loc[list(test_item_id.keys()), list(test_item_id.keys())]
    print("similarity matrix generated done!")

    #### write data into files
    rw.write2file(df_title_similarity_matrix, foutput_title_similarity)
    rw.write2file(df_description_similarity_matrix,
                  foutput_description_similarity)
    print("file saved done!")
예제 #6
0
import k_medoids as km
'''
finput_user_similarity = "Data/user_similarity_matrix"
finput_cluster_number = 200
foutput_user_cluster_set = "Data/user_cluster_set"
'''

if (__name__ == '__main__'):
    # data path
    finput_user_similarity = sys.argv[1]
    finput_cluster_number = int(sys.argv[2])
    foutput_user_cluster_set = sys.argv[3]

    # read into user similarity matrix
    user_similarity_matrix = rw.readffile(finput_user_similarity)

    # k-medoids
    user_cluster_set = km.k_medoids(user_similarity_matrix.values,
                                    K=finput_cluster_number,
                                    max_iterations=20)
    print("\ndone!")

    rw.write2file(user_cluster_set, foutput_user_cluster_set)
    print("file saved done!")

    print("top 20% of user cluster:")
    length = []
    for lst in user_cluster_set:
        length.append(len(lst))
    length.sort(reverse=True)
    print(length[0:int(len(length) * 0.2)])