finput_title_sim_matrix = sys.argv[3]
	finput_description_sim_matrix = sys.argv[4]
	finput_user_cluster_set = sys.argv[5]
	finput_train_item_id = sys.argv[6]
	finput_test_item_id = sys.argv[7]
	finput_nonlinreg = sys.argv[8]
	finput_init_tp = float(sys.argv[9])
	finput_init_dp = float(sys.argv[10])
	foutput_iuclst_rating_matrix_train = sys.argv[11]
	foutput_iuclst_rating_matrix_test = sys.argv[12]
	foutput_item_sim_matrix = sys.argv[13]

	# load data
	iu_rating_matrix_train = scipy.sparse.load_npz(finput_iu_rating_matrix_train)
	iu_rating_matrix_test = scipy.sparse.load_npz(finput_iu_rating_matrix_test)
	title_sim_matrix = rw.readffile(finput_title_sim_matrix)
	description_sim_matrix = rw.readffile(finput_description_sim_matrix)
	user_cluster_set = rw.readffile(finput_user_cluster_set)
	train_item_id = rw.readffile(finput_train_item_id)
	test_item_id = rw.readffile(finput_test_item_id)

	# run matlab script and get parameters for title and description
	print("call matlab script....")
	cur_path = os.getcwd()
	os.chdir("D:\GitCode\Dissertation\Step1-Preprocessing")
	eng = matlab.engine.start_matlab()
	x = eng.my_fitnlm(finput_nonlinreg, finput_init_tp, finput_init_dp, nargout=3)
	theta1, theta2, RMSE = x[0], x[1], x[2]
	eng.quit()
	sim_matrix = theta1*title_sim_matrix + theta2*description_sim_matrix
	os.chdir(cur_path)
示例#2
0
'''

if (__name__ == '__main__'):

    finput_iu_sparse_matrix_train = sys.argv[1]
    finput_iu_sparse_matrix_test = sys.argv[2]
    finput_iuclst_rating_matrix_train = sys.argv[3]
    finput_iuclst_rating_matrix_test = sys.argv[4]
    finput_user_cluster_set = sys.argv[5]
    finput_desired_depth = int(sys.argv[6])

    # read into data for tree construction
    iu_sparse_matrix_train = scipy.sparse.load_npz(
        finput_iu_sparse_matrix_train)
    iu_sparse_matrix_test = scipy.sparse.load_npz(finput_iu_sparse_matrix_test)
    iuclst_rating_matrix_train = rw.readffile(
        finput_iuclst_rating_matrix_train)
    iuclst_rating_matrix_test = rw.readffile(finput_iuclst_rating_matrix_test)
    user_cluster_set = rw.readffile(finput_user_cluster_set)

    # build tree
    dt_model = DecisionTree(iu_sparse_matrix_train, iu_sparse_matrix_test,
                            iuclst_rating_matrix_train,
                            iuclst_rating_matrix_test, user_cluster_set,
                            finput_desired_depth)
    dt_model.buildTreeModel()
    print("\ntree construction finished")
    # build prediction model
    dt_model.buildPredModel()
    print("prediction model finished")
    # predict
    # dt_model.predict()
import scipy.sparse
from MatrixFactorization import MatrixFactorization

if (__name__ == '__main__'):
    finput_dataset = sys.argv[1]
    finput_K = (int)(sys.argv[2])
    iu_matrix_train_path = "../../Data/" + finput_dataset + "/iu_sparse_matrix_train.npz"
    iu_matrix_test_path = "../../Data/" + finput_dataset + "/iu_sparse_matrix_test.npz"
    train_item_id_path = "../../Data/" + finput_dataset + "/train_item_id"
    test_item_id_path = "../../Data/" + finput_dataset + "/test_item_id"
    item_sim_matrix_path = "../../Data/" + finput_dataset + "/item_sim_matrix"     # pass

    ui_matrix_train = scipy.sparse.load_npz(iu_matrix_train_path).T
    ui_matrix_test = scipy.sparse.load_npz(iu_matrix_test_path).T
    ui_matrix = scipy.sparse.csr_matrix(np.hstack((ui_matrix_train.toarray(), np.zeros(ui_matrix_test.shape))))
    train_item_id = rw.readffile(train_item_id_path)
    test_item_id = rw.readffile(test_item_id_path)
    item_sim_matrix = rw.readffile(item_sim_matrix_path)

    # Computing Score for user (Score = [user number, new item number])
    Score = (ui_matrix_train * item_sim_matrix.loc[train_item_id, test_item_id]) / \
            ((ui_matrix_train != 0) * item_sim_matrix.loc[train_item_id, test_item_id])

    # Active Learning
    train_item_num = len(train_item_id)
    ui_matrix = ui_matrix.tolil()
    ui_matrix_test = ui_matrix_test.tolil()
    for i in range(len(test_item_id)):
        ind = np.argsort(-Score[:, i])
        if finput_K < ind.shape[0]:
            topK = ind[:(finput_K+1)]
示例#4
0
	foutput_train_item_id = "Data/train_item_id"
	foutput_test_item_id = "Data/test_item_id"
'''
if (__name__ == '__main__'):

    #### data path
    finput = sys.argv[1]
    finput_item = sys.argv[2]
    foutput1 = sys.argv[3]
    foutput2 = sys.argv[4]
    foutput_uid = sys.argv[5]
    foutput_train_item_id = sys.argv[6]
    foutput_test_item_id = sys.argv[7]

    # read into item id whose title and description is not null
    dict_item_id = rw.readffile(finput_item)

    # read into review file and select item
    df = rdf.getDF(finput)
    df = df.loc[df['asin'].isin(dict_item_id)]

    # split item into train and test
    itemid = list(df['asin'].unique())
    train_item_id = random.sample(itemid, int(0.75 * len(itemid)))
    test_item_id = [ele for ele in itemid if ele not in train_item_id]
    print("train: %d/%d, test: %d/%d" %
          (len(train_item_id), len(itemid), len(test_item_id), len(itemid)))

    df_train = df.loc[df['asin'].isin(train_item_id)]
    df_test = df.loc[df['asin'].isin(test_item_id)]
    # set user set as those who rate at least one item in the training set
示例#5
0
from scipy.sparse import csr_matrix, find
import read_write as rw
'''
finput_uid = "Data/uid"
finput_rating_matrix_train = "Data/iu_sparse_matrix_train.npz"
foutput_user_similarity = "Data/user_similarity_matrix"
'''

if (__name__ == '__main__'):
    #### data path
    finput_uid = sys.argv[1]
    finput_rating_matrix_train = sys.argv[2]
    foutput_user_similarity = sys.argv[3]

    # read into user id information and train rating matrix
    uid = rw.readffile(finput_uid)
    rating_matrix_train = scipy.sparse.load_npz(
        finput_rating_matrix_train).toarray()

    # generate user similarity
    rating_matrix_train = (
        rating_matrix_train - np.sum(rating_matrix_train, axis=0) /
        np.sum(rating_matrix_train != 0, axis=0)) * (rating_matrix_train != 0)
    rating_matrix_train_2 = rating_matrix_train**2
    # user_similarity_matrix = np.dot(rating_matrix_train.T, rating_matrix_train) / (np.dot(rating_matrix_train_2.T, rating_matrix_train_2)**0.5 + 1e-9)
    row_num = rating_matrix_train.shape[0]
    col_num = rating_matrix_train.shape[1]
    user_similarity_matrix = np.zeros((col_num, col_num))
    nominatorM = np.dot(rating_matrix_train.T, rating_matrix_train)
    print("nominator done!")
    cnt = 0
示例#6
0
foutput_title_similarity = "Data/title_similarity_matrix"
foutput_description_similarity = "Data/description_similarity_matrix"
'''

if (__name__ == '__main__'):
    #### data path
    finput_topic_num = int(sys.argv[1])
    finput_title = sys.argv[2]
    finput_description = sys.argv[3]
    finput_train_item_id = sys.argv[4]
    finput_test_item_id = sys.argv[5]
    foutput_title_similarity = sys.argv[6]
    foutput_description_similarity = sys.argv[7]

    #### read into item title and description information (dict: {id : content})
    dict_title = rw.readffile(finput_title)
    dict_description = rw.readffile(finput_description)
    train_item_id = rw.readffile(finput_train_item_id)
    test_item_id = rw.readffile(finput_test_item_id)

    #### preprocess before LDA
    dict_title_preprocessed = lda.texts_preprocess(dict_title)
    dict_description_preprocessed = lda.texts_preprocess(dict_description)
    list_title_preprocessed = list(dict_title_preprocessed.values())
    list_description_preprocessed = list(
        dict_description_preprocessed.values())
    print("text preprocessed done!")

    #### generate item title and description similarity for selected items
    item_tt_id_lst = list(train_item_id.keys()) + list(test_item_id.keys())
    item_total_id_lst = list(dict_title.keys())
finput_rating_matrix_test = "Data/iu_sparse_matrix_test.npz"
foutput = "Data/nonlinreg.mat"
'''
if (__name__ == '__main__'):

    #### data path
    finput_title = sys.argv[1]
    finput_description = sys.argv[2]
    finput_train_id = sys.argv[3]
    finput_test_id = sys.argv[4]
    finput_rating_matrix_train = sys.argv[5]
    finput_rating_matrix_test = sys.argv[6]
    foutput = sys.argv[7]

    # read into similarity file and train/test item id
    matrix_title = rw.readffile(finput_title)
    matrix_description = rw.readffile(finput_description)
    train_id = rw.readffile(finput_train_id)
    test_id = rw.readffile(finput_test_id)

    # combine these items and select corresponding matrix
    item_id = list(train_id.keys()) + list(test_id.keys())
    matrix_title = matrix_title.loc[item_id, item_id]
    matrix_description = matrix_description.loc[item_id, item_id]

    # read into train/test rating sparse matrix and combine them up
    rating_matrix_train = scipy.sparse.load_npz(finput_rating_matrix_train)
    rating_matrix_test = scipy.sparse.load_npz(finput_rating_matrix_test)
    rating_matrix = scipy.sparse.csr_matrix(
        np.vstack(
            (rating_matrix_train.toarray(), rating_matrix_test.toarray())))
示例#8
0
import read_write as rw
import k_medoids as km
'''
finput_user_similarity = "Data/user_similarity_matrix"
finput_cluster_number = 200
foutput_user_cluster_set = "Data/user_cluster_set"
'''

if (__name__ == '__main__'):
    # data path
    finput_user_similarity = sys.argv[1]
    finput_cluster_number = int(sys.argv[2])
    foutput_user_cluster_set = sys.argv[3]

    # read into user similarity matrix
    user_similarity_matrix = rw.readffile(finput_user_similarity)

    # k-medoids
    user_cluster_set = km.k_medoids(user_similarity_matrix.values,
                                    K=finput_cluster_number,
                                    max_iterations=20)
    print("\ndone!")

    rw.write2file(user_cluster_set, foutput_user_cluster_set)
    print("file saved done!")

    print("top 20% of user cluster:")
    length = []
    for lst in user_cluster_set:
        length.append(len(lst))
    length.sort(reverse=True)