def main(args): progress = WorkSplitter() raw = pd.read_csv(args.path + args.name, names=['user', 'item', 'rating', 'timestamp']) raw['userID'] = pd.factorize(raw.user)[0] raw['itemID'] = pd.factorize(raw.item)[0] progress.section("Load Raw Data") rating_matrix = getSparseMatrix(raw, row_name='userID', col_name='itemID', value_name='rating') timestamp_matrix = getSparseMatrix(raw, row_name='userID', col_name='itemID', value_name='timestamp') progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime") save_array(nonzero_index, args.path, "Index")
def main(args): progress = WorkSplitter() progress.section("Parameter Setting") print("Data Path: {}".format(args.data_dir)) print("Implicit User Feedback: {}".format(args.implicit)) progress.section("Load Raw Data") rating_matrix, timestamp_matrix = get_yelp_df( args.data_dir + args.data_name, sampling=True, top_user_num=args.top_user_num, top_item_num=args.top_item_num) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit) import ipdb ipdb.set_trace() progress.section("Save NPZ") save_numpy(rtrain, args.data_dir, "Rtrain") save_numpy(rvalid, args.data_dir, "Rvalid") save_numpy(rtest, args.data_dir, "Rtest") save_numpy(rtime, args.data_dir, "Rtime") save_array(nonzero_index, args.data_dir, "Index")
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") rating_matrix = load_pandas(row_name='userId', col_name='itemId', value_name=None, path=args.path, name=args.name, shape=args.shape) timestamp_matrix = load_pandas(row_name='userId', col_name='itemId', value_name='Timestamp', path=args.path, name=args.name, shape=args.shape) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit, sampling=True, percentage=0.2) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime") save_array(nonzero_index, args.path, "Index")
def main(args): progress = WorkSplitter() progress.section("Yahoo R3: Load Raw Data") user_df = pd.read_csv(args.path + args.dataset + args.user, sep=args.sep, header=None, names=args.names) random_df = pd.read_csv(args.path + args.dataset + args.random, sep=args.sep, header=None, names=args.names) if args.implicit: """ If only implicit (clicks, views, binary) feedback, convert to implicit feedback """ user_df['rating'].loc[user_df['rating'] < args.threshold] = -1 user_df['rating'].loc[user_df['rating'] >= args.threshold] = 1 random_df['rating'].loc[random_df['rating'] < args.threshold] = -1 random_df['rating'].loc[random_df['rating'] >= args.threshold] = 1 progress.section("Yahoo R3: Randomly Split Random Set") m, n = max(user_df['uid']) + 1, max(user_df['iid']) + 1 unif_train, validation, test = seed_randomly_split(df=random_df, ratio=args.ratio, split_seed=args.seed, shape=(m, n)) progress.section("Yahoo R3: Save NPZ") save_dir = args.path + args.dataset train = sparse.csr_matrix( (user_df['rating'], (user_df['uid'], user_df['iid'])), shape=(m, n), dtype='float32') save_numpy(train, save_dir, "S_c") save_numpy(unif_train, save_dir, "S_t") save_numpy(validation, save_dir, "S_va") save_numpy(test, save_dir, "S_te") progress.section("Yahoo R3: Statistics of Data Sets") print('* S_c #num: %6d, pos: %.6f, neg: %.6f' % (train.count_nonzero(), np.sum(train == 1) / train.count_nonzero(), 1 - np.sum(train == 1) / train.count_nonzero())) print('* S_t #num: %6d, pos: %.6f, neg: %.6f' % (unif_train.count_nonzero(), np.sum(unif_train == 1) / unif_train.count_nonzero(), 1 - np.sum(unif_train == 1) / unif_train.count_nonzero())) print('* S_va #num: %6d, pos: %.6f, neg: %.6f' % (validation.count_nonzero(), np.sum(validation == 1) / validation.count_nonzero(), 1 - np.sum(validation == 1) / validation.count_nonzero())) print('* S_te #num: %6d, pos: %.6f, neg: %.6f' % (test.count_nonzero(), np.sum(test == 1) / test.count_nonzero(), 1 - np.sum(test == 1) / test.count_nonzero()))
def main(args): progress = WorkSplitter() progress.section("Parameter Setting") print("Data Path: {}".format(args.path)) print("Validation: {}".format(args.validation)) print("Implicit: {}".format(args.implicit)) progress.section("Load Raw Data") rating_matrix = load_pandas(path=args.path, name=args.name, shape=args.shape) timestamp_matrix = load_pandas(path=args.path, value_name='timestamp', name=args.name, shape=args.shape) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, _, _, rtime = split_user_randomly( rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.split_user_ratio, implicit=args.implicit) if args.validation: rtrain, rvalid, _, _, _ = time_ordered_split( rating_matrix=rtrain, timestamp_matrix=rtime, ratio=args.split_train_valid_ratio, implicit=False, remove_empty=False) ractive, rtest, _, _, _ = time_ordered_split( rating_matrix=rtest, timestamp_matrix=rtime, ratio=args.split_active_test_ratio, implicit=False, remove_empty=False) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(ractive, args.path, "Ractive") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime")
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") rating_matrix = load_pandas_without_names( path=args.path, name=args.name, row_name='userId', sep='\t', col_name='trackId', value_name='rating', shape=args.shape, names=['userId', 'trackId', 'rating']) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index = split_seed_randomly( rating_matrix=rating_matrix, ratio=args.ratio, threshold=80, implicit=args.implicit, sampling=True, percentage=0.2) print("Done splitting Yahoo dataset") progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_array(nonzero_index, args.path, "Index") print("Done saving data for yahoo after splitting")
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") rating_matrix, timestamp_matrix = load_netflix(path=args.folder, shape=args.shape) progress.section("Split CSR Matrices") rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(rating_matrix=rating_matrix, timestamp_matrix=timestamp_matrix, ratio=args.ratio, implicit=args.implicit) progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_numpy(rtime, args.path, "Rtime") save_array(nonzero_index, args.path, "Index")
def main(args): progress = WorkSplitter() progress.section("Load Raw Data") #rating_matrix = load_pandas(path=args.path, name=args.name, shape=args.shape) rating_matrix = load_yahoo(path=args.path, name=args.name, shape=args.shape) #timestamp_matrix = load_pandas(path=args.path, value_name='timestamp', name=args.name, shape=args.shape) progress.section("Split CSR Matrices") #rtrain, rvalid, rtest, nonzero_index = time_ordered_split(rating_matrix=rating_matrix, ratio=args.ratio, implicit=args.implicit) rtrain, rvalid, rtest, nonzero_index = split_seed_randomly( rating_matrix=rating_matrix, ratio=args.ratio, implicit=args.implicit) print("Done splitting Yahoo dataset") progress.section("Save NPZ") save_numpy(rtrain, args.path, "Rtrain") save_numpy(rvalid, args.path, "Rvalid") save_numpy(rtest, args.path, "Rtest") save_array(nonzero_index, args.path, "Index") print("Done saving data for yahoo after splitting")
def main(args): progress = WorkSplitter() progress.section("Parameter Setting") print("Data Path: {}".format(args.data_dir)) reviewJsonToronto = args.data_dir + args.data_name progress.section("Load data") df = get_yelp_df(path='', filename=reviewJsonToronto, sampling=True) print('Data loaded sucessfully') progress.section("Matrix Generation") rating_matrix, timestamp_matrix, I_C_matrix, IC_dictionary = get_rating_timestamp_matrix( df) # get ratingWuserAvg_matrix rating_array = rating_matrix.toarray() user_average_array = rating_array.sum(axis=1) / np.count_nonzero( rating_array, axis=1) init_UI = np.zeros(rating_array.shape) init_UI[rating_array.nonzero()] = 1 #Creating rating with user average array array for i in range(user_average_array.shape[0]): init_UI[i] = init_UI[i] * (user_average_array[i] - 0.001) user_average_array = init_UI ratingWuserAvg_array = rating_array - user_average_array ratingWuserAvg_matrix = sparse.csr_matrix(ratingWuserAvg_array) progress.section("Split for training") rtrain_implicit, rvalid_implicit, rtest_implicit, rtrain_userAvg_implicit, rvalid_userAvg_implicit, \ rtest_userAvg_implicit, nonzero_index, rtime, item_idx_matrix_train_implicit,item_idx_matrix_valid_implicit, item_idx_matrix_test_implicit \ = time_ordered_splitModified(rating_matrix=rating_matrix, ratingWuserAvg_matrix=ratingWuserAvg_matrix, timestamp_matrix=timestamp_matrix, ratio=[0.5,0.2,0.3], implicit=True, remove_empty=False, threshold=3,sampling=False, sampling_ratio=0.1, trainSampling=0.95) rtrain, rvalid, rtest, rtrain_userAvg, rvalid_userAvg, rtest_userAvg, nonzero_index, rtime, \ item_idx_matrix_train,item_idx_matrix_valid, item_idx_matrix_test = time_ordered_splitModified(rating_matrix=rating_matrix, ratingWuserAvg_matrix=ratingWuserAvg_matrix, timestamp_matrix=timestamp_matrix, ratio=[0.5,0.2,0.3], implicit=False, remove_empty=False, threshold=3, sampling=False, sampling_ratio=0.1, trainSampling=0.95) rtrain = rtrain + rvalid + rtest rtrain_implicit = rtrain_implicit + rvalid_implicit + rtest_implicit progress.section("Get UC Matrix") #Get UC matrices U_C_matrix_explicit, U_C_matrix_implicit = get_UC_Matrix( I_C_matrix, rtrain_implicit) progress.section("Get IK Similarity") IK_MATRIX = ikGeneration(df) IK_similarity = train(IK_MATRIX) ''' progress.section("Get IC Similarity") IC_similarity = train(I_C_matrix) ''' progress.section("Get IP, IS, ID Dictionary") #intersection = get_intersection() intersection_yonge_and_finch, intersection_bloor_and_bathurst, intersection_spadina_and_dundas,\ intersection_queen_and_spadina, intersection_bloor_and_yonge, intersection_dundas_and_yonge = get_intersection() IP_df, IP_dictionary = get_IP_matrix_dictionary(df, IK_similarity) IS_dictionary = get_IS_dictionary(df) #ID_dictionary = get_ID_dictionary(df,list(set(df['business_num_id'])),intersection) ID_dictionary_yonge_and_finch = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_yonge_and_finch) ID_dictionary_bloor_and_bathurst = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_bloor_and_bathurst) ID_dictionary_spadina_and_dundas = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_spadina_and_dundas) ID_dictionary_queen_and_spadina = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_queen_and_spadina) ID_dictionary_bloor_and_yonge = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_bloor_and_yonge) ID_dictionary_dundas_and_yonge = get_ID_dictionary( df, list(set(df['business_num_id'])), intersection_dundas_and_yonge) progress.section("user item predict") user_item_prediction_score = predict(rtrain, 110, IK_similarity, item_similarity_en=True) UI_Prediction_Matrix = prediction(user_item_prediction_score, rtrain) progress.section("Save datafiles csv") save_dataframe_csv(df, args.data_dir, "Dataframe") progress.section("Save datafiles JSON") saveDictToJson(IC_dictionary, args.data_dir, 'icDictionary', trainOrTest='train') saveDictToJson(IP_dictionary, args.data_dir, 'ipDictionary', trainOrTest='train') saveDictToJson(IS_dictionary, args.data_dir, 'isDictionary', trainOrTest='train') #saveDictToJson(ID_dictionary, args.data_dir, 'idDictionary', trainOrTest='train') saveDictToJson(ID_dictionary_yonge_and_finch, args.data_dir, 'idDictionary_yongefinch', trainOrTest='train') saveDictToJson(ID_dictionary_bloor_and_bathurst, args.data_dir, 'idDictionary_bloorbathurst', trainOrTest='train') saveDictToJson(ID_dictionary_spadina_and_dundas, args.data_dir, 'idDictionary_spadinadundas', trainOrTest='train') saveDictToJson(ID_dictionary_queen_and_spadina, args.data_dir, 'idDictionary_queenspadina', trainOrTest='train') saveDictToJson(ID_dictionary_bloor_and_yonge, args.data_dir, 'idDictionary_blooryonge', trainOrTest='train') saveDictToJson(ID_dictionary_dundas_and_yonge, args.data_dir, 'idDictionary_dundasyonge', trainOrTest='train') progress.section("Save datafiles Numpy") save_numpy_csr(rtrain, args.data_dir, "rtrain") save_numpy_csr(I_C_matrix, args.data_dir, "icmatrix") #save_numpy(user_item_prediction_score, args.data_dir, "predictionScore") save_numpy(IK_similarity, args.data_dir, "IKbased_II_similarity") #Tina requested for this name save_numpy(UI_Prediction_Matrix, args.data_dir, "UI_prediction_matrix") '''