def test_knn_condensed(self): data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) # load data df = data.df.sample(n=350) # minimal data frame data.split_data(data_frame=df) # sets test and train data cluster_obj = KNN(5, data) condensed_data = cluster_obj.condense_data(data.train_df) size_after = condensed_data.shape[0] size_prior = data.train_df.shape[0] self.assertGreater(size_prior, size_after)
def test_edit_vs_condese(self): data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) df = data.df.sample(n=350) data.split_data(data_frame=df) knn = KNN(5, data) edit = knn.edit_data(data.train_df, 5, data.test_df, data.label_col) data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) # load data df = data.df.sample(n=350) # minimal data frame data.split_data(data_frame=df) # sets test and train data cluster_obj = KNN(5, data) condensed_data = cluster_obj.condense_data(data.train_df) size_after = condensed_data.shape[0] print("----------") print(edit.shape[0]) print(size_after) if size_after < edit.shape[0]: print("Run condensed") else: print("Run edited")
def RBFREG_exp(data_config, data): # setup data var # data = Data('segmentation', pd.read_csv(r'data/segmentation.data', header=None), 0) # load data df = data.df # get the dataframe from df print("Checking DF set") print(df[df.columns[-1]]) # double check data is numerical cols = df.columns for col in cols: df[col] = df[col].astype(float) # split into test/train data.split_data(data_frame=df) if data_config == 'condensed': # Run RBF on condensed data set cluster_obj = KNN(5, data) data.train_df = cluster_obj.condense_data(data.train_df) print( "\n---------------- Running Condensed Nearest Neighbor RBF -----------------" ) print('Size of data: ', data.train_df.shape) rbf = RBFReg(clusters=4, maxruns=1000) rbf2 = RBFReg(clusters=6, maxruns=1000) rbf3 = RBFReg(clusters=8, maxruns=1000) rbf4 = RBFReg(clusters=12, maxruns=1000) elif data_config == 'edited': # Run RBF on edited dataset knn = KNN(5, data) data.train_df = knn.edit_data(data.train_df, 5, data.test_df, data.label_col) print( "\n---------------- Running Edited Nearest Neighbor RBF -----------------\n" ) print('Size of data: ', data.train_df.shape) rbf = RBFReg(clusters=4, maxruns=1000) rbf2 = RBFReg(clusters=6, maxruns=1000) rbf3 = RBFReg(clusters=8, maxruns=1000) rbf4 = RBFReg(clusters=12, maxruns=1000) elif data_config == 'k-means': # Run RBF on K-means print("\n---------------- Running K-Means RBF -----------------\n") rbf = RBFRegK(clusters=4, maxruns=1000) rbf2 = RBFRegK(clusters=6, maxruns=1000) rbf3 = RBFRegK(clusters=8, maxruns=1000) rbf4 = RBFRegK(clusters=12, maxruns=1000) elif data_config == 'medoids': # Run RBF on Medoids print("\n---------------- Running Medoids RBF -----------------\n") rbf = RBFReg(clusters=4, maxruns=1000) rbf2 = RBFReg(clusters=6, maxruns=1000) rbf3 = RBFReg(clusters=8, maxruns=1000) rbf4 = RBFReg(clusters=12, maxruns=1000) # setup expected values for testings expected = data.train_df[data.train_df.columns[-1]] actual = data.test_df[data.test_df.columns[-1]] # sets test and train data # will have high error due to small dataset, but just a test to show how this works expc_list = actual.values.tolist() rbf.trainReg(data.train_df, expected, data) predicts = rbf.predictReg(data.test_df, data) print("predicts RBF 1") print(predicts) print("expected") print(expc_list) lf = LF() lf.mean_squared_error(predicts, expc_list) lf.zero_one_loss(predicts, expc_list) # print("MSE RBF 1") # mse = rbf.mean_squared_error(predicts, expc_list) # print(mse) rbf2.trainReg(data.train_df, expected, data) predicts2 = rbf.predictReg(data.test_df, data) print("predicts RBF 2") print(predicts2) print("expected") print(expc_list) # print("MSE RBF 2") # mse2 = rbf2.mean_squared_error(predicts2, expc_list) # print(mse2) lf.mean_squared_error(predicts, expc_list) lf.zero_one_loss(predicts, expc_list) rbf3.trainReg(data.train_df, expected, data) predicts3 = rbf.predictReg(data.test_df, data) print("predicts RBF 3") print(predicts3) print("expected") print(expc_list) # print("MSE RBF 3") # mse3 = rbf.mean_squared_error(predicts3, expc_list) # print(mse3) lf.mean_squared_error(predicts, expc_list) lf.zero_one_loss(predicts, expc_list) rbf4.trainReg(data.train_df, expected, data) predicts4 = rbf.predictReg(data.test_df, data) print("predicts RBF 4") print(predicts4) print("expected") print(expc_list) # print("MSE RBF 4") # mse4 = rbf.mean_squared_error(predicts4, expc_list) # print(mse4) lf.mean_squared_error(predicts, expc_list) lf.zero_one_loss(predicts, expc_list)
def RBFREG_vid(data_config, data): # data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) # load data df = data.df.sample( 100) # get the dataframe from df, take small subsection data_name = data.name print("\nChecking DF set") print(df[df.columns[-1]]) # double check data is numerical cols = df.columns for col in cols: df[col] = df[col].astype(float) # split into test/train data.split_data(data_frame=df) # setup expected values for testings expected = data.train_df[data.train_df.columns[-1]] actual = data.test_df[data.test_df.columns[-1]] # sets test and train data # will have high error due to small dataset, but just a test to show how this works if data_config == 'condensed': # Run RBF on condensed data set cluster_obj = KNN(5, data) data.train_df = cluster_obj.condense_data(data.train_df) print( "\n---------------- Running Condensed Nearest Neighbor RBF Data: " + data_name + "-----------------") print('Size of data: ', data.train_df.shape) rbf = RBFReg(clusters=8, maxruns=600) elif data_config == 'edited': # Run RBF on edited dataset knn = KNN(5, data) data.train_df = knn.edit_data(data.train_df, 5, data.test_df, data.label_col) print("\n---------------- Running Edited Nearest Neighbor RBF Data: " + data_name + "-----------------") print('Size of data: ', data.train_df.shape) rbf = RBFReg(clusters=8, maxruns=600) elif data_config == 'k-means': # Run RBF on K-means print("\n---------------- Running K-Means RBF Data: " + data_name + "-----------------") rbf = RBFRegK(clusters=8, maxruns=600) elif data_config == 'medoids': # Run RBF on Medoids print("\n---------------- Running Mediods RBF Data: " + data_name + "-----------------") rbf = RBFReg(clusters=8, maxruns=600) rbf.trainReg(data.train_df, expected, data) print('Calculate predictions for the RBF') predicts = rbf.predictReg(data.test_df, data) expc_list = actual.values.tolist() print("predicts RBF") print(predicts) print("expected") print(expc_list) lf = LF() mse = lf.mean_squared_error(predicts, expc_list) zeroone = lf.zero_one_loss(predicts, expc_list) plt.plot(predicts, label=data_name + ' ' + data_config + ' prediction') plt.plot(expc_list, label=data_name + ' ' + data_config + ' expected') plt.plot(mse, label='MSE: ' + str(mse)) plt.plot(zeroone, label='0-1 Loss: ' + str(zeroone)) plt.legend() plt.title('Data: ' + data_name) plt.ylabel('Expected value/ Predicted Value') plt.xlabel('# Predictions') plt.savefig( data_name + '_' + data_config ) # Code for saving a plot to image sourced from: https://pythonspot.com/matplotlib-save-figure-to-image-file/ plt.clf()