def test_data_generator(self): FOV_Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\a.tif' LABEL_Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\a.tif' IMAGE_Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\a.tif' data_creator = Data(FOV_Path, LABEL_Path, IMAGE_Path,C, False) Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data' data_creator.create_data(Path) first_data_generator = itertools.cycle(D.data_generator(Path+r'\FOV', Path+r'\LABEL', Path+r'\IMAGE',0, 0)) for z in range(D.start_z, D.end_z): for x in range(D.start_xy, D.end_xy): for y in range(D.start_xy, D.end_xy): Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(x, i) self.assertEqual(y, j) self.assertEqual(z, k) self.assertEqual( np.sum(Fov-tiff.imread(r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\FOV\FOV_%d%d%d_%d_%d.tif'%(x,y,z,0,0))),0) self.assertEqual( np.sum(Label- tiff.imread(r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\LABEL\LABEL_%d%d%d.tif'%(x,y,z))),0) self.assertEqual( np.sum(Image- tiff.imread(r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\IMAGE\IMAGE_%d%d%d.tif'%(x,y,z))),0)
def TestClassifier(): data = Data("cs_170_small80.txt") start = time.time() d = data.selectFeature(data.data, [0,5,3,7]) validator = LeaveOneOutValidator(d, KnearestNeighbor) print validator.validate() print "time: ", time.time() -start
def TestFeatureSelection(): data = Data("cs_170_small80.txt") dat = data.preprocess() print dat.shape print "dat type: ", type(dat) col_nums = dat.shape[1] feature_indices = [] for d in range(col_nums): feature_indices.append(d) print data.selectFeature(dat,feature_indices )
def main(): path = 'DM_Experiment4/iris.arff' choice = input('Use KMeans Enter 1;Use DBSCAN Enter 2:') # load data data_obj = Data(path=path) data_obj.load_data() algorithm_router(choice, data_obj)
def TestDistance(): data = Data("testData.txt") validator = LeaveOneOutValidator(data.data, KnearestNeighbor) d = data.preprocess() test, train = validator.leaveOneOut(d, 0) knn = KnearestNeighbor() print "test: ", test print "train: \n", train print "distance: ", knn.distance(train, test) print "\n\n"
def TestBackwardsSearch(): data = Data("cs_170_small80.txt") start = time.time() validator = LeaveOneOutValidator(data.data, KnearestNeighbor) backEl = BackwardsElimination(data, validator) backEl.search() print "time: ", time.time() - start
def TestForwardSearch(): data = Data("cs_170_small80.txt") start = time.time() validator = LeaveOneOutValidator(data.data, KnearestNeighbor) fwdSlct = ForwardSelection(data, validator) fwdSlct.search() print "time: ", time.time() - start
def test_createdata_SAME(self): D = Data(imagepath, labelpath, promappath, C) Path = r'C:\Users\sunzh\CS636\Summer project\BPN\data' D.create_data(Path) r_xy = int((D.size[0] - 1) / 2) r_z = max(int((D.size[2] - 1) / 2), 1) for x in range(r_xy, r_xy + D.cropped_size[0]): for y in range(r_xy, r_xy + D.cropped_size[1]): for z in range(r_z, r_z + D.cropped_size[2]): self.assertTrue( os.path.isfile(Path + r'\FOV\FOV_%d%d%d_0_0.tif' % (x, y, z))) self.assertTrue( os.path.isfile(Path + r'\LABEL\LABEL_%d%d%d.tif' % (x, y, z))) print('2nd test case finished')
def run_model(): d = Data(LANG, DEVorTEST, GLOVE_FILE, ELMO_FILE, MODEL, DEP_ADJACENCY_GCN, POSITION_EMBED) d.load_data( DATAPATH ) # This loads train, dev, and test if available, and also word2vec and ELMo where relevant model = Tagger(d, d.max_length, d.input_dim, d.n_poses, d.n_classes, initial_weight) tagger = getattr(model, MODEL)() # choose the specified tagging model T = Train_Test(POS, MODEL, tagger, d) if DEVorTEST == "CROSS_VAL": T.cross_validation(EPOCHS, BATCH_SIZE, DATAPATH) else: T.train(EPOCHS, BATCH_SIZE) T.test(DATAPATH ) # We pass DATAPATH to this function to be used for evaluation
def run_model(): # args: lang, train, dev, test, word2vec_dir, elmo_dir, model_name d = Data(LANG_TR, LANG_DEV, LANG_TS, DEVorTEST, WV_DIR, ELMO_PATH, MODEL, DEP_ADJACENCY_GCN, DEP_INFO, POS) d.load_data( DATAPATH ) # This loads train, dev (if available), test (if available) and also word2vec and ELMo # args: max_length, n_poses, n_classes, initial_weight='' model = Tagger(d, initial_weight) tagger = getattr(model, MODEL)() # choose the specified tagging model print(tagger) T = Train_Test(POS, W2V, MODEL, tagger, d, DEVorTEST) if DEVorTEST == "CROSS_VAL": T.cross_validation(EPOCHS, BATCH_SIZE, DATAPATH) else: T.train(EPOCHS, BATCH_SIZE) T.test( DATAPATH ) # We give the Data_path to this function, just for it to return the evaluation for us
def main(): # set path iris_path = 'DataAnalysisProjectDesign/Experiment2/iris_train.arff' adult_path = 'DataAnalysisProjectDesign/Experiment2/adult_train.arff' # get choice data_choice = input('Enter 1 for iris; Enter 2 for adult:') dt_num = int(input('Enter your expected tree number:')) path = select_dataset(data_choice,iris_path,adult_path) # create data instance data_obj = Data(path) data_obj.load_data() data_obj.fill_missing_data() # create random forest rf = RandomForest( data=data_obj, dt_num=dt_num ) rf.bagging() rf.train_rf() correct_rate,conf_mat = rf.test_rf() return dt_num,correct_rate,conf_mat
def test_init(self): D = Data(imagepath, labelpath, promappath, C) self.assertEqual(D.padding, C.boundary_padding) self.assertEqual(D.size, C.field_of_view_scales) self.assertEqual(D.sample, C.sample_fov) self.assertEqual(D.stride_hw, C.stride_hw) self.assertEqual(D.stride_depth, C.stride_depth) self.assertEqual(D.cropped_size, C.cropped_size) self.assertAlmostEqual( int( np.sum(D.image - D.featurewise_std_normalization( D.featurewise_center(Image[0:C.cropped_size[0], 0:C.cropped_size[1], 0:C.cropped_size[2]])))), 0) self.assertEqual( np.sum(D.label - D.label_making(Label[0:C.cropped_size[0], 0:C.cropped_size[1], 0:C.cropped_size[2]])), 0) self.assertEqual( np.sum(D.promap - D.LLR(Promap[0:C.cropped_size[0], 0:C.cropped_size[1], 0:C.cropped_size[2]])), 0) print('1st test case finished')
def train_rf(self): for data_df in self.bagging_data: params_dict = self.get_train_params(data_df) data = Data(path=None, dataset=None, df=data_df, fea_column=params_dict.get('fea_column'), nom_columns=params_dict.get('nom_columns'), num_columns=params_dict.get('num_columns'), class_column=params_dict.get('class_column'), class_data=params_dict.get('class_data')) tree = self.train_dt(train_data=data, split_path=bytes(' ', encoding='utf-8'), val=None, num_flag=-1) self.dt_list.append(tree) self.root_list.append(tree)
def TestMenu(): print "Welcome to Robert's Nearest Neigbhor Feature Search Algorithm" filename = raw_input("Enter filename to data: ") data = Data(filename) validator = LeaveOneOutValidator(data.data, KnearestNeighbor) print "Choose the Algorithm you'd like to run: (eg. 1)" print "1. Forward Selection" print "2. Backward Elimination" algorithm_choice = int(raw_input()) if algorithm_choice == 1: algorithm = ForwardSelection(data, validator) elif algorithm_choice == 2: algorithm = BackwardsElimination(data, validator) print "This dataset has ", data.data.shape[1], " features and ", data.data.shape[0], " instances" algorithm.search()
def test_data_exchange_3d(self): C1 = Config() C1.field_of_view_scales = [3, 3, 3] C1.cropped_size = [5, 5, 3] D1 = data_generator.Data_Generator(C1) FOV_Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\b.tif' LABEL_Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\b.tif' IMAGE_Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\b.tif' data_creator = Data(FOV_Path, LABEL_Path, IMAGE_Path,C1,False) Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data' data_creator.create_data(Path) print(data_creator.promap) D1.data_exchange(Path+r'\FOV', 1, 0 ) first_data_generator = itertools.cycle(D1.data_generator(Path+r'\FOV', Path+r'\LABEL', Path+r'\IMAGE',1, 0)) Fov, Label, Image, i, j, k = next(first_data_generator) print(Fov) self.assertEqual(np.sum(Fov), 8) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 24) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 36) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 48) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 40) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 72) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 7*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 8*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 9*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 120) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 11*12) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 12*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 13*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 14*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 15*12) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 16*12) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 17*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 18*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 19*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 20*12) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 21*8) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 22*12) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 23*12) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 24*12) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 25*8) #################### Ch2 Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 1*12) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 2*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 3*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 4*18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 5*12) # Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 6*18) # Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 7*27) # Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 8*27) # Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 9*27) # Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 10*6*3) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 11*6*3)
def main(): # set path iris_path = [ 'DataAnalysisProjectDesign/Experiment1/iris_train.arff', 'DataAnalysisProjectDesign/Experiment1/iris_test.arff' ] adult_path = [ 'DataAnalysisProjectDesign/Experiment1/adult_train.arff', 'DataAnalysisProjectDesign/Experiment1/adult_test.arff' ] # get choice data_choice = input('Enter 1 for iris DT; Enter 2 for adult DT:') tree_choice = input('Enter 1 for ID3; Enter 2 for CART:') path = select_dataset(data_choice, iris_path, adult_path) # create train data instance train_data_obj = Data(path[0]) train_data_obj.load_data() train_data_obj.fill_missing_data() # create test data instance test_data_obj = Data(path[1]) test_data_obj.clear_memory() test_data_obj.load_data() test_data_obj.fill_missing_data() tree = dt_router(train_data_obj, test_data_obj, tree_choice) tree.test() conf_mat, judge = tree.get_conf_mat() return tree, conf_mat, judge
def test_data_exchange(self): FOV_Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\a.tif' LABEL_Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\a.tif' IMAGE_Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data\a.tif' data_creator = Data(FOV_Path, LABEL_Path, IMAGE_Path,C, False) Path = r'C:\Users\sunzh\CS636\Summer project\BPN\utils\test\test_data' data_creator.create_data(Path) D.data_exchange(Path+r'\FOV', 1, 0 ) first_data_generator = itertools.cycle(D.data_generator(Path+r'\FOV', Path+r'\LABEL', Path+r'\IMAGE',1, 0)) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 4) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 12) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 18) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 24) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 20) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 36) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 63) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 72) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 81) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 60) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 66) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 108) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 117) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 126) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 90) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 96) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 153) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 162) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 171) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 120) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 84) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 22*6) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 23*6) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 24*6) Fov, Label, Image, i, j, k = next(first_data_generator) self.assertEqual(np.sum(Fov), 100)
t = str(t, encoding='utf-8') t = datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S') for im in range(6): tm = t + datetime.timedelta(minutes=20) line = '{},{},"[{},{})",{:.2f}\n'.format( it[0], it[1], str(t), str(tm), preds[it][i, im]) f.write(line) t = tm intervals = [24] if __name__ == '__main__': test_intervals = [15, 20, 24] data_dir = '../../data/dataSets' data_set = 'test' # sval data = Data(data_dir) if data_set != 'test': if data_set == 'sval': batch_size = 2 * 7 elif data_set == 'val': batch_size = 60 * 7 else: batch_size = 3920 all_preds = [] all_losses = [] for interval in intervals: batch_link_ftr, batch_link_id, batch_link_g, \ batch_route_ftr, batch_route_id, batch_route_g, \ batch_wea_ftr, batch_time_ftr,\ batch_times,batch_route_tgt, batch_weights = get_inputs(
def build_tree(self, train_data, split_path, val, num_flag): df = train_data.df # get dataframe # create a node node = CARTNode(data=train_data, split_fea=None, val=val, num_flag=num_flag, split_path=split_path, belong_fea=None, leaf_flag=0, purity_flag=0) uni_class_data = node.data.class_data.unique() # 若该类别数为1 if uni_class_data.shape[0] == 1: node.leaf_flag = 1 # 标记为叶子结点 node.purity_flag = 1 #标记为纯结点 node.belong_fea = uni_class_data[0] return node if len(node.data.fea_column) == 0: # 特征都用完了 多数表决 node.leaf_flag = 1 node.purity_flag = 0 mode = node.data.class_data.mode().get(0) node.belong_fea = mode return node selected_fea, flag, divide_point, best_nom_fea_val = node.fea_selection( ) node.split_fea = selected_fea if flag == 0: # 数值属性 split_df = [ df[df[selected_fea] <= divide_point], df[df[selected_fea] > divide_point] ] # 是二叉树 for i in range(2): if i == 0: # 左子代 if split_df[i].empty: mode = node.data.class_data.mode().get(0) data_obj = Data(path=None, dataset=None, df=df, fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=node.data.class_data) node.left_child = CARTNode( data=data_obj, split_fea=None, val=None, num_flag=-1, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes(' not exist', encoding='utf-8'), belong_fea=mode, leaf_flag=1, purity_flag=0) else: #不为空 data_obj = Data( path=None, dataset=None, df=split_df[i], fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=split_df[i][node.data.class_column]) node.left_child = self.build_tree( train_data=data_obj, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes('<=', encoding='utf-8') + bytes(str(divide_point), encoding='utf-8') + bytes(' ', encoding='utf-8'), val=bytes('<=', encoding='utf-8') + bytes(str(divide_point), 'utf-8'), num_flag=0) else: #右子代 if split_df[i].empty: mode = node.data.class_data.mode().get(0) data_obj = Data(path=None, dataset=None, df=df, fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=node.data.class_data) node.right_child = CARTNode( data=data_obj, split_fea=None, val=None, num_flag=-1, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes(' not exist', encoding='utf-8'), belong_fea=mode, leaf_flag=1, purity_flag=0) else: data_obj = Data( path=None, dataset=None, df=split_df[i], fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=split_df[i][node.data.class_column]) node.right_child = self.build_tree( train_data=data_obj, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes('>', encoding='utf-8') + bytes(str(divide_point), encoding='utf-8') + bytes(' ', encoding='utf-8'), val=bytes('>', encoding='utf-8') + bytes(str(divide_point), encoding='utf-8'), num_flag=1) else: # 数据是标称型 split_df = [ df[df[selected_fea] == best_nom_fea_val], df[df[selected_fea] != best_nom_fea_val] ] for i in range(2): if i == 0: #左子代 if split_df[i].empty: mode = node.data.class_data.mode().get(0) data_obj = Data(path=None, dataset=None, df=df, fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=node.data.class_data) node.left_child = CARTNode( data=data_obj, split_fea=None, val=None, num_flag=-1, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes(' not exist', encoding='utf-8'), belong_fea=mode, leaf_flag=1, purity_flag=0) else: data_obj = Data( path=None, dataset=None, df=split_df[i], fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=split_df[i][node.data.class_column]) node.left_child = self.build_tree( train_data=data_obj, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes(' not ', encoding='utf-8') + best_nom_fea_val + bytes(' ', encoding='utf-8'), val=bytes(' not ', encoding='utf-8') + best_nom_fea_val, num_flag=-1) else: #右子代 if split_df[i].empty: mode = node.data.class_data.mode().get(0) data_obj = Data(path=None, dataset=None, df=df, fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=node.data.class_data) node.right_child = CARTNode( data=data_obj, split_fea=None, val=None, num_flag=-1, split_path=split_path + bytes(' not exist', encoding='utf-8'), belong_fea=mode, leaf_flag=1, purity_flag=0) else: data_obj = Data( path=None, dataset=None, df=split_df[i], fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=split_df[i][node.data.class_column]) node.right_child = self.build_tree( train_data=data_obj, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes('\'s ', encoding='utf-8') + best_nom_fea_val + bytes(' ', encoding='utf-8'), val=best_nom_fea_val, num_flag=-1) return node
def TestValidatorFlow(): data = Data("cs_170_small80.txt") validator = LeaveOneOutValidator(data.data, KnearestNeighbor) precentage_correct = validator.validate()
import numpy as np from model import BILSTM import pandas as pd from preprocessing import Data # taking the first 0.5 second of the signal data just as an example # note each 1800 time is 0.5 second # signal should have shape (1, 1800, 8) signal = np.array([pd.read_csv("1_raw_data_13-12_22.03.16.txt", delimiter='\t').to_numpy()[1800:3600, 1:-1]]) print(signal.shape) # Prepare Data data = Data(signal) # initialize model model = BILSTM() # load model model.load_model() # return prediction, the prediction for time 0.5-1 second is 1 (which is indeed true) print(model.predict(data.X, data.zc_feature, data.ssc_feature, data.feature_1d))
class PreprocessingTests(unittest.TestCase): def setUp(self): self.data = Data() def test_load(self): """ Test existence, type, length of loaded data """ self.data.load() self.assertIsNotNone(self.data._dataset, "loaded no data") self.assertEqual(type(("foo", "bar")), type(self.data._dataset), "loaded no tuple") self.assertEqual(2, len(self.data._dataset), "loaded tuple has false length") def test_preprocess(self): """ Test one-hot-encoding and type conversions of preprocessed data """ self.data.load() self.data.preprocess() # one-hot-encodings np.testing.assert_array_equal([0., 1.], np.unique(self.data._dataset[0][0]), "false one-hot-encoding of train_data") np.testing.assert_array_equal([0., 1.], np.unique(self.data._dataset[1][0]), "false one-hot-encoding of test_data") self.assertEqual("float64", self.data._dataset[0][0].dtype, "wrong type of train_data values") self.assertEqual("float64", self.data._dataset[1][0].dtype, "wrong type of test_data values") # label vectorization self.assertEqual(np.ndarray, type(self.data._dataset[0][1]), "wrong type of train_labels set") self.assertEqual(np.ndarray, type(self.data._dataset[1][1]), "wrong type of test_labels set") self.assertEqual("float32", self.data._dataset[0][1].dtype, "wrong type of train_labels values") self.assertEqual("float32", self.data._dataset[1][1].dtype, "wrong type of test_labels values") def test_split_data(self): """ Test correct train-dev-test-split """ self.data.load() self.data.preprocess() self.data.split_data() # correct number of tuples self.assertEqual(3, len(self.data._dataset), "wrong number of splits") self.assertEqual(2, len(self.data._dataset[0]), "wrong number of train splits") self.assertEqual(2, len(self.data._dataset[1]), "wrong number of dev splits") self.assertEqual(2, len(self.data._dataset[2]), "wrong number of test splits") # existence self.assertIsNotNone(self.data._dataset[0][0], "train_data is None") self.assertIsNotNone(self.data._dataset[0][1], "train_labels is None") self.assertIsNotNone(self.data._dataset[1][0], "dev_data is None") self.assertIsNotNone(self.data._dataset[1][1], "dev_labels is None") self.assertIsNotNone(self.data._dataset[2][0], "test_data is None") self.assertIsNotNone(self.data._dataset[2][1], "test_labels is None") def test_train_dev_test(self): """ Test type and shape of train, dev & test sets """ (train_data, train_labels), (dev_data, dev_labels), (test_data, test_labels) = self.data.train_dev_test # type self.assertEqual(np.ndarray, type(train_data), "wrong type of train_data") self.assertEqual(np.ndarray, type(train_labels), "wrong type of train_labels") self.assertEqual(np.ndarray, type(dev_data), "wrong type of dev_data") self.assertEqual(np.ndarray, type(dev_labels), "wrong type of dev_labels") self.assertEqual(np.ndarray, type(test_data), "wrong type of test_data") self.assertEqual(np.ndarray, type(test_labels), "wrong type of test_labels") # shape self.assertEqual((15000, 10000), train_data.shape, "train_data has wrong shape") self.assertEqual((15000, ), train_labels.shape, "train_labels have wrong shape") self.assertEqual((10000, 10000), dev_data.shape, "dev_data has wrong shape") self.assertEqual((10000, ), dev_labels.shape, "dev_labels have wrong shape") self.assertEqual((25000, 10000), test_data.shape, "test_data has wrong shape") self.assertEqual((25000, ), test_labels.shape, "test_labels have wrong shape")
def setUp(self): self.data = Data()
def main(_): global img_row, img_col max_steps = 15 # intit tensorboard logdir_train = 'C:\\Users\\Yuval\\Documents\\tensorboard\\1' + '\\train' logdir_test = 'C:\\Users\\Yuval\\Documents\\tensorboard\\1' + '\\test' train_writer = tf.summary.FileWriter(logdir=logdir_train) test_writer = tf.summary.FileWriter(logdir=logdir_test) # TODO: add Embedded Visualizer is a cool 3D visualization of tensorboard data # Import data mydata = Data() # Create the model x = tf.placeholder(tf.float32, [None, img_row * img_col], name='x') # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, 2], name='labels') # Build the graph for the deep net y_conv, keep_prob = deepnn(x) # merges all summaries to be passed to fileWriter merged_summary = tf.summary.merge_all() # cost function to minimize with tf.name_scope('cross_entropy'): cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv)) # use AdamOptimizer instead of Gradient Descent Algo with tf.name_scope('accuracy'): train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) # Measure prediction accuracy by then frequency of correct classifications with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # take time for performance analysis start_time = time.time() with tf.Session() as sess: # tensorboard add graph train_writer.add_graph(sess.graph) test_writer.add_graph(sess.graph) # initialize CNN weights sess.run(tf.global_variables_initializer()) # batch-stochastic gradient descent for i in range(max_steps): batch_x, batch_y, dropout = mydata.get_batch(i, train=True) # run optimizer to calculate gradients summary, _ = sess.run([merged_summary, train_step], feed_dict={ x: batch_x, y_: batch_y, keep_prob: dropout }) train_writer.add_summary(summary, i) train_accuracy = accuracy.eval(feed_dict={ x: batch_x, y_: batch_y, keep_prob: dropout }) tf.summary.scalar('accuracy', train_accuracy) if i % 5 == 0: batch_x, batch_y, dropout = mydata.get_batch(i, train=False) summary, test_accuracy = sess.run([merged_summary, accuracy], feed_dict={ x: batch_x, y_: batch_y, keep_prob: dropout }) test_writer.add_summary(summary, i) print('step %d, test accuracy %g' % (i, test_accuracy)) print("--- %s seconds ---" % (time.time() - start_time)) print('i=', i) print('test accuracy %g' % accuracy.eval(feed_dict={ x: mydata.test_imgs, y_: mydata.test_labels, keep_prob: 1.0 }))
def train_dt(self, train_data, split_path, val, num_flag): df = train_data.df # get dataframe # create rfnode node = RFNode(data=train_data, split_fea=None, val=val, num_flag=num_flag, split_path=split_path, belong_fea=None, leaf_flag=0, purity_flag=0) uni_class_data = node.data.class_data.unique() # 若该类别数为1 if uni_class_data.shape[0] == 1: node.leaf_flag = 1 # 标记为叶子结点 node.purity_flag = 1 #标记为纯结点 node.belong_fea = uni_class_data[0] return node if len(node.data.fea_column) == 0: # 特征都用完了 多数表决 node.leaf_flag = 1 node.purity_flag = 0 mode = node.data.class_data.mode().get(0) node.belong_fea = mode return node selected_fea, flag, divide_point = node.fea_selection() node.split_fea = selected_fea if flag == 0: #数值属性作为分裂属性 #根据分裂属性 将数据分裂 split_df = [ df[df[selected_fea] <= divide_point], df[df[selected_fea] > divide_point] ] tmp_count = 0 for data in split_df: #如果split_data中有一个为空 以train_data中多数表决 该结点为叶结点 if data.empty: mode = node.data.class_data.mode().get(0) #创建新的data instance data_obj = Data(path=None, dataset=None, df=df, fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=node.data.class_data) #作为该结点的子代 node.children.append( RFNode(data=data_obj, split_fea=None, val=None, num_flag=-1, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes(' not exist', encoding='utf-8'), belong_fea=mode, leaf_flag=1, purity_flag=0)) tmp_count += 1 else: # tmp_count == 0 对于数值型为 <= # tmp_count == 1 对于数值型为 > if tmp_count == 0: data_obj = Data( path=None, dataset=None, df=data, fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=data[node.data.class_column]) node.children.append( self.train_dt( train_data=data_obj, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes('<=', encoding='utf-8') + bytes(str(divide_point), encoding='utf-8') + bytes(' ', encoding='utf-8'), val=bytes('<=', encoding='utf-8') + bytes(str(divide_point), encoding='utf-8'), num_flag=0)) tmp_count += 1 else: data_obj = Data( path=None, dataset=None, df=data, fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=data[node.data.class_column]) node.children.append( self.train_dt( train_data=data_obj, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes('>', encoding='utf-8') + bytes(str(divide_point), encoding='utf-8') + bytes(' ', encoding='utf-8'), val=bytes('>', encoding='utf-8') + bytes(str(divide_point), encoding='utf-8'), num_flag=1)) else: # flag == 1 标称型数据 # 获取分裂属性列的所有不重复元素 selected_fea_value = df[selected_fea].unique() for val in selected_fea_value: split_df = df[df[selected_fea] == val] if split_df.empty: mode = node.data.class_data.mode().get(0) data_obj = Data(path=None, dataset=None, df=df, fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=node.data.class_data) node.children.append( RFNode(data=data_obj, split_fea=None, val=None, num_flag=-1, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes(' not exist', encoding='utf-8'), belong_fea=mode, leaf_flag=1, purity_flag=0)) else: data_obj = Data( path=None, dataset=None, df=split_df, fea_column=node.data.fea_column, nom_columns=node.data.nom_columns, num_columns=node.data.num_columns, class_column=node.data.class_column, class_data=split_df[node.data.class_column]) node.children.append( self.train_dt(train_data=data_obj, split_path=split_path + bytes(selected_fea, encoding='utf-8') + bytes(' ', encoding='utf-8'), val=val, num_flag=-1)) return node