def read_data_sets(dir, fake_data=False): class DataSets(object): pass data_sets = DataSets() if fake_data: data_sets.train = DataSet([], [], fake_data=True) data_sets.validation = DataSet([], [], fake_data=True) data_sets.test = DataSet([], [], fake_data=True) return data_sets TRAIN_IMAGES = "ipcai_revision_colon_mean_scattering_train_all_spectrocam.txt" TEST_IMAGES = "ipcai_revision_colon_mean_scattering_test_all_spectrocam.txt" df_train = pd.read_csv(os.path.join(dir, TRAIN_IMAGES), header=[0, 1]) df_test = pd.read_csv(os.path.join(dir, TEST_IMAGES), header=[0, 1]) train_images, train_labels = preprocess(df_train, snr=10.0) test_images, test_labels = preprocess(df_test, snr=10.0) train_labels = train_labels.values test_labels = test_labels.values VALIDATION_SIZE = 1 validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] data_sets.train = DataSet(train_images, train_labels) data_sets.validation = DataSet(validation_images, validation_labels) data_sets.test = DataSet(test_images, test_labels) return data_sets
def read_data_sets(dir, fake_data=False): class DataSets(object): pass data_sets = DataSets() if fake_data: data_sets.train = DataSet([], [], fake_data=True) data_sets.validation = DataSet([], [], fake_data=True) data_sets.test = DataSet([], [], fake_data=True) return data_sets TRAIN_IMAGES = "ipcai_revision_colon_mean_scattering_train_all_spectrocam.txt" TEST_IMAGES = "ipcai_revision_colon_mean_scattering_test_all_spectrocam.txt" df_train = pd.read_csv(os.path.join(dir, TRAIN_IMAGES), header=[0, 1]) df_test = pd.read_csv(os.path.join(dir, TEST_IMAGES), header=[0, 1]) train_images, train_labels = preprocess(df_train, snr=10.) test_images, test_labels = preprocess(df_test, snr=10.) train_labels = train_labels.values test_labels = test_labels.values VALIDATION_SIZE = 1 validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] data_sets.train = DataSet(train_images, train_labels) data_sets.validation = DataSet(validation_images, validation_labels) data_sets.test = DataSet(test_images, test_labels) return data_sets
def run(self): # get data df_train = pd.read_csv(self.input()[0].path, header=[0, 1]) df_test = pd.read_csv(self.input()[1].path, header=[0, 1]) # for this plot we write a custom evaluation function as it is built # a little different # create a new dataframe which will hold all the generated errors df = pd.DataFrame() nr_training_samples = np.arange(10, 15010, 50).astype(int) # not very pythonic, don't care for n in nr_training_samples: X_test, y_test = preprocess(df_test, snr=w_standard) # only take n samples for training X_train, y_train = preprocess(df_train, nr_samples=n, snr=w_standard) regressor = rf regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) # save results to a dataframe errors = np.abs(y_pred - y_test) errors = errors.reshape(len(errors), 1) current_df = DataFrame(errors * 100, columns=["Errors"]) current_df["Method"] = "Proposed" current_df["Number Samples"] = n / 10**3. df = pd.concat([df, current_df], ignore_index=True) logging.info( "Finished training classifier with {0} samples".format(str(n))) df = df.groupby("Number Samples").describe() # get the error description in the rows: df = df.unstack(-1) # get rid of multiindex by dropping "Error" level df.columns = df.columns.droplevel(0) plt.figure() plt.plot(df.index, df["50%"], color="green") # tidy up the plot plt.xlabel("number of training samples / 1000") plt.ylabel("absolute error [%]") plt.ylim((0, 20)) plt.xlim((0, 15)) plt.grid() # finally save the figure plt.savefig(self.output().path, mode="pdf", dpi=500, bbox_inches='tight')
def run(self): # get data df_train = pd.read_csv(self.input()[0].path, header=[0, 1]) df_test = pd.read_csv(self.input()[1].path, header=[0, 1]) # for this plot we write a custom evaluation function as it is built # a little different # create a new dataframe which will hold all the generated errors df = pd.DataFrame() nr_training_samples = np.arange(10, 15010, 50).astype(int) # not very pythonic, don't care for n in nr_training_samples: X_test, y_test = preprocess(df_test, snr=w_standard) # only take n samples for training X_train, y_train = preprocess(df_train, nr_samples=n, snr=w_standard) regressor = rf regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) # save results to a dataframe errors = np.abs(y_pred - y_test) errors = errors.reshape(len(errors), 1) current_df = DataFrame(errors * 100, columns=["Errors"]) current_df["Method"] = "Proposed" current_df["Number Samples"] = n / 10**3. df = pd.concat([df, current_df], ignore_index=True) logging.info( "Finished training classifier with {0} samples".format( str(n))) df = df.groupby("Number Samples").describe() # get the error description in the rows: df = df.unstack(-1) # get rid of multiindex by dropping "Error" level df.columns = df.columns.droplevel(0) plt.figure() plt.plot(df.index, df["50%"], color="green") # tidy up the plot plt.xlabel("number of training samples / 1000") plt.ylabel("absolute error [%]") plt.ylim((0, 20)) plt.xlim((0, 15)) plt.grid() # finally save the figure plt.savefig(self.output().path, mode="pdf", dpi=500, bbox_inches='tight')
def create_dataset(path_to_simulation_results): df = pd.read_csv(path_to_simulation_results, header=[0, 1]) X, y = preprocess(df, snr=10.0) y = y.values return X, y
def create_lmdb(path_to_simulation_results, lmdb_name): df = pd.read_csv(path_to_simulation_results, header=[0, 1]) X, y = preprocess(df, snr=10.) y = y.values * 1000 # We need to prepare the database for the size. We'll set it 10 times # greater than what we theoretically need. There is little drawback to # setting this too big. If you still run into problem after raising # this, you might want to try saving fewer entries in a single # transaction. map_size = X.nbytes * 10 env = lmdb.open(lmdb_name, map_size=map_size) with env.begin(write=True) as txn: # txn is a Transaction object for i in range(X.shape[0]): datum = caffe.proto.caffe_pb2.Datum() datum.channels = X.shape[1] datum.height = 1 datum.width = 1 datum.data = X[i].tobytes() # or .tostring() if numpy < 1.9 datum.label = int(y[i]) str_id = '{:08}'.format(i) # The encode is only essential in Python 3 txn.put(str_id.encode('ascii'), datum.SerializeToString())
def create_dataset(path_to_simulation_results): df = pd.read_csv(path_to_simulation_results, header=[0, 1]) X, y = preprocess(df, snr=10.) y = y.values return X, y
def run(self): # get data df_source = pd.read_csv(self.input()[0].path, header=[0, 1]) df_target = pd.read_csv(self.input()[1].path, header=[0, 1]) # first extract X_source and X_target, preprocessed at standard noise # level X_source, y_source = preprocess(df_source, w_percent=w_standard) X_target, y_target = preprocess(df_target, w_percent=w_standard) # train a classifier to determine probability for specific class weights = estimate_weights_random_forests(X_source, X_target, X_source) # add weight to dataframe df_source["weights"] = weights # finally save the dataframe with the added weights df_source.to_csv(self.output().path, index=False)
def create_hdf5(path_to_simulation_results, hdf5_name): df = pd.read_csv(path_to_simulation_results, header=[0, 1]) X, y = preprocess(df, snr=10.) y = y.values with h5py.File(hdf5_name,'w') as H: H.create_dataset('data', data=X ) # note the name X given to the dataset! H.create_dataset('label', data=y ) # note the name y given to the dataset! with open(hdf5_name + '_list.txt','w') as L: L.write(hdf5_name) # list all h5 files you are going to use
def read_data_set(dataframe_filename, fake_data=False): if fake_data: data_set = DataSet([], [], fake_data=True) return data_set df_data_set = pd.read_csv(os.path.join(dir, dataframe_filename), header=[0, 1]) data_set_images, data_set_labels = preprocess(df_data_set, snr=10.) data_set_labels = data_set_labels.values data_set = DataSet(data_set_images, data_set_labels) return data_set
def create_hdf5(path_to_simulation_results, hdf5_name): df = pd.read_csv(path_to_simulation_results, header=[0, 1]) X, y = preprocess(df, snr=10.) y = y.values with h5py.File(hdf5_name, 'w') as H: H.create_dataset('data', data=X) # note the name X given to the dataset! H.create_dataset('label', data=y) # note the name y given to the dataset! with open(hdf5_name + '_list.txt', 'w') as L: L.write(hdf5_name) # list all h5 files you are going to use