def main(): # Prepare dataset(imgs, labels) preprocess = Preprocessor('../dataset_test', batch_size=2, labels=['raccoon', 'test']) num_classes = len(preprocess.labels) imgs, labels = next(preprocess()) print(imgs.shape) print(labels[0].shape) print(labels[1].shape) print(labels[2].shape) # Training yolov3 = YoloV3(input_shape=(416, 416, 3), num_classes=num_classes, training=True) outputs = yolov3(imgs) loss_func1 = YoloLoss(anchors_wh_mask[0], num_classes) loss_func2 = YoloLoss(anchors_wh_mask[1], num_classes) loss_func3 = YoloLoss(anchors_wh_mask[2], num_classes) loss1, loss_breakdown1 = loss_func1(labels[0], outputs[0]) loss2, loss_breakdown2 = loss_func2(labels[1], outputs[1]) loss3, loss_breakdown3 = loss_func3(labels[2], outputs[2]) print(loss1) print(loss2) print(loss3)
def setup_preprocessing(self, infiles_pattern, params=None, param_id=None): ''' Setup the preprocessing function for the workflow ''' # Get params if id given if params is None and param_id is None: raise Exception("No parameters and no parameter id to lookup.") if param_id: params = self.db.get_binary('params', 'filtering_parameterID', param_id, table='filtering_parameters') assert params, "No data returned from database for param_id: %s" % param_id self.c.filterparam_id = param_id else: # Insert parameters dictionary into filter_parameters table self.c.filterparam_id = self.db.insert_binary(params, col='params', table='filtering_parameters') # Define Preprocessing Class and set inputs self.Preprocessor = Preprocessor(self.c) self.Preprocessor.db = self.db # Pass database reference to Preprocessor self.Preprocessor.set_input_files(data_files=infiles_pattern, data_inpath=self.c.data_inpath) self.Preprocessor.filter_functions = [ self.Preprocessor.make_propN_filter(params['filtering']['propN']), self.Preprocessor.make_phred_filter(params['filtering']['phred']), self.Preprocessor.make_cutsite_filter(max_edit_dist=params['filtering']['cutsite_edit_dist']), self.Preprocessor.make_overhang_filter('TCGAGG', 'GG', params['filtering']['overhang_edit_dist']) ] # Save addition to config file path = joinp(self.c.db_path, '.' + self.c.root_name + '-config.pkl') if os.path.exists(path): os.remove(path) pkl.dump(self.c, open(path, 'w'))
def setup_clustering(self, mode, infiles_pattern, infiles_path=None, default_params=None, subgroups=None): ''' Setup files for the Clustering function of the workflow. Does the necessary splitting and trimming of files if specified in mode. ''' # Input Checks if not hasattr(self, 'c'): raise Exception('Must first load a config file') if not hasattr(self, 'Preprocessor'): self.Preprocessor = Preprocessor(self.c) self.Preprocessor.db = self.db if infiles_path is None: infiles_path = self.c.tag_processed_outpath # Set files to process for clustering self.Preprocessor.set_input_files(data_files=infiles_pattern, data_inpath=infiles_path) if mode == 'split_by_tags': (outfiles, outpath) = self.Preprocessor.split_by_tags() self.c.current_tag_split_outpath = outpath # Create index for files clustered # makeSQLindex(outfiles, outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='separate', outpath=self.c.tag_splitby_sample_outpath, n=1) elif mode == 'split_by_subgroups': if subgroups is None: raise Exception("No subgroups specified") (outfiles, outpath) = self.Preprocessor.split_by_subgroups(subgroups) self.c.current_tag_split_outpath = outpath # Create index for files clustered # makeSQLindex(outfiles, outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='separate', outpath=self.c.tag_splitby_subgroup_outpath, n=1) elif mode == 'no_split_grouped': # Create index for files clustered # makeSQLindex(filepattern=infiles_pattern, data_inpath=self.c.tag_processed_outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='grouped', n=1) self.c.current_tag_split_outpath = path elif mode == 'no_split_separate': # Create index for files clustered # makeSQLindex(filepattern=infiles_pattern, data_inpath=self.c.tag_processed_outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='separate', n=1) self.c.current_tag_split_outpath = path else: raise Exception(' No valid mode specified. ') # Setup Clusterer self.Clusterer = ClusterClass(infiles=files2cluster, inpath=path, config=self.c, db=self.db, defaults=default_params)
def main(): data_dir = './dataset_test' labels = ['raccoon'] num_classes = len(labels) ckpt_dir = './checkpoints' lr_rate = 0.0001 if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) preprocessor = Preprocessor(data_dir=data_dir, output_shape=(416, 416), labels=labels, batch_size=BATCH_SIZE) model = YoloV3(input_shape=(416, 416, 3), num_classes=num_classes, training=True) loss_objects = [ YoloLoss(valid_anchors_wh, num_classes) for valid_anchors_wh in anchors_wh_mask ] optimizer = tf.keras.optimizers.Adam(lr=lr_rate) lowest_loss = 10000000 for epoch in range(EPOCH): print('{} epoch start! : {}'.format( epoch, datetime.datetime.now().strftime("%Y.%m.%d %H:%M:%S"))) epoch_loss = train_one_epoch(model, loss_objects, preprocessor(), optimizer) if lowest_loss > epoch_loss: lowest_loss = epoch_loss save_path = ckpt_dir + '/cp-{:04d}-{:.4f}.ckpt'.format( epoch, lowest_loss) model.save_weights(save_path) print('Save CKPT _ [loss : {:.4f}, save_path : {}]\n'.format( lowest_loss, save_path))
L8_barcode_files = glob.glob(joinp(c.barcode_inpath, '*[8].txt')) datafiles = glob.glob(joinp(c.data_inpath, testfile)) db.add_barcodes_datafiles(L8_barcode_files, datafiles, datafile_type='raw_mixed') else: # os.chdir(c.barcode_inpath) L6_barcode_files = glob.glob(joinp(c.barcode_inpath, '*[6].txt')) L8_barcode_files = glob.glob(joinp(c.barcode_inpath, '*[8].txt')) L6_datafiles = glob.glob(joinp(c.data_inpath, 'lane6*bgzf')) L8_datafiles = glob.glob(joinp(c.data_inpath, 'lane8*bgzf')) # Associate subsets of the data files list to their respective barcode files. db.add_barcodes_datafiles(L6_barcode_files, L6_datafiles, datafile_type='raw_mixed') db.add_barcodes_datafiles(L8_barcode_files, L8_datafiles, datafile_type='raw_mixed') # Define Preprocessing Class and set inputs Preprocess = Preprocessor(c) if testing: Preprocess.set_input_files(data_inpath=c.data_inpath, file_pattern=testfile) else: Preprocess.set_input_files(data_inpath=c.data_inpath, file_pattern='lane*bgzf') Preprocess.db = db # Pass database reference to Preprocessor Object #=============================================================================== # Setup Filtering Parameters #=============================================================================== p = {'filtering' : {'propN': 0.1, 'phred': 25, 'cutsite_edit_dist' : 2,
import pandas as pd import numpy as np import sys from sklearn.externals import joblib from utils.preprocess import feature_expand_path from utils.preprocess import Preprocessor if __name__ == "__main__": preprocessor = Preprocessor() # 扩展特征 if len(sys.argv) == 3: feature_expand_path(sys.argv[1], sys.argv[2]) else: print('请指定需要预测的文件的路径以及特征扩展文件的路径') test = pd.read_csv(sys.argv[2]) test_for_pre = test[test['pay_price'] > 0] classify_test = test_for_pre.iloc[:, 1:] classify_test = preprocessor.time_spliter(classify_test) model = joblib.load('model_save/xgb_clf.model') model_reg = joblib.load('model_save/xgb_reg.model') # 准备测试集 classify_test = test_for_pre.iloc[:, 1:]
class Workflow(object): ''' Container for all preprocessing, filtering, and exploritry analysis with a particular dataset ''' def __init__(self): self.c = ConfigClass() def create_new(self, name=None, db_name=None, testing=False): ''' Setup directory structure and initialise config file and database for the given dataset name.''' if (name is None) or (type(name) is not str): raise Exception('Must specify a valid name for the dataset.') if db_name is None: db_name = name + '.db' # Setup Configuration prefix = get_data_prefix() # Default path locations self.c.testing = testing self.c.root_name = name self.c.db_name = db_name if testing: self.c.data_inpath = joinp(prefix,name, 'testset') else: self.c.data_inpath = joinp(prefix, name, 'raw-data') self.c.barcode_inpath = joinp(prefix, name , 'barcodes') self.c.filtered_outpath = joinp(prefix, name , 'processed-data') self.c.tag_processed_outpath = joinp(prefix, name, 'processed-data') self.c.tag_splitby_sample_outpath = joinp(prefix, name, 'processed-data', 'per-sample') self.c.tag_splitby_subgroup_outpath = joinp(prefix, name, 'processed-data', 'per-subgroup') self.c.clusters_outpath = joinp(prefix, name, 'clusters') self.c.db_path = joinp(prefix, name) self.c.cdhit_path = os.path.expanduser("~/bin/cd-hit-v4.6.1") # Create directories of they dont exist for attr in dir(self.c): #numwritten = SeqIO.write(RecCycler.recgen , output_filehdl , 'fastq') #print '{0} records written'.format(numwritten) #total_numwritten += numwritten if 'path' in attr: path = getattr(self.c, attr) if not os.path.exists(path): os.makedirs(path) # Var to choose between different output locations after splitting data self.c.current_tag_split_outpath = None # Set interim file suffixes self.c.filtered_files_postfix = '-pass' self.c.tag_processed_files_postfix = '-clean' # MIDtags self.c.cutsite = 'TGCAGG' self.c.max_edit_dist = 2 # FILTERING # Whether to log reads that fail the filtering self.c.log_fails = False # Create new Database self.db = Popgen_db(joinp(self.c.db_path, db_name), recbyname=True, new=True) # Save config f = open(joinp(self.c.db_path, '.' + name + '-config.pkl'), 'w') pkl.dump(self.c, f) def load(self, name=None, db_name=None, recbyname=True): ''' Load a pre-existing directory structure, config file and database with the given dataset name.''' if (name is None) or (type(name) is not str): raise Exception('Must specify a valid name for the dataset.') if db_name is None: db_name = name + '.db' # Load config prefix = get_data_prefix() path2config = joinp(prefix, name, '.' + name + '-config.pkl') path2db = joinp(prefix, name, db_name) self.db = Popgen_db(path2db, recbyname=recbyname) self.c = pkl.load(open(path2config)) # Setup Configuration with new prefix prefix = get_data_prefix() if self.c.testing: self.c.data_inpath = joinp(prefix,name, 'testset') else: self.c.data_inpath = joinp(prefix, name, 'raw-data') self.c.barcode_inpath = joinp(prefix, name , 'barcodes') self.c.filtered_outpath = joinp(prefix, name , 'processed-data') self.c.tag_processed_outpath = joinp(prefix, name, 'processed-data') self.c.tag_splitby_sample_outpath = joinp(prefix, name, 'processed-data', 'per-sample') self.c.tag_splitby_subgroup_outpath = joinp(prefix, name, 'processed-data', 'per-subgroup') self.c.clusters_outpath = joinp(prefix, name, 'clusters') self.c.db_path = joinp(prefix, name) def add_datafiles(self, data_files=None , barcode_files=None ): ''' Add datafiles and barcodes in pairs to the database. Each pair defines the samples present in the datafiles listed. If 'files' or 'barcodes' is a str, it is interpreted as a glob to the data_path / barcode_path respecively. ''' if type(data_files) is str: data_files = glob.glob(joinp(self.c.data_inpath, data_files)) if type(barcode_files) is str: barcode_files = glob.glob(joinp(self.c.barcode_inpath, barcode_files)) # Input samples-datafiles info in Database self.db.add_barcodes_datafiles(barcode_files, data_files, datafile_type='raw_mixed') def setup_preprocessing(self, infiles_pattern, params=None, param_id=None): ''' Setup the preprocessing function for the workflow ''' # Get params if id given if params is None and param_id is None: raise Exception("No parameters and no parameter id to lookup.") if param_id: params = self.db.get_binary('params', 'filtering_parameterID', param_id, table='filtering_parameters') assert params, "No data returned from database for param_id: %s" % param_id self.c.filterparam_id = param_id else: # Insert parameters dictionary into filter_parameters table self.c.filterparam_id = self.db.insert_binary(params, col='params', table='filtering_parameters') # Define Preprocessing Class and set inputs self.Preprocessor = Preprocessor(self.c) self.Preprocessor.db = self.db # Pass database reference to Preprocessor self.Preprocessor.set_input_files(data_files=infiles_pattern, data_inpath=self.c.data_inpath) self.Preprocessor.filter_functions = [ self.Preprocessor.make_propN_filter(params['filtering']['propN']), self.Preprocessor.make_phred_filter(params['filtering']['phred']), self.Preprocessor.make_cutsite_filter(max_edit_dist=params['filtering']['cutsite_edit_dist']), self.Preprocessor.make_overhang_filter('TCGAGG', 'GG', params['filtering']['overhang_edit_dist']) ] # Save addition to config file path = joinp(self.c.db_path, '.' + self.c.root_name + '-config.pkl') if os.path.exists(path): os.remove(path) pkl.dump(self.c, open(path, 'w')) def run_preprocessing(self): ''' Call the Preprocessing functions for the workflow ''' params = self.db.get_binary('params', 'filtering_parameterID', self.c.filterparam_id, table='filtering_parameters') # Process and Correct MID tag self.Preprocessor.filter_reads_pipeline() self.Preprocessor.process_MIDtag(max_edit_dist = params['cleaning']['max_edit_dist']) # Remove filtered intermediate files self.Preprocessor.cleanup_files('filtered') def setup_clustering(self, mode, infiles_pattern, infiles_path=None, default_params=None, subgroups=None): ''' Setup files for the Clustering function of the workflow. Does the necessary splitting and trimming of files if specified in mode. ''' # Input Checks if not hasattr(self, 'c'): raise Exception('Must first load a config file') if not hasattr(self, 'Preprocessor'): self.Preprocessor = Preprocessor(self.c) self.Preprocessor.db = self.db if infiles_path is None: infiles_path = self.c.tag_processed_outpath # Set files to process for clustering self.Preprocessor.set_input_files(data_files=infiles_pattern, data_inpath=infiles_path) if mode == 'split_by_tags': (outfiles, outpath) = self.Preprocessor.split_by_tags() self.c.current_tag_split_outpath = outpath # Create index for files clustered # makeSQLindex(outfiles, outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='separate', outpath=self.c.tag_splitby_sample_outpath, n=1) elif mode == 'split_by_subgroups': if subgroups is None: raise Exception("No subgroups specified") (outfiles, outpath) = self.Preprocessor.split_by_subgroups(subgroups) self.c.current_tag_split_outpath = outpath # Create index for files clustered # makeSQLindex(outfiles, outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='separate', outpath=self.c.tag_splitby_subgroup_outpath, n=1) elif mode == 'no_split_grouped': # Create index for files clustered # makeSQLindex(filepattern=infiles_pattern, data_inpath=self.c.tag_processed_outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='grouped', n=1) self.c.current_tag_split_outpath = path elif mode == 'no_split_separate': # Create index for files clustered # makeSQLindex(filepattern=infiles_pattern, data_inpath=self.c.tag_processed_outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='separate', n=1) self.c.current_tag_split_outpath = path else: raise Exception(' No valid mode specified. ') # Setup Clusterer self.Clusterer = ClusterClass(infiles=files2cluster, inpath=path, config=self.c, db=self.db, defaults=default_params) def run_clustering(self, run_parameters, **kwargs): ''' Run CDHIT using the specified parameters. passes on any kwargs''' outputs_list = self.Clusterer.run_batch_cdhit_clustering(run_parameters, **kwargs) outnames_list, out_path, counters_list = zip(*outputs_list) return outnames_list, out_path, counters_list def add_experiment_name(self, name, description): ''' Add Experimental details and config object in database''' # These should be unique for each experiment, else results table is overwritten self.c.experiment_name = name self.c.experiment_description = description self.c.exp_id = self.db.add_experiment(config=self.c, exp_type='clustering') def cleanup_files(self, file_type): ''' Remove all intermediate files specified ''' self.Preprocessor.cleanup_files(file_type)
("test", testPaths, config.TEST_HDF5)] for (dType, paths, outputPath) in datasets: print("[INFO] building {}...".format(outputPath)) dim_0 = config.data_shape[0] dim_1 = config.data_shape[1] dim_2 = config.data_shape[2] dim_3 = config.data_shape[3] writer = HDF5DatasetWriter( (len(paths), dim_0, dim_1, dim_2, dim_3), outputPath) #Number of cells=1782 in 3 directions (x,y,z) widgets = [ "Building Dataset: ", progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA() ] pbar = progressbar.ProgressBar(maxval=len(paths), widgets=widgets).start() for (i, path) in enumerate(paths): force, disp = Preprocessor.array_reshape(path, config.data_shape, channel_firtst=False) writer.add([force], [disp]) pbar.update(i) pbar.finish() writer.close()
progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA() ] pbar = progressbar.ProgressBar(maxval=len(paths), widgets=widgets).start() for i in range(0, len(paths), step_size): forces_time_steps = np.zeros((step_size, data_shape[0], data_shape[1], data_shape[2], data_shape[3])) disps_time_steps = np.zeros((step_size, data_shape[0], data_shape[1], data_shape[2], data_shape[3])) meshfiles = paths[i:i + step_size] for (k, meshfile) in enumerate(meshfiles): force, disp, force_mean, disp_mean, force_std, disp_std = Preprocessor.array_reshape( meshfile, data_shape, channel_first) forces_time_steps[k, :, :, :, :] = force disps_time_steps[k, :, :, :, :] = disp F_mean_x.append(force_mean[0]) F_mean_y.append(force_mean[1]) F_mean_z.append(force_mean[2]) F_std_x.append(force_std[0]) F_std_y.append(force_std[1]) F_std_z.append(force_std[2]) disp_mean_x.append(disp_mean[0]) disp_mean_y.append(disp_mean[1]) disp_mean_z.append(disp_mean[2]) disp_std_x.append(disp_std[0]) disp_std_y.append(disp_std[1])