Пример #1
0
def main():
    # Prepare dataset(imgs, labels)
    preprocess = Preprocessor('../dataset_test', batch_size=2, labels=['raccoon', 'test'])
    num_classes = len(preprocess.labels)

    imgs, labels = next(preprocess())

    print(imgs.shape)
    print(labels[0].shape)
    print(labels[1].shape)
    print(labels[2].shape)

    # Training
    yolov3 = YoloV3(input_shape=(416, 416, 3), num_classes=num_classes, training=True)
    outputs = yolov3(imgs)

    loss_func1 = YoloLoss(anchors_wh_mask[0], num_classes)
    loss_func2 = YoloLoss(anchors_wh_mask[1], num_classes)
    loss_func3 = YoloLoss(anchors_wh_mask[2], num_classes)

    loss1, loss_breakdown1 = loss_func1(labels[0], outputs[0])
    loss2, loss_breakdown2 = loss_func2(labels[1], outputs[1])
    loss3, loss_breakdown3 = loss_func3(labels[2], outputs[2])

    print(loss1)
    print(loss2)
    print(loss3)
Пример #2
0
 def setup_preprocessing(self, infiles_pattern, params=None,  param_id=None):
     ''' Setup the preprocessing function for the workflow '''
     
     # Get params if id given 
     if params is None and param_id is None:
         raise Exception("No parameters and no parameter id to lookup.")
     if param_id:
         params = self.db.get_binary('params', 'filtering_parameterID', param_id, table='filtering_parameters')
         assert params, "No data returned from database for param_id: %s" % param_id
         self.c.filterparam_id = param_id
     else:
         # Insert parameters dictionary into filter_parameters table
         self.c.filterparam_id = self.db.insert_binary(params, col='params', table='filtering_parameters')
         
     # Define Preprocessing Class and set inputs
     self.Preprocessor = Preprocessor(self.c)
     self.Preprocessor.db = self.db # Pass database reference to Preprocessor 
     
     self.Preprocessor.set_input_files(data_files=infiles_pattern, data_inpath=self.c.data_inpath)
      
     self.Preprocessor.filter_functions = [
             self.Preprocessor.make_propN_filter(params['filtering']['propN']),
             self.Preprocessor.make_phred_filter(params['filtering']['phred']),
             self.Preprocessor.make_cutsite_filter(max_edit_dist=params['filtering']['cutsite_edit_dist']),
             self.Preprocessor.make_overhang_filter('TCGAGG', 'GG', params['filtering']['overhang_edit_dist'])
             ]
     
     # Save addition to config file
     path = joinp(self.c.db_path, '.' + self.c.root_name + '-config.pkl')
     if os.path.exists(path):
         os.remove(path)
         pkl.dump(self.c, open(path, 'w'))
Пример #3
0
    def setup_clustering(self, mode, infiles_pattern, infiles_path=None, default_params=None, subgroups=None):
        ''' Setup files for the Clustering function of the workflow. 
        
        Does the necessary splitting and trimming of files if specified in mode.
        '''
        
        # Input Checks
        if not hasattr(self, 'c'):
            raise Exception('Must first load a config file')
        if not hasattr(self, 'Preprocessor'):
            self.Preprocessor = Preprocessor(self.c)
            self.Preprocessor.db = self.db

        if infiles_path is None:
            infiles_path = self.c.tag_processed_outpath

        # Set files to process for clustering         
        self.Preprocessor.set_input_files(data_files=infiles_pattern, data_inpath=infiles_path)
        
        if mode == 'split_by_tags':
            (outfiles, outpath) = self.Preprocessor.split_by_tags()
            self.c.current_tag_split_outpath = outpath
            # Create index for files clustered
#             makeSQLindex(outfiles, outpath)
            
            files2cluster, path = self.Preprocessor.trim_reads(mode='separate', 
                                outpath=self.c.tag_splitby_sample_outpath, n=1)
        elif mode == 'split_by_subgroups':
            if subgroups is None:
                raise Exception("No subgroups specified")
            (outfiles, outpath) = self.Preprocessor.split_by_subgroups(subgroups)
            self.c.current_tag_split_outpath = outpath
            # Create index for files clustered
#             makeSQLindex(outfiles, outpath)
            
            files2cluster, path = self.Preprocessor.trim_reads(mode='separate',
                             outpath=self.c.tag_splitby_subgroup_outpath,  n=1)
        elif mode == 'no_split_grouped':
            # Create index for files clustered
#             makeSQLindex(filepattern=infiles_pattern, data_inpath=self.c.tag_processed_outpath)
            files2cluster, path = self.Preprocessor.trim_reads(mode='grouped', n=1)
            self.c.current_tag_split_outpath = path 
        elif mode == 'no_split_separate':
            # Create index for files clustered
#             makeSQLindex(filepattern=infiles_pattern, data_inpath=self.c.tag_processed_outpath)
            files2cluster, path = self.Preprocessor.trim_reads(mode='separate', n=1)
            self.c.current_tag_split_outpath = path 
        else:
            raise Exception(' No valid mode specified. ')

        # Setup Clusterer
        self.Clusterer = ClusterClass(infiles=files2cluster, inpath=path, 
                                      config=self.c, db=self.db, defaults=default_params)
Пример #4
0
def main():
    data_dir = './dataset_test'
    labels = ['raccoon']
    num_classes = len(labels)

    ckpt_dir = './checkpoints'

    lr_rate = 0.0001

    if not os.path.exists(ckpt_dir):
        os.makedirs(ckpt_dir)

    preprocessor = Preprocessor(data_dir=data_dir,
                                output_shape=(416, 416),
                                labels=labels,
                                batch_size=BATCH_SIZE)

    model = YoloV3(input_shape=(416, 416, 3),
                   num_classes=num_classes,
                   training=True)
    loss_objects = [
        YoloLoss(valid_anchors_wh, num_classes)
        for valid_anchors_wh in anchors_wh_mask
    ]

    optimizer = tf.keras.optimizers.Adam(lr=lr_rate)

    lowest_loss = 10000000
    for epoch in range(EPOCH):
        print('{} epoch start! : {}'.format(
            epoch,
            datetime.datetime.now().strftime("%Y.%m.%d %H:%M:%S")))

        epoch_loss = train_one_epoch(model, loss_objects, preprocessor(),
                                     optimizer)

        if lowest_loss > epoch_loss:
            lowest_loss = epoch_loss

            save_path = ckpt_dir + '/cp-{:04d}-{:.4f}.ckpt'.format(
                epoch, lowest_loss)
            model.save_weights(save_path)
            print('Save CKPT _ [loss : {:.4f}, save_path : {}]\n'.format(
                lowest_loss, save_path))
Пример #5
0
    L8_barcode_files = glob.glob(joinp(c.barcode_inpath, '*[8].txt')) 
    datafiles = glob.glob(joinp(c.data_inpath, testfile))
    db.add_barcodes_datafiles(L8_barcode_files, datafiles, datafile_type='raw_mixed')
else:
#     os.chdir(c.barcode_inpath)
    L6_barcode_files = glob.glob(joinp(c.barcode_inpath, '*[6].txt')) 
    L8_barcode_files = glob.glob(joinp(c.barcode_inpath, '*[8].txt')) 
    L6_datafiles = glob.glob(joinp(c.data_inpath, 'lane6*bgzf'))
    L8_datafiles = glob.glob(joinp(c.data_inpath, 'lane8*bgzf'))

    # Associate subsets of the data files list to their respective barcode files.   
    db.add_barcodes_datafiles(L6_barcode_files, L6_datafiles, datafile_type='raw_mixed')
    db.add_barcodes_datafiles(L8_barcode_files, L8_datafiles, datafile_type='raw_mixed')

# Define Preprocessing Class and set inputs
Preprocess = Preprocessor(c)

if testing:
    Preprocess.set_input_files(data_inpath=c.data_inpath, file_pattern=testfile)
else:
    Preprocess.set_input_files(data_inpath=c.data_inpath, file_pattern='lane*bgzf')
 
Preprocess.db = db # Pass database reference to Preprocessor Object


#===============================================================================
# Setup Filtering Parameters
#===============================================================================
p = {'filtering' : {'propN': 0.1,
                    'phred': 25,
                    'cutsite_edit_dist' : 2,
Пример #6
0
import pandas as pd
import numpy as np
import sys
from sklearn.externals import joblib

from utils.preprocess import feature_expand_path
from utils.preprocess import Preprocessor

if __name__ == "__main__":

    preprocessor = Preprocessor()

    # 扩展特征
    if len(sys.argv) == 3:
        feature_expand_path(sys.argv[1], sys.argv[2])
    else:
        print('请指定需要预测的文件的路径以及特征扩展文件的路径')

    test = pd.read_csv(sys.argv[2])

    test_for_pre = test[test['pay_price'] > 0]

    classify_test = test_for_pre.iloc[:, 1:]
    classify_test = preprocessor.time_spliter(classify_test)

    model = joblib.load('model_save/xgb_clf.model')

    model_reg = joblib.load('model_save/xgb_reg.model')

    # 准备测试集
    classify_test = test_for_pre.iloc[:, 1:]
Пример #7
0
class Workflow(object):
    ''' Container for all preprocessing, filtering, and exploritry analysis with 
    a particular dataset
    '''
    
    def __init__(self):
        self.c = ConfigClass()
    
    def create_new(self, name=None, db_name=None, testing=False):
        ''' Setup directory structure and initialise config file and database 
        for the given dataset name.'''
        
        if (name is None) or (type(name) is not str):
            raise Exception('Must specify a valid name for the dataset.')
        if db_name is None:
            db_name = name + '.db'
        
        # Setup Configuration
        prefix =  get_data_prefix()
        
        # Default path locations
        self.c.testing = testing
        self.c.root_name = name
        self.c.db_name = db_name
        if testing:
            self.c.data_inpath =  joinp(prefix,name, 'testset')
        else:
            self.c.data_inpath =  joinp(prefix, name, 'raw-data') 
        self.c.barcode_inpath = joinp(prefix, name , 'barcodes')
        self.c.filtered_outpath = joinp(prefix, name , 'processed-data')
        self.c.tag_processed_outpath = joinp(prefix, name, 'processed-data')
        self.c.tag_splitby_sample_outpath = joinp(prefix, name, 'processed-data', 'per-sample')
        self.c.tag_splitby_subgroup_outpath = joinp(prefix, name, 'processed-data', 'per-subgroup')
        self.c.clusters_outpath = joinp(prefix, name, 'clusters')
        self.c.db_path = joinp(prefix,  name)
        self.c.cdhit_path = os.path.expanduser("~/bin/cd-hit-v4.6.1")


        # Create directories of they dont exist
        for attr in dir(self.c): #numwritten = SeqIO.write(RecCycler.recgen , output_filehdl , 'fastq')
#print '{0} records written'.format(numwritten)
#total_numwritten += numwritten
            if 'path' in attr:
                path = getattr(self.c, attr)
                if not os.path.exists(path):
                    os.makedirs(path)

        # Var to choose between different output locations after splitting data 
        self.c.current_tag_split_outpath = None
        
        # Set interim file suffixes
        self.c.filtered_files_postfix = '-pass'
        self.c.tag_processed_files_postfix = '-clean'

        # MIDtags
        self.c.cutsite = 'TGCAGG'
        self.c.max_edit_dist = 2
        
        # FILTERING
        # Whether to log reads that fail the filtering         
        self.c.log_fails = False
        
        # Create new Database 
        self.db = Popgen_db(joinp(self.c.db_path, db_name), recbyname=True, new=True)
        
        # Save config
        f = open(joinp(self.c.db_path, '.' + name + '-config.pkl'), 'w')
        pkl.dump(self.c, f)
        
    def load(self, name=None, db_name=None, recbyname=True):
        ''' Load a pre-existing directory structure, config file and database 
        with the given dataset name.'''
        
        if (name is None) or (type(name) is not str):
            raise Exception('Must specify a valid name for the dataset.')
        if db_name is None:
            db_name = name + '.db'
        
        # Load config
        prefix = get_data_prefix()
        path2config = joinp(prefix, name, '.' + name + '-config.pkl')
        path2db = joinp(prefix, name, db_name)
        
        self.db = Popgen_db(path2db, recbyname=recbyname)
        self.c = pkl.load(open(path2config))
        
        # Setup Configuration with new prefix 
        prefix =  get_data_prefix()
        
        if self.c.testing:
            self.c.data_inpath =  joinp(prefix,name, 'testset')
        else:
            self.c.data_inpath =  joinp(prefix, name, 'raw-data') 
        self.c.barcode_inpath = joinp(prefix, name , 'barcodes')
        self.c.filtered_outpath = joinp(prefix, name , 'processed-data')
        self.c.tag_processed_outpath = joinp(prefix, name, 'processed-data')
        self.c.tag_splitby_sample_outpath = joinp(prefix, name, 'processed-data', 'per-sample')
        self.c.tag_splitby_subgroup_outpath = joinp(prefix, name, 'processed-data', 'per-subgroup')
        self.c.clusters_outpath = joinp(prefix, name, 'clusters')
        self.c.db_path = joinp(prefix,  name)
        
        
    def add_datafiles(self, data_files=None , barcode_files=None ):
        ''' Add datafiles and barcodes in pairs to the database.
         
        Each pair defines the samples present in the datafiles listed. 
        
        If 'files' or 'barcodes' is a str, it is interpreted as a glob to the 
        data_path / barcode_path respecively.
        '''
        
        if type(data_files) is str:
            data_files = glob.glob(joinp(self.c.data_inpath, data_files))
        if type(barcode_files) is str:
            barcode_files = glob.glob(joinp(self.c.barcode_inpath, barcode_files))
            
        # Input samples-datafiles info in Database 
        self.db.add_barcodes_datafiles(barcode_files, data_files, datafile_type='raw_mixed') 
                
    def setup_preprocessing(self, infiles_pattern, params=None,  param_id=None):
        ''' Setup the preprocessing function for the workflow '''
        
        # Get params if id given 
        if params is None and param_id is None:
            raise Exception("No parameters and no parameter id to lookup.")
        if param_id:
            params = self.db.get_binary('params', 'filtering_parameterID', param_id, table='filtering_parameters')
            assert params, "No data returned from database for param_id: %s" % param_id
            self.c.filterparam_id = param_id
        else:
            # Insert parameters dictionary into filter_parameters table
            self.c.filterparam_id = self.db.insert_binary(params, col='params', table='filtering_parameters')
            
        # Define Preprocessing Class and set inputs
        self.Preprocessor = Preprocessor(self.c)
        self.Preprocessor.db = self.db # Pass database reference to Preprocessor 
        
        self.Preprocessor.set_input_files(data_files=infiles_pattern, data_inpath=self.c.data_inpath)
         
        self.Preprocessor.filter_functions = [
                self.Preprocessor.make_propN_filter(params['filtering']['propN']),
                self.Preprocessor.make_phred_filter(params['filtering']['phred']),
                self.Preprocessor.make_cutsite_filter(max_edit_dist=params['filtering']['cutsite_edit_dist']),
                self.Preprocessor.make_overhang_filter('TCGAGG', 'GG', params['filtering']['overhang_edit_dist'])
                ]
        
        # Save addition to config file
        path = joinp(self.c.db_path, '.' + self.c.root_name + '-config.pkl')
        if os.path.exists(path):
            os.remove(path)
            pkl.dump(self.c, open(path, 'w'))
            
    def run_preprocessing(self):
        ''' Call the Preprocessing functions for the workflow '''

        params = self.db.get_binary('params', 'filtering_parameterID', self.c.filterparam_id, table='filtering_parameters')
            
        # Process and Correct MID tag 
        self.Preprocessor.filter_reads_pipeline()
        self.Preprocessor.process_MIDtag(max_edit_dist = params['cleaning']['max_edit_dist'])

        # Remove filtered intermediate files 
        self.Preprocessor.cleanup_files('filtered') 

    def setup_clustering(self, mode, infiles_pattern, infiles_path=None, default_params=None, subgroups=None):
        ''' Setup files for the Clustering function of the workflow. 
        
        Does the necessary splitting and trimming of files if specified in mode.
        '''
        
        # Input Checks
        if not hasattr(self, 'c'):
            raise Exception('Must first load a config file')
        if not hasattr(self, 'Preprocessor'):
            self.Preprocessor = Preprocessor(self.c)
            self.Preprocessor.db = self.db

        if infiles_path is None:
            infiles_path = self.c.tag_processed_outpath

        # Set files to process for clustering         
        self.Preprocessor.set_input_files(data_files=infiles_pattern, data_inpath=infiles_path)
        
        if mode == 'split_by_tags':
            (outfiles, outpath) = self.Preprocessor.split_by_tags()
            self.c.current_tag_split_outpath = outpath
            # Create index for files clustered
#             makeSQLindex(outfiles, outpath)
            
            files2cluster, path = self.Preprocessor.trim_reads(mode='separate', 
                                outpath=self.c.tag_splitby_sample_outpath, n=1)
        elif mode == 'split_by_subgroups':
            if subgroups is None:
                raise Exception("No subgroups specified")
            (outfiles, outpath) = self.Preprocessor.split_by_subgroups(subgroups)
            self.c.current_tag_split_outpath = outpath
            # Create index for files clustered
#             makeSQLindex(outfiles, outpath)
            
            files2cluster, path = self.Preprocessor.trim_reads(mode='separate',
                             outpath=self.c.tag_splitby_subgroup_outpath,  n=1)
        elif mode == 'no_split_grouped':
            # Create index for files clustered
#             makeSQLindex(filepattern=infiles_pattern, data_inpath=self.c.tag_processed_outpath)
            files2cluster, path = self.Preprocessor.trim_reads(mode='grouped', n=1)
            self.c.current_tag_split_outpath = path 
        elif mode == 'no_split_separate':
            # Create index for files clustered
#             makeSQLindex(filepattern=infiles_pattern, data_inpath=self.c.tag_processed_outpath)
            files2cluster, path = self.Preprocessor.trim_reads(mode='separate', n=1)
            self.c.current_tag_split_outpath = path 
        else:
            raise Exception(' No valid mode specified. ')

        # Setup Clusterer
        self.Clusterer = ClusterClass(infiles=files2cluster, inpath=path, 
                                      config=self.c, db=self.db, defaults=default_params)

    def run_clustering(self, run_parameters, **kwargs):
        ''' Run CDHIT using the specified parameters. passes on any kwargs'''
        
        outputs_list = self.Clusterer.run_batch_cdhit_clustering(run_parameters, **kwargs)
        outnames_list, out_path, counters_list = zip(*outputs_list)
        
        return outnames_list, out_path, counters_list
        
    def add_experiment_name(self, name, description):
        ''' Add Experimental details and config object in database'''

        # These should be unique for each experiment, else results table is overwritten
        self.c.experiment_name = name
        self.c.experiment_description = description
        self.c.exp_id = self.db.add_experiment(config=self.c, exp_type='clustering')
        
    def cleanup_files(self, file_type):
        ''' Remove all intermediate files specified '''
        self.Preprocessor.cleanup_files(file_type)
Пример #8
0
            ("test", testPaths, config.TEST_HDF5)]

for (dType, paths, outputPath) in datasets:

    print("[INFO] building {}...".format(outputPath))
    dim_0 = config.data_shape[0]
    dim_1 = config.data_shape[1]
    dim_2 = config.data_shape[2]
    dim_3 = config.data_shape[3]
    writer = HDF5DatasetWriter(
        (len(paths), dim_0, dim_1, dim_2, dim_3),
        outputPath)  #Number of cells=1782 in 3 directions (x,y,z)

    widgets = [
        "Building Dataset: ",
        progressbar.Percentage(), " ",
        progressbar.Bar(), " ",
        progressbar.ETA()
    ]
    pbar = progressbar.ProgressBar(maxval=len(paths), widgets=widgets).start()

    for (i, path) in enumerate(paths):

        force, disp = Preprocessor.array_reshape(path,
                                                 config.data_shape,
                                                 channel_firtst=False)
        writer.add([force], [disp])
        pbar.update(i)

    pbar.finish()
    writer.close()
Пример #9
0
        progressbar.Percentage(), " ",
        progressbar.Bar(), " ",
        progressbar.ETA()
    ]
    pbar = progressbar.ProgressBar(maxval=len(paths), widgets=widgets).start()

    for i in range(0, len(paths), step_size):

        forces_time_steps = np.zeros((step_size, data_shape[0], data_shape[1],
                                      data_shape[2], data_shape[3]))
        disps_time_steps = np.zeros((step_size, data_shape[0], data_shape[1],
                                     data_shape[2], data_shape[3]))
        meshfiles = paths[i:i + step_size]

        for (k, meshfile) in enumerate(meshfiles):
            force, disp, force_mean, disp_mean, force_std, disp_std = Preprocessor.array_reshape(
                meshfile, data_shape, channel_first)
            forces_time_steps[k, :, :, :, :] = force
            disps_time_steps[k, :, :, :, :] = disp

            F_mean_x.append(force_mean[0])
            F_mean_y.append(force_mean[1])
            F_mean_z.append(force_mean[2])
            F_std_x.append(force_std[0])
            F_std_y.append(force_std[1])
            F_std_z.append(force_std[2])

            disp_mean_x.append(disp_mean[0])
            disp_mean_y.append(disp_mean[1])
            disp_mean_z.append(disp_mean[2])
            disp_std_x.append(disp_std[0])
            disp_std_y.append(disp_std[1])