示例#1
0
def main(archive_dir, hashcode):
    print(archive_dir, hashcode)
    sys.stdout.flush()
    dp = DataProcedure.find(archive_dir=archive_dir,
                            hashcode=hashcode,
                            verbose=1)
    if (not dp.is_archived()):
        dp.get_data(archive=True)
示例#2
0
 def f(hashes,archive_dir, verbose=0,i=0):
     from CMS_Deep_Learning.storage.archiving import DataProcedure
     if (verbose >= 1): print("Batch process %r started." % i)
     print("HASHES:", hashes)
     for h in hashes:
         u = DataProcedure.find(archive_dir=archive_dir, hashcode=h)
         u.get_data(archive=True, verbose=verbose)
         if(verbose >= 1): print("From process %r." % i)
def procsFrom_label_dir_pairs(start,
                              samples_per_label,
                              stride,
                              archive_dir,
                              label_dir_pairs,
                              object_profiles,
                              observ_types,
                              single_list=False,
                              sort_columns=None,
                              sort_ascending=True,
                              data_keys=["X", 'Y'],
                              verbose=1):
    '''Gets a list of DataProcedures that use preprocessFromPandas_label_dir_pairs to read from the unjoined pandas files
        #Arguments
            start -- Where to start reading in the filesystem (if we treat it as one long list for each directory)
            samples_per_label -- How many samples to read from the filesystem per event type
            stride -- How many samples_per_label to grab in each DataProcedure. This should be big enough to avoid 
                    excessive reads but small enough so that samples_per_label*labels total samples can fit reasonably
                    in memory.
            archive_dir -- the archive directory to store the preprocessed data.
            label_dir_pairs -- A list of tuples like (label_name, pandas_data_directory) telling us what to call the data
                                and where to find it.
            object_profiles -- A list of ObjectProfiles, used to determine what preprocessing steps need to be taken
            observ_types -- A list of the observable quantities in our pandas tables i.e ['E/c', "Px" ,,,etc.]
            verbose -- Whether or not to print
    '''
    procs = []
    end = start + samples_per_label
    if (verbose >= 1):
        print("Generating DataProcedure in range(%r,%r):" % (start, end))
    for proc_start in range(start, end, stride):
        proc_num = min(stride, end - proc_start)
        dp = DataProcedure(archive_dir,
                           True,
                           preprocessFromPandas_label_dir_pairs, [
                               label_dir_pairs, proc_start, proc_num,
                               object_profiles, observ_types
                           ], {
                               'single_list': single_list,
                               'sort_columns': sort_columns,
                               'sort_ascending': sort_ascending,
                               'verbose': verbose
                           },
                           data_keys=data_keys)
        procs.append(dp)
        #print(proc_start, samples_per_label, stride)
        if (verbose >= 1):
            num_labels = len(label_dir_pairs)
            print("   From %r labels in range(%r,%r) for %rx%r = %r Samples" %
                  (num_labels, proc_start, proc_start + proc_num, num_labels,
                   proc_num, num_labels * proc_num))
    #print([p.hash() for p in procs])
    return procs
示例#4
0
def batchExecuteAndTestTrials(tups, time_str="24:00:00", repo="/scratch/snx3000/dweiteka/CMS_Deep_Learning/", trial_out_dir='/scratch/snx3000/dweiteka/trial_out/',use_mpi=False, verbose=1):
    '''Takes in a list of tuples 'tups' of the form (trial (a KerasTrial), test (a DataProcedure), num_test (an Integer), deps (a list)), and executes/tests 
        each trial, either in in order or in separate batches in the case of CSCS.
    '''
    isdaint = "daint" in socket.gethostname()
    scripts_dir = repo + "scripts/" 
    for trial, test, num_test, deps in tups:
        archive_dir = trial.archive_dir
        hashcode = trial.hash()

        test_hashcode = None
        if(test != None):
            
            test.write()
            test_hashcode = test.hash()
        if(isdaint):
            if(not os.path.exists(trial_out_dir)):
                os.makedirs(trial_out_dir)
            dep_clause = "" if len(deps)==0 else "--dependency=afterok:" + ":".join(deps)
            ofile = trial_out_dir + hashcode[:5] + ".%j"
            sbatch = 'sbatch -C gpu -t %s -o %s -e %s %s ' % (time_str,ofile,ofile,dep_clause)
            sbatch += '%srunTrial.sh %s %s %s %s %s %s\n' % (scripts_dir,repo,archive_dir,hashcode, test_hashcode, num_test, use_mpi)
            if(verbose >=1): print(sbatch)
            out = os.popen(sbatch).read()
            if(verbose >=1): print("THIS IS THE OUTPUT:",out)
        else:
            if(use_mpi):
                trial = KerasTrial.find(archive_dir, hashcode) 
            else:
                from CMS_Deep_Learning.storage.MPIArchiving import MPI_KerasTrial
                trial = MPI_KerasTrial.find(archive_dir, hashcode)
            if(verbose >=1): print("EXECUTE %r" % trial.hash())
            trial.execute()#custom_objects={"Lorentz":Lorentz,"Slice": Slice})

            if(test_hashcode != None):
                if(verbose >=1): print("TEST %r" % trial.hash())
                test = DataProcedure.find(archive_dir, test_hashcode)
                trial.test(test_proc=test,
                             test_samples=num_test,
                             custom_objects={"Lorentz":Lorentz,"Slice": Slice})
def getGensDefaultFormat(archive_dir,
                         splits,
                         length,
                         object_profiles,
                         label_dir_pairs,
                         observ_types,
                         single_list=False,
                         sort_columns=None,
                         sort_ascending=True,
                         batch_size=100,
                         megabytes=500,
                         verbose=1):
    '''Creates a set of DataProcedures that return generators and their coressponding lengths. Each generator consists of a list DataProcedures that preprocess data
        from a set of label_dir_pairs in a given range. The size of the archived files for each DP is set by 'megabytes' so that each one is not too big. Each generator
        reads a number of samples per label type set by 'splits' and 'length', and feeds data in batches of 'batch_size' into training.
        #Arguments:
            archive_dir -- The archive directory that the DataProcedures of each generator will archive their information in.
            splits -- a list of either integers or floats between 0 and 1 (or both). Each entry in 'splits' designates a generator. If an Integer is given then a generator
                      is built with the number of samples per label designated by that integer (static). If a float is given then the number of samples per label is computed as a 
                      fraction of the argument 'length' minus the sum of the integer entries (ratio). Float (ratio) entries in splits must add up to 1.0.
            length -- The total number of samples per label to be split among the float (ratio) values of 'splits' plus the Integer (static) values. In other words the total number
                        of samples per value to be used by all of the generators built by this function. Does not matter if all splits are Integers (static).
            object_profiles -- A list of ObjectProfiles (see CMS_Deep_Learning.utils.preprocessing.ObjectProfile). Order matters, these determine how the final preprocessed inputs will be
                            preprocessed and order among themselves.
            label_dir_pairs -- A list of tuples where the first entry is a label and the second is the name of a directory containing pandas files (either msg or h5 format) corresponding 
                            to that label.
            observ_types -- A list of the types of observables to be used in the final preprocessed files.
            batch_size -- How many samples to feed into training at a time. 
            megabytes -- Determines how large in MB a DataProcedure archive should be. A smaller number means less data in memory at a time as each generator is used, but shorter more frequent
                        disk reads. 
            verbose -- Determines whether or not information is printed out as the generators are formed and as they are used. (TODO: the implementation of this might need some work, the specifics
                        of how this information is passed along the the DPs and their dependant functions might not be implemented correctly at the moment, leading to printouts even if verbose=0)
        #Returns (all_dps, all_datasets)
            all_dps -- A list of DataProcedures, this can be passed to CMS_Deep_Learning.utils.batch.batchAssertArchived to make sure that all the DPs are archived before proceeding to training
            all_datasets -- A list like [(generator1,num_samples1), (generator2, num_samples2), ... , max_q_size], where max_q_size designates how large the keras generator queue should be so that
                            each generator starts reading the next DP in the archive as it starts outputing data from the previous one.  
        '''
    assert isinstance(object_profiles, list)
    assert isinstance(label_dir_pairs, list)
    assert isinstance(observ_types, list)
    stride = strideFromTargetSize(object_profiles,
                                  label_dir_pairs,
                                  observ_types,
                                  megabytes=megabytes)
    SNs = start_num_fromSplits(splits, length)
    all_dps = []
    all_datasets = []
    for s in SNs:
        dps = procsFrom_label_dir_pairs(s[0],
                                        s[1],
                                        stride,
                                        archive_dir,
                                        label_dir_pairs,
                                        object_profiles,
                                        observ_types,
                                        single_list=single_list,
                                        sort_columns=sort_columns,
                                        sort_ascending=sort_ascending,
                                        verbose=verbose)
        gen_DP = DataProcedure(archive_dir,
                               False,
                               genFromDPs,
                               dps,
                               batch_size,
                               threading=False,
                               verbose=verbose)
        num_samples = len(label_dir_pairs) * s[1]
        all_datasets += [(gen_DP, num_samples)]
        all_dps += dps
    #Calculate a good max_q_size and add it to the all_datasets list
    all_datasets += [max(np.ceil(stride / float(batch_size)), 1)]
    return (all_dps, all_datasets)