Пример #1
0
def invokeGen(infile):
    if infile[-6:] == '.djcdc':
        dc = DataCollection(infile)
        td = dc.dataclass()
        tdclass = dc.dataclass
        dc.setBatchSize(1)
        gen = dc.invokeGenerator()
    elif infile[-6:] == '.djctd':
        td = TrainData_NanoML()
        tdclass = TrainData_NanoML
        td.readFromFile(infile)
        gen = TrainDataGenerator()
        gen.setBatchSize(1)
        gen.setBuffer(td)
    elif infile[-5:] == '.root':
        print('reading from root file')
        td = TrainData_NanoML()
        tdclass = TrainData_NanoML
        td.readFromSourceFile(infile,{},True)
        td.writeToFile(infile+'.djctd')
        td.readFromFile(infile+'.djctd')
        gen = TrainDataGenerator()
        gen.setBatchSize(1)
        gen.setBuffer(td)
        
    gen.setSkipTooLargeBatches(False)
    nevents = gen.getNBatches()
    gen.cast_to = tdclass
    return gen.feedTrainData,nevents,td
Пример #2
0
    def __init__(
            self,
            samplefile,
            function_to_apply=None,  #needs to be function(counter,[model_input], [predict_output], [truth])
            after_n_batches=50,
            batchsize=10,
            on_epoch_end=False,
            use_event=0,
            decay_function=None,
            offset=0):
        super(PredictCallback, self).__init__()
        self.samplefile = samplefile
        self.function_to_apply = function_to_apply
        self.counter = 0
        self.call_counter = offset
        self.decay_function = decay_function

        self.after_n_batches = after_n_batches
        self.run_on_epoch_end = on_epoch_end

        if self.run_on_epoch_end and self.after_n_batches >= 0:
            print(
                'PredictCallback: can only be used on epoch end OR after n batches, falling back to epoch end'
            )
            self.after_n_batches = 0

        td = TrainData()
        td.readFromFile(samplefile)
        if use_event >= 0:
            td.skim(use_event)

        self.batchsize = 1
        self.td = td
        self.gen = TrainDataGenerator()
        self.gen.setBatchSize(batchsize)
        self.gen.setSkipTooLargeBatches(False)
Пример #3
0
from DeepJetCore.dataPipeline import TrainDataGenerator

infile=args.infile
nbatch=int(args.nelementsperfile)
randomise = args.randomise

dc = DataCollection(infile)
dc2 = DataCollection(infile)
samples = dc.samples

dir = dc.dataDir
if len(dir)<1:
    dir='.'
insamples = [dir+'/'+s for s in samples]

gen = TrainDataGenerator()
gen.setBatchSize(nbatch)
gen.setSkipTooLargeBatches(False)
gen.setFileList(insamples)

if randomise:
    gen.shuffleFileList()

nbatches = gen.getNBatches()

newsamples=[]
for i in range(nbatches):
    newname = str(samples[0][:-6]+"_n_"+str(i)+".djctd")
    newsamples.append(newname)
    ntd = gen.getBatch()
    print(newname)
Пример #4
0
 def invokeGenerator(self):
     generator = TrainDataGenerator()
     generator.setBatchSize(self.__batchsize)
     generator.setSquaredElementsLimit(self.batch_uses_sum_of_squares)
     generator.setFileList([self.dataDir + "/" + s for s in self.samples])
     return generator
Пример #5
0
    def predict(self, model=None, model_path=None, output_to_file=True):
        if model_path == None:
            model_path = self.model_path

        if model is None:
            if not os.path.exists(model_path):
                raise FileNotFoundError('Model file not found')

        assert model_path is not None or model is not None

        outputs = []
        if output_to_file:
            os.system('mkdir -p ' + self.predict_dir)

        if model is None:
            model = load_model(model_path)

        all_data = []
        for inputfile in self.input_data_files:

            use_inputdir = self.inputdir
            if inputfile[0] == "/":
                use_inputdir = ""
            outfilename = "pred_" + os.path.basename(inputfile)

            print('predicting ', use_inputdir + '/' + inputfile)

            td = self.dc.dataclass()

            #also allows for inheriting classes now, like with tracks or special PU
            if not isinstance(td, TrainData_NanoML) and type(
                    td) is not TrainData_TrackML:
                raise RuntimeError(
                    "TODO: make sure this works for other traindata formats")

            if inputfile[-5:] == 'djctd':
                if self.unbuffered:
                    td.readFromFile(use_inputdir + "/" + inputfile)
                else:
                    td.readFromFileBuffered(use_inputdir + "/" + inputfile)
            else:
                print('converting ' + inputfile)
                td.readFromSourceFile(use_inputdir + "/" + inputfile,
                                      self.dc.weighterobjects,
                                      istraining=False)

            gen = TrainDataGenerator()
            # the batch size must be one otherwise we need to play tricks with the row splits later on
            gen.setBatchSize(1)
            gen.setSquaredElementsLimit(False)
            gen.setSkipTooLargeBatches(False)
            gen.setBuffer(td)

            num_steps = gen.getNBatches()
            generator = gen.feedNumpyData()

            dumping_data = []

            thistime = time.time()
            for _ in range(num_steps):
                data_in = next(generator)
                predictions_dict = model(data_in[0])
                for k in predictions_dict.keys():
                    predictions_dict[k] = predictions_dict[k].numpy()
                features_dict = td.createFeatureDict(data_in[0])
                truth_dict = td.createTruthDict(data_in[0])

                dumping_data.append(
                    [features_dict, truth_dict, predictions_dict])

            totaltime = time.time() - thistime
            print('took approx', totaltime / num_steps,
                  's per endcap (also includes dict building)')

            td.clear()
            gen.clear()
            outfilename = os.path.splitext(outfilename)[0] + '.bin.gz'
            if output_to_file:
                td.writeOutPredictionDict(dumping_data,
                                          self.predict_dir + "/" + outfilename)
            outputs.append(outfilename)
            if not output_to_file:
                all_data.append(dumping_data)

        if output_to_file:
            with open(self.predict_dir + "/outfiles.txt", "w") as f:
                for l in outputs:
                    f.write(l + '\n')

        if not output_to_file:
            return all_data
Пример #6
0
    def __init__(self,
                 samplefile,
                 accumulate_after_batches=5,
                 plot_after_batches=50,
                 batchsize=10,
                 beta_threshold=0.6,
                 distance_threshold=0.6,
                 iou_threshold=0.1,
                 n_windows_for_plots=5,
                 n_windows_for_scalar_metrics=5000000,
                 outputdir=None,
                 publish = None,
                 n_ccoords=None,
                 n_average_over_samples=5,
                 ):
        """

        :param samplefile: the file to pick validation data from
        :param accumulate_after_batches: run performance metrics after n batches (a good value is 5)
        :param plot_after_batches: update and upload plots after n batches
        :param batchsize: batch size
        :param beta_threshold: beta threshold for running prediction on obc
        :param distance_threshold: distance threshold for running prediction on obc
        :param iou_threshold: iou threshold to use to match both for obc and for ticl
        :param n_windows_for_plots: how many windows to average to do running performance plots
        :param n_windows_for_scalar_metrics: the maximum windows to store data for scalar performance metrics as a function of iteration
        :param outputdir: the output directory where to store results
        :param publish: where to publish, could be ssh'able path
        :param n_ccoords: n coords for plots
        :param n_average_over_samples: average scalar metrics over samples
        """
        super(plotRunningPerformanceMetrics, self).__init__()
        self.samplefile = samplefile
        self.counter = 0
        self.call_counter = 0
        self.decay_function = None
        self.outputdir = outputdir
        self.n_ccords=n_ccoords
        self.publish=publish

        self.accumulate_after_batches = accumulate_after_batches
        self.plot_after_batches = plot_after_batches
        self.run_on_epoch_end = False

        if self.run_on_epoch_end and self.accumulate_after_batches >= 0:
            print('PredictCallback: can only be used on epoch end OR after n batches, falling back to epoch end')
            self.accumulate_after_batches = 0

        td = TrainData()
        td.readFromFile(samplefile)
        # td_selected = td.split(self.n_events)  # check if this works in ragged out of the box
        # if use_event >= 0:
        #     if use_event < td.nElements():
        #         td.skim(use_event)
        #     else:
        #         td.skim(use_event % td.nElements())
        self.batchsize = batchsize
        self.td = td
        self.gen = TrainDataGenerator()
        self.gen.setBatchSize(self.batchsize)
        self.gen.setSkipTooLargeBatches(False)
        self.gen.setBuffer(td)

        self.n_batches=self.gen.getNBatches()


        with tf.device('/CPU:0'):
            self.ragged_constructor = RaggedConstructTensor()
        self.window_id = 0
        self.window_analysis_dicts = []
        self.n_windows_for_plots = n_windows_for_plots
        self.n_windows_for_scalar_metrics = n_windows_for_scalar_metrics
        self.beta_threshold = beta_threshold
        self.distance_threshold = distance_threshold
        self.iou_threshold = iou_threshold

        self.scalar_metrics = dict()
        self.scalar_metrics['efficiency'] = []
        self.scalar_metrics['efficiency_ticl'] = []
        self.scalar_metrics['fake_rate'] = []
        self.scalar_metrics['fake_rate_ticl'] = []
        self.scalar_metrics['var_response'] = []
        self.scalar_metrics['var_response_ticl'] = []
        self.scalar_metrics['iteration'] = []

        self.n_average_over_samples = n_average_over_samples

        self.plot_process = None