Python TrainDataGenerator.TrainDataGenerator примеры использования

Язык программирования: Python

Пространство имен/Пакет: DeepJetCore.dataPipeline

Класс/Тип: TrainDataGenerator

Метод/Функция: TrainDataGenerator

Примеров на hotexamples.com: 6

Python TrainDataGenerator.TrainDataGenerator - 6 примеров найдено. Это лучшие примеры Python кода для DeepJetCore.dataPipeline.TrainDataGenerator.TrainDataGenerator, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

setBatchSize(8)

TrainDataGenerator(6)

feedNumpyData(6)

getNBatches(6)

setBuffer(5)

setSkipTooLargeBatches(5)

setSquaredElementsLimit(3)

clear(2)

lastBatch(2)

setFileList(2)

cast_to(1)

getBatch(1)

shuffleFileList(1)

Пример #1

Показать файл

def invokeGen(infile):
    if infile[-6:] == '.djcdc':
        dc = DataCollection(infile)
        td = dc.dataclass()
        tdclass = dc.dataclass
        dc.setBatchSize(1)
        gen = dc.invokeGenerator()
    elif infile[-6:] == '.djctd':
        td = TrainData_NanoML()
        tdclass = TrainData_NanoML
        td.readFromFile(infile)
        gen = TrainDataGenerator()
        gen.setBatchSize(1)
        gen.setBuffer(td)
    elif infile[-5:] == '.root':
        print('reading from root file')
        td = TrainData_NanoML()
        tdclass = TrainData_NanoML
        td.readFromSourceFile(infile,{},True)
        td.writeToFile(infile+'.djctd')
        td.readFromFile(infile+'.djctd')
        gen = TrainDataGenerator()
        gen.setBatchSize(1)
        gen.setBuffer(td)
        
    gen.setSkipTooLargeBatches(False)
    nevents = gen.getNBatches()
    gen.cast_to = tdclass
    return gen.feedTrainData,nevents,td

Пример #2

Показать файл

Файл: DeepJet_callbacks.py Проект: schoef/DeepJetCore

    def __init__(
            self,
            samplefile,
            function_to_apply=None,  #needs to be function(counter,[model_input], [predict_output], [truth])
            after_n_batches=50,
            batchsize=10,
            on_epoch_end=False,
            use_event=0,
            decay_function=None,
            offset=0):
        super(PredictCallback, self).__init__()
        self.samplefile = samplefile
        self.function_to_apply = function_to_apply
        self.counter = 0
        self.call_counter = offset
        self.decay_function = decay_function

        self.after_n_batches = after_n_batches
        self.run_on_epoch_end = on_epoch_end

        if self.run_on_epoch_end and self.after_n_batches >= 0:
            print(
                'PredictCallback: can only be used on epoch end OR after n batches, falling back to epoch end'
            )
            self.after_n_batches = 0

        td = TrainData()
        td.readFromFile(samplefile)
        if use_event >= 0:
            td.skim(use_event)

        self.batchsize = 1
        self.td = td
        self.gen = TrainDataGenerator()
        self.gen.setBatchSize(batchsize)
        self.gen.setSkipTooLargeBatches(False)

Пример #3

Показать файл

Файл: mergeOrSplitFiles.py Проект: DL4Jets/DeepJetCore

from DeepJetCore.dataPipeline import TrainDataGenerator

infile=args.infile
nbatch=int(args.nelementsperfile)
randomise = args.randomise

dc = DataCollection(infile)
dc2 = DataCollection(infile)
samples = dc.samples

dir = dc.dataDir
if len(dir)<1:
    dir='.'
insamples = [dir+'/'+s for s in samples]

gen = TrainDataGenerator()
gen.setBatchSize(nbatch)
gen.setSkipTooLargeBatches(False)
gen.setFileList(insamples)

if randomise:
    gen.shuffleFileList()

nbatches = gen.getNBatches()

newsamples=[]
for i in range(nbatches):
    newname = str(samples[0][:-6]+"_n_"+str(i)+".djctd")
    newsamples.append(newname)
    ntd = gen.getBatch()
    print(newname)

Пример #4

Показать файл

Файл: DataCollection.py Проект: schoef/DeepJetCore

 def invokeGenerator(self):
     generator = TrainDataGenerator()
     generator.setBatchSize(self.__batchsize)
     generator.setSquaredElementsLimit(self.batch_uses_sum_of_squares)
     generator.setFileList([self.dataDir + "/" + s for s in self.samples])
     return generator

Пример #5

Показать файл

    def predict(self, model=None, model_path=None, output_to_file=True):
        if model_path == None:
            model_path = self.model_path

        if model is None:
            if not os.path.exists(model_path):
                raise FileNotFoundError('Model file not found')

        assert model_path is not None or model is not None

        outputs = []
        if output_to_file:
            os.system('mkdir -p ' + self.predict_dir)

        if model is None:
            model = load_model(model_path)

        all_data = []
        for inputfile in self.input_data_files:

            use_inputdir = self.inputdir
            if inputfile[0] == "/":
                use_inputdir = ""
            outfilename = "pred_" + os.path.basename(inputfile)

            print('predicting ', use_inputdir + '/' + inputfile)

            td = self.dc.dataclass()

            #also allows for inheriting classes now, like with tracks or special PU
            if not isinstance(td, TrainData_NanoML) and type(
                    td) is not TrainData_TrackML:
                raise RuntimeError(
                    "TODO: make sure this works for other traindata formats")

            if inputfile[-5:] == 'djctd':
                if self.unbuffered:
                    td.readFromFile(use_inputdir + "/" + inputfile)
                else:
                    td.readFromFileBuffered(use_inputdir + "/" + inputfile)
            else:
                print('converting ' + inputfile)
                td.readFromSourceFile(use_inputdir + "/" + inputfile,
                                      self.dc.weighterobjects,
                                      istraining=False)

            gen = TrainDataGenerator()
            # the batch size must be one otherwise we need to play tricks with the row splits later on
            gen.setBatchSize(1)
            gen.setSquaredElementsLimit(False)
            gen.setSkipTooLargeBatches(False)
            gen.setBuffer(td)

            num_steps = gen.getNBatches()
            generator = gen.feedNumpyData()

            dumping_data = []

            thistime = time.time()
            for _ in range(num_steps):
                data_in = next(generator)
                predictions_dict = model(data_in[0])
                for k in predictions_dict.keys():
                    predictions_dict[k] = predictions_dict[k].numpy()
                features_dict = td.createFeatureDict(data_in[0])
                truth_dict = td.createTruthDict(data_in[0])

                dumping_data.append(
                    [features_dict, truth_dict, predictions_dict])

            totaltime = time.time() - thistime
            print('took approx', totaltime / num_steps,
                  's per endcap (also includes dict building)')

            td.clear()
            gen.clear()
            outfilename = os.path.splitext(outfilename)[0] + '.bin.gz'
            if output_to_file:
                td.writeOutPredictionDict(dumping_data,
                                          self.predict_dir + "/" + outfilename)
            outputs.append(outfilename)
            if not output_to_file:
                all_data.append(dumping_data)

        if output_to_file:
            with open(self.predict_dir + "/outfiles.txt", "w") as f:
                for l in outputs:
                    f.write(l + '\n')

        if not output_to_file:
            return all_data

Пример #6

Показать файл

    def __init__(self,
                 samplefile,
                 accumulate_after_batches=5,
                 plot_after_batches=50,
                 batchsize=10,
                 beta_threshold=0.6,
                 distance_threshold=0.6,
                 iou_threshold=0.1,
                 n_windows_for_plots=5,
                 n_windows_for_scalar_metrics=5000000,
                 outputdir=None,
                 publish = None,
                 n_ccoords=None,
                 n_average_over_samples=5,
                 ):
        """

        :param samplefile: the file to pick validation data from
        :param accumulate_after_batches: run performance metrics after n batches (a good value is 5)
        :param plot_after_batches: update and upload plots after n batches
        :param batchsize: batch size
        :param beta_threshold: beta threshold for running prediction on obc
        :param distance_threshold: distance threshold for running prediction on obc
        :param iou_threshold: iou threshold to use to match both for obc and for ticl
        :param n_windows_for_plots: how many windows to average to do running performance plots
        :param n_windows_for_scalar_metrics: the maximum windows to store data for scalar performance metrics as a function of iteration
        :param outputdir: the output directory where to store results
        :param publish: where to publish, could be ssh'able path
        :param n_ccoords: n coords for plots
        :param n_average_over_samples: average scalar metrics over samples
        """
        super(plotRunningPerformanceMetrics, self).__init__()
        self.samplefile = samplefile
        self.counter = 0
        self.call_counter = 0
        self.decay_function = None
        self.outputdir = outputdir
        self.n_ccords=n_ccoords
        self.publish=publish

        self.accumulate_after_batches = accumulate_after_batches
        self.plot_after_batches = plot_after_batches
        self.run_on_epoch_end = False

        if self.run_on_epoch_end and self.accumulate_after_batches >= 0:
            print('PredictCallback: can only be used on epoch end OR after n batches, falling back to epoch end')
            self.accumulate_after_batches = 0

        td = TrainData()
        td.readFromFile(samplefile)
        # td_selected = td.split(self.n_events)  # check if this works in ragged out of the box
        # if use_event >= 0:
        #     if use_event < td.nElements():
        #         td.skim(use_event)
        #     else:
        #         td.skim(use_event % td.nElements())
        self.batchsize = batchsize
        self.td = td
        self.gen = TrainDataGenerator()
        self.gen.setBatchSize(self.batchsize)
        self.gen.setSkipTooLargeBatches(False)
        self.gen.setBuffer(td)

        self.n_batches=self.gen.getNBatches()


        with tf.device('/CPU:0'):
            self.ragged_constructor = RaggedConstructTensor()
        self.window_id = 0
        self.window_analysis_dicts = []
        self.n_windows_for_plots = n_windows_for_plots
        self.n_windows_for_scalar_metrics = n_windows_for_scalar_metrics
        self.beta_threshold = beta_threshold
        self.distance_threshold = distance_threshold
        self.iou_threshold = iou_threshold

        self.scalar_metrics = dict()
        self.scalar_metrics['efficiency'] = []
        self.scalar_metrics['efficiency_ticl'] = []
        self.scalar_metrics['fake_rate'] = []
        self.scalar_metrics['fake_rate_ticl'] = []
        self.scalar_metrics['var_response'] = []
        self.scalar_metrics['var_response_ticl'] = []
        self.scalar_metrics['iteration'] = []

        self.n_average_over_samples = n_average_over_samples

        self.plot_process = None