Exemplo n.º 1
0
 def test(self):
     
     passed = True
     
     dc = DataCollection()
     dc.dataclass = TrainData_test
     dc.sourceList = [f for f in self.files.filenames]
     dc.createDataFromRoot(TrainData_test, outputDir=self.dcoutdir.path)
     
     gen = dc.invokeGenerator()
     gen.setBatchSize(self.n_per_batch)
     
     for epoch in range(10):
         gen.prepareNextEpoch()
         print("epoch",epoch,'batches',gen.getNBatches())
         for b in range(gen.getNBatches()):
             d,_ = next(gen.feedNumpyData())
             data,rs = d[0],d[1]
             rs = np.array(rs[:,0],dtype='int')
             rs = rs[:rs[-1]]
             #print(data)
             #print(rs[-1])
             if not raggedtester.checkData(data, rs):
                 print('epoch',epoch, 'batch',b,'broken')
                 passed=False
                 break
             if rs[-1] > self.n_per_batch:
                 print('maximum batch size exceeded for batch ',b, 'epoch', epoch)
             
         print('shuffling')
         gen.shuffleFilelist()
         
     return passed
Exemplo n.º 2
0
    print('available classes:')
    for key, val in class_options.iteritems():
        print(key)
    raise Exception('wrong class selection')

if not ".dc" in infile:
    raise Exception('wrong input file '+infile)
    
dir = os.path.dirname(infile)

dcold = DCOld()
dcold.readRawFromFile(infile)


dcnew = DataCollection()
dcnew.dataclass = traind()
dcnew.samples = [s[:-4]+'djctd' for s in dcold.samples]
print(dcnew.samples)
dcnew.sourceList = dcold.originRoots
# leave traindata undefined no way to convert.
dcnew.__nsamples = 0 # determine again, also check

outfile = infile[:-2] +'djcdc'
print("infile: ", infile, " outfile", outfile)

def worker(i):

    td = TDOld()
    tdnew = TrainData()
    print("converting",dcold.samples[i])
    
Exemplo n.º 3
0
from argparse import ArgumentParser

parser = ArgumentParser('Dataset validation hplots script')
parser.add_argument('-d', help="Data collection file")
parser.add_argument('-p',
                    help="PDF file path (will be ignored in validate mode)")
parser.add_argument('-n',
                    help="Number of events to produce dataset stats pdf on",
                    default="50")
parser.add_argument('--validate', dest='validate', action='store_true')
parser.set_defaults(validate=False)

args = parser.parse_args()
dc = DataCollection(args.d)
td = dc.dataclass()  #this is actually saved
#JK: this combination enforces one event per batch, then the extra row split loop is not needed
batchsize = 1
dc.setBatchSize(batchsize)
print("Invoking generator")
gen = dc.invokeGenerator()
gen.setSkipTooLargeBatches(False)

# gen.setBuffer(td)
print("n batches")
n_batches = gen.getNBatches()
print(n_batches)
print("probably ready")
#gpus = tf.config.list_physical_devices('GPU')
gpus = 0
if gpus:
Exemplo n.º 4
0
    dc = DataCollection(args.trainingDataCollection)

outputs = []
os.system('mkdir -p '+args.outputDir)


for inputfile in inputdatafiles:
    
    print('predicting ',inputdir+"/"+inputfile)
    
    use_inputdir = inputdir
    if inputfile[0] == "/":
        use_inputdir=""
    outfilename = "pred_"+os.path.basename( inputfile )
    
    td = dc.dataclass()

    if inputfile[-5:] == 'djctd':
        if args.unbuffered:
            td.readFromFile(use_inputdir+"/"+inputfile)
        else:
            td.readFromFileBuffered(use_inputdir+"/"+inputfile)
    else:
        print('converting '+inputfile)
        td.readFromSourceFile(use_inputdir+"/"+inputfile, dc.weighterobjects, istraining=False)
    

    gen = TrainDataGenerator()
    if batchsize < 1:
        batchsize = dc.getBatchSize()
    print('batch size',batchsize)
Exemplo n.º 5
0
class HGCalPredictor():
    def __init__(self,
                 input_source_files_list,
                 training_data_collection,
                 predict_dir,
                 unbuffered=False,
                 model_path=None,
                 max_files=4,
                 inputdir=None):
        self.input_data_files = []
        self.inputdir = None
        self.predict_dir = predict_dir
        self.unbuffered = unbuffered
        self.max_files = max_files
        print("Using HGCal predictor class")

        ## prepare input lists for different file formats

        if input_source_files_list[-6:] == ".djcdc":
            print('reading from data collection', input_source_files_list)
            predsamples = DataCollection(input_source_files_list)
            self.inputdir = predsamples.dataDir
            for s in predsamples.samples:
                self.input_data_files.append(s)

        elif input_source_files_list[-6:] == ".djctd":
            self.inputdir = os.path.abspath(
                os.path.dirname(input_source_files_list))
            infile = os.path.basename(input_source_files_list)
            self.input_data_files.append(infile)
        else:
            print('reading from text file', input_source_files_list)
            self.inputdir = os.path.abspath(
                os.path.dirname(input_source_files_list))
            with open(input_source_files_list, "r") as f:
                for s in f:
                    self.input_data_files.append(
                        s.replace('\n', '').replace(" ", ""))

        self.dc = None
        if input_source_files_list[
                -6:] == ".djcdc" and not training_data_collection[
                    -6:] == ".djcdc":
            self.dc = DataCollection(input_source_files_list)
        else:
            self.dc = DataCollection(training_data_collection)

        if inputdir is not None:
            self.inputdir = inputdir

        self.model_path = model_path
        if max_files > 0:
            self.input_data_files = self.input_data_files[
                0:min(max_files, len(self.input_data_files))]

    def predict(self, model=None, model_path=None, output_to_file=True):
        if model_path == None:
            model_path = self.model_path

        if model is None:
            if not os.path.exists(model_path):
                raise FileNotFoundError('Model file not found')

        assert model_path is not None or model is not None

        outputs = []
        if output_to_file:
            os.system('mkdir -p ' + self.predict_dir)

        if model is None:
            model = load_model(model_path)

        all_data = []
        for inputfile in self.input_data_files:

            use_inputdir = self.inputdir
            if inputfile[0] == "/":
                use_inputdir = ""
            outfilename = "pred_" + os.path.basename(inputfile)

            print('predicting ', use_inputdir + '/' + inputfile)

            td = self.dc.dataclass()

            #also allows for inheriting classes now, like with tracks or special PU
            if not isinstance(td, TrainData_NanoML) and type(
                    td) is not TrainData_TrackML:
                raise RuntimeError(
                    "TODO: make sure this works for other traindata formats")

            if inputfile[-5:] == 'djctd':
                if self.unbuffered:
                    td.readFromFile(use_inputdir + "/" + inputfile)
                else:
                    td.readFromFileBuffered(use_inputdir + "/" + inputfile)
            else:
                print('converting ' + inputfile)
                td.readFromSourceFile(use_inputdir + "/" + inputfile,
                                      self.dc.weighterobjects,
                                      istraining=False)

            gen = TrainDataGenerator()
            # the batch size must be one otherwise we need to play tricks with the row splits later on
            gen.setBatchSize(1)
            gen.setSquaredElementsLimit(False)
            gen.setSkipTooLargeBatches(False)
            gen.setBuffer(td)

            num_steps = gen.getNBatches()
            generator = gen.feedNumpyData()

            dumping_data = []

            thistime = time.time()
            for _ in range(num_steps):
                data_in = next(generator)
                predictions_dict = model(data_in[0])
                for k in predictions_dict.keys():
                    predictions_dict[k] = predictions_dict[k].numpy()
                features_dict = td.createFeatureDict(data_in[0])
                truth_dict = td.createTruthDict(data_in[0])

                dumping_data.append(
                    [features_dict, truth_dict, predictions_dict])

            totaltime = time.time() - thistime
            print('took approx', totaltime / num_steps,
                  's per endcap (also includes dict building)')

            td.clear()
            gen.clear()
            outfilename = os.path.splitext(outfilename)[0] + '.bin.gz'
            if output_to_file:
                td.writeOutPredictionDict(dumping_data,
                                          self.predict_dir + "/" + outfilename)
            outputs.append(outfilename)
            if not output_to_file:
                all_data.append(dumping_data)

        if output_to_file:
            with open(self.predict_dir + "/outfiles.txt", "w") as f:
                for l in outputs:
                    f.write(l + '\n')

        if not output_to_file:
            return all_data