Exemplo n.º 1
0
class training_base(object):
    def __init__(self,
                 splittrainandtest=0.85,
                 useweights=False,
                 testrun=False,
                 testrun_fraction=0.1,
                 resumeSilently=False,
                 renewtokens=True,
                 collection_class=DataCollection,
                 parser=None,
                 recreate_silently=False):

        import sys
        scriptname = sys.argv[0]

        if parser is None: parser = ArgumentParser('Run the training')
        parser.add_argument('inputDataCollection')
        parser.add_argument('outputDir')
        parser.add_argument(
            '--modelMethod',
            help=
            'Method to be used to instantiate model in derived training class',
            metavar='OPT',
            default=None)
        parser.add_argument("--gpu",
                            help="select specific GPU",
                            metavar="OPT",
                            default="")
        parser.add_argument("--gpufraction",
                            help="select memory fraction for GPU",
                            type=float,
                            metavar="OPT",
                            default=-1)
        parser.add_argument("--submitbatch",
                            help="submits the job to condor",
                            default=False,
                            action="store_true")
        parser.add_argument(
            "--walltime",
            help=
            "sets the wall time for the batch job, format: 1d5h or 2d or 3h etc",
            default='1d')
        parser.add_argument("--isbatchrun",
                            help="is batch run",
                            default=False,
                            action="store_true")
        parser.add_argument("--valdata",
                            help="set validation dataset (optional)",
                            default="")
        parser.add_argument(
            "--takeweights",
            help=
            "Applies weights from the model given as relative or absolute path. Matches by names and skips layers that don't match.",
            default="")

        args = parser.parse_args()
        self.args = args
        import sys
        self.argstring = sys.argv
        #sanity check
        if args.isbatchrun:
            args.submitbatch = False
            resumeSilently = True

        if args.submitbatch:
            print(
                'submitting batch job. Model will be compiled for testing before submission (GPU settings being ignored)'
            )

        import matplotlib
        #if no X11 use below
        matplotlib.use('Agg')
        DJCSetGPUs(args.gpu)

        if args.gpufraction > 0 and args.gpufraction < 1:
            import sys
            import tensorflow as tf
            gpu_options = tf.GPUOptions(
                per_process_gpu_memory_fraction=args.gpufraction)
            sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
            import keras
            from keras import backend as K
            K.set_session(sess)
            print('using gpu memory fraction: ' + str(args.gpufraction))

        import keras

        self.ngpus = 1
        self.dist_strat_scope = None
        if len(args.gpu):
            self.ngpus = len([i for i in args.gpu.split(',')])
            print('running on ' + str(self.ngpus) + ' gpus')
            if self.ngpus > 1:
                import tensorflow as tf
                self.dist_strat_scope = tf.distribute.MirroredStrategy()

        self.keras_inputs = []
        self.keras_inputsshapes = []
        self.keras_model = None
        self.keras_model_method = args.modelMethod
        self.keras_weight_model_path = args.takeweights
        self.train_data = None
        self.val_data = None
        self.startlearningrate = None
        self.optimizer = None
        self.trainedepoches = 0
        self.compiled = False
        self.checkpointcounter = 0
        self.renewtokens = renewtokens
        if args.isbatchrun:
            self.renewtokens = False
        self.callbacks = None
        self.custom_optimizer = False
        self.copied_script = ""
        self.submitbatch = args.submitbatch

        self.GAN_mode = False

        self.inputData = os.path.abspath(args.inputDataCollection) \
             if ',' not in args.inputDataCollection else \
              [os.path.abspath(i) for i in args.inputDataCollection.split(',')]
        self.outputDir = args.outputDir
        # create output dir

        isNewTraining = True
        if os.path.isdir(self.outputDir):
            if not (resumeSilently or recreate_silently):
                var = input(
                    'output dir exists. To recover a training, please type "yes"\n'
                )
                if not var == 'yes':
                    raise Exception('output directory must not exists yet')
            isNewTraining = False
            if recreate_silently:
                isNewTraining = True
        else:
            os.mkdir(self.outputDir)
        self.outputDir = os.path.abspath(self.outputDir)
        self.outputDir += '/'

        if recreate_silently:
            os.system('rm -rf ' + self.outputDir + '*')

        #copy configuration to output dir
        if not args.isbatchrun:
            try:
                shutil.copyfile(scriptname,
                                self.outputDir + os.path.basename(scriptname))
            except shutil.SameFileError:
                pass
            except BaseException as e:
                raise e

            self.copied_script = self.outputDir + os.path.basename(scriptname)
        else:
            self.copied_script = scriptname

        self.train_data = collection_class()
        self.train_data.readFromFile(self.inputData)
        self.train_data.useweights = useweights

        if len(args.valdata):
            print('using validation data from ', args.valdata)
            self.val_data = DataCollection(args.valdata)

        else:
            if testrun:
                if len(self.train_data) > 1:
                    self.train_data.split(testrun_fraction)

                self.train_data.dataclass_instance = None  #can't be pickled
                self.val_data = copy.deepcopy(self.train_data)

            else:
                self.val_data = self.train_data.split(splittrainandtest)

        shapes = self.train_data.getKerasFeatureShapes()
        inputdtypes = self.train_data.getKerasFeatureDTypes()
        inputnames = self.train_data.getKerasFeatureArrayNames()
        for i in range(len(inputnames)):
            if inputnames[i] == "" or inputnames[i] == "_rowsplits":
                inputnames[i] = "input_" + str(i) + inputnames[i]

        print("shapes", shapes)
        print("inputdtypes", inputdtypes)
        print("inputnames", inputnames)

        self.keras_inputs = []
        self.keras_inputsshapes = []
        counter = 0
        for s, dt, n in zip(shapes, inputdtypes, inputnames):
            self.keras_inputs.append(
                keras.layers.Input(shape=s, dtype=dt, name=n))
            self.keras_inputsshapes.append(s)

        if not isNewTraining:
            kfile = self.outputDir+'/KERAS_check_model_last.h5' \
        if os.path.isfile(self.outputDir+'/KERAS_check_model_last.h5') else \
        self.outputDir+'/KERAS_model.h5'
            if os.path.isfile(kfile):
                print(kfile)

                if self.dist_strat_scope is not None:
                    with self.dist_strat_scope.scope():
                        self.loadModel(kfile)
                else:
                    self.loadModel(kfile)
                self.trainedepoches = 0
                if os.path.isfile(self.outputDir + 'losses.log'):
                    for line in open(self.outputDir + 'losses.log'):
                        valloss = line.split(' ')[1][:-1]
                        if not valloss == "None":
                            self.trainedepoches += 1
                else:
                    print(
                        'incomplete epochs, starting from the beginning but with pretrained model'
                    )
            else:
                print(
                    'no model found in existing output dir, starting training from scratch'
                )

    def __del__(self):
        if hasattr(self, 'train_data'):
            del self.train_data
            del self.val_data

    def modelSet(self):
        return (not self.keras_model == None) and not len(
            self.keras_weight_model_path)

    def setDJCKerasModel(self, model, *args, **kwargs):
        if len(self.keras_inputs) < 1:
            raise Exception('setup data first')
        self.keras_model = model(*args, **kwargs)
        if hasattr(self.keras_model, "_is_djc_keras_model"):
            self.keras_model.setInputShape(self.keras_inputs)
            self.keras_model.build(None)
        if not self.keras_model:
            raise Exception('Setting DJCKerasModel not successful')

    def setModel(self, model, **modelargs):
        if len(self.keras_inputs) < 1:
            raise Exception('setup data first')
        if self.dist_strat_scope is not None:
            with self.dist_strat_scope.scope():
                self.keras_model = model(self.keras_inputs, **modelargs)
        else:
            self.keras_model = model(self.keras_inputs, **modelargs)
        if hasattr(self.keras_model, "_is_djc_keras_model"):  #compatibility
            self.keras_model.setInputShape(self.keras_inputs)
            self.keras_model.build(None)

        if len(self.keras_weight_model_path):
            from DeepJetCore.modeltools import apply_weights_where_possible, load_model
            self.keras_model = apply_weights_where_possible(
                self.keras_model, load_model(self.keras_weight_model_path))
        #try:
        #    self.keras_model=model(self.keras_inputs,**modelargs)
        #except BaseException as e:
        #    print('problem in setting model. Reminder: since DJC 2.0, NClassificationTargets and RegressionTargets must not be specified anymore')
        #    raise e
        if not self.keras_model:
            raise Exception('Setting model not successful')

    def saveCheckPoint(self, addstring=''):

        self.checkpointcounter = self.checkpointcounter + 1
        self.saveModel("KERAS_model_checkpoint_" +
                       str(self.checkpointcounter) + "_" + addstring + ".h5")

    def loadModel(self, filename):
        from keras.models import load_model
        self.keras_model = load_model(filename,
                                      custom_objects=custom_objects_list)
        self.optimizer = self.keras_model.optimizer
        self.compiled = True
        if self.ngpus > 1:
            self.compiled = False

    def setCustomOptimizer(self, optimizer):
        self.optimizer = optimizer
        self.custom_optimizer = True

    def compileModel(self,
                     learningrate,
                     clipnorm=None,
                     discriminator_loss=['binary_crossentropy'],
                     print_models=False,
                     metrics=None,
                     **compileargs):
        if not self.keras_model and not self.GAN_mode:
            raise Exception('set model first')

        if self.ngpus > 1 and not self.submitbatch:
            print('Model being compiled for ' + str(self.ngpus) + ' gpus')

        self.startlearningrate = learningrate

        if not self.custom_optimizer:
            from keras.optimizers import Adam
            if clipnorm:
                self.optimizer = Adam(lr=self.startlearningrate,
                                      clipnorm=clipnorm)
            else:
                self.optimizer = Adam(lr=self.startlearningrate)

        if self.dist_strat_scope is not None:
            with self.dist_strat_scope.scope():
                self.keras_model.compile(optimizer=self.optimizer,
                                         metrics=metrics,
                                         **compileargs)
        else:
            self.keras_model.compile(optimizer=self.optimizer,
                                     metrics=metrics,
                                     **compileargs)
        if print_models:
            print(self.keras_model.summary())
        self.compiled = True

    def compileModelWithCustomOptimizer(self, customOptimizer, **compileargs):
        raise Exception(
            'DEPRECATED: please use setCustomOptimizer before calling compileModel'
        )

    def saveModel(self, outfile):
        if not self.GAN_mode:
            self.keras_model.save(self.outputDir + outfile)
        else:
            self.gan.save(self.outputDir + 'GAN_' + outfile)
            self.generator.save(self.outputDir + 'GEN_' + outfile)
            self.discriminator.save(self.outputDir + 'DIS_' + outfile)

        #import h5py
        #f = h5py.File(self.outputDir+outfile, 'r+')
        #del f['optimizer_weights']
        #f.close()

    def _initTraining(self, nepochs, batchsize, use_sum_of_squares=False):

        if self.submitbatch:
            from DeepJetCore.training.batchTools import submit_batch
            submit_batch(self, self.args.walltime)
            exit()  #don't delete this!

        self.train_data.setBatchSize(batchsize)
        self.val_data.setBatchSize(batchsize)
        self.train_data.batch_uses_sum_of_squares = use_sum_of_squares
        self.val_data.batch_uses_sum_of_squares = use_sum_of_squares

        self.train_data.writeToFile(self.outputDir + 'trainsamples.djcdc')
        self.val_data.writeToFile(self.outputDir + 'valsamples.djcdc')

        #make sure tokens don't expire
        from .tokenTools import checkTokens, renew_token_process
        from _thread import start_new_thread

        if self.renewtokens:
            print('starting afs backgrounder')
            checkTokens()
            start_new_thread(renew_token_process, ())

        self.train_data.setBatchSize(batchsize)
        self.val_data.setBatchSize(batchsize)

    def trainModel(
            self,
            nepochs,
            batchsize,
            run_eagerly=False,
            batchsize_use_sum_of_squares=False,
            extend_truth_list_by=0,  #extend the truth list with dummies. Useful when adding more prediction outputs than truth inputs
            stop_patience=-1,
            lr_factor=0.5,
            lr_patience=-1,
            lr_epsilon=0.003,
            lr_cooldown=6,
            lr_minimum=0.000001,
            checkperiod=10,
            backup_after_batches=-1,
            additional_plots=None,
            additional_callbacks=None,
            load_in_mem=False,
            max_files=-1,
            plot_batch_loss=False,
            **trainargs):

        self.keras_model.run_eagerly = run_eagerly
        # write only after the output classes have been added
        self._initTraining(nepochs, batchsize, batchsize_use_sum_of_squares)

        self.keras_model.save(self.outputDir + 'KERAS_untrained_model.h5')
        print('setting up callbacks')
        from .DeepJet_callbacks import DeepJet_callbacks
        minTokenLifetime = 5
        if not self.renewtokens:
            minTokenLifetime = -1

        self.callbacks = DeepJet_callbacks(
            self.keras_model,
            stop_patience=stop_patience,
            lr_factor=lr_factor,
            lr_patience=lr_patience,
            lr_epsilon=lr_epsilon,
            lr_cooldown=lr_cooldown,
            lr_minimum=lr_minimum,
            outputDir=self.outputDir,
            checkperiod=checkperiod,
            backup_after_batches=backup_after_batches,
            checkperiodoffset=self.trainedepoches,
            additional_plots=additional_plots,
            batch_loss=plot_batch_loss,
            minTokenLifetime=minTokenLifetime)

        if additional_callbacks is not None:
            if not isinstance(additional_callbacks, list):
                additional_callbacks = [additional_callbacks]
            self.callbacks.callbacks.extend(additional_callbacks)

        print('starting training')
        if load_in_mem:
            if match_truth_and_pred_list:
                raise ValueError(
                    "match_truth_and_pred_list not available with load_in_mem")
            print('make features')
            X_train = self.train_data.getAllFeatures(nfiles=max_files)
            X_test = self.val_data.getAllFeatures(nfiles=max_files)
            print('make truth')
            Y_train = self.train_data.getAllLabels(nfiles=max_files)
            Y_test = self.val_data.getAllLabels(nfiles=max_files)
            self.keras_model.fit(X_train,
                                 Y_train,
                                 batch_size=batchsize,
                                 epochs=nepochs,
                                 callbacks=self.callbacks.callbacks,
                                 validation_data=(X_test, Y_test),
                                 max_queue_size=1,
                                 use_multiprocessing=False,
                                 workers=0,
                                 **trainargs)
        else:

            #prepare generator

            print("setting up generator... can take a while")
            traingen = self.train_data.invokeGenerator()
            valgen = self.val_data.invokeGenerator()
            #this is fixed
            traingen.extend_truth_list_by = extend_truth_list_by
            valgen.extend_truth_list_by = extend_truth_list_by

            while (self.trainedepoches < nepochs):

                #this can change from epoch to epoch
                #calculate steps for this epoch
                #feed info below
                traingen.prepareNextEpoch()
                valgen.prepareNextEpoch()
                nbatches_train = traingen.getNBatches(
                )  #might have changed due to shuffeling
                nbatches_val = valgen.getNBatches()

                print('>>>> epoch', self.trainedepoches, "/", nepochs)
                print('training batches: ', nbatches_train)
                print('validation batches: ', nbatches_val)

                self.keras_model.fit(traingen.feedNumpyData(),
                                     steps_per_epoch=nbatches_train,
                                     epochs=self.trainedepoches + 1,
                                     initial_epoch=self.trainedepoches,
                                     callbacks=self.callbacks.callbacks,
                                     validation_data=valgen.feedNumpyData(),
                                     validation_steps=nbatches_val,
                                     max_queue_size=1,
                                     use_multiprocessing=False,
                                     workers=0,
                                     **trainargs)
                self.trainedepoches += 1
                traingen.shuffleFilelist()
                #

            self.saveModel("KERAS_model.h5")

        return self.keras_model, self.callbacks.history

    def change_learning_rate(self, new_lr):
        import keras.backend as K
        if self.GAN_mode:
            K.set_value(self.discriminator.optimizer.lr, new_lr)
            K.set_value(self.gan.optimizer.lr, new_lr)
        else:
            K.set_value(self.keras_model.optimizer.lr, new_lr)
Exemplo n.º 2
0
djdc_path = sys.argv[1]

train_data = DataCollection(djdc_path)

# splits off 10% of the training dataset for validation. Can be used in the same way as train_data
val_data = train_data.split(0.9)

# Set the batch size.
# If the data is ragged in dimension 1 (see convert options),
# then this is the maximum number of elements per batch, which could be distributed differently
# to individual examples. E.g., if the first example has 50 elements, the second 48, and the third 30,
# and the batch size is set to 100, it would return the first two examples (in total 99 elements) in
# the first batch etc. This is helpful to avoid out-of-memory errors during training

train_data.setBatchSize(100)

print("batch size: 100")
# prepare the generator

train_data.invokeGenerator()

# loop over epochs here ...

train_data.generator.shuffleFilelist()
train_data.generator.prepareNextEpoch()

# this number can differ from epoch to epoch for ragged data!
nbatches = train_data.generator.getNBatches()

print("nbatches: {}".format(nbatches))
Exemplo n.º 3
0
minbatch = int(args.min)
maxbatch = int(args.max)
n_plots = int(args.n)
infile = args.inputFile
batchsize = int(args.b)

from DeepJetCore.DataCollection import DataCollection
from index_dicts import create_truth_dict, create_feature_dict
from ragged_plotting_tools import make_original_truth_shower_plot, createRandomizedColors
import matplotlib
import matplotlib.pyplot as plt
import random

dc = DataCollection(infile)
dc.setBatchSize(batchsize)
gen = dc.invokeGenerator()
nbatches = gen.getNBatches()

if maxbatch >= nbatches:
    raise ValueError("maxbatch >= nbatches in sample")
if minbatch >= maxbatch:
    raise ValueError("minbatch >= maxbatch")

events = random.sample(range(minbatch, maxbatch), n_plots)
lastev = -1
n_plots_done = 0
print('scanning...')
for i in range(nbatches):
    f, t = next(gen.feedNumpyData())
    rs = f[1]
Exemplo n.º 4
0
    def train(self):

        placeholder_input, placeholder_output = self.model.get_placeholders()
        graph_output = self.model.get_compute_graphs()
        graph_loss = self.model.get_losses()
        graph_optmiser = self.model.get_optimizer()
        graph_summary = self.model.get_summary()

        if self.from_scratch:
            self.clean_summary_dir()

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)
            if self.use_tf_records:
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(coord=coord)
                record_batch_input, record_batch_target = self.get_record_placeholders(
                )
            else:
                input_data = self.config['train_data_path']
                train_data = DataCollection()
                train_data.readFromFile(input_data)

                val_data = train_data.split(0.1)
                train_data = train_data.split(0.9)
                train_data.setBatchSize(self.batch_size)
                val_data.setBatchSize(self.batch_size)
                val_data_generator = train_data.generator()
                train_data_generator = train_data.generator()

            summary_writer = tf.summary.FileWriter(self.summary_path,
                                                   sess.graph)

            if not self.from_scratch:
                self.saver_all.restore(sess, self.model_path)
                print("\n\nINFO: Loading model\n\n")
                with open(self.model_path + '.txt', 'r') as f:
                    iteration_number = int(f.read())
            else:
                iteration_number = 0

            print("Starting iterations")
            while iteration_number < self.train_for_iterations:
                if self.use_tf_records:
                    input, output = sess.run(
                        [record_batch_input, record_batch_target])
                    input = [
                        np.fromstring(''.join(i)).reshape(
                            13, 13, int(self.config['num_layers']),
                            int(self.config['num_channels'])) for i in input
                    ]
                    output = [
                        np.fromstring(''.join(i)).reshape(
                            13, 13, int(self.config['num_layers']))
                        for i in output
                    ]
                else:
                    input, output, _ = train_data_generator.next()
                    input = np.squeeze(input, axis=0)
                    output = np.squeeze(output, axis=0)

                _, eval_loss, _, eval_summary = sess.run(
                    [graph_output, graph_loss, graph_optmiser, graph_summary],
                    feed_dict={
                        placeholder_input: input,
                        placeholder_output: output
                    })
                print("Iteration %4d: loss %0.5f" %
                      (iteration_number, eval_loss))
                iteration_number += 1
                summary_writer.add_summary(eval_summary, iteration_number)
                if iteration_number % self.save_after_iterations == 0:
                    print("\n\nINFO: Saving model\n\n")
                    self.saver_all.save(sess, self.model_path)
                    with open(self.model_path + '.txt', 'w') as f:
                        f.write(str(iteration_number))
            if self.use_tf_records:
                # Stop the threads
                coord.request_stop()

                # Wait for threads to stop
                coord.join(threads)