예제 #1
0
    def __init__(self, *args, **kwargs):
        super(KerasTests, self).__init__(*args, **kwargs)
        warnings.simplefilter('module')
        hvd.init()

        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.config.gpu_options.visible_device_list = str(hvd.local_rank())
예제 #2
0
    def initialize(self):
        # init_op = tf.initialize_all_variables()
        # init_op = tf.global_variables_initializer()
        # sess = tf.Session()
        # sess.run(init_op)

        # Check if GPUs are available
        # if tf.test.is_gpu_available():  # commented out since this test will cause a new session be created
        # allow growth
        # config = tf.compat.v1.ConfigProto()
        # config.gpu_options.per_process_gpu_memory_fraction = 1
        # config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
        # # config.log_device_placement = True  # to log device placement (on which device the operation ran)
        # sess = tf.compat.v1.Session(config=config)
        # tf.compat.v1.keras.backend.set_session(sess)  # set this TensorFlow session as the default session for Keras

        # Create logger
        self.logger = logging.getLogger('DeepGalaxyTrain')
        self.logger.setLevel(self.log_level)
        self.logger.addHandler(logging.FileHandler('train_log.txt'))
        if self.distributed_training is True:
            try:
                import horovod.tensorflow.keras as hvd
                # initialize horovod
                hvd.init()
                self.callbacks.append(
                    hvd.callbacks.BroadcastGlobalVariablesCallback(0))
                self.callbacks.append(hvd.callbacks.MetricAverageCallback())
                # self.callbacks = [hvd.BroadcastGlobalVariablesHook(0)]
                if hvd.rank() == 0:
                    self.logger.info('Parallel training enabled.')
                    self.logger.info(
                        'batch_size = %d, global_batch_size = %d, num_workers = %d\n'
                        % (self.batch_size, self.batch_size * hvd.size(),
                           hvd.size()))

                # Map an MPI process to a GPU (Important!)
                print('hvd_rank = %d, hvd_local_rank = %d' %
                      (hvd.rank(), hvd.local_rank()))
                self.logger.info('hvd_rank = %d, hvd_local_rank = %d' %
                                 (hvd.rank(), hvd.local_rank()))

                # Bind a CUDA device to one MPI process (has no effect if GPUs are not used)
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())

                # # Horovod: pin GPU to be used to process local rank (one GPU per process)
                gpus = tf.config.experimental.list_physical_devices('GPU')
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)
                # if gpus:
                # tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
            except ImportError as identifier:
                print(
                    'Error importing horovod. Disabling distributed training.')
                self.distributed_training = False
        else:
            self.logger.info('Parallel training disabled.')
            self.logger.info('Batch_size = %d' % (self.batch_size))
예제 #3
0
    def init_horovod(self):
        # Horovod: initialize Horovod.
        hvd.init()

        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        K.set_session(tf.Session(config=config))
예제 #4
0
def train_evaluate():

    # Generate training and validation data generators 
    def get_image_list(data_dir):
       dataset = []
       for folder in os.listdir(data_dir):
          for image in os.listdir(os.path.join(data_dir, folder)):
             dataset.append((os.path.join(data_dir, folder, image), folder)) 
       return dataset      

    training_data = ImageSequence(get_image_list(os.path.join(FLAGS.data_dir, 'train')), FLAGS.batch_size, True)
    validation_data = ImageSequence(get_image_list(os.path.join(FLAGS.data_dir, 'test')), FLAGS.batch_size, False)

    # Horovod: Initialize Horovod
    hvd.init()

    # Horvod: Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    tf.keras.backend.set_session(tf.Session(config=config))

    # Create a model
    model = network_model(FLAGS.hidden_units)
    loss = 'categorical_crossentropy'

    # Horovod: Adjust learning rate based on number of GPUs
    optimizer = Adadelta(lr=1.0 * hvd.size())
    # Horovod: add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(optimizer)

    metrics = ['acc']
    model.compile(optimizer, loss, metrics)
  
    # Set up callbacks
    callbacks = [
        # Broadcast initial variable states from rank 0 to all other processes
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]
    
    # Horovod: save  logs only on worker 0
    if hvd.rank() == 0:
        callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=FLAGS.log_dir))

    # Start training
    model.fit_generator(generator = training_data,
                        validation_data = validation_data,
                        epochs = FLAGS.epochs,
                        use_multiprocessing = True,
                        workers = 4,
                        callbacks = callbacks,
                        verbose = 1)

    # Save the model
    model.save(FLAGS.save_model_path)
예제 #5
0
def _main():
    hvd.init()
    better_exceptions.MAX_LENGTH = 128
    _MODELS_DIR.mkdir(parents=True, exist_ok=True)
    logger = tk.log.get()
    logger.addHandler(tk.log.stream_handler())
    if hvd.rank() == 0:
        logger.addHandler(
            tk.log.file_handler(_MODELS_DIR / 'train.log', append=True))
    with tk.dl.session(
            gpu_options={'visible_device_list': str(hvd.local_rank())}):
        _run()
예제 #6
0
def perform_setup(options):

    import numpy as np
    import sys
    import keras
    import keras.backend as K
    import tensorflow as tf

    sys.setrecursionlimit(5000)

    if options.with_hvd:
        import horovod.keras as hvd
        hvd.init()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth=True
        if options.gpu > 1:
            devlist = '0'
            for i in range(1,options.gpu):
                devlist += ','+str(i)
            config.gpu_options.visible_device_list = devlist
        else:
            config.gpu_options.visible_device_list = str(hvd.local_rank())
        K.set_session(tf.Session(config=config))
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    config.gpu_options.per_process_gpu_memory_fraction=0.25
    K.set_session(tf.Session(config=config))


    global _globalnpfile
    global _globalexpectedpixel
    global IMG_DTYPE
    global SEG_DTYPE
    global FLOAT_DTYPE
    global _nx
    global _ny


    # raw dicom data is usually short int (2bytes) datatype
    # labels are usually uchar (1byte)
    IMG_DTYPE = np.int16
    SEG_DTYPE = np.uint8
    FLOAT_DTYPE = np.float32

    # _globalnpfile = options.dbfile.replace('.csv','%d.npy' % options.trainingresample )
    # _globalexpectedpixel=512
    _nx = options.trainingresample
    _ny = options.trainingresample

    return IMG_DTYPE, SEG_DTYPE, _nx, _ny
예제 #7
0
def main(args):
    # ===========変更点============= #
    import horovod.keras as hvd
    hvd.init()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))
    
    logging.info("getting data")
    
    train_dataset = train_input_fn()
    eval_dataset = eval_input_fn()
    validation_dataset = validation_input_fn()
    
    
    # ===============変更点======================= #
    logging.info("configuring model")
    model = keras_model_fn(args.learning_rate, args.weight_decay, args.optimizer, args.momentum, hvd)
    callbacks = []

    # ===============変更点======================= #
    # callbacks.append(ModelCheckpoint(args.model_dir + '/checkpoint-{epoch}.h5'))
    callbacks.append(ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5'))
    
    # ===============変更点======================= #
    callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    callbacks.append(hvd.callbacks.MetricAverageCallback())
    callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))
    
    # ===============変更点======================= #
    if hvd.rank() == 0:
        callbacks.append(ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5'))
        callbacks.append(TensorBoard(log_dir=args.model_dir,update_freq='epoch'))

    logging.info("Starting training")
    model.fit(x=train_dataset[0], y=train_dataset[1],
              steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size),
              epochs=args.epochs, validation_data=validation_dataset,
              validation_steps=(num_examples_per_epoch('validation') // args.batch_size), callbacks=callbacks)

    score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=num_examples_per_epoch('eval') // args.batch_size,
                           verbose=0)

    logging.info('Test loss:{}'.format(score[0]))
    logging.info('Test accuracy:{}'.format(score[1]))

    # ===============変更点======================= #
    # return save_model(model, args.model_dir)
    return save_model(model, args.model_output_dir)
예제 #8
0
    def test_load_model_custom_objects(self):
        hvd.init()

        class TestOptimizer(keras.optimizers.RMSprop):
            def __init__(self, **kwargs):
                super(TestOptimizer, self).__init__(**kwargs)

        with self.test_session() as sess:
            K.set_session(sess)

            opt = TestOptimizer(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            _, fname = tempfile.mkstemp('.h5')
            model.save(fname)

            custom_objects = {
                'TestOptimizer':
                lambda **kwargs: hvd.DistributedOptimizer(
                    TestOptimizer(**kwargs))
            }
            new_model = hvd.load_model(fname, custom_objects=custom_objects)
            new_opt = new_model.optimizer
            os.remove(fname)

            self.assertEqual(type(new_opt).__module__, 'horovod.keras.impl')
            self.assertEqual(type(new_opt).__name__, 'TestOptimizer')
            self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr))
            self.assertEqual(len(opt.get_weights()),
                             len(new_opt.get_weights()))
            for weights, new_weights in zip(opt.get_weights(),
                                            new_opt.get_weights()):
                self.assertListEqual(weights.tolist(), new_weights.tolist())
예제 #9
0
    def test_sparse_as_dense(self):
        hvd.init()

        with self.test_session() as sess:
            K.set_session(sess)

            opt = keras.optimizers.RMSprop(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt, sparse_as_dense=True)

            model = keras.models.Sequential()
            model.add(keras.layers.Embedding(1000, 64, input_length=10))
            model.compile(loss=keras.losses.MSE, optimizer=opt)

            x = np.random.randint(1000, size=(32, 10))
            y = np.random.random((32, 10, 64))
            # No assertions, we just need to verify that it doesn't hang
            model.train_on_batch(x, y)
예제 #10
0
def init_keras(hvd=None):
    """
    Set config for Horovod. Config params copied from official example:
    https://github.com/uber/horovod/blob/master/examples/keras_mnist_advanced.py#L15

    :param hvd: instance of horovod.keras
    """

    init_cuda_env()
    config = tf.ConfigProto()

    if hvd:
        hvd.init()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())

    set_session(tf.Session(config=config))
예제 #11
0
    def test_load_model_custom_objects(self):
        hvd.init()

        class TestOptimizer(keras.optimizers.RMSprop):
            def __init__(self, **kwargs):
                super(TestOptimizer, self).__init__(**kwargs)

        with self.test_session() as sess:
            K.set_session(sess)

            opt = TestOptimizer(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3,)))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            _, fname = tempfile.mkstemp('.h5')
            model.save(fname)

            custom_objects = {
                'TestOptimizer': lambda **kwargs: hvd.DistributedOptimizer(
                    TestOptimizer(**kwargs))
            }
            new_model = hvd.load_model(fname, custom_objects=custom_objects)
            new_opt = new_model.optimizer
            os.remove(fname)

            self.assertEqual(type(new_opt).__module__, 'horovod.keras')
            self.assertEqual(type(new_opt).__name__, 'TestOptimizer')
            self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr))
            self.assertEqual(len(opt.get_weights()), len(new_opt.get_weights()))
            for weights, new_weights in zip(opt.get_weights(),
                                            new_opt.get_weights()):
                self.assertListEqual(weights.tolist(), new_weights.tolist())
예제 #12
0
 def test_horovod(self):
     import horovod.keras as hvd
     self.assertEqual(hvd.init(), 1)
     self.assertEqual(hvd.rank(), 0)
     print(
         '\nNOTE: remember to also test horovod with a real script, for example'
     )
     print(
         'https://github.com/CSCfi/machine-learning-scripts/blob/master/examples/keras-dvc-cnn-simple-hvd.py'
     )
예제 #13
0
def main(use_horovod: bool, gpus: int, checkpoint: int,
         config_path: str) -> None:
    config = process_config(config_path, use_horovod, gpus, checkpoint)

    # create tensorflow session and set as keras backed
    tf_config = tf.ConfigProto()

    if config.trainer.use_horovod:
        import horovod.keras as hvd

        hvd.init()
        tf_config.gpu_options.allow_growth = True
        tf_config.gpu_options.visible_device_list = str(hvd.local_rank())

    is_master = not config.trainer.use_horovod
    if not is_master:
        import horovod.keras as hvd

        is_master = hvd.rank() == 0

    if is_master and not os.path.exists(config.exp.source_dir):
        # copy source files
        shutil.copytree(
            os.path.abspath(os.path.curdir),
            config.exp.source_dir,
            ignore=lambda src, names:
            {"datasets", "__pycache__", ".git", "experiments", "venv"})

    tf_sess = tf.Session(config=tf_config)
    K.set_session(tf_sess)
    data_loader = get_data_loader(config=config)

    model, trainer = build_model_and_trainer(config, data_loader)

    print(f"Start Training Experiment {config.exp.name}")
    try:
        trainer.train()
    except Exception as e:
        send_noti_to_telegram(
            f"an exception raised on training {config.exp.name}")
        raise e
예제 #14
0
def setup_tf_config(config: DotMap):

    tf_config = tf.ConfigProto()

    if config.trainer.use_horovod:
        import horovod.keras as hvd

        hvd.init()
        tf_config.gpu_options.allow_growth = True
        tf_config.gpu_options.visible_device_list = str(hvd.local_rank())

    is_master = not config.trainer.use_horovod
    if not is_master:
        import horovod.keras as hvd

        is_master = hvd.rank() == 0

    tf_sess = tf.Session(config=tf_config)
    K.set_session(tf_sess)

    return is_master
예제 #15
0
    def train_model(self, epochs, batch_size, training_directory, test_directory, trained_model_filename, metrics_filename, binary=False, weights=None):
        # The metrics will be saved as a numpy array:
        # First row: training accuracy
        # Second row: validation (test) accuracy
        # Third row: training loss
        # Fourth row: validation(test) loss
        if binary:
            class_mode = 'binary'
        else:
            class_mode = 'categorical'

        hvd.init()
        callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
        self.load_model()
        training_datagen = ImageDataGenerator(rescale=1./255., horizontal_flip=True)
        training_generator = training_datagen.flow_from_directory(
            directory = training_directory,
            target_size = (299, 299),
            batch_size = batch_size,
            class_mode = class_mode)
        test_datagen = ImageDataGenerator(rescale=1./255., horizontal_flip=False)
        validation_test_generator = test_datagen.flow_from_directory(
            directory = test_directory,
            target_size = (299, 299),
            batch_size = 1,
            class_mode = class_mode)

        metrics = np.zeros((2, epochs))

        for epoch in range(epochs):
            self.model.fit_generator(generator=training_generator, callbacks=callbacks, steps_per_epoch=training_generator.n//batch_size, class_weight=weights)
            result = self.model.evaluate_generator(validation_test_generator)
            metrics[0, epoch] = result[1]
            metrics[1, epoch] = result[0]
            print('Validation Accuracy: ' + str(metrics[0, epoch]))
            print('Validation Loss: ' + str(metrics[1, epoch]))

        self.model.save(trained_model_filename)
        np.save(metrics_filename, metrics)
        self.model_filename = trained_model_filename
예제 #16
0
def init(global_batch_size, max_gpu_batch_size, gpus=runai.utils.gpus.count()):
    if gpus < 1:
        raise ValueError('GPU count (%d) must be at least 1' % gpus)

    module = sys.modules[__name__]

    setattr(module, 'global_batch_size', global_batch_size)
    setattr(module, 'gpus', gpus)
    setattr(module, 'master', True)

    # TODO(levosos): support uneven dividing
    steps = max(1, global_batch_size //
                (max_gpu_batch_size * gpus))  # must be at least 1
    batch_size = global_batch_size // (steps * gpus)

    setattr(module, 'steps', steps)
    setattr(module, 'batch_size', batch_size)

    runai.utils.log.info(
        'Spreading global batch size %d across %d GPU(s) each with %d step(s) of batch size %d',
        global_batch_size, gpus, steps, batch_size)

    if gpus > 1:
        runai.utils.log.debug('Initializing Horovod')
        import horovod.keras as hvd
        hvd.init()
        setattr(module, 'master', hvd.local_rank() == 0)
        setattr(module, 'hvd',
                hvd)  # so that anyone will be easily accessible to Horovod

        runai.utils.log.debug('Attaching Keras session to GPU #%d',
                              hvd.local_rank())
        import tensorflow
        config = tensorflow.ConfigProto()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        import keras.backend
        keras.backend.set_session(
            tensorflow.Session(config=config)
        )  # TODO(levosos): support cases where configuration will be set afterwards
예제 #17
0
    def create_inception_model(self, number_categories, dense_layer_sizes, dropout_fraction, unfrozen_layers, focal_loss=False):
        hvd.init()
        config = tf.compat.v1.ConfigProto()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        opt = hvd.DistributedOptimizer(tf.keras.optimizers.Adam(learning_rate=0.001*hvd.size()))
        model = InceptionV3(include_top=False, pooling='avg')
        output = model.outputs[0]

        for layer_size in dense_layer_sizes:
            dense = Dense(layer_size, activation='relu')(output)
            dropout = Dropout(dropout_fraction)(dense)
            output = BatchNormalization()(dropout)
        
        if number_categories == 1:
            output = Dense(1, activation='sigmoid')(output)
        else:
            output = Dense(number_categories, activation='softmax')(output)
        model = Model(inputs=model.inputs, outputs=output)

        for index in range(len(model.layers) - unfrozen_layers):
            model.layers[index].trainable = False

        if number_categories == 1:
            the_metrics = [metrics.binary_accuracy]
            if focal_loss:
                loss = customlosses.focal_binary_crossentropy
            else:
                loss = 'binary_crossentropy'
        else:
            the_metrics = [metrics.categorical_accuracy]
            if focal_loss:
                loss = customlosses.focal_categorical_crossentropy
            else:
                loss = 'categorical_crossentropy'

        model.compile(optimizer=opt, loss=loss, metrics=the_metrics)
        model.save(self.model_filename)
        self.model = model
예제 #18
0
def init(global_batch_size, max_gpu_batch_size, gpus=None):
    # first of all calculate the number of GA steps and the batch size
    runai.elastic._init(global_batch_size, max_gpu_batch_size, gpus)

    # now use Horovod if needed
    if runai.elastic.gpus > 1:
        runai.utils.log.debug('Initializing Horovod')
        import horovod.keras as hvd
        hvd.init()

        setattr(runai.elastic, 'master', hvd.local_rank() == 0)
        setattr(runai.elastic, 'hvd',
                hvd)  # so that anyone will be easily accessible to Horovod

        runai.utils.log.debug('Attaching Keras session to GPU #%d',
                              hvd.local_rank())
        import tensorflow
        config = tensorflow.ConfigProto()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        import keras.backend
        keras.backend.set_session(
            tensorflow.Session(config=config)
        )  # TODO(levosos): support cases where configuration will be set afterwards
예제 #19
0
def perform_setup(options):

    sys.setrecursionlimit(5000)

    if options.with_hvd:
        import horovod.keras as hvd
        hvd.init()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        if options.gpu > 1:
            devlist = '0'
            for i in range(1, options.gpu):
                devlist += ',' + str(i)
            config.gpu_options.visible_device_list = devlist
        else:
            config.gpu_options.visible_device_list = str(hvd.local_rank())
        K.set_session(tf.Session(config=config))

    global _globalnpfile
    global _globalexpectedpixel
    global INT_DTYPE
    global SEG_DTYPE
    global _nx
    global _ny

    # raw dicom data is usually short int (2bytes) datatype
    # labels are usually uchar (1byte)
    IMG_DTYPE = np.int16
    SEG_DTYPE = np.uint8

    _globalnpfile = options.dbfile.replace('.csv',
                                           '%d.npy' % options.trainingresample)
    _globalexpectedpixel = 512
    _nx = options.trainingresample
    _ny = options.trainingresample

    return IMG_DTYPE, SEG_DTYPE, _globalnpfile, _globalexpectedpixel, _nx, _ny
예제 #20
0
def main(args):
    if 'sourcedir.tar.gz' in args.tensorboard_dir:
        tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'model',
                                 args.tensorboard_dir)
    else:
        tensorboard_dir = args.tensorboard_dir
    logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir))

    if os.path.isdir(args.checkpoint_path):
        logging.info("Checkpointing directory {} exists".format(
            args.checkpoint_path))
    else:
        logging.info("Creating Checkpointing directory {}".format(
            args.checkpoint_path))
        os.mkdir(args.checkpoint_path)

    mpi = False
    if 'sagemaker_mpi_enabled' in args.fw_params:
        if args.fw_params['sagemaker_mpi_enabled']:
            import horovod.keras as hvd
            mpi = True
            # Horovod: initialize Horovod.
            hvd.init()

            # Horovod: pin GPU to be used to process local rank (one GPU per process)
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            config.gpu_options.visible_device_list = str(hvd.local_rank())
            K.set_session(tf.Session(config=config))
    else:
        hvd = None
    logging.info("Running with MPI={}".format(mpi))

    logging.info("getting data")
    train_dataset = train_input_fn()
    eval_dataset = eval_input_fn()
    validation_dataset = validation_input_fn()

    logging.info("configuring model")

    # Load model
    if not os.listdir(args.checkpoint_path):
        model = keras_model_fn(args.learning_rate, args.weight_decay,
                               args.optimizer, args.momentum, mpi, hvd)
        epoch_number = 0
    else:
        model, epoch_number = load_checkpoint_model(args.checkpoint_path)

    logging.info("Checkpointing to: {}".format(args.checkpoint_path))

    callbacks = []
    if mpi:
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
        callbacks.append(hvd.callbacks.MetricAverageCallback())
        callbacks.append(
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                     verbose=1))
        callbacks.append(
            keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
        if hvd.rank() == 0:
            callbacks.append(
                ModelCheckpoint(args.checkpoint_path +
                                '/checkpoint-{epoch}.h5'))
            callbacks.append(
                TensorBoard(log_dir=tensorboard_dir, update_freq='epoch'))
    else:
        callbacks.append(
            keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
        callbacks.append(
            ModelCheckpoint(args.checkpoint_path + '/checkpoint-{epoch}.h5'))
        callbacks.append(
            TensorBoard(log_dir=tensorboard_dir, update_freq='epoch'))

    logging.info("Starting training")
    size = 1
    if mpi:
        size = hvd.size()

    model.fit(
        x=train_dataset[0],
        y=train_dataset[1],
        steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size) //
        size,
        epochs=args.epochs,
        initial_epoch=epoch_number,
        validation_data=validation_dataset,
        validation_steps=(num_examples_per_epoch('validation') //
                          args.batch_size) // size,
        callbacks=callbacks)

    score = model.evaluate(eval_dataset[0],
                           eval_dataset[1],
                           steps=num_examples_per_epoch('eval') //
                           args.batch_size,
                           verbose=0)

    logging.info('Test loss:{}'.format(score[0]))
    logging.info('Test accuracy:{}'.format(score[1]))

    # Horovod: Save model only on worker 0 (i.e. master)
    if mpi:
        if hvd.rank() == 0:
            save_model(model, args.model_output_dir)
    else:
        save_model(model, args.model_output_dir)
예제 #21
0
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import math
import tensorflow as tf
import horovod.keras as hvd

# Horovod: initialize Horovod.
hvd.init()

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
K.set_session(tf.Session(config=config))

batch_size = 128
num_classes = 10

# Horovod: adjust number of epochs based on number of GPUs.
epochs = int(math.ceil(12.0 / hvd.size()))

# Input image dimensions
img_rows, img_cols = 28, 28

# The data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
예제 #22
0
def train():

    print('-' * 30)
    print('Loading train data...')
    print('-' * 30)

    x_train = np.load(os.path.join(paths['main_path'], 'data', 'x_train.npy'))
    y_train = np.load(os.path.join(paths['main_path'], 'data', 'y_train.npy'))
    x_val = np.load(os.path.join(paths['main_path'], 'data', 'x_val.npy'))
    y_val = np.load(os.path.join(paths['main_path'], 'data', 'y_val.npy'))

    indexes = np.arange(x_train.shape[0])
    np.random.shuffle(indexes)

    x_train = x_train[indexes]
    y_train = y_train[indexes]

    indexes = np.arange(x_val.shape[0])
    np.random.shuffle(indexes)

    x_val = x_val[indexes]
    y_val = y_val[indexes]

    if image_params['samplewise_intensity_normalization'] == True:
        x_train = samplewise_intensity_normalization(x_train)
        x_val = samplewise_intensity_normalization(x_val)

    if image_params['featurewise_normalization'] == True:
        x_train = featurewise_normalization(x_train)
        x_val = featurewise_normalization(x_val)

    x_train = x_train[..., np.newaxis]
    x_val = x_val[..., np.newaxis]

    if not os.path.isdir('training'):
        os.mkdir('training')

    print('-' * 30)
    print('Creating and compiling model...')
    print('-' * 30)

    # Horovod: initialize Horovod.
    hvd.init()

    #Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    # Horovod: adjust number of epochs based on number of GPUs.
    epochs = int(math.ceil(12.0 / hvd.size()))

    # Horovod: adjust learning rate based on number of GPUs.
    opt = Adadelta(1.0 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    data_generator_train = data_generator(x_train, y_train,
                                          augmentation_params,
                                          training_params['batch_size'])

    print('-' * 30)
    print('Fitting model...')
    print('-' * 30)
    model = classifier()
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(
            ModelCheckpoint(filepath=(os.path.join(paths['main_path'],
                                                   'training',
                                                   'classifier.hdf5')),
                            monitor='val_loss',
                            save_best_only=True))

    model.fit_generator(generator=data_generator_train,
                        steps_per_epoch=x_train.shape[0] //
                        training_params['batch_size'],
                        epochs=epochs,
                        verbose=1,
                        callbacks=callbacks,
                        validation_data=(x_val, y_val))
def main():

    train_label = FLAGS.train_label
    validation_label = FLAGS.validation_label
    labels, training_files, validation_files = load_train_valid_labels(
        train_label, validation_label)

    hvd.init()

    np.random.seed(hvd.rank())

    # Horovod: print logs on the first worker.
    verbose = 2 if hvd.rank() == 0 else 0

    print("Running with the following config:")
    for item in FLAGS.__dict__.items():
        print('%s = %s' % (item[0], str(item[1])))

    base_model = DenseNet121(include_top=False,
                             weights='imagenet',
                             input_shape=(FLAGS.image_size, FLAGS.image_size,
                                          3))
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    predictions = Dense(14, activation='sigmoid', bias_initializer='ones')(x)
    model = Model(inputs=base_model.input, outputs=predictions)

    if FLAGS.opt == 'adam':
        opt = optimizers.Adam(lr=FLAGS.lr)
    elif FLAGS.opt == 'sgd':
        opt = optimizers.SGD(lr=FLAGS.lr,
                             momentum=FLAGS.momentum,
                             nesterov=FLAGS.nesterov)
    elif FLAGS.opt == 'rmsprop':
        opt = optimizers.RMSProp(lr=FLAGS.lr)
    elif FLAGS.opt == 'adagrad':
        opt = optimizers.Adagrad(lr=FLAGS.lr)
    elif FLAGS.opt == 'adadelta':
        opt = optimizers.Adadelta(lr=FLAGS.lr)
    elif FLAGS.opt == 'adamax':
        opt = optimizers.Adamax(lr=FLAGS.lr)
    elif FLAGS.opt == 'nadam':
        opt = optimizers.Nadam(lr=FLAGS.lr)
    else:
        print("No optimizer selected. Using Adam.")
        opt = optimizers.Adam(lr=FLAGS.lr)

    hvd_opt = hvd.DistributedOptimizer(opt)

    model.compile(loss='binary_crossentropy',
                  optimizer=hvd_opt,
                  metrics=['accuracy'])
    # Path to weights file
    weights_file = FLAGS.model_dir + '/lr_{:.3f}_bz_{:d}'.format(
        FLAGS.lr,
        FLAGS.batch_size) + '_loss_{val_loss:.3f}_epoch_{epoch:02d}.h5'

    # Callbacks
    steps_per_epoch = 77871 // FLAGS.batch_size
    val_steps = 8653 // FLAGS.batch_size
    lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                   factor=0.1,
                                   epsilon=0.01,
                                   cooldown=0,
                                   patience=1,
                                   min_lr=1e-15,
                                   verbose=2)
    auc = AucRoc(val_generator(val_steps, validation_files, labels), val_steps)
    model_checkpoint = ModelCheckpoint(weights_file,
                                       monitor="val_loss",
                                       save_best_only=True,
                                       save_weights_only=True,
                                       verbose=2)

    callbacks = [
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        hvd.callbacks.MetricAverageCallback(),
        keras.callbacks.TensorBoard(log_dir='./logs',
                                    histogram_freq=0,
                                    batch_size=64), lr_reducer
    ]

    if hvd.rank() == 0:
        #callbacks.append(auc)
        callbacks.append(model_checkpoint)

    start_time = time.time()
    # specify training params and start training
    model.fit_generator(train_generator(steps_per_epoch // hvd.size(),
                                        training_files, labels),
                        steps_per_epoch=steps_per_epoch // hvd.size(),
                        epochs=FLAGS.epochs,
                        validation_data=val_generator(
                            3 * val_steps // hvd.size(), validation_files,
                            labels),
                        validation_steps=3 * val_steps // hvd.size(),
                        callbacks=callbacks,
                        verbose=verbose)
    end_time = time.time()
    print("start time: {} , end time: {} , elapsed time: {}".format(
        start_time, end_time, end_time - start_time))
예제 #24
0
    def initialize(self):
        tf.keras.backend.set_image_data_format('channels_last')

        if self.distributed_training is True:
            try:
                import horovod.tensorflow.keras as hvd
                # initialize horovod
                hvd.init()
                if hvd.rank() == 0:
                    # Create logger
                    self.logger = logging.getLogger('DeepGalaxyTrain')
                    self.logger.setLevel(self.log_level)
                    self.logger.addHandler(
                        logging.FileHandler('train_log.txt'))
                    self.logger.info('Parallel training enabled.')
                    self.logger.info(
                        'batch_size = %d, global_batch_size = %d, num_workers = %d\n'
                        % (self.batch_size, self.batch_size * hvd.size(),
                           hvd.size()))

                # Map an MPI process to a GPU (Important!)
                print('hvd_rank = %d, hvd_local_rank = %d' %
                      (hvd.rank(), hvd.local_rank()))
                if hvd.rank() == 0:
                    self.logger.info('hvd_rank = %d, hvd_local_rank = %d' %
                                     (hvd.rank(), hvd.local_rank()))

                # Add callbacks
                self.callbacks.append(
                    hvd.callbacks.BroadcastGlobalVariablesCallback(0))
                self.callbacks.append(hvd.callbacks.MetricAverageCallback())
                self.callbacks.append(DataReshuffleCallback(self))

                # Configure GPUs (if any)
                gpus = tf.config.experimental.list_physical_devices('GPU')

                if hvd.local_rank() < len(gpus):
                    gpu = gpus[hvd.local_rank()]

                    tf.config.experimental.set_memory_growth(
                        gpu, self._gpu_memory_allow_growth)
                    tf.config.experimental.set_visible_devices(gpu, 'GPU')
                if self._gpu_memory_fraction is not None:
                    config = tf.compat.v1.ConfigProto()
                    config.gpu_options.per_process_gpu_memory_fraction = self._gpu_memory_fraction
                    session = tf.compat.v1.InteractiveSession(config=config)

            except ImportError as identifier:
                print(
                    'Error importing horovod. Disabling distributed training.')
                self.distributed_training = False
                self.logger = logging.getLogger('DeepGalaxyTrain')
                self.logger.setLevel(self.log_level)
                self.logger.addHandler(logging.FileHandler('train_log.txt'))
                self.logger.info('Parallel training disabled.')
                self.logger.info('Batch_size = %d' % (self.batch_size))
        else:
            # Create logger
            self.logger = logging.getLogger('DeepGalaxyTrain')
            self.logger.setLevel(self.log_level)
            self.logger.addHandler(logging.FileHandler('train_log.txt'))
            self.logger.info('Parallel training disabled.')
            self.logger.info('Batch_size = %d' % (self.batch_size))
예제 #25
0
def main(data_dir, model_dir, batch_size, epochs, learning_rate, data_augmentation, verbose):
    num_classes = 10
    model_name = 'keras_cifar10_trained_model.h5'

    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    # The data, split between train and test sets:
    (x_train, y_train), (x_test, y_test) = load_cifar10_data(data_dir)

    print('x_train shape:', x_train.shape)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # Convert class vectors to binary class matrices.
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    input_shape = x_train.shape[1:]
    model = create_cnn_model(num_classes, input_shape, learning_rate)

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(keras.callbacks.ModelCheckpoint(data_dir + '/logs/checkpoint-{epoch}.h5'))
        callbacks.append(keras.callbacks.TensorBoard(data_dir + '/logs'))

    if not data_augmentation:
        print('Not using data augmentation.')
        model.fit(x_train, y_train, # TODO: add fit generator so you don't need all the data
                batch_size=batch_size,
                epochs=epochs,
                validation_data=(x_test, y_test),
                shuffle=True,
                callbacks=callbacks)
    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            featurewise_std_normalization=False,  # divide inputs by std of the dataset
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
            width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
            height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(datagen.flow(x_train, y_train,
                                        batch_size=batch_size),
                            epochs=epochs,
                            validation_data=(x_test, y_test),
                            #workers=4,
                            callbacks = callbacks)

    # Save model and weights
    if hvd.rank() == 0:
        if not os.path.isdir(model_dir):
            os.makedirs(model_dir)
        model_path = model_dir + '/' + model_name

        print('Saving trained model at %s ' % model_path)
        model.save(model_path)

        # Score trained model.
        scores = model.evaluate(x_test, y_test, verbose=verbose)

        if verbose:
            print('Test loss:', scores[0])
            print('Test accuracy:', scores[1])
예제 #26
0
def init_workers(distributed=False):
    rank, n_ranks = 0, 1
    if distributed:
        hvd.init()
        rank, n_ranks = hvd.rank(), hvd.size()
    return rank, n_ranks
예제 #27
0
    def test_load_model_broadcast(self):
        hvd.init()

        def create_model():
            opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3,)))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            return model

        with self.test_session() as sess:
            K.set_session(sess)

            model = create_model()

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            if hvd.rank() == 0:
                _, fname = tempfile.mkstemp('.h5')
                model.save(fname)

        K.clear_session()
        with self.test_session() as sess:
            K.set_session(sess)

            if hvd.rank() == 0:
                model = hvd.load_model(fname)
                os.remove(fname)
            else:
                model = create_model()

            def generator():
                while 1:
                    yield (x, y)

            if hvd.rank() == 0:
                self.assertEqual(len(model.optimizer.weights), 5)
            else:
                self.assertEqual(len(model.optimizer.weights), 0)

            # No assertions, we just need to verify that it doesn't hang
            callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
            model.fit_generator(generator(),
                                steps_per_epoch=10,
                                callbacks=callbacks,
                                epochs=0,
                                verbose=0,
                                workers=4,
                                initial_epoch=1)

            self.assertEqual(len(model.optimizer.weights), 5)
예제 #28
0
def train_hvd(modelCode,model, trainMap,val_df,mode,tf,learning_rate,min_max_scaler,isBinary,old_weights,startSet,startEpoch):
  tensor_board = TensorBoard(log_dir=tfb_log_dir, histogram_freq=1, write_graph=True, write_images=True)
  #isBinary = True
  
  if isBinary:
    classType = "BINARY"
    targetColName = "LABEL1"
  else:
    classType = "MULTI"
    targetColName = "LABEL2"
    
  if mode=="HRV":
    # Horovod: initialize Horovod.
    hvd.init()
    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

  #   with tf.Graph().as_default():
  #     config = tf.ConfigProto(allow_soft_placement=True)
  #     config.gpu_options.visible_device_list = '0'

    K.set_session(tf.Session(config=config))

    # Horovod: adjust learning rate based on number of GPUs.
    optimizer = keras.optimizers.Adadelta(learning_rate * hvd.size())
    # Horovod: Wrap optimizer with Horovod DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(optimizer)
  
    # Horovod: Broadcast initial variable states from rank 0
    # to all other processes. This is necessary to ensure 
    # consistent initialization of all workers when training is
    # started with random weights or restored from a checkpoint.
    tensor_board = TensorBoard(log_dir=tfb_log_dir, histogram_freq=1, write_graph=True, write_images=True)
    callbacks = [tensor_board, hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
  
  
    #modelNameList = ["buildBinaryClassModel","buildMultipleClassModel","buildMultiAttentionModel"]
    #modelCodeMap = {"LSTM":"buildBinaryClassModel", "BDLSTM":"buildMultipleClassModel","BDLSTM_ATTEN":"buildMultiAttentionModel"}
  else:
    v_optimizer = keras.optimizers.Adam(lr=learning_rate)
    #v_optimizer = keras.optimizers.RMSprop(lr=learning_rate)
    #v_optimizer = keras.optimizers.SGD(lr=learning_rate, clipvalue=1)
    #v_optimizer =  keras.optimizers.SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
    callbacks = [tensor_board]
  
  print("Start Train Model ",mode)
  
  cLoop = 1
  #trainedSets = {}
  test_df={}
  resultMetric = {}
  score = 0
  rolling_win_size = 60
  isNotSatisfacEval = True
  nEpochs = 10
  lossOptimal = False
  history =""
  score = 0
  cvscores = []
  curVLoss = 0.001
  resutl = []
  maxBatch = 10
  
  val_seq_array, val_label_array = gen_data_test_val(targetColName, val_df,sequence_length, sequence_cols)
  #val_seq_array, val_label_array, nb_features, nb_out = gen_data_train_val(targetColName, val_df,sequence_length, sequence_cols)
  
  ## Multiple Classifications
  #val_label_array = to_categorical(val_label_array, num_classes=3, dtype='int32')
    
  for nEpoch in range(nEpochs):
    countTrainSet = 1
    trainDataSetKeys = trainMap.keys()
    
    #Hyperparameters
    v_batch_size = 200
    v_validation_split = 0.05
    v_verbose = 2
    
    #verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch
    
    v_LSTMUnitLayer1 = 150
    v_LSTMUnitLayer2 = 60
    v_LSTMUnitLayer3 = 30
    
    v_Dropout = 0.2
    v_maxEpoch = 1
    scores_test = []
    
    for trainKey in trainDataSetKeys:
      if (trainKey>=startSet and nEpoch>=startEpoch):
        if isNotSatisfacEval is True:
          print("Starting Loop (cLoop) : ",str(cLoop))
          print("Train model using dataset {",str(trainKey),"}")
          isTrainSet = True
          train_df_new = getDataFromCSV(sqlContext, dbFSDir,trainMap[trainKey], selectedCols,isTrainSet,isBinary)
          
          ##Correct Sample Labels
          train_df_new = genSampleLabel(train_df_new)
          ##train_df = train_df.append(train_df)
          train_df_new = train_df_new.sort_values(['CODE','YEAR','EVENT_ID','CYCLE'])
          train_df_new = add_features(train_df_new, rolling_win_size , sensor_cols)
          train_df_new = train_df_new.drop(columns=columns_to_drop)
          #train_df_new,min_max_scaler = normalizeMaxMinTrain(train_df_new,min_max_scaler)
          train_df_new = train_df_new.sort_values(['EVENT_ID','CYCLE'])
          train_df_new = train_df_new.drop_duplicates(['EVENT_ID','CYCLE'], keep='last')
          
          printDFPortion(train_df_new, val_df, targetColName)
          seq_array, label_array, nb_features, nb_out = gen_data_train_val(targetColName, train_df_new,sequence_length, sequence_cols)
#           print("Finish Gen Train Data Sequence")
#           print("Finish Gen Validate Data Sequence")
          
          # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
          # Horovod: Save checkpoints only on worker 0 to prevent 
          # other workers from overwriting and corrupting them.
          ###checkpoint_dir = dataLake
          
          if mode=="HRV":
            if hvd.rank() == 0:
              callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_dir+ str(cLoop)+'checkpoint-hvd.hdf5', save_weights_only=True))
          
          original_label_array = label_array
          
          #Multiple Classification
          #label_array = to_categorical(label_array, num_classes=3, dtype='int32')
#          val_label_array = to_categorical(val_label_array, num_classes=3, dtype='int32')
          nb_classes=label_array.shape[1]
          vb_classes=val_label_array.shape[1]          
#           print("label_array : nb_classes: ",nb_classes)
#           print("val_label_array : vb_classes: ",vb_classes)

          if len(old_weights)==0 and classType=="MULTI":
            model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer1, return_sequences=True),input_shape=(sequence_length, nb_features),merge_mode='concat'))
#             print("Created Bidirectional 1")
            model.add(Dropout(v_Dropout))
            model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer2,return_sequences=True)))
#             print("Created Bidirectional 2")
            model.add(Dropout(v_Dropout))
            model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer3,return_sequences=False)))
#             print("Created Bidirectional 3")
            model.add(Dropout(v_Dropout))
            model.add(Dense(units=nb_classes,activation='softmax'))
          elif len(old_weights)==0 and classType=="BINARY":
            model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer1, return_sequences=True),input_shape=(sequence_length, nb_features),merge_mode='concat'))
            model.add(Dropout(v_Dropout))
#             print("Created Bidirectional 1")
            model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer2,return_sequences=False)))
            model.add(Dropout(v_Dropout))
#             print("Created Bidirectional 2")
            model.add(Dense(units=nb_out, activation='sigmoid'))
            print("nb_out:",nb_out)
          else:
            print("Model Already Constructed.")
          try:
            
            if old_weights!="":
              model.set_weights(old_weights)
              print("Reset weights successfully.")
              
          except:
            print("Failed reset weights.")
            pass
          
#           try:
#             model = multi_gpu_model(model,gpus=2)
#             print("Training using multiple GPUs..")
#           except:
#             print("Training using single GPU or CPU..")
#             pass
          
          if nb_classes>2:
            model.compile(loss='categorical_crossentropy', optimizer=v_optimizer, metrics=['accuracy'])
            print("set loss: categorical_crossentropy ")
          else:
            model.compile(loss='binary_crossentropy', optimizer=v_optimizer, metrics=['accuracy'])
            print("set loss: binary_crossentropy ")
            
          print(model.summary())
          
          processCode = str(cLoop)+"_R_"+str(trainKey)
          
          if mode=="HRV":
            if hvd.rank() == 0:
                callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_dir + '/'+processCode+'_checkpoint-{epoch}.h5'))
          
          ### Utilizing Horovod
          history = model.fit(seq_array, label_array,
                              batch_size=v_batch_size,
                              epochs=v_maxEpoch, 
                              verbose=2,
                  #validation_data=(val_seq_array, val_label_array),
                  validation_split=v_validation_split,
                  callbacks = callbacks)
          
              
          try:
            old_weights = model.get_weights()
            # evaluate the model
          except:
            print("Error get_weights !")
          
          # list all data in history
          print(history.history.keys())
          
            
          #val_seq_array, val_label_array = gen_data_test_val(targetColName, val_df,sequence_length, sequence_cols)
          #val_label_array = to_categorical(val_label_array, num_classes=3, dtype='int32')
          # cm,precision_test,recall_test,f1_test, y_true_label, y_predicted = evaluationMetrics(val_seq_array,val_label_array,isBinary,model)
          
          #printProb(model,val_seq_array , val_label_array)
          
          try:
            
            #cm,precision_test,recall_test,f1_test, y_true_label, y_predicted, y_pred_prop, y_pred_prob_thrldeshod = evaluationMetrics(val_seq_array,val_label_array,isBinary,model)
            cm,precision_test,recall_test,f1_test, y_true_label, y_pred_class, y_pred_prop, y_pred_prob_threshold = evaluationMetrics(val_seq_array,val_label_array,isBinary,model)
            
          except:
            precision_test = 0
            recall_test = 0
            f1_test=0
            print("Error in evaluation performance [evaluationMetrics]!")
            #return model
            pass
          
          if len(old_weights)==0:
              print("Error Empty Weights!!")
          else:
              print("Has weights!!")
          
          if mode!="HRV":
            try:
              currentModelPath = processCode + "_"+model_path
              print("Trying to save model : "+currentModelPath)
              model.save(currentModelPath)

              try:
                fromPath = "file:/databricks/driver/"+currentModelPath
                print("Copying file [",fromPath,"] to Data Lake....")
                copyData(fromPath, dataLake+"/model",False)
                
              except:
                print("Error while trying to transfer file "+"file:/databricks/driver/"+currentModelPath," to ",dataLake+"/model")
                pass  
                print("Model Saved >> ",currentModelPath)
            
            except:
              print("Error Saving Model",currentModelPath)
              pass
          
          try:
            lossOptimal, score, result, curVLoss = isOptimal(history,countTrainSet,score,curVLoss,nEpoch)
            #resultMetric[cLoop] = [cLoop, processCode] + result
            resultMetric[cLoop] = [cLoop, processCode] + result + [precision_test,recall_test,f1_test]
            print(resultMetric)
            saveFileToDataLake(resultMetric)
          except:
            print("Erro write metric file.")
            pass
            
          if lossOptimal is False:
            countTrainSet=countTrainSet+1
          else:
            break
          cLoop = cLoop+1
        else:
          print("Skip DataSet:",trainKey)
      else:
        print("Train and evaluation is satisfactory!")
        break
  return model
def main(args):    
    #initialize Horovod.
    hvd.init()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))
    
    fold = args.data_path.split("fold_")[1]
    if hvd.rank()==0:
        print("================================")
        if args.use_lovasz:
            print("Fine tuning with ")
        print("Fold {}".format(fold))
        
    #Find best saved model
    best_model_file = 'weights/{}/fold_{}_{epoch}_best.h5'.format(args.model, fold, epoch='{epoch}')
    resume_from_epoch = 0
    for try_epoch in range(args.epochs, 0, -1):
        if os.path.exists(best_model_file.format(epoch=try_epoch)):
            resume_from_epoch = try_epoch
            break
    if hvd.rank()==0:
        print("Last model saved: {}".format(best_model_file.format(epoch=resume_from_epoch)))
    resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')
    #verbose mode for one node
    if hvd.rank()==0:
        verbose = 1
    else:
        verbose = 0
   
    #Create dataset
    
    dataset = TGSDataset(data_path=args.data_path, batch_size=args.batch_size)
    input_shape = (args.target_size, args.target_size)
    mask_shape = (101, 101)
    train_data_generator = dataset.get_train_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand())
    val_data_generator = dataset.get_val_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand())
    train_step_size = dataset.train_step_size // hvd.size()
    val_step_size = dataset.val_step_size // hvd.size()
    #Create model
    model = make_model(args.model, (args.target_size, args.target_size, 3), 2)

    #load weights
    if resume_from_epoch > 0:
        model.load_weights(best_model_file.format(epoch=resume_from_epoch))
        
    size = hvd.size()
    opt = hvd.DistributedOptimizer(SGD(lr=args.learning_rate * size, momentum=0.9, nesterov=True))

    #Loss
    loss = losses.c_lovasz_loss if args.use_lovasz else losses.c_binary_crossentropy
    
    model.compile(loss=loss,
                  optimizer=opt,
                  metrics=[metrics.c_binary_accuracy, metrics.c_iou])

    #h5 model
    best_model = ModelCheckpointMGPU(model, filepath=best_model_file, monitor='val_loss',
                                     verbose=1,
                                     mode='min',
                                     period=1,
                                     save_best_only=True,
                                     save_weights_only=True)
    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),

        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard, or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=True)
    ]

    # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(keras.callbacks.TensorBoard(args.log_dir))
        callbacks.append(best_model)
    
    #Fit model
    history = model.fit_generator(train_data_generator,
                        steps_per_epoch=train_step_size,
                        callbacks=callbacks,
                        epochs=args.epochs,
                        verbose=verbose,
                        workers=4,
                        initial_epoch=resume_from_epoch,
                        validation_data=val_data_generator,
                        validation_steps=val_step_size)
  

    score = hvd.allreduce(model.evaluate_generator(val_data_generator, val_step_size, workers=4))
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
예제 #30
0
    def test_load_model_broadcast(self):
        hvd.init()

        def create_model():
            opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            return model

        with self.test_session() as sess:
            K.set_session(sess)

            model = create_model()

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            if hvd.rank() == 0:
                _, fname = tempfile.mkstemp('.h5')
                model.save(fname)

        K.clear_session()
        with self.test_session() as sess:
            K.set_session(sess)

            if hvd.rank() == 0:
                model = hvd.load_model(fname)
                os.remove(fname)
            else:
                model = create_model()

            def generator():
                while 1:
                    yield (x, y)

            if hvd.rank() == 0:
                self.assertEqual(len(model.optimizer.weights), 5)
            else:
                self.assertEqual(len(model.optimizer.weights), 0)

            # No assertions, we just need to verify that it doesn't hang
            callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
            model.fit_generator(generator(),
                                steps_per_epoch=10,
                                callbacks=callbacks,
                                epochs=0,
                                verbose=0,
                                workers=4,
                                initial_epoch=1)

            self.assertEqual(len(model.optimizer.weights), 5)
예제 #31
0
def main():
    """Main function"""

    # Initialize horovod
    hvd.init()

    # Parse the command line
    args = parse_args()

    # Setup logging
    log_format = '%(asctime)s %(levelname)s %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_format)
    logging.info('Initializing')
    if args.show_config:
        logging.info('Command line config: %s' % args)

    logging.info('MPI rank %i, local rank %i, host %s' %
                 (hvd.rank(), hvd.local_rank(), socket.gethostname()))

    # Load configuration file
    with open(args.config) as f:
        config = yaml.load(f)
    logging.info('Configuration: %s' % config)

    # Load the data files
    train_data, valid_data, test_data = load_dataset(**config['data_config'])
    train_input, train_labels, train_weights = train_data
    valid_input, valid_labels, valid_weights = valid_data
    test_input, test_labels, test_weights = test_data
    logging.info('train shape: %s Mean label %s' %
                 (train_input.shape, train_labels.mean()))
    logging.info('valid shape: %s Mean label %s' %
                 (valid_input.shape, valid_labels.mean()))
    logging.info('test shape:  %s Mean label %s' %
                 (test_input.shape, test_labels.mean()))

    # Configure the session (e.g. thread settings)
    keras.backend.set_session(configure_session(**config['session_config']))

    # Scale the learning rate
    model_config = config['model_config']
    if model_config.pop('scale_learning_rate'):
        model_config[
            'learning_rate'] = model_config['learning_rate'] * hvd.size()

    # Build the model
    logging.info(config)
    model = build_model(train_input.shape[1:],
                        use_horovod=True,
                        **model_config)
    if hvd.rank() == 0:
        model.summary()

    # Training hooks
    callbacks = []

    # Horovod model synchronization during initialization
    callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))

    # Model checkpointing
    if hvd.rank() == 0:
        checkpoint_file = os.path.expandvars(config['checkpoint_file'])
        os.makedirs(os.path.dirname(checkpoint_file), exist_ok=True)
        callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_file))

    # Batch size
    training_config = config['training_config']
    bsize = training_config['batch_size']
    per_node = training_config.pop('batch_size_per_node')
    training_config['batch_size'] = bsize if per_node else (bsize //
                                                            hvd.size())

    # Run the training
    logging.info('Final training config: %s' % training_config)
    history = model.fit(x=train_input,
                        y=train_labels,
                        validation_data=(valid_input, valid_labels),
                        callbacks=callbacks,
                        verbose=2,
                        **training_config)

    # Evaluate on the test set
    test_loss, test_acc = model.evaluate(test_input, test_labels, verbose=2)
    logging.info('Test loss:     %g' % test_loss)
    logging.info('Test accuracy: %g' % test_acc)

    # Drop to IPython interactive shell
    if args.interactive:
        logging.info('Starting IPython interactive session')
        import IPython
        IPython.embed()

    logging.info('All done!')
예제 #32
0
import datetime
import os
from argparser import args
import numpy as np

import tensorflow as tf

if args.keras_api:
    import keras as K
else:
    from tensorflow import keras as K

CHANNELS_LAST = True

hvd.init()

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # Get rid of the AVX, SSE warnings
os.environ["OMP_NUM_THREADS"] = str(args.intraop_threads)
os.environ["KMP_BLOCKTIME"] = str(args.blocktime)
os.environ["KMP_AFFINITY"] = "granularity=thread,compact,1,0"

if (hvd.rank() == 0):  # Only print on worker 0
    print_summary = args.print_model
    verbose = 1
    # os.system("lscpu")
    #os.system("uname -a")
    print("TensorFlow version: {}".format(tf.__version__))
    print("Intel MKL-DNN is enabled = {}".format(
        tf.pywrap_tensorflow.IsMklEnabled()))
    print("Keras API version: {}".format(K.__version__))
def main(args):
    mpi = False
    if 'sourcedir.tar.gz' in args.tensorboard_dir:
        tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'output',
                                 args.tensorboard_dir)
    else:
        tensorboard_dir = args.tensorboard_dir
    logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir))
    if 'sagemaker_mpi_enabled' in args.fw_params:
        if args.fw_params['sagemaker_mpi_enabled']:
            import horovod.keras as hvd
            mpi = True
            # Horovod: initialize Horovod.
            hvd.init()

            # Horovod: pin GPU to be used to process local rank (one GPU per process)
            gpus = tf.config.experimental.list_physical_devices('GPU')
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            if gpus:
                tf.config.experimental.set_visible_devices(
                    gpus[hvd.local_rank()], 'GPU')

    else:
        hvd = None

    train_dataset = train_input_fn(hvd, mpi)

    eval_dataset = eval_input_fn()
    validation_dataset = validation_input_fn(hvd, mpi)

    model = keras_model_fn(args.learning_rate, args.weight_decay,
                           args.optimizer, args.momentum, mpi, hvd)

    callbacks = []
    if mpi:
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
        callbacks.append(hvd.callbacks.MetricAverageCallback())
        callbacks.append(
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                     verbose=1))
        callbacks.append(
            keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
        if hvd.rank() == 0:
            callbacks.append(
                ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.ckpt',
                                save_weights_only=True,
                                verbose=2))
            callbacks.append(
                TensorBoard(log_dir=tensorboard_dir, update_freq='epoch'))
    else:
        callbacks.append(
            keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
        callbacks.append(
            ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.ckpt',
                            save_weights_only=True,
                            verbose=2))
        callbacks.append(
            TensorBoard(log_dir=tensorboard_dir, update_freq='epoch'))
    logging.info("Starting training")
    size = 1
    if mpi:
        size = hvd.size()

    if mpi and hvd.rank() > 0:
        # for horovod training, no validation for non-master nodes (rank > 0)
        model.fit(
            train_dataset,
            steps_per_epoch=(
                (num_examples_per_epoch('train') // args.batch_size) // size),
            epochs=args.epochs,
            validation_data=validation_dataset,
            validation_steps=(
                (num_examples_per_epoch('validation') // args.batch_size) //
                size),
            callbacks=callbacks,
            verbose=2)
    else:
        model.fit(
            train_dataset,
            steps_per_epoch=(
                (num_examples_per_epoch('train') // args.batch_size) // size),
            epochs=args.epochs,
            validation_data=validation_dataset,
            validation_steps=(
                (num_examples_per_epoch('validation') // args.batch_size) //
                size),
            callbacks=callbacks,
            verbose=2)

    if not mpi or (mpi and hvd.rank() == 0):
        score = model.evaluate(eval_dataset,
                               steps=num_examples_per_epoch('eval') //
                               args.batch_size,
                               verbose=2)

        logging.info('Test loss:{}'.format(score[0]))
        logging.info('Test accuracy:{}'.format(score[1]))

    # Horovod: Save model only on worker 0 (i.e. master)
    if mpi:
        if hvd.rank() == 0:
            model.save(args.model_output_dir)
    else:
        model.save(args.model_output_dir)