Пример #1
0
def setup_callbacks(params, callbacks, encoder, decoder, prop_pred):
    import horovod.keras as hvd

    # model checkpointing
    if params.checkpoint_period and hvd.rank() == 0:
        model_checkpoint_callback = model_checkpoint(
            encoder,
            decoder,
            prop_pred,
            params.checkpoint_path,
            nepochs=params.checkpoint_period,
            overwrite=params.overwrite_checkpoint)
        callbacks.append(model_checkpoint_callback)

    # LR scheduler
    if params.lr_schedule_patience:
        lr_callback = ReduceLROnPlateau(monitor=params.lr_schedule_prop,
                                        factor=0.5,
                                        patience=params.lr_schedule_patience,
                                        min_lr=params.lr_schedule_min *
                                        hvd.size(),
                                        cooldown=params.lr_schedule_cooldown,
                                        verbose=(hvd.rank() == 0))
        callbacks.append(lr_callback)

    if hvd.rank() == 0:
        callbacks.append(print_loss())
        if params.enable_tensorboard:
            callbacks.append(TensorBoard(params.checkpoint_path))
Пример #2
0
    def test_load_model_broadcast(self):
        def create_model():
            opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            return model

        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            model = create_model()

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            if hvd.rank() == 0:
                _, fname = tempfile.mkstemp('.h5')
                model.save(fname)

        K.clear_session()
        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            if hvd.rank() == 0:
                model = hvd.load_model(fname)
                os.remove(fname)
            else:
                model = create_model()

            def generator():
                while 1:
                    yield (x, y)

            if hvd.rank() == 0:
                self.assertEqual(len(model.optimizer.weights), 5)
            else:
                self.assertEqual(len(model.optimizer.weights), 0)

            # No assertions, we just need to verify that it doesn't hang
            callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
            model.fit_generator(generator(),
                                steps_per_epoch=10,
                                callbacks=callbacks,
                                epochs=0,
                                verbose=0,
                                workers=4,
                                initial_epoch=1)

            self.assertEqual(len(model.optimizer.weights), 5)
Пример #3
0
    def test_elastic_state(self):
        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            v = 1.0 if hvd.rank() == 0 else 2.0
            model1 = keras.models.Sequential([
                keras.layers.Dense(2, activation='softmax')
            ])
            model1.build((2, 2))
            model1.set_weights(
                [np.array([[v,  v], [v, v]], dtype=np.float32),
                 np.array([v, v], dtype=np.float32)])

            model2 = keras.models.Sequential([
                keras.layers.Dense(2, activation='softmax')
            ])
            model2.build((2, 2))
            model2.set_weights(
                [np.array([[1.0,  2.0], [3.0, 4.0]], dtype=np.float32),
                 np.array([0.0, 0.0], dtype=np.float32)])

            optimizer = keras.optimizers.Adam(0.001 * hvd.size())

            state = hvd.elastic.KerasState(model1, optimizer, batch=20 + hvd.rank(), epoch=10 + hvd.rank())
            state.sync()

            model1_weights = model1.get_weights()
            model2_weights = model2.get_weights()

            # After sync, all values should match the root rank
            for w in state.model.get_weights():
                self.assertAllClose(w, np.ones_like(w))
            assert state.batch == 20
            assert state.epoch == 10

            # Partially modify then restore
            model1.set_weights(model2_weights)
            state.batch = 21
            state.epoch = 11

            state.restore()

            for w1, w2 in zip(model1.get_weights(), model1_weights):
                self.assertAllClose(w1, w2)
            assert state.batch == 20
            assert state.epoch == 10

            # Partially modify then commit
            model1.set_weights(model2_weights)
            state.batch = 21
            state.epoch = 11

            state.commit()
            state.restore()

            for w1, w2 in zip(model1.get_weights(), model2_weights):
                self.assertAllClose(w1, w2)
            assert state.batch == 21
            assert state.epoch == 11
Пример #4
0
    def initialize(self):
        # init_op = tf.initialize_all_variables()
        # init_op = tf.global_variables_initializer()
        # sess = tf.Session()
        # sess.run(init_op)

        # Check if GPUs are available
        # if tf.test.is_gpu_available():  # commented out since this test will cause a new session be created
        # allow growth
        # config = tf.compat.v1.ConfigProto()
        # config.gpu_options.per_process_gpu_memory_fraction = 1
        # config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
        # # config.log_device_placement = True  # to log device placement (on which device the operation ran)
        # sess = tf.compat.v1.Session(config=config)
        # tf.compat.v1.keras.backend.set_session(sess)  # set this TensorFlow session as the default session for Keras

        # Create logger
        self.logger = logging.getLogger('DeepGalaxyTrain')
        self.logger.setLevel(self.log_level)
        self.logger.addHandler(logging.FileHandler('train_log.txt'))
        if self.distributed_training is True:
            try:
                import horovod.tensorflow.keras as hvd
                # initialize horovod
                hvd.init()
                self.callbacks.append(
                    hvd.callbacks.BroadcastGlobalVariablesCallback(0))
                self.callbacks.append(hvd.callbacks.MetricAverageCallback())
                # self.callbacks = [hvd.BroadcastGlobalVariablesHook(0)]
                if hvd.rank() == 0:
                    self.logger.info('Parallel training enabled.')
                    self.logger.info(
                        'batch_size = %d, global_batch_size = %d, num_workers = %d\n'
                        % (self.batch_size, self.batch_size * hvd.size(),
                           hvd.size()))

                # Map an MPI process to a GPU (Important!)
                print('hvd_rank = %d, hvd_local_rank = %d' %
                      (hvd.rank(), hvd.local_rank()))
                self.logger.info('hvd_rank = %d, hvd_local_rank = %d' %
                                 (hvd.rank(), hvd.local_rank()))

                # Bind a CUDA device to one MPI process (has no effect if GPUs are not used)
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())

                # # Horovod: pin GPU to be used to process local rank (one GPU per process)
                gpus = tf.config.experimental.list_physical_devices('GPU')
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)
                # if gpus:
                # tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
            except ImportError as identifier:
                print(
                    'Error importing horovod. Disabling distributed training.')
                self.distributed_training = False
        else:
            self.logger.info('Parallel training disabled.')
            self.logger.info('Batch_size = %d' % (self.batch_size))
Пример #5
0
def create_callbacks(model, training_model, prediction_model, validation_generator, args, verbose):
    # Create Horovod callback    
    callbacks = [
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        hvd.callbacks.MetricAverageCallback(),
        hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=verbose)
    ]

    if hvd.rank() == 0 and args.output_path: # only one worker saves the checkpoint file
        # Create a snapshot for the Epoch
        callbacks.append(
            keras.callbacks.ModelCheckpoint(
                os.path.join(
                    args.output_path,
                    'model.h5'
                )
            )
        )

    tensorboard_callback = None

    if (args.tensorboard_dir) and (hvd.rank() == 0):
        tensorboard_callback = keras.callbacks.TensorBoard(
            log_dir                = args.tensorboard_dir,
            histogram_freq         = 0,
            batch_size             = args.batch_size,
            write_graph            = True,
            write_grads            = False,
            write_images           = False,
            embeddings_freq        = 0,
            embeddings_layer_names = None,
            embeddings_metadata    = None
        )
        callbacks.append(tensorboard_callback)

    # if args.evaluation and validation_generator:
    #     if args.dataset_type == 'coco':
    #         from ..callbacks.coco import CocoEval

    #         # use prediction model for evaluation
    #         evaluation = CocoEval(validation_generator, tensorboard=tensorboard_callback)
    #     else:
    #         evaluation = Evaluate(validation_generator, tensorboard=tensorboard_callback)
    #     evaluation = RedirectModel(evaluation, prediction_model)
    #     callbacks.append(evaluation)

    callbacks.append(keras.callbacks.ReduceLROnPlateau(
        monitor  = 'loss',
        factor   = 0.1,
        patience = 2,
        verbose  = 1,
        mode     = 'auto',
        epsilon  = 0.0001,
        cooldown = 0,
        min_lr   = 0
    ))

    return callbacks
Пример #6
0
def getModel(net_settings, num_classes=1):
    '''
		Should be modified with model type as input and returns the desired model
    '''
    if net_settings['model_type'] == 'resnet':
        base_model = resnet50.ResNet50(include_top=True, weights='imagenet')
        finetuning = Dense(1, activation='sigmoid',
                           name='predictions')(base_model.layers[-2].output)
        model = Model(input=base_model.input, output=finetuning)

        ## Adjust learning rate based on number of GPUs
        hv_lr = net_settings['lr'] * hvd.size()
        opt = optimizers.SGD(lr=hv_lr, momentum=0.9, decay=1e-6, nesterov=True)
        ## Adding Horovod DistributedOptimizer
        opt = hvd.DistributedOptimizer(opt)

        model.compile(loss=net_settings['loss'],
                      optimizer=opt,
                      metrics=['accuracy'])
        callbacks = [
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        ]
        if hvd.rank() == 0:
            callbacks.append(
                keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
        return model
    elif net_settings['model_type'] == 'resnet101':
        model = resnet101_model(224, 224, 3, 1)
        ## Adjust learning rate based on number of GPUs
        hv_lr = net_settings['lr'] * hvd.size()
        opt = optimizers.SGD(lr=hv_lr, momentum=0.9, decay=1e-6, nesterov=True)
        ## Adding Horovod DistributedOptimizer
        opt = hvd.DistributedOptimizer(opt)

        model.compile(loss=net_settings['loss'],
                      optimizer=opt,
                      metrics=['accuracy'])
        callbacks = [
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        ]
        if hvd.rank() == 0:
            callbacks.append(
                keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
            return model
    else:
        print '[models] Ugggh. Not ready for this yet.'
        exit(0)
        return None
Пример #7
0
    def generate_train_patch_using_sharing(self, batch_size):
        comm = MPI.COMM_WORLD
        if hvd.rank() == 0:
            batch_patch_info = self.cnes_gen.choose_patches_for_iteration(
                batch_size * hvd.size())
            # batch_img, batch_gt = cnes_gen.generate_train_patch_fast(BATCH_SIZE, batch_patch_info)
            transfers = self.get_batch_sharing_solution(batch_patch_info)

            for k in range(1, hvd.size()):
                comm.send(batch_patch_info, dest=k, tag=1001)
                comm.send(transfers, dest=k, tag=1002)
        else:
            batch_patch_info = comm.recv(source=0, tag=1001)
            transfers = comm.recv(source=0, tag=1002)

#            batch_patch_info = np.zeros((6, batch_size * hvd.size()), np.int32)
#            transfers = np.zeros((hvd.size(), hvd.size()), np.int32)

#        batch_patch_info = hvd.broadcast(batch_patch_info, root_rank=0, name="BATCH_PATCH_INFO")
#        transfers = hvd.broadcast(transfers, root_rank=0, name="TRANSFERS")

#        batch_patch_info = comm.bcast(batch_patch_info, root=0)
#        transfers = comm.bcast(transfers, root=0)

        return self.get_batch_using_sharing(batch_size, batch_patch_info,
                                            transfers)
Пример #8
0
    def load_data(self, data_fn, test_size=0.3, random=True):
        if not self.distributed_training:
            self.logger.info(
                'Loading the full dataset since distributed training is disabled ...'
            )
            # X, Y = self.data_io.load_all(data_fn, dset_name_pattern=dset_name_pattern, camera_pos=camera_pos)
            X, Y = self.data_io.load_all(data_fn)
        else:
            self.logger.info(
                'Loading part of the dataset since distributed training is enabled ...'
            )
            X, Y = self.data_io.load_partial(data_fn, hvd.size(), hvd.rank())
        self.logger.debug('Shape of X: %s' % str(X.shape))
        self.logger.debug('Shape of Y: %s' % str(Y.shape))

        # update the input_shape setting according to the loaded data
        self.input_shape = X.shape[1:]

        if test_size > 0:
            x_train, x_test, y_train, y_test = train_test_split(
                X, Y, test_size=test_size, random_state=42)
            self.x_train = x_train
            self.x_test = x_test
            self.y_train = y_train
            self.y_test = y_test
        else:
            self.x_train = X
            self.y_train = Y
        self.num_classes = np.unique(Y).shape[0]
        print("shapes:", self.x_train.shape, self.x_test.shape,
              self.y_train.shape, self.y_test.shape)
        self.logger.debug('Number of classes: %d' % self.num_classes)
def _get_rank():
    if _DISTRIBUTED:
        try:
            return hvd.rank()
        except:
            return 0
    else:
        return 0
def _is_master(is_distributed=_DISTRIBUTED):
    if is_distributed:
        if hvd.rank() == 0:
            return True
        else:
            return False
    else:
        return True
def _get_model_dir(is_distributed=_DISTRIBUTED):
    if is_distributed:
        # Horovod: save checkpoints only on worker 0 to prevent other workers from
        # corrupting them.
        return (os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
                if hvd.rank() == 0 else os.getenv("AZ_BATCHAI_JOB_TEMP_DIR"))
    else:
        return os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
Пример #12
0
def readData(file):
  if not useHorovod or hvd.rank() == 0:
     logger.info("reading in training %s", file)
  loaded_data = zarr.open(file, mode='r')
  x_train = loaded_data['train']
  y_train = loaded_data['test']

  return x_train,y_train
Пример #13
0
 def test_horovod(self):
     import horovod.keras as hvd
     self.assertEqual(hvd.init(), 1)
     self.assertEqual(hvd.rank(), 0)
     print(
         '\nNOTE: remember to also test horovod with a real script, for example'
     )
     print(
         'https://github.com/CSCfi/machine-learning-scripts/blob/master/examples/keras-dvc-cnn-simple-hvd.py'
     )
Пример #14
0
def train_evaluate():

    # Generate training and validation data generators 
    def get_image_list(data_dir):
       dataset = []
       for folder in os.listdir(data_dir):
          for image in os.listdir(os.path.join(data_dir, folder)):
             dataset.append((os.path.join(data_dir, folder, image), folder)) 
       return dataset      

    training_data = ImageSequence(get_image_list(os.path.join(FLAGS.data_dir, 'train')), FLAGS.batch_size, True)
    validation_data = ImageSequence(get_image_list(os.path.join(FLAGS.data_dir, 'test')), FLAGS.batch_size, False)

    # Horovod: Initialize Horovod
    hvd.init()

    # Horvod: Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    tf.keras.backend.set_session(tf.Session(config=config))

    # Create a model
    model = network_model(FLAGS.hidden_units)
    loss = 'categorical_crossentropy'

    # Horovod: Adjust learning rate based on number of GPUs
    optimizer = Adadelta(lr=1.0 * hvd.size())
    # Horovod: add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(optimizer)

    metrics = ['acc']
    model.compile(optimizer, loss, metrics)
  
    # Set up callbacks
    callbacks = [
        # Broadcast initial variable states from rank 0 to all other processes
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]
    
    # Horovod: save  logs only on worker 0
    if hvd.rank() == 0:
        callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=FLAGS.log_dir))

    # Start training
    model.fit_generator(generator = training_data,
                        validation_data = validation_data,
                        epochs = FLAGS.epochs,
                        use_multiprocessing = True,
                        workers = 4,
                        callbacks = callbacks,
                        verbose = 1)

    # Save the model
    model.save(FLAGS.save_model_path)
Пример #15
0
def _main():
    hvd.init()
    better_exceptions.MAX_LENGTH = 128
    _MODELS_DIR.mkdir(parents=True, exist_ok=True)
    logger = tk.log.get()
    logger.addHandler(tk.log.stream_handler())
    if hvd.rank() == 0:
        logger.addHandler(
            tk.log.file_handler(_MODELS_DIR / 'train.log', append=True))
    with tk.dl.session(
            gpu_options={'visible_device_list': str(hvd.local_rank())}):
        _run()
Пример #16
0
 def __init__(self, filename, batch_size):
     self.f_array = h5py.File(filename, "r")
     x = self.f_array["images"]
     y = self.f_array["masks"]
     self.batch_size = batch_size
     node_array_size = int(np.ceil(len(x) / hvd.size()))
     self.init_array = hvd.rank() * node_array_size
     self.end_array = self.init_array + node_array_size
     self.x = x
     self.y = y
     print("calculating size")
     print("size", len(self))
Пример #17
0
def setup_hvd_callbacks(params, callbacks, encoder, decoder, prop_pred):
    import horovod.keras as hvd

    # Horovod: broadcast initial variable states
    callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))

    # Horovod: average metrics among workers at the end of every epoch.
    callbacks.append(hvd.callbacks.MetricAverageCallback())

    # Horovod: Scale the learning rate * hvd.size()`
    callbacks.append(
        hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                 verbose=(hvd.rank() == 0)))
Пример #18
0
def main(args):
    # ===========変更点============= #
    import horovod.keras as hvd
    hvd.init()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))
    
    logging.info("getting data")
    
    train_dataset = train_input_fn()
    eval_dataset = eval_input_fn()
    validation_dataset = validation_input_fn()
    
    
    # ===============変更点======================= #
    logging.info("configuring model")
    model = keras_model_fn(args.learning_rate, args.weight_decay, args.optimizer, args.momentum, hvd)
    callbacks = []

    # ===============変更点======================= #
    # callbacks.append(ModelCheckpoint(args.model_dir + '/checkpoint-{epoch}.h5'))
    callbacks.append(ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5'))
    
    # ===============変更点======================= #
    callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    callbacks.append(hvd.callbacks.MetricAverageCallback())
    callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))
    
    # ===============変更点======================= #
    if hvd.rank() == 0:
        callbacks.append(ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5'))
        callbacks.append(TensorBoard(log_dir=args.model_dir,update_freq='epoch'))

    logging.info("Starting training")
    model.fit(x=train_dataset[0], y=train_dataset[1],
              steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size),
              epochs=args.epochs, validation_data=validation_dataset,
              validation_steps=(num_examples_per_epoch('validation') // args.batch_size), callbacks=callbacks)

    score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=num_examples_per_epoch('eval') // args.batch_size,
                           verbose=0)

    logging.info('Test loss:{}'.format(score[0]))
    logging.info('Test accuracy:{}'.format(score[1]))

    # ===============変更点======================= #
    # return save_model(model, args.model_dir)
    return save_model(model, args.model_output_dir)
Пример #19
0
    def print_global_running_stats(self):
        stats = self.cnes_gen.get_running_stats()

        CLASS_ID_SET = self.cnes_gen.get_class_ids()

        print("stats at rank {} : {}".format(hvd.rank(), stats))

        stats_mat = np.zeros((len(CLASS_ID_SET) + 1, 2), np.float32)
        stats_mat[0, 1] = stats[0]
        idx = 1
        for cid in CLASS_ID_SET:
            stats_mat[idx, 0] = cid
            if cid in stats:
                stats_mat[idx, 1] = stats[cid]
            idx += 1

        print("Gathering stats from all MPI instances, rank {}".format(
            hvd.rank()))
        all_stats = hvd.allgather(stats_mat)  # comm.gather(stats, root=0)
        total_px = 0

        if hvd.rank() == 0:
            #            print("Epoch {} class freqs:".format(self.epoch))
            class_stats = {class_id: 0 for class_id in CLASS_ID_SET}
            for class_id in CLASS_ID_SET:
                # print("Data for class {}: {}".format(class_id, all_stats[all_stats[:,0] == class_id, :]))
                px_class = np.sum(all_stats[all_stats[:, 0] == class_id, 1])
                class_stats[class_id] += px_class
                total_px += px_class

            non_annot_px = np.sum(all_stats[all_stats[:, 0] == 0, 1])
            total_px += non_annot_px
            print("Non annotated pixels : {}".format(non_annot_px))
            for class_id in class_stats:
                print("Class {} count = {}, freq {:.5f}%".format(
                    class_id, class_stats[class_id],
                    class_stats[class_id] / total_px * 100))
Пример #20
0
 def save_model(self):
     if self.distributed_training is True:
         if hvd.rank() == 0:
             if self.use_noise is True:
                 self.model.save('model_hvd_bw_%d_B0_with_noise_n_p_%d.h5' %
                                 (self.input_shape[0], hvd.size()))
             else:
                 self.model.save('model_hvd_bw_%d_B0_no_noise_%d_nodes.h5' %
                                 (self.input_shape[0], hvd.size()))
     else:
         if self.use_noise is True:
             self.model.save('model_bw_%d_B0_with_noise.h5' %
                             (self.input_shape[0]))
         else:
             self.model.save('model_bw_%d_B0_no_noise.h5' %
                             (self.input_shape[0]))
Пример #21
0
def setup_generators(params):

    # train/valid splits
    train_split = '{}/index/{}/{}'.format(params.tag, 'LogD', SplitTypes.train)
    valid_split = '{}/index/{}/{}'.format(params.tag, 'LogD', SplitTypes.valid)

    # outputs
    if params.do_prop_pred:
        normalize_y = True
        regression_prediction_columns = params.prop_add
        output_datasets = ['{}/data/values/{}'.format(params.tag, x) \
                               for x in regression_prediction_columns]
        if hvd.rank() == 0:
            for ix, val in enumerate(output_datasets):
                print('regression output:', val)
    else:
        normalize_y = []
        regression_prediction_columns = []
        output_datasets = []

    # inputs
    input_datasets = ['{}/data/one_hot/{}'.format(params.tag, x) \
                                              for x in ['smiles']]

    # setup generators
    train_gen = DatasetGeneratorFast(
        h5store=params.hdf5_file_path,
        vae_params={'hidden_dim': params.hidden_dim},
        batch_size=params.batch_size,
        xlabel=input_datasets,
        ylabel=output_datasets,
        normalize_X=False,
        normalize_y=normalize_y,
        splitlabel=train_split)

    valid_gen = DatasetGeneratorFast(
        h5store=params.hdf5_file_path,
        vae_params={'hidden_dim': params.hidden_dim},
        batch_size=params.batch_size,
        xlabel=input_datasets,
        ylabel=output_datasets,
        normalize_X=False,
        normalize_y=normalize_y,
        splitlabel=valid_split)
    return train_gen, valid_gen
Пример #22
0
 def save_model(self):
     if self.distributed_training is True:
         if hvd.rank() == 0:
             if self.noise_stddev > 0 is True:
                 self.model.save('model_%d_%s_noise_np_%d.h5' %
                                 (self.input_shape[0], self.base_model_name,
                                  hvd.size()))
             else:
                 self.model.save('model_%d_%s_np_%d.h5' %
                                 (self.input_shape[0], self.base_model_name,
                                  hvd.size()))
     else:
         if self.noise_stddev > 0 is True:
             self.model.save('model_%d_%s_noise.h5' %
                             (self.input_shape[0], self.base_model_name))
         else:
             self.model.save('model_%d_%s.h5' %
                             (self.input_shape[0], self.base_model_name))
def data_generator(file_path, batch_size, seq_len=512, predict=False):
    # Trick the code into thinking we're only running 1 process for prediction when running `Metrics`.
    if predict:
        size = 1
    else:
        size = hvd.size()
    total_batch_size = batch_size * size
    print(total_batch_size)
    rank = hvd.rank()
    print(rank)
    range_start = batch_size * rank
    range_end = range_start + batch_size
    print(range_start, range_end)
    while True:
        with xopen(file_path, "rt") as f:
            _, label_dim = json.loads(f.readline())
            text = []
            labels = []
            for line in f:
                if len(text) == total_batch_size:
                    text = text[range_start:range_end]
                    labels = labels[range_start:range_end]
                    print(text[0])
                    # Fun fact: the 2 inputs must be in a list, *not* a tuple. Why.
                    yield ([np.asarray(text), np.zeros_like(text)], np.asarray(labels))
                    text = []
                    labels = []
                line = json.loads(line)
                # First sublist is token ids.
                text.append(np.asarray(line[0])[0:seq_len])

                # Second sublist is positive label indices.
                label_line = np.zeros(label_dim, dtype='b')
                label_line[line[1]] = 1
                labels.append(label_line)
            # Yield what is left as the last batch when file has been read to its end.
            # Split the remaining examples, duplicating with `ceil()` if they don't split evenly.
            leftover_batch_start = ceil(len(text) / size) * rank
            leftover_batch_end = leftover_batch_start + ceil(len(text) / size)
            text = text[leftover_batch_start:leftover_batch_end]
            labels = labels[leftover_batch_start:leftover_batch_end]
            yield ([np.asarray(text), np.zeros_like(text)], np.asarray(labels))
Пример #24
0
def main(use_horovod: bool, gpus: int, checkpoint: int,
         config_path: str) -> None:
    config = process_config(config_path, use_horovod, gpus, checkpoint)

    # create tensorflow session and set as keras backed
    tf_config = tf.ConfigProto()

    if config.trainer.use_horovod:
        import horovod.keras as hvd

        hvd.init()
        tf_config.gpu_options.allow_growth = True
        tf_config.gpu_options.visible_device_list = str(hvd.local_rank())

    is_master = not config.trainer.use_horovod
    if not is_master:
        import horovod.keras as hvd

        is_master = hvd.rank() == 0

    if is_master and not os.path.exists(config.exp.source_dir):
        # copy source files
        shutil.copytree(
            os.path.abspath(os.path.curdir),
            config.exp.source_dir,
            ignore=lambda src, names:
            {"datasets", "__pycache__", ".git", "experiments", "venv"})

    tf_sess = tf.Session(config=tf_config)
    K.set_session(tf_sess)
    data_loader = get_data_loader(config=config)

    model, trainer = build_model_and_trainer(config, data_loader)

    print(f"Start Training Experiment {config.exp.name}")
    try:
        trainer.train()
    except Exception as e:
        send_noti_to_telegram(
            f"an exception raised on training {config.exp.name}")
        raise e
Пример #25
0
def setup_tf_config(config: DotMap):

    tf_config = tf.ConfigProto()

    if config.trainer.use_horovod:
        import horovod.keras as hvd

        hvd.init()
        tf_config.gpu_options.allow_growth = True
        tf_config.gpu_options.visible_device_list = str(hvd.local_rank())

    is_master = not config.trainer.use_horovod
    if not is_master:
        import horovod.keras as hvd

        is_master = hvd.rank() == 0

    tf_sess = tf.Session(config=tf_config)
    K.set_session(tf_sess)

    return is_master
Пример #26
0
def batch_generator(full_sequences, fragment_length, batch_size,
                    fragment_stride, nb_output_bins, randomize_batch_order,
                    _rnd):
    indices = list(
        fragment_indices(full_sequences, fragment_length, batch_size,
                         fragment_stride, nb_output_bins))
    global g_multi_gpu
    if g_multi_gpu:
        import horovod.keras as hvd
        gpu_count = hvd.size()
        current_gpu = hvd.rank()
    else:
        gpu_count = 1
        current_gpu = 0

    if randomize_batch_order:
        _rnd.shuffle(indices)

    batches_parted = [batch for batch in partition_all(batch_size, indices)]
    start_index = len(batches_parted) // gpu_count * current_gpu
    batches_gpu = batches_parted[start_index:]

    batches = cycle(batches_gpu)
    for batch in batches:
        if len(batch) < batch_size:
            continue
        yield np.array([
            one_hot(full_sequences[e[0]][e[1]:e[1] + fragment_length])
            for e in batch
        ],
                       dtype='uint8'), np.array([
                           one_hot(full_sequences[e[0]][e[1] + 1:e[1] +
                                                        fragment_length + 1])
                           for e in batch
                       ],
                                                dtype='uint8')
Пример #27
0
def GetCallbacks(logfileoutputdir, stage):
    logdir = logfileoutputdir + "/" + stage
    filename = logfileoutputdir + "/" + stage + "/modelunet.h5"
    logname = logfileoutputdir + "/" + stage + "/log.csv"
    if options.with_hvd:
        callbacks = [
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
            hvd.callbacks.MetricAverageCallback(),
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                     verbose=1),
            keras.callbacks.TerminateOnNaN()
        ]
        if hvd.rank() == 0:
            callbacks += [
                keras.callbacks.ModelCheckpoint(filepath=filename,
                                                verbose=1,
                                                save_best_only=True),
                keras.callbacks.CSVLogger(logname),
                keras.callbacks.TensorBoard(log_dir=logdir,
                                            histogram_freq=0,
                                            write_graph=True,
                                            write_images=False)
            ]
    else:
        callbacks = [
            keras.callbacks.TerminateOnNaN(),
            keras.callbacks.CSVLogger(logname),
            keras.callbacks.ModelCheckpoint(filepath=filename,
                                            verbose=1,
                                            save_best_only=True),
            keras.callbacks.TensorBoard(log_dir=logdir,
                                        histogram_freq=0,
                                        write_graph=True,
                                        write_images=False)
        ]
    return callbacks, filename
    def init_callbacks(self) -> None:
        if self.config.trainer.use_lr_decay:
            # linear decay from the half of max_epochs
            def lr_scheduler(lr, epoch, max_epochs):
                return min(lr, 2 * lr * (1 - epoch / max_epochs))

            self.model_callbacks["combined"].append(
                LearningRateScheduler(schedule=lambda epoch: lr_scheduler(self.config.model.generator.lr, epoch,
                                                                          self.config.trainer.num_epochs)))
            for model_name in ['d_x', 'd_y']:
                self.model_callbacks[model_name].append(
                    LearningRateScheduler(schedule=lambda epoch: lr_scheduler(self.config.model.discriminator.lr, epoch,
                                                                              self.config.trainer.num_epochs)))
        # if horovod used, only worker 0 saves checkpoints
        is_master = True
        is_local_master = True
        if self.config.trainer.use_horovod:
            import horovod.keras as hvd

            is_master = hvd.rank() == 0
            is_local_master = hvd.local_rank() == 0

        # horovod callbacks
        if self.config.trainer.use_horovod:
            import horovod.keras as hvd

            self.model_callbacks["combined"].append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
            self.model_callbacks["combined"].append(hvd.callbacks.MetricAverageCallback())
            self.model_callbacks["combined"].append(
                hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))

        if is_local_master:
            # model saver
            self.model_callbacks["serial_combined"].append(
                ModelCheckpointWithKeepFreq(
                    filepath=os.path.join(self.config.exp.checkpoints_dir, "{epoch:04d}-combined.hdf5"),
                    keep_checkpoint_freq=self.config.trainer.keep_checkpoint_freq,
                    save_checkpoint_freq=self.config.trainer.save_checkpoint_freq,
                    save_best_only=False,
                    save_weights_only=True,
                    verbose=1))

            # save optimizer weights
            for model_name in ['combined', 'd_x', 'd_y']:
                self.model_callbacks[model_name].append(OptimizerSaver(self.config, model_name))
        if is_master:
            # save individual models
            for model_name in ['g_xy', 'g_yx', 'd_x', 'd_y']:
                self.model_callbacks[model_name].append(
                    ModelSaver(
                        checkpoint_dir=self.config.exp.checkpoints_dir,
                        keep_checkpoint_freq=self.config.trainer.keep_checkpoint_freq,
                        model_name=model_name,
                        num_epochs=self.config.trainer.num_epochs,
                        verbose=1))

            # send notification to telegram channel on train start and end
            self.model_callbacks["combined"].append(TrainProgressAlertCallback(experiment_name=self.config.exp.name,
                                                                               total_epochs=self.config.trainer.num_epochs))

            # tensorboard callback
            self.model_callbacks["combined"].append(
                ScalarCollageTensorBoard(log_dir=self.config.exp.tensorboard_dir,
                                         batch_size=self.config.trainer.batch_size,
                                         write_images=True))

        # initialize callbacks by setting model and params
        epochs = self.config.trainer.num_epochs
        steps_per_epoch = self.data_loader.get_train_data_size() // self.config.trainer.batch_size
        for model_name in self.model_callbacks:
            model = eval(f"self.{model_name}")

            callbacks = self.model_callbacks[model_name]
            for callback in callbacks:
                callback.set_model(model)
                callback.set_params({
                    "batch_size": self.config.trainer.batch_size,
                    "epochs": epochs,
                    "steps": steps_per_epoch,
                    "samples": self.data_loader.get_train_data_size(),
                    "verbose": True,
                    "do_validation": False,
                    "model_name": model_name,
                })
Пример #29
0
if args.keras_api:
    import keras as K
else:
    from tensorflow import keras as K

CHANNELS_LAST = True

hvd.init()

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  # Get rid of the AVX, SSE warnings
os.environ["OMP_NUM_THREADS"] = str(args.intraop_threads)
os.environ["KMP_BLOCKTIME"] = str(args.blocktime)
os.environ["KMP_AFFINITY"] = "granularity=thread,compact,1,0"

if (hvd.rank() == 0):  # Only print on worker 0
    print_summary = args.print_model
    verbose = 1
    # os.system("lscpu")
    #os.system("uname -a")
    print("TensorFlow version: {}".format(tf.__version__))
    print("Intel MKL-DNN is enabled = {}".format(
        tf.pywrap_tensorflow.IsMklEnabled()))
    print("Keras API version: {}".format(K.__version__))

else:  # Don't print on workers > 0
    print_summary = 0
    verbose = 0
    # Horovod needs to have every worker do the same amount of work.
    # Otherwise it will complain at the end of the epoch when
    # worker 0 takes more time than the others to do validation,
Пример #30
0
# If set > 0, will resume training from a given checkpoint.
resume_from_epoch = 0
for try_epoch in range(args.epochs, 0, -1):
    if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
        resume_from_epoch = try_epoch
        break

# Horovod: broadcast resume_from_epoch from rank 0 (which will have
# checkpoints) to other ranks.
resume_from_epoch = hvd.broadcast(resume_from_epoch,
                                  0,
                                  name='resume_from_epoch')

# Horovod: print logs on the first worker.
verbose = 1 if hvd.rank() == 0 else 0

# Input image dimensions
img_rows, img_cols = 28, 28
num_classes = 10

# Load Fashion MNIST data.
(x_train, y_train), (x_test, y_test) = load_data(args.dataset_path)

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
Пример #31
0
# Horovod: adjust learning rate based on number of GPUs.
opt = keras.optimizers.Adadelta(1.0 * hvd.size())

# Horovod: add Horovod Distributed Optimizer.
opt = hvd.DistributedOptimizer(opt)

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=opt,
              metrics=['accuracy'])

callbacks = [
    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
]

# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
if hvd.rank() == 0:
    callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

model.fit(x_train, y_train,
          batch_size=batch_size,
          callbacks=callbacks,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
Пример #32
0
    def test_load_model_broadcast(self):
        hvd.init()

        def create_model():
            opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3,)))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            return model

        with self.test_session() as sess:
            K.set_session(sess)

            model = create_model()

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            if hvd.rank() == 0:
                _, fname = tempfile.mkstemp('.h5')
                model.save(fname)

        K.clear_session()
        with self.test_session() as sess:
            K.set_session(sess)

            if hvd.rank() == 0:
                model = hvd.load_model(fname)
                os.remove(fname)
            else:
                model = create_model()

            def generator():
                while 1:
                    yield (x, y)

            if hvd.rank() == 0:
                self.assertEqual(len(model.optimizer.weights), 5)
            else:
                self.assertEqual(len(model.optimizer.weights), 0)

            # No assertions, we just need to verify that it doesn't hang
            callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
            model.fit_generator(generator(),
                                steps_per_epoch=10,
                                callbacks=callbacks,
                                epochs=0,
                                verbose=0,
                                workers=4,
                                initial_epoch=1)

            self.assertEqual(len(model.optimizer.weights), 5)
checkpoint_format = './checkpoint-{epoch}.h5'
log_dir = './logs'

# If set > 0, will resume training from a given checkpoint.
resume_from_epoch = 0
for try_epoch in range(epochs, 0, -1):
    if os.path.exists(checkpoint_format.format(epoch=try_epoch)):
        resume_from_epoch = try_epoch
        break

# Horovod: broadcast resume_from_epoch from rank 0 (which will have
# checkpoints) to other ranks.
resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')

# Horovod: print logs on the first worker.
verbose = 1 if hvd.rank() == 0 else 0

# Training data iterator.
train_gen = image.ImageDataGenerator(
    width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
    preprocessing_function=keras.applications.resnet50.preprocess_input)
train_iter = train_gen.flow_from_directory(train_dir, batch_size=batch_size,
                                           target_size=(224, 224))

# Validation data iterator.
test_gen = image.ImageDataGenerator(
    zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
test_iter = test_gen.flow_from_directory(test_dir, batch_size=batch_size,
                                         target_size=(224, 224))

# Set up standard ResNet-50 model.