def test_load_model_broadcast(self): def create_model(): opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') return model with temppath() as fname: with self.session(config=self.config) as sess: K.set_session(sess) model = create_model() x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) if hvd.rank() == 0: model.save(fname) K.clear_session() with self.session(config=self.config) as sess: K.set_session(sess) weight = np.random.random((1, 3)) if hvd.rank() == 0: model = hvd.load_model(fname) else: model = create_model() def generator(): while 1: yield (x, y, weight) if hvd.rank() == 0: self.assertEqual(len(model.optimizer.weights), 5) else: self.assertEqual(len(model.optimizer.weights), 0) # No assertions, we just need to verify that it doesn't hang callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] model.fit_generator(generator(), steps_per_epoch=1, callbacks=callbacks, epochs=1, verbose=0, workers=4, initial_epoch=0) self.assertEqual(len(model.optimizer.weights), 5)
def test_load_model_custom_optimizers(self): class TestOptimizer(keras.optimizers.RMSprop): def __init__(self, **kwargs): super(TestOptimizer, self).__init__(**kwargs) with self.test_session(config=self.config) as sess: K.set_session(sess) opt = TestOptimizer(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) with temppath() as fname: model.save(fname) custom_optimizers = [TestOptimizer] new_model = hvd.load_model(fname, custom_optimizers=custom_optimizers) new_opt = new_model.optimizer self.assertEqual(type(new_opt).__module__, 'horovod._keras') self.assertEqual(type(new_opt).__name__, 'TestOptimizer') self._check_optimizer_weights(opt, new_opt)
def test_load_model(self): with self.test_session(config=self.config) as sess: K.set_session(sess) opt = keras.optimizers.RMSprop(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) with temppath() as fname: model.save(fname) new_model = hvd.load_model(fname) new_opt = new_model.optimizer self.assertEqual(type(new_opt).__module__, 'horovod._keras') self.assertEqual(type(new_opt).__name__, 'RMSprop') self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr)) self._check_optimizer_weights(opt, new_opt)
def inference(model_file: Path, dataset_dir: Path, args: dict, smlb_in: RuntimeIn, smlb_out: RuntimeOut) -> None: """ Perform inference using a U-Net style model :param model_file: model weights file to load :param dataset_dir: path to find files for inference :param args: dictionary of user/environment arguments :param smlb_in: RuntimeIn instance for logging :param smlb_out: RuntimeOut instance for logging """ console = smlb_out.log.console device = smlb_out.log.device crop_size = args['crop_size'] output_dir = Path(smlb_in.output_dir) output_dir.mkdir(parents=True, exist_ok=True) console.message('Loading model {}'.format(model_file)) assert Path(model_file).exists(), "Model file does not exist!" model = hvd.load_model(str(model_file)) console.message('Getting file paths') file_paths = list(Path(dataset_dir).glob('**/S3A*.hdf')) assert len(file_paths) > 0, "Could not find any HDF files!" console.message('Preparing data loader') # Create data loader in single image mode. This turns off shuffling and # only yields batches of images for a single image at a time so they can be # reconstructed. data_loader = SLSTRDataLoader(file_paths, single_image=True, crop_size=crop_size) dataset = data_loader.to_dataset() console.begin('Inference Loop') for patches, file_name in dataset: file_name = Path(file_name.numpy().decode('utf-8')) device.message(f"Processing file {file_name}") console.message(f"Processing file {file_name}") # convert patches to a batch of patches n, ny, nx, _ = patches.shape patches = tf.reshape(patches, (n * nx * ny, PATCH_SIZE, PATCH_SIZE, N_CHANNELS)) # perform inference on patches mask_patches = model.predict_on_batch(patches) # crop edge artifacts mask_patches = tf.image.crop_to_bounding_box(mask_patches, crop_size // 2, crop_size // 2, PATCH_SIZE - crop_size, PATCH_SIZE - crop_size) # reconstruct patches back to full size image mask_patches = tf.reshape(mask_patches, (n, ny, nx, PATCH_SIZE - crop_size, PATCH_SIZE - crop_size, 1)) mask = reconstruct_from_patches(mask_patches, nx, ny, patch_size=PATCH_SIZE - crop_size) mask_name = (output_dir / file_name.name).with_suffix('.h5') with h5py.File(mask_name, 'w') as handle: handle.create_dataset('mask', data=mask) console.ended('Inference Loop')
def reload_last_checkpoint(checkpoint_format, n_epochs, distributed): """Finds and loads the last checkpoint matching the provided pattern""" # Count down from n_epochs to 0 to find the last epoch. # Note that keras names checkpoint files with epoch number starting from 1. # So the matched number corresponds to the new initial epoch. for epoch in range(n_epochs, 0, -1): checkpoint = checkpoint_format.format(epoch=epoch) if os.path.exists(checkpoint): logging.info('Found last checkpoint at %s', checkpoint) # Use special reload to prepare the DistributedOptimizer if distributed: model = hvd.load_model(checkpoint) else: model = tf.keras.models.load_model(checkpoint) return epoch, model raise Exception('Unable to find a checkpoint file at %s' % checkpoint_format)
def main(events, make_model_fn, div, dataset, default_verbosity, data_dir, checkpoint_dir, log_to, log_info): world = MPI.COMM_WORLD rank = world.Get_rank() size = world.Get_size() wrh.open(str(log_to) % { 'rank': rank, 'size': size, 'rank+1': rank + 1, }, 'a') if log_info is not None: wrh.load(log_info % { 'rank': rank, 'rank+1': rank + 1, 'size': size, }) else: if rank == 0: wrh.push('master') for i in range(1, size): wrh.push('worker') info = wrh.save() world.send(info, dest=i, tag=i) wrh.pop('worker') wrh.push('worker') else: info = world.recv(source=0, tag=rank) wrh.load(info) wrh.push('triple-r.py') wrh.log('rank', '%d', rank) wrh.log('size', '%d', size) wrh.log('model', '%s', make_model_fn) wrh.log('dataset', '%s', dataset) wrh.log('events', '%s', events) wrh.log('div', '%d', div) wrh.log('data_dir', '%s', data_dir) wrh.log('checkpoint_dir', '%s', checkpoint_dir) wrh.push('initialize horovod') hvd.init(world) wrh.pop('initialize horovod') wrh.log('hvd.mpi_threads_supported', '%r', hvd.mpi_threads_supported()) assert hvd.mpi_threads_supported() wrh.log('_executing_eagerly', '%r', _executing_eagerly()) #print(f'{hvd. is_emnist = dataset in ('emnist',) is_tiny_imagenet = dataset in ('tiny-imagenet',) wrh.push('loading dataset') train_ds = None valid_ds = None if is_emnist: datasets, info = tfds.load( dataset, split=None, with_info=True, as_supervised=True, data_dir=str(data_dir), download=True, ) wrh.log('datasets', '%r', datasets) wrh.log('info', '%r', info) input_shape = info.features['image'].shape output_shape = info.features['label'].num_classes train_ds = datasets['train'] valid_ds = datasets['valid'] train_ds = train_ds.map(lambda img, label: (tf.image.convert_image_dtype(img, dtype=tf.float32), label)) valid_ds = valid_ds.map(lambda img, label: (tf.image.convert_image_dtype(img, dtype=tf.float32), label)) num_train = info.splits['train'].num_examples num_valid = info.splits['validation'].num_examples elif is_tiny_imagenet: # Training data iterator. input_shape = (224, 224, 3) output_shape = 200 num_train = 100000 num_valid = 10000 train_dir = data_dir / "tiny-imagenet-200/train" valid_dir = data_dir / "tiny-imagenet-200/val" def drop_first_dimension(tensor: 'tf.Tensor') -> 'tf.Tensor': #print(f'{tensor = }') #shape = tensor.get_shape() #print(f'{shape = }') #tensor.set_shape(shape[1:]) return tensor def add_first_dimension(tensor): shape = tensor.get_shape() tensor.set_shape([1, *shape]) return tensor def debug(s: str, tensor: 'tf.Tensor') -> 'tf.Tensor': print(f'{s}: {tensor = } (type = {type(tensor)})') return tensor train_gen = tf.keras.preprocessing.image.ImageDataGenerator( width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True, preprocessing_function=tf.keras.applications.resnet50.preprocess_input) train_ds = tf.data.Dataset.from_generator( lambda: train_gen.flow_from_directory(train_dir, batch_size=1, target_size=input_shape[:-1]), output_signature=(tf.TensorSpec(shape=[1, *input_shape], dtype=tf.float32), tf.TensorSpec(shape=(1, output_shape,), dtype=tf.int32)), ) \ .unbatch() #.map(lambda x, y: (debug('before x', x), debug('before y', y))) \ #.map(lambda x, y: (debug('after x', x), debug('after y', y))) \ #.map(lambda x, y: (debug('unbatch x', x), debug('unbatch y', y))) \ #.map(lambda x, y: (x, tf.expand_dims(y, axis=0))) \ # Validation data iterator. valid_gen = tf.keras.preprocessing.image.ImageDataGenerator( zoom_range=(0.875, 0.875), preprocessing_function=tf.keras.applications.resnet50.preprocess_input) valid_ds = tf.data.Dataset.from_generator( lambda: valid_gen.flow_from_directory(valid_dir, batch_size=1, target_size=input_shape[:-1]), output_signature=(tf.TensorSpec(shape=[1, *input_shape], dtype=tf.float32), tf.TensorSpec(shape=(1, output_shape,), dtype=tf.int32)), ) \ .unbatch() wrh.pop('loading dataset') wrh.push('creating model') wrh.log('input_shape', '%r', input_shape) wrh.log('output_shape', '%r', output_shape) model = make_model_fn( input_shape=input_shape, output_shape=output_shape, ) wrh.pop('creating model') callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback(), PreciseEarlyStopping(nepochs=-1, nbatches=-1), ] if rank == 0: pass # callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_dir / 'checkpoint.h5', save_weights_only=False)) if rank == 0: wrh.push('checkpoint') weights = checkpoint_dir / 'checkpoint.h5' model.save(weights) wrh.pop('checkpoint') #events.insert(0, Event(nepochs=0, nworkers=size, batch=32, reload=False)) initial_epoch = 0 for event in events: wrh.push('event') wrh.log('event', '%r', event) opt = tf.keras.optimizers.Adam(0.001) print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }') opt = hvd.DistributedOptimizer( opt, backward_passes_per_step=1, average_aggregated_gradients=True, ) print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }') if rank == -1: opt = create_no_op_optimizer(opt) print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }') # old_allreduce = opt._allreduce # def _allreduce(grads): # print(f'{rank=} {grads = }') # return old_allreduce(grads) # opt._allreduce = _allreduce model.compile( optimizer=opt, metrics=['accuracy'], loss=tf.losses.CategoricalCrossentropy(from_logits=True), experimental_run_tf_function=False, ) if event.reload: wrh.push('reload') print(f'Reloading weights') #weights = tf.train.latest_checkpoint(checkpoint_dir) weights = checkpoint_dir / 'checkpoint.h5' if weights is None: print(f'Error! Could not load weights!') print(f'{checkpoint_dir = }') for path in checkpoint_dir.iterdir(): print(f' {path = }') raise ValueError('Could not load weights') wrh.log('weights', '%r', weights) model = hvd.load_model(weights) wrh.pop('reload') wrh.push('train') model.fit( train_ds.repeat().batch(event.batch), steps_per_epoch=num_train // event.batch // event.nworkers // div, callbacks=callbacks, epochs=initial_epoch + event.nepochs, initial_epoch=initial_epoch, verbose=default_verbosity if hvd.rank() == 0 else 0, ) wrh.pop('train') wrh.push('valid') stats = model.evaluate( valid_ds.repeat().batch(event.batch), steps=num_valid // event.batch // event.nworkers // div, callbacks=callbacks, verbose=default_verbosity if hvd.rank() == 0 else 0, ) if rank == 0: print(f'stats = {" ".join(f"{name}={value}" for name, value in zip(model.metrics_names, stats))}') for name, value in zip(model.metrics_names, stats): wrh.log(name, '%r', value) wrh.pop('valid') if event.checkpoint and rank == 0: wrh.push('checkpoint') weights = checkpoint_dir / 'checkpoint.h5' model.save(weights) wrh.pop('checkpoint') world.Barrier() initial_epoch += event.nepochs wrh.pop('event') wrh.pop('triple-r.py') if rank == 0: wrh.pop('worker') wrh.pop('master')