예제 #1
0
    def test_load_model_broadcast(self):
        def create_model():
            opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            return model

        with temppath() as fname:
            with self.session(config=self.config) as sess:
                K.set_session(sess)

                model = create_model()

                x = np.random.random((1, 3))
                y = np.random.random((1, 3, 3))
                model.train_on_batch(x, y)

                if hvd.rank() == 0:
                    model.save(fname)

            K.clear_session()
            with self.session(config=self.config) as sess:
                K.set_session(sess)

                weight = np.random.random((1, 3))

                if hvd.rank() == 0:
                    model = hvd.load_model(fname)
                else:
                    model = create_model()

                def generator():
                    while 1:
                        yield (x, y, weight)

                if hvd.rank() == 0:
                    self.assertEqual(len(model.optimizer.weights), 5)
                else:
                    self.assertEqual(len(model.optimizer.weights), 0)

                # No assertions, we just need to verify that it doesn't hang
                callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
                model.fit_generator(generator(),
                                    steps_per_epoch=1,
                                    callbacks=callbacks,
                                    epochs=1,
                                    verbose=0,
                                    workers=4,
                                    initial_epoch=0)

                self.assertEqual(len(model.optimizer.weights), 5)
예제 #2
0
    def test_load_model_custom_optimizers(self):
        class TestOptimizer(keras.optimizers.RMSprop):
            def __init__(self, **kwargs):
                super(TestOptimizer, self).__init__(**kwargs)

        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            opt = TestOptimizer(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            with temppath() as fname:
                model.save(fname)

                custom_optimizers = [TestOptimizer]
                new_model = hvd.load_model(fname,
                                           custom_optimizers=custom_optimizers)
                new_opt = new_model.optimizer

            self.assertEqual(type(new_opt).__module__, 'horovod._keras')
            self.assertEqual(type(new_opt).__name__, 'TestOptimizer')
            self._check_optimizer_weights(opt, new_opt)
예제 #3
0
    def test_load_model(self):
        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            opt = keras.optimizers.RMSprop(lr=0.0001)
            opt = hvd.DistributedOptimizer(opt)

            model = keras.models.Sequential()
            model.add(keras.layers.Dense(2, input_shape=(3, )))
            model.add(keras.layers.RepeatVector(3))
            model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
            model.compile(loss=keras.losses.MSE,
                          optimizer=opt,
                          metrics=[keras.metrics.categorical_accuracy],
                          sample_weight_mode='temporal')

            x = np.random.random((1, 3))
            y = np.random.random((1, 3, 3))
            model.train_on_batch(x, y)

            with temppath() as fname:
                model.save(fname)

                new_model = hvd.load_model(fname)
                new_opt = new_model.optimizer

            self.assertEqual(type(new_opt).__module__, 'horovod._keras')
            self.assertEqual(type(new_opt).__name__, 'RMSprop')
            self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr))
            self._check_optimizer_weights(opt, new_opt)
예제 #4
0
def inference(model_file: Path, dataset_dir: Path, args: dict, smlb_in: RuntimeIn, smlb_out: RuntimeOut) -> None:
    """
    Perform inference using a U-Net style model

    :param model_file: model weights file to load
    :param dataset_dir: path to find files for inference
    :param args: dictionary of user/environment arguments
    :param smlb_in: RuntimeIn instance for logging
    :param smlb_out: RuntimeOut instance for logging
    """
    console = smlb_out.log.console
    device = smlb_out.log.device

    crop_size = args['crop_size']

    output_dir = Path(smlb_in.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    console.message('Loading model {}'.format(model_file))
    assert Path(model_file).exists(), "Model file does not exist!"
    model = hvd.load_model(str(model_file))

    console.message('Getting file paths')
    file_paths = list(Path(dataset_dir).glob('**/S3A*.hdf'))
    assert len(file_paths) > 0, "Could not find any HDF files!"

    console.message('Preparing data loader')
    # Create data loader in single image mode. This turns off shuffling and
    # only yields batches of images for a single image at a time so they can be
    # reconstructed.
    data_loader = SLSTRDataLoader(file_paths, single_image=True, crop_size=crop_size)
    dataset = data_loader.to_dataset()

    console.begin('Inference Loop')
    for patches, file_name in dataset:
        file_name = Path(file_name.numpy().decode('utf-8'))
        device.message(f"Processing file {file_name}")
        console.message(f"Processing file {file_name}")

        # convert patches to a batch of patches
        n, ny, nx, _ = patches.shape
        patches = tf.reshape(patches, (n * nx * ny, PATCH_SIZE, PATCH_SIZE, N_CHANNELS))

        # perform inference on patches
        mask_patches = model.predict_on_batch(patches)

        # crop edge artifacts
        mask_patches = tf.image.crop_to_bounding_box(mask_patches, crop_size // 2, crop_size // 2, PATCH_SIZE - crop_size, PATCH_SIZE - crop_size)

        # reconstruct patches back to full size image
        mask_patches = tf.reshape(mask_patches, (n, ny, nx, PATCH_SIZE - crop_size, PATCH_SIZE - crop_size, 1))
        mask = reconstruct_from_patches(mask_patches, nx, ny, patch_size=PATCH_SIZE - crop_size)
        mask_name = (output_dir / file_name.name).with_suffix('.h5')

        with h5py.File(mask_name, 'w') as handle:
            handle.create_dataset('mask', data=mask)

    console.ended('Inference Loop')
예제 #5
0
def reload_last_checkpoint(checkpoint_format, n_epochs, distributed):
    """Finds and loads the last checkpoint matching the provided pattern"""
    # Count down from n_epochs to 0 to find the last epoch.
    # Note that keras names checkpoint files with epoch number starting from 1.
    # So the matched number corresponds to the new initial epoch.
    for epoch in range(n_epochs, 0, -1):
        checkpoint = checkpoint_format.format(epoch=epoch)
        if os.path.exists(checkpoint):
            logging.info('Found last checkpoint at %s', checkpoint)
            # Use special reload to prepare the DistributedOptimizer
            if distributed:
                model = hvd.load_model(checkpoint)
            else:
                model = tf.keras.models.load_model(checkpoint)
            return epoch, model
    raise Exception('Unable to find a checkpoint file at %s' %
                    checkpoint_format)
예제 #6
0
def main(events, make_model_fn, div, dataset, default_verbosity, data_dir, checkpoint_dir, log_to, log_info):
    world = MPI.COMM_WORLD
    rank = world.Get_rank()
    size = world.Get_size()

    wrh.open(str(log_to) % {
        'rank': rank,
        'size': size,
        'rank+1': rank + 1,
    }, 'a')

    if log_info is not None:
        wrh.load(log_info % {
            'rank': rank,
            'rank+1': rank + 1,
            'size': size,
        })
    else:
        if rank == 0:
            wrh.push('master')
            for i in range(1, size):
                wrh.push('worker')
                info = wrh.save()
                world.send(info, dest=i, tag=i)
                wrh.pop('worker')
            wrh.push('worker')
        else:
            info = world.recv(source=0, tag=rank)
            wrh.load(info)

    wrh.push('triple-r.py')
    wrh.log('rank', '%d', rank)
    wrh.log('size', '%d', size)
    wrh.log('model', '%s', make_model_fn)
    wrh.log('dataset', '%s', dataset)
    wrh.log('events', '%s', events)
    wrh.log('div', '%d', div)
    wrh.log('data_dir', '%s', data_dir)
    wrh.log('checkpoint_dir', '%s', checkpoint_dir)

    wrh.push('initialize horovod')
    hvd.init(world)
    wrh.pop('initialize horovod')

    wrh.log('hvd.mpi_threads_supported', '%r', hvd.mpi_threads_supported())
    assert hvd.mpi_threads_supported()

    wrh.log('_executing_eagerly', '%r', _executing_eagerly())

    #print(f'{hvd.

    is_emnist = dataset in ('emnist',)
    is_tiny_imagenet = dataset in ('tiny-imagenet',)

    wrh.push('loading dataset')
    train_ds = None
    valid_ds = None
    if is_emnist:
        datasets, info = tfds.load(
            dataset,
            split=None,
            with_info=True,
            as_supervised=True,
            data_dir=str(data_dir),
            download=True,
        )
        wrh.log('datasets', '%r', datasets)
        wrh.log('info', '%r', info)
        input_shape = info.features['image'].shape
        output_shape = info.features['label'].num_classes
        train_ds = datasets['train']
        valid_ds = datasets['valid']

        train_ds = train_ds.map(lambda img, label: (tf.image.convert_image_dtype(img, dtype=tf.float32), label))
        valid_ds = valid_ds.map(lambda img, label: (tf.image.convert_image_dtype(img, dtype=tf.float32), label))

        num_train = info.splits['train'].num_examples
        num_valid = info.splits['validation'].num_examples
    elif is_tiny_imagenet:
        # Training data iterator.
        input_shape = (224, 224, 3)
        output_shape = 200

        num_train = 100000
        num_valid = 10000

        train_dir = data_dir / "tiny-imagenet-200/train"
        valid_dir = data_dir / "tiny-imagenet-200/val"

        def drop_first_dimension(tensor: 'tf.Tensor') -> 'tf.Tensor':
            #print(f'{tensor = }')
            #shape = tensor.get_shape()
            #print(f'{shape = }')
            #tensor.set_shape(shape[1:])
            return tensor

        def add_first_dimension(tensor):
            shape = tensor.get_shape()
            tensor.set_shape([1, *shape])
            return tensor

        def debug(s: str, tensor: 'tf.Tensor') -> 'tf.Tensor':
            print(f'{s}: {tensor = } (type = {type(tensor)})')
            return tensor

        train_gen = tf.keras.preprocessing.image.ImageDataGenerator(
            width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
            preprocessing_function=tf.keras.applications.resnet50.preprocess_input)

        train_ds = tf.data.Dataset.from_generator(
            lambda: train_gen.flow_from_directory(train_dir,
                                                  batch_size=1,
                                                  target_size=input_shape[:-1]),
            output_signature=(tf.TensorSpec(shape=[1, *input_shape], dtype=tf.float32),
                              tf.TensorSpec(shape=(1, output_shape,), dtype=tf.int32)),
        ) \
            .unbatch()
            #.map(lambda x, y: (debug('before x', x), debug('before y', y))) \
            #.map(lambda x, y: (debug('after x', x), debug('after y', y))) \
            #.map(lambda x, y: (debug('unbatch x', x), debug('unbatch y', y))) \
            #.map(lambda x, y: (x, tf.expand_dims(y, axis=0))) \

        # Validation data iterator.
        valid_gen = tf.keras.preprocessing.image.ImageDataGenerator(
            zoom_range=(0.875, 0.875), preprocessing_function=tf.keras.applications.resnet50.preprocess_input)
        valid_ds = tf.data.Dataset.from_generator(
            lambda: valid_gen.flow_from_directory(valid_dir,
                                                  batch_size=1,
                                                  target_size=input_shape[:-1]),
            output_signature=(tf.TensorSpec(shape=[1, *input_shape], dtype=tf.float32),
                              tf.TensorSpec(shape=(1, output_shape,), dtype=tf.int32)),
        ) \
            .unbatch()
        
    wrh.pop('loading dataset')

    wrh.push('creating model')
    wrh.log('input_shape', '%r', input_shape)
    wrh.log('output_shape', '%r', output_shape)
    model = make_model_fn(
        input_shape=input_shape,
        output_shape=output_shape,
    )
    wrh.pop('creating model')

    callbacks = [
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        hvd.callbacks.MetricAverageCallback(),
        PreciseEarlyStopping(nepochs=-1, nbatches=-1),
    ]

    if rank == 0:
        pass # callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_dir / 'checkpoint.h5', save_weights_only=False))

    if rank == 0:
        wrh.push('checkpoint')
        weights = checkpoint_dir / 'checkpoint.h5'
        model.save(weights)
        wrh.pop('checkpoint')

    #events.insert(0, Event(nepochs=0, nworkers=size, batch=32, reload=False))

    initial_epoch = 0
    for event in events:
        wrh.push('event')
        wrh.log('event', '%r', event)

        opt = tf.keras.optimizers.Adam(0.001)
        print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }')

        opt = hvd.DistributedOptimizer(
            opt,
            backward_passes_per_step=1,
            average_aggregated_gradients=True,
        )
        print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }')

        if rank == -1:
            opt = create_no_op_optimizer(opt)
            print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }')

       # old_allreduce = opt._allreduce
       # def _allreduce(grads):
       #     print(f'{rank=} {grads = }')
       #     return old_allreduce(grads)
       # opt._allreduce = _allreduce

        model.compile(
            optimizer=opt,
            metrics=['accuracy'],
            loss=tf.losses.CategoricalCrossentropy(from_logits=True),
            experimental_run_tf_function=False,
        )

        if event.reload:
            wrh.push('reload')
            print(f'Reloading weights')
            #weights = tf.train.latest_checkpoint(checkpoint_dir)
            weights = checkpoint_dir / 'checkpoint.h5'
            if weights is None:
                print(f'Error! Could not load weights!')
                print(f'{checkpoint_dir = }')
                for path in checkpoint_dir.iterdir():
                    print(f'  {path = }')
                raise ValueError('Could not load weights')
            wrh.log('weights', '%r', weights)
            model = hvd.load_model(weights)
            wrh.pop('reload')

        wrh.push('train')
        model.fit(
            train_ds.repeat().batch(event.batch),
            steps_per_epoch=num_train // event.batch // event.nworkers // div,
            callbacks=callbacks,
            epochs=initial_epoch + event.nepochs,
            initial_epoch=initial_epoch,
            verbose=default_verbosity if hvd.rank() == 0 else 0,
        )
        wrh.pop('train')

        wrh.push('valid')
        stats = model.evaluate(
            valid_ds.repeat().batch(event.batch),
            steps=num_valid // event.batch // event.nworkers // div,
            callbacks=callbacks,
            verbose=default_verbosity if hvd.rank() == 0 else 0,
        )
        if rank == 0:
            print(f'stats = {" ".join(f"{name}={value}" for name, value in zip(model.metrics_names, stats))}')
        for name, value in zip(model.metrics_names, stats):
            wrh.log(name, '%r', value)
        wrh.pop('valid')

        if event.checkpoint and rank == 0:
            wrh.push('checkpoint')
            weights = checkpoint_dir / 'checkpoint.h5'
            model.save(weights)
            wrh.pop('checkpoint')

        world.Barrier()

        initial_epoch += event.nepochs

        wrh.pop('event')

    wrh.pop('triple-r.py')

    if rank == 0:
        wrh.pop('worker')
        wrh.pop('master')