def __init__(self, name, device_dense, device_sparse, compression, sparse_as_dense, config, aggregation_frequency, grad_updated_sizes_dict, profile_frequency, profile_filename, average_aggregated_gradients): if name is None: name = "Distributed%s" % self.__class__.__base__.__name__ self._name = name self._device_dense = device_dense self._device_sparse = device_sparse self._compression = compression self._sparse_as_dense = sparse_as_dense self._aggregated_gradients = False # We save the result of this because `get_gradients` and # `apply_gradients` do not execute eagerly. self._executing_eagerly = hvd._executing_eagerly() if not self._executing_eagerly: self._agg_helper = LocalGradientAggregationHelper( aggregation_frequency, _make_allreduce_grads_fn(device_dense, device_sparse, compression), sparse_as_dense, grad_updated_sizes_dict, average_aggregated_gradients ) self._profile_helper = TFProfileHelper(profile_frequency, profile_filename) super(self.__class__, self).__init__(**config)
def __init__(self, **kwargs): self._name = name or "Distributed%s" % self.__class__.__base__.__name__ self._aggregated_gradients = False self._allreduce_grads = hvd._make_allreduce_grads_fn( self._name, device_dense, device_sparse, compression, sparse_as_dense, op, gradient_predivide_factor) self._agg_helper = None if backward_passes_per_step > 1: if hvd._executing_eagerly(): self._agg_helper = LocalGradientAggregationHelperEager( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients= average_aggregated_gradients, ) else: self._agg_helper = LocalGradientAggregationHelper( backward_passes_per_step=backward_passes_per_step, allreduce_func=self._allreduce_grads, sparse_as_dense=sparse_as_dense, average_aggregated_gradients= average_aggregated_gradients, rank=rank(), optimizer_type=LocalGradientAggregationHelper. _OPTIMIZER_TYPE_KERAS, ) super(self.__class__, self).__init__(**kwargs)
def on_batch_end(self, batch, logs=None): if self.broadcast_done: return with tf.device(self.device): if hvd._executing_eagerly() and hasattr(self.model, 'variables'): # TensorFlow 2.0 or TensorFlow eager hvd.broadcast_variables(self.model.variables, root_rank=self.root_rank) hvd.broadcast_variables(self.model.optimizer.variables(), root_rank=self.root_rank) else: bcast_op = hvd.broadcast_global_variables(self.root_rank) self.backend.get_session().run(bcast_op) self.broadcast_done = True
def test_broadcast_object_fn(self): if LooseVersion(tf.__version__) < LooseVersion('1.15.0'): self.skipTest( "Broadcasting object requires TensorFlow 1.15 or above") if hvd._executing_eagerly() or _IS_TF2: # Only for TF 1.0 in graph mode return hvd.init() with tf.device("/cpu:0"): expected_obj = {'hello': 123, 0: [1, 2]} obj = expected_obj if hvd.rank() == 0 else {} bcast = hvd.broadcast_object_fn(root_rank=0) obj = bcast(obj) self.assertDictEqual(obj, expected_obj)
def _average_metrics_in_place(self, logs): logs = logs or {} reduced_logs = {} # Reduce every metric among workers. Sort metrics by name # to ensure consistent order. for metric, value in sorted(logs.items()): if hvd._executing_eagerly(): reduced_logs[metric] = \ hvd.allreduce(tf.constant(value, name=metric)).numpy() else: if metric not in self.variables: self.variables[metric], self.allreduce_ops[metric] = \ self._make_variable(metric, value) else: self.backend.set_value(self.variables[metric], value) reduced_logs[metric] = \ self.backend.get_session().run(self.allreduce_ops[metric]) # Override the reduced values back into logs dictionary # for other callbacks to use. for metric, value in reduced_logs.items(): logs[metric] = value
def _eval(backend, op_or_result): if hvd._executing_eagerly(): return op_or_result else: return backend.get_session().run(op_or_result)
def test_elastic_state(self): if LooseVersion(tf.__version__) < LooseVersion('1.15.0'): self.skipTest( "Broadcasting object requires TensorFlow 1.15 or above") if not hvd._executing_eagerly() and _IS_TF2: # Only support TF 2.0 in eager mode return hvd.init() with tf.device("/cpu:0"): v = 1.0 if hvd.rank() == 0 else 2.0 weights1 = [np.array([[v, v], [v, v]]), np.array([v, v])] vars1 = [tf.Variable(arr) for arr in weights1] weights2 = [ np.array([[1.0, 2.0], [3.0, 4.0]]), np.array([0.0, 0.0]) ] if not hvd._executing_eagerly(): init = tf.global_variables_initializer() self.evaluate(init) state = hvd.elastic.TensorFlowState(vars1, batch=20 + hvd.rank(), epoch=10 + hvd.rank()) state.sync() weights1 = [np.ones_like(w) for w in weights1] # After sync, all values should match the root rank for w in self.evaluate(vars1): self.assertAllClose(w, np.ones_like(w)) assert state.batch == 20 assert state.epoch == 10 # Partially modify then restore self.assign(vars1, weights2) state.batch = 21 state.epoch = 11 state.restore() for w1, w2 in zip(self.evaluate(vars1), weights1): self.assertAllClose(w1, w2) assert state.batch == 20 assert state.epoch == 10 # Partially modify then commit self.assign(vars1, weights2) state.batch = 21 state.epoch = 11 state.commit() state.restore() for w1, w2 in zip(self.evaluate(vars1), weights2): self.assertAllClose(w1, w2) assert state.batch == 21 assert state.epoch == 11
def main(events, make_model_fn, div, dataset, default_verbosity, data_dir, checkpoint_dir, log_to, log_info): world = MPI.COMM_WORLD rank = world.Get_rank() size = world.Get_size() wrh.open(str(log_to) % { 'rank': rank, 'size': size, 'rank+1': rank + 1, }, 'a') if log_info is not None: wrh.load(log_info % { 'rank': rank, 'rank+1': rank + 1, 'size': size, }) else: if rank == 0: wrh.push('master') for i in range(1, size): wrh.push('worker') info = wrh.save() world.send(info, dest=i, tag=i) wrh.pop('worker') wrh.push('worker') else: info = world.recv(source=0, tag=rank) wrh.load(info) wrh.push('triple-r.py') wrh.log('rank', '%d', rank) wrh.log('size', '%d', size) wrh.log('model', '%s', make_model_fn) wrh.log('dataset', '%s', dataset) wrh.log('events', '%s', events) wrh.log('div', '%d', div) wrh.log('data_dir', '%s', data_dir) wrh.log('checkpoint_dir', '%s', checkpoint_dir) wrh.push('initialize horovod') hvd.init(world) wrh.pop('initialize horovod') wrh.log('hvd.mpi_threads_supported', '%r', hvd.mpi_threads_supported()) assert hvd.mpi_threads_supported() wrh.log('_executing_eagerly', '%r', _executing_eagerly()) #print(f'{hvd. is_emnist = dataset in ('emnist',) is_tiny_imagenet = dataset in ('tiny-imagenet',) wrh.push('loading dataset') train_ds = None valid_ds = None if is_emnist: datasets, info = tfds.load( dataset, split=None, with_info=True, as_supervised=True, data_dir=str(data_dir), download=True, ) wrh.log('datasets', '%r', datasets) wrh.log('info', '%r', info) input_shape = info.features['image'].shape output_shape = info.features['label'].num_classes train_ds = datasets['train'] valid_ds = datasets['valid'] train_ds = train_ds.map(lambda img, label: (tf.image.convert_image_dtype(img, dtype=tf.float32), label)) valid_ds = valid_ds.map(lambda img, label: (tf.image.convert_image_dtype(img, dtype=tf.float32), label)) num_train = info.splits['train'].num_examples num_valid = info.splits['validation'].num_examples elif is_tiny_imagenet: # Training data iterator. input_shape = (224, 224, 3) output_shape = 200 num_train = 100000 num_valid = 10000 train_dir = data_dir / "tiny-imagenet-200/train" valid_dir = data_dir / "tiny-imagenet-200/val" def drop_first_dimension(tensor: 'tf.Tensor') -> 'tf.Tensor': #print(f'{tensor = }') #shape = tensor.get_shape() #print(f'{shape = }') #tensor.set_shape(shape[1:]) return tensor def add_first_dimension(tensor): shape = tensor.get_shape() tensor.set_shape([1, *shape]) return tensor def debug(s: str, tensor: 'tf.Tensor') -> 'tf.Tensor': print(f'{s}: {tensor = } (type = {type(tensor)})') return tensor train_gen = tf.keras.preprocessing.image.ImageDataGenerator( width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True, preprocessing_function=tf.keras.applications.resnet50.preprocess_input) train_ds = tf.data.Dataset.from_generator( lambda: train_gen.flow_from_directory(train_dir, batch_size=1, target_size=input_shape[:-1]), output_signature=(tf.TensorSpec(shape=[1, *input_shape], dtype=tf.float32), tf.TensorSpec(shape=(1, output_shape,), dtype=tf.int32)), ) \ .unbatch() #.map(lambda x, y: (debug('before x', x), debug('before y', y))) \ #.map(lambda x, y: (debug('after x', x), debug('after y', y))) \ #.map(lambda x, y: (debug('unbatch x', x), debug('unbatch y', y))) \ #.map(lambda x, y: (x, tf.expand_dims(y, axis=0))) \ # Validation data iterator. valid_gen = tf.keras.preprocessing.image.ImageDataGenerator( zoom_range=(0.875, 0.875), preprocessing_function=tf.keras.applications.resnet50.preprocess_input) valid_ds = tf.data.Dataset.from_generator( lambda: valid_gen.flow_from_directory(valid_dir, batch_size=1, target_size=input_shape[:-1]), output_signature=(tf.TensorSpec(shape=[1, *input_shape], dtype=tf.float32), tf.TensorSpec(shape=(1, output_shape,), dtype=tf.int32)), ) \ .unbatch() wrh.pop('loading dataset') wrh.push('creating model') wrh.log('input_shape', '%r', input_shape) wrh.log('output_shape', '%r', output_shape) model = make_model_fn( input_shape=input_shape, output_shape=output_shape, ) wrh.pop('creating model') callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback(), PreciseEarlyStopping(nepochs=-1, nbatches=-1), ] if rank == 0: pass # callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_dir / 'checkpoint.h5', save_weights_only=False)) if rank == 0: wrh.push('checkpoint') weights = checkpoint_dir / 'checkpoint.h5' model.save(weights) wrh.pop('checkpoint') #events.insert(0, Event(nepochs=0, nworkers=size, batch=32, reload=False)) initial_epoch = 0 for event in events: wrh.push('event') wrh.log('event', '%r', event) opt = tf.keras.optimizers.Adam(0.001) print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }') opt = hvd.DistributedOptimizer( opt, backward_passes_per_step=1, average_aggregated_gradients=True, ) print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }') if rank == -1: opt = create_no_op_optimizer(opt) print(f'{rank=} {opt.__class__ = }, {opt.__class__.__base__ = }') # old_allreduce = opt._allreduce # def _allreduce(grads): # print(f'{rank=} {grads = }') # return old_allreduce(grads) # opt._allreduce = _allreduce model.compile( optimizer=opt, metrics=['accuracy'], loss=tf.losses.CategoricalCrossentropy(from_logits=True), experimental_run_tf_function=False, ) if event.reload: wrh.push('reload') print(f'Reloading weights') #weights = tf.train.latest_checkpoint(checkpoint_dir) weights = checkpoint_dir / 'checkpoint.h5' if weights is None: print(f'Error! Could not load weights!') print(f'{checkpoint_dir = }') for path in checkpoint_dir.iterdir(): print(f' {path = }') raise ValueError('Could not load weights') wrh.log('weights', '%r', weights) model = hvd.load_model(weights) wrh.pop('reload') wrh.push('train') model.fit( train_ds.repeat().batch(event.batch), steps_per_epoch=num_train // event.batch // event.nworkers // div, callbacks=callbacks, epochs=initial_epoch + event.nepochs, initial_epoch=initial_epoch, verbose=default_verbosity if hvd.rank() == 0 else 0, ) wrh.pop('train') wrh.push('valid') stats = model.evaluate( valid_ds.repeat().batch(event.batch), steps=num_valid // event.batch // event.nworkers // div, callbacks=callbacks, verbose=default_verbosity if hvd.rank() == 0 else 0, ) if rank == 0: print(f'stats = {" ".join(f"{name}={value}" for name, value in zip(model.metrics_names, stats))}') for name, value in zip(model.metrics_names, stats): wrh.log(name, '%r', value) wrh.pop('valid') if event.checkpoint and rank == 0: wrh.push('checkpoint') weights = checkpoint_dir / 'checkpoint.h5' model.save(weights) wrh.pop('checkpoint') world.Barrier() initial_epoch += event.nepochs wrh.pop('event') wrh.pop('triple-r.py') if rank == 0: wrh.pop('worker') wrh.pop('master')