Exemplo n.º 1
0
    def _step(self, samples, labels, first_batch):
        self._lr_scheduler()

        with tf.GradientTape() as tape:
            probs = self._model(samples, training=True)
            loss = self._loss_fn(labels, probs)
            if self._amp:
                loss = self._embedding_optimizer.get_scaled_loss(loss)

        embedding_vars, dense_vars = sok.split_embedding_variable_from_others(self._model.trainable_variables)
        embedding_grads, dense_grads = tape.gradient(loss, [embedding_vars, dense_vars])
        if self._amp:
            embedding_grads = self._embedding_optimizer.get_unscaled_gradients(embedding_grads)
            dense_grads = self._embedding_optimizer.get_unscaled_gradients(dense_grads)

        # embedding_grads = [scale_grad(g, hvd.size()) for g in embedding_grads]

        with sok.OptimizerScope(embedding_vars):
            self._embedding_optimizer.apply_gradients(zip(embedding_grads, embedding_vars),
                                                      experimental_aggregate_gradients=False)

        # with tf.control_dependencies(embedding_grads):
        dense_grads = [hvd.allreduce(grad, op=hvd.Average, compression=hvd.compression.NoneCompressor) for grad in dense_grads]
        self._dense_optimizer.apply_gradients(zip(dense_grads, dense_vars),
                                              experimental_aggregate_gradients=False)

        if first_batch:
            hvd.broadcast_variables(dense_vars, root_rank=0)
            hvd.broadcast_variables(self._dense_optimizer.variables(), root_rank=0)

        return loss
Exemplo n.º 2
0
    def train_step(features, labels, warmup_batch=False):
        with tf.GradientTape() as tape:
            output_map = model(features)
            crossentropy_loss, dice_loss = partial_losses(output_map, labels)
            added_losses = tf.add(crossentropy_loss,
                                  dice_loss,
                                  name="total_loss_ref")
            loss = added_losses + params.weight_decay * tf.add_n([
                tf.nn.l2_loss(v) for v in model.trainable_variables
                if 'batch_normalization' not in v.name
            ])

            if params.use_amp:
                loss = optimizer.get_scaled_loss(loss)
        tape = hvd.DistributedGradientTape(tape)
        gradients = tape.gradient(loss, model.trainable_variables)
        if params.use_amp:
            gradients = optimizer.get_unscaled_gradients(gradients)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        # Note: broadcast should be done after the first gradient step to ensure optimizer
        # initialization.
        if warmup_batch:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        ce_loss(crossentropy_loss)
        f1_loss(dice_loss)
        return loss
Exemplo n.º 3
0
    def train_step(images, masks, first_batch=False):
        with tf.GradientTape() as tape:
            predicted = model(images)
            predicted = predicted[:, clip_offset:-
                                  clip_offset, clip_offset:-clip_offset]
            masks = masks[:, clip_offset:-
                          clip_offset, clip_offset:-clip_offset]
            loss = bce(masks, predicted)
            train_loss_metric.update_state(loss)

        tape = hvd.DistributedGradientTape(tape)
        gradients = tape.gradient(
            loss, model.trainable_variables)

        optimizer.apply_gradients(
            zip(gradients, model.trainable_variables))

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        #
        # Note: broadcast should be done after the first gradient step to ensure optimizer
        # initialization.
        if first_batch:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        return predicted, masks
Exemplo n.º 4
0
    def train_step(self, data):
        """Perform a single training step."""
        x, beta = data
        start = time.time()
        with tf.GradientTape() as tape:
            states, accept_prob, sumlogdet = self((x, beta), training=True)
            loss = self.calc_losses(states, accept_prob)

            if self.aux_weight > 0:
                z = tf.random.normal(x.shape, dtype=x.dtype)
                states_, accept_prob_, _ = self((z, beta), training=True)
                loss_ = self.calc_losses(states_, accept_prob_)
                loss += loss_

        if NUM_RANKS > 1:
            tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))

        metrics = AttrDict({
            'dt': time.time() - start,
            'loss': loss,
            'accept_prob': accept_prob,
            'eps': self.eps,
            'beta': states.init.beta,
            'sumlogdet': sumlogdet.out,
        })

        if self.optimizer.iterations == 0 and NUM_RANKS > 1:
            hvd.broadcast_variables(self.variables, root_rank=0)
            hvd.broadcast_variables(self.optimizer.variables(), root_rank=0)

        return states.out.x, metrics
Exemplo n.º 5
0
    def train_step(inputs_tr, targets_tr, first_batch):
        print("Tracing update_step")
        print("inputs nodes", inputs_tr.nodes.shape)
        print("inputs edges", inputs_tr.edges.shape)
        print("input n_node", inputs_tr.n_node.shape)
        print(inputs_tr.nodes)
        with tf.GradientTape() as tape:
            outputs_tr = model(inputs_tr,
                               num_processing_steps_tr,
                               is_training=True)
            loss_ops_tr = loss_fcn(targets_tr, outputs_tr)
            loss_op_tr = tf.math.reduce_sum(loss_ops_tr) / tf.constant(
                num_processing_steps_tr, dtype=tf.float32)

        # Horovod: add Horovod Distributed GradientTape.
        if args.distributed:
            tape = hvd.DistributedGradientTape(tape)

        gradients = tape.gradient(loss_op_tr, model.trainable_variables)
        optimizer.apply(gradients, model.trainable_variables)

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        #
        # Note: broadcast should be done after the first gradient step to ensure optimizer
        # initialization.
        if args.distributed and first_batch:
            hvd.broadcast_variables(model.trainable_variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables, root_rank=0)

        return loss_op_tr
Exemplo n.º 6
0
  def train_step(inputs, first_batch):
    images, labels = inputs

    with tf.GradientTape() as tape:
      predictions = model(images, training=True)
      loss = loss_func(labels, predictions)
      loss += tf.reduce_sum(model.losses)
      loss_copy = loss
      # Scale the losses
      if precision == 'fp16':
        loss = loss * tf.cast(loss_scale, loss.dtype)

    tape = hvd.DistributedGradientTape(tape)

    old_grads = tape.gradient(loss, model.trainable_variables)

    # Unscale the grads
    if precision == 'fp16':
      loss_scale_reciprocal = 1. / loss_scale
      grads = [g * tf.cast(loss_scale_reciprocal, g.dtype) if g is not
                           None else None for g in old_grads]
    else:
      grads = old_grads

    opt.apply_gradients(zip(grads, model.trainable_variables))

    train_top1.update_state(labels, predictions)
    train_top5.update_state(labels, predictions)

    if hvd.size() > 1 and first_batch:
      hvd.broadcast_variables(model.variables, root_rank=0)
      hvd.broadcast_variables(opt.variables(), root_rank=0)

    return loss_copy
Exemplo n.º 7
0
def training_step(images, labels, first_batch):
    with tf.GradientTape() as tape:
        probs = cifar10_model(images, training=True)
        loss_value = loss(labels, probs)

    # Horovod: add Horovod Distributed GradientTape.
    try:
        tape = hvd.DistributedGradientTape(tape)
    except:
        print("no horovod")
    grads = tape.gradient(loss_value, cifar10_model.trainable_variables)
    opt.apply_gradients(zip(grads, cifar10_model.trainable_variables))

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    #
    # Note: broadcast should be done after the first gradient step to ensure optimizer
    # initialization.
    if first_batch:
        try:
            hvd.broadcast_variables(cifar10_model.variables, root_rank=0)
            hvd.broadcast_variables(opt.variables(), root_rank=0)
        except:
            print("no horovod")
    return loss_value
Exemplo n.º 8
0
def train_step(model, opt, loss_func, images, labels, first_batch, batch_size, mixup_alpha=0.0, fp32=False):
    images, labels = mixup(batch_size, mixup_alpha, images, labels)
    with tf.GradientTape() as tape:
        logits = model(images, training=True)
        loss_value = loss_func(labels, tf.cast(logits, tf.float32))
        loss_value += tf.add_n(model.losses)
        if not fp32:
            scaled_loss_value = opt.get_scaled_loss(loss_value)

    tape = hvd.DistributedGradientTape(tape, compression=hvd.Compression.fp16)
    if not fp32:
        grads = tape.gradient(scaled_loss_value, model.trainable_variables)
        grads = opt.get_unscaled_gradients(grads)
    else:
        grads = tape.gradient(loss_value, model.trainable_variables)
    opt.apply_gradients(zip(grads, model.trainable_variables))
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)
    
    probs = layers.Activation('softmax', dtype='float32')(logits)
    top_1_pred = tf.squeeze(tf.math.top_k(probs, k=1)[1])
    sparse_labels = tf.cast(tf.math.argmax(labels, axis=1), tf.int32)
    top_1_accuracy = tf.math.reduce_sum(tf.cast(tf.equal(top_1_pred, sparse_labels), tf.int32))
    return loss_value, top_1_accuracy
Exemplo n.º 9
0
 def train(self, dataset, total_batches=-1):
     """ Update the model in 1 epoch """
     train_step = self.train_step
     if self.hparams.enable_tf_function:
         logging.info(
             "please be patient, enable tf.function, it takes time ...")
         train_step = tf.function(train_step,
                                  input_signature=self.sample_signature)
     for batch, samples in enumerate(dataset.take(total_batches)):
         # train 1 step
         samples = self.model.prepare_samples(samples)
         loss, metrics = train_step(samples)
         # Horovod: broadcast initial variable states from rank 0 to all other processes.
         # This is necessary to ensure consistent initialization of all workers when
         # training is started with random weights or restored from a checkpoint.
         #
         # Note: broadcast should be done after the first gradient step to ensure optimizer
         # initialization.
         if batch == 0:
             hvd.broadcast_variables(self.model.trainable_variables,
                                     root_rank=0)
             hvd.broadcast_variables(self.optimizer.variables(),
                                     root_rank=0)
         if batch % self.hparams.log_interval == 0 and hvd.rank() == 0:
             logging.info(self.metric_checker(loss, metrics))
             self.model.reset_metrics()
Exemplo n.º 10
0
    def _train_step(inputs, labels, first_batch):
        with tf.GradientTape() as tape:
            logit, all_vectors = model(inputs, training=True)
            replica_loss = _replica_loss(labels, logit)
            if args.mixed_precision:
                _loss = emb_opt.get_scaled_loss(replica_loss)
            else:
                _loss = replica_loss

        emb_var, other_var = sok.split_embedding_variable_from_others(
            model.trainable_variables)
        emb_grads, grads = tape.gradient(_loss, [emb_var, other_var])
        if args.mixed_precision:
            emb_grads = emb_opt.get_unscaled_gradients(emb_grads)
            grads = emb_opt.get_unscaled_gradients(grads)

        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(emb_var):
                emb_opt.apply_gradients(zip(emb_grads, emb_var),
                                        experimental_aggregate_gradients=False)
        else:
            emb_opt.apply_gradients(zip(emb_grads, emb_var),
                                    experimental_aggregate_gradients=False)

        with tf.control_dependencies(emb_grads):

            grads = [hvd.allreduce(grad) for grad in grads]
            dense_opt.apply_gradients(zip(grads, other_var))

            if first_batch:
                hvd.broadcast_variables(other_var, root_rank=0)
                hvd.broadcast_variables(dense_opt.variables(), root_rank=0)

            total_loss = hvd.allreduce(replica_loss)
        return total_loss, all_vectors
def benchmark_step(first_batch):
    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: use DistributedGradientTape
    with tf.GradientTape() as tape:
        probs = model(data, training=True)
        loss = tf.losses.sparse_categorical_crossentropy(target, probs)

        if args.use_amp:
            loss = opt.get_scaled_loss(loss)

    # Horovod: add Horovod Distributed GradientTape.
    tape = hvd.DistributedGradientTape(tape, compression=compression)

    gradients = tape.gradient(loss, model.trainable_variables)

    if args.use_amp:
        gradients = opt.get_unscaled_gradients(gradients)

    opt.apply_gradients(zip(gradients, model.trainable_variables))

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    #
    # Note: broadcast should be done after the first gradient step to ensure optimizer
    # initialization.
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)
Exemplo n.º 12
0
def train_step(model, inputs, loss, amp, opt, init):
    with tf.GradientTape() as tape:
        [input_ids, input_mask, segment_ids, label_ids] = inputs
        # print(input_ids, input_ids.shape)
        outputs = model(
            input_ids,
            # input_ids=input_ids,
            attention_mask=input_mask,
            token_type_ids=segment_ids,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            training=True,
        )

        loss_value = loss(y_true=label_ids, y_pred=outputs[0])
        unscaled_loss = tf.stop_gradient(loss_value)
        if amp:
            loss_value = opt.get_scaled_loss(loss_value)
    tape = hvd.DistributedGradientTape(tape)
    gradients = tape.gradient(loss_value, model.trainable_variables)
    if amp:
        gradients = opt.get_unscaled_gradients(gradients)
    opt.apply_gradients(zip(gradients,
                            model.trainable_variables))  # , clip_norm=1.0)

    if init:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)

    return unscaled_loss, outputs  # , tape.gradient(loss_value, model.trainable_variables)
Exemplo n.º 13
0
def train_step(model, inputs, loss, amp, opt, init, v2=False, loss_class=None, fp16=False, clip_norm=1.0):
    with tf.GradientTape() as tape:
        [input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible] = inputs

        if not v2:
            is_impossible = None

        start_logits, end_logits, cls_logits = model(input_ids,
                                                     attention_mask=input_mask,
                                                     token_type_ids=segment_ids,
                                                     start_positions=start_positions,
                                                     end_positions=end_positions,
                                                     cls_index=cls_index,
                                                     p_mask=p_mask,
                                                     is_impossible=is_impossible,
                                                     position_ids=None,
                                                     head_mask=None,
                                                     inputs_embeds=None,
                                                     training=True,
                                                     )[0:3]

        # If we are on multi-GPU, split add a dimension
        if len(start_positions.shape) > 1:
            start_positions = tf.squeeze(start_positions, axis=-1, name="squeeze_start_positions")
        if len(end_positions.shape) > 1:
            end_positions = tf.squeeze(end_positions, axis=-1, name="squeeze_end_positions")
        if is_impossible is not None and len(is_impossible.shape) > 1 and v2 and cls_logits is not None:
            is_impossible = tf.squeeze(is_impossible, axis=-1, name="squeeze_is_impossible")

        # sometimes the start/end positions are outside our model inputs, we ignore these terms
        ignored_index = start_logits.shape[1]
        start_positions = tf.clip_by_value(start_positions, 0, ignored_index, name="clip_start_positions")
        end_positions = tf.clip_by_value(end_positions, 0, ignored_index, name="clip_end_positions")

        start_loss = loss(y_true=start_positions, y_pred=tf.cast(start_logits, tf.float32))
        end_loss = loss(y_true=end_positions, y_pred=tf.cast(end_logits, tf.float32))
        loss_value = (start_loss + end_loss) / 2

        if v2:
            cls_loss_value = loss_class(y_true=is_impossible, y_pred=tf.cast(cls_logits, tf.float32))
            loss_value += cls_loss_value * 0.5

        unscaled_loss = tf.stop_gradient(loss_value)
        if amp:
            loss_value = opt.get_scaled_loss(loss_value)

    tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True,
                                       compression=Compression.fp16 if fp16 else Compression.none)
    gradients = tape.gradient(loss_value, model.trainable_variables)
    if amp:
        gradients = opt.get_unscaled_gradients(gradients)
    (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=clip_norm)
    opt.apply_gradients(zip(gradients, model.trainable_variables))  # , clip_norm=1.0)

    if init:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)

    return unscaled_loss  # , outputs#, tape.gradient(loss_value, model.trainable_variables)
Exemplo n.º 14
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    tf.enable_eager_execution(config=config)

    mnist_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(10)
    ])

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data()

    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(mnist_images[..., tf.newaxis] / 255,
                 tf.float32), tf.cast(mnist_labels, tf.int64)))
    dataset = dataset.shuffle(1000).batch(32)

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    checkpoint_dir = './checkpoints'
    step_counter = tf.train.get_or_create_global_step()
    checkpoint = tf.train.Checkpoint(model=mnist_model,
                                     optimizer=opt,
                                     step_counter=step_counter)

    # Horovod: adjust number of steps based on number of GPUs.
    for (batch, (images,
                 labels)) in enumerate(dataset.take(20000 // hvd.size())):
        with tf.GradientTape() as tape:
            logits = mnist_model(images, training=True)
            loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits)

        # Horovod: broadcast initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        if batch == 0:
            hvd.broadcast_variables(0, mnist_model.variables)

        # Horovod: add Horovod Distributed GradientTape.
        tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, mnist_model.variables)
        opt.apply_gradients(zip(grads, mnist_model.variables),
                            global_step=tf.train.get_or_create_global_step())
        if batch % 10 == 0 and hvd.local_rank() == 0:
            print('Step #%d\tLoss: %.6f' % (batch, loss_value))

    if hvd.rank() == 0:
        checkpoint.save(checkpoint_dir)
Exemplo n.º 15
0
 def train_first_step(inputs):
     images, labels = inputs
     with tf.GradientTape() as tape:
         probs = model(images, training=True)
         loss_value = loss(labels, probs)
     tape = hvd_tf.DistributedGradientTape(tape, compression=compression)
     grads = tape.gradient(loss_value, model.trainable_variables)
     opt.apply_gradients(zip(grads, model.trainable_variables))
     hvd_tf.broadcast_variables(model.variables, root_rank=0)
     hvd_tf.broadcast_variables(opt.variables(), root_rank=0)
Exemplo n.º 16
0
def train_step(model, opt, loss_func, images, labels, first_batch):
    with tf.GradientTape() as tape:
        probs = model(images, training=True)
        loss_value = loss_func(labels, probs)
    tape = hvd.DistributedGradientTape(tape, compression=hvd.Compression.fp16)
    grads = tape.gradient(loss_value, model.trainable_variables)
    opt.apply_gradients(zip(grads, model.trainable_variables))
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)
    return loss_value
Exemplo n.º 17
0
    def initialize(self, io_only=False):

        tf_trainer.initialize(self, io_only)

        # Here, we broadcast parameters from rank 0.

        # If the model was restored, this is correct.  If not,
        # This syncs everythign up.
        # print(bcast)

        hvd.broadcast_variables(self._net.variables, root_rank=0)
        hvd.broadcast_variables(self._opt.variables(), root_rank=0)
Exemplo n.º 18
0
def train_one_step(config,
                   model,
                   optimizer,
                   features,
                   accumulator,
                   first_step,
                   take_step,
                   clip_norm=1.0):

    #Forward and Backward pass
    with tf.GradientTape() as tape:
        total_loss, eval_fn_inputs = model(features, is_training=True)
        unscaled_loss = tf.stop_gradient(total_loss)
        if config.amp:
            total_loss = optimizer.get_scaled_loss(total_loss)

    #Backpropogate gradients
    #tape = hvd.DistributedGradientTape(
    #    tape, sparse_as_dense=True,
    #    compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none)
    gradients = tape.gradient(total_loss, model.trainable_variables)

    #Get unscaled gradients if AMP
    if config.amp:
        gradients = optimizer.get_unscaled_gradients(gradients)

    #Accumulate gradients
    accumulator(gradients)
    #Need to call apply_gradients on very first step irrespective of gradient accumulation
    #This is required for the optimizer to build it's states
    if first_step or take_step:
        #All reduce and Clip the accumulated gradients
        allreduced_accumulated_gradients = [
            None if g is None else hvd.allreduce(
                g / tf.cast(config.gradient_accumulation_steps, g.dtype),
                compression=Compression.fp16 if config.amp
                and config.fp16_compression else Compression.none)
            for g in accumulator.gradients
        ]
        (clipped_accumulated_gradients,
         _) = tf.clip_by_global_norm(allreduced_accumulated_gradients,
                                     clip_norm=clip_norm)
        #Weight update
        optimizer.apply_gradients(
            zip(clipped_accumulated_gradients, model.trainable_variables))
        accumulator.reset()

    #brodcast model weights after first train step
    if first_step:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(optimizer.variables(), root_rank=0)

    return unscaled_loss, eval_fn_inputs
def train_step(images, labels, first_batch):
    gradients, predictions, loss = get_grads(images, labels, first_batch)
    gradients = [hvd.allreduce(g.reduce_mean()) for g in gradients]
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(optimizer.variables(), root_rank=0)

    train_loss(loss.reduce_mean())
    train_accuracy(labels, predictions.merge())
    return loss.reduce_mean()
Exemplo n.º 20
0
    def train_step(self, first_epoch):
        epoch_global_norm = tf.TensorArray(
            tf.float32,
            size=self.params['dataloader']["number_of_elements"],
            dynamic_size=False,
            clear_after_read=False,
        )
        epoch_loss_avg = tf.TensorArray(
            tf.float32,
            size=self.params['dataloader']["number_of_elements"],
            dynamic_size=False,
            clear_after_read=False,
        )

        for element in self.train_dataset.enumerate():
            index = tf.dtypes.cast(element[0], tf.int32)
            set = element[1]
            shape = [
                self.params['dataloader']['batch_size'], self.pixel_num,
                self.params['dataloader']['tomographic_bin_number']
            ]
            kappa_data = tf.boolean_mask(tf.transpose(set[0], perm=[0, 2, 1]),
                                         self.bool_mask,
                                         axis=1)
            kappa_data = tf.ensure_shape(kappa_data, shape)
            labels = set[1]
            # Add noise
            noise = tf.ensure_shape(self._make_noise(), shape)
            kappa_data = tf.math.add(kappa_data, noise)

            # Optimize the model
            with tf.GradientTape() as tape:
                loss_object = tf.keras.losses.MeanAbsoluteError()
                y_ = self.model.__call__(kappa_data, training=True)
                loss_value = loss_object(y_true=labels, y_pred=y_)
            if self.params['training']['distributed']:
                tape = hvd.DistributedGradientTape(tape)
            grads = tape.gradient(loss_value, self.model.trainable_variables)
            self.optimizer.apply_gradients(
                zip(grads, self.model.trainable_variables))

            if self.params['training'][
                    'distributed'] and index == 0 and first_epoch:
                hvd.broadcast_variables(self.model.variables, root_rank=0)
                hvd.broadcast_variables(self.optimizer.variables(),
                                        root_rank=0)

            epoch_loss_avg = epoch_loss_avg.write(index, loss_value)
            epoch_global_norm = epoch_global_norm.write(
                index, tf.linalg.global_norm(grads))

        return epoch_loss_avg.stack(), epoch_global_norm.stack()
Exemplo n.º 21
0
 def join_and_broadcast(self):
     hvd.join()
     if not self.args.benchmark:
         hvd.broadcast_variables(self.model.linear_model.variables, root_rank=0)
         hvd.broadcast_variables(self.model.dnn_model.variables, root_rank=0)
         hvd.broadcast_variables(self.wide_optimizer.variables(), root_rank=0)
         hvd.broadcast_variables(self.deep_optimizer.variables(), root_rank=0)
def train_GAN(_batch_size, _training_epochs, global_size):

    tf.keras.mixed_precision.set_global_policy("mixed_float16")

    generator = Generator()

    random_input = numpy.random.uniform(-1, 1, [1, 100]).astype(numpy.float16)
    generated_image = generator(random_input)

    discriminator = Discriminator()
    classification = discriminator(generated_image)

    models = {"generator": generator, "discriminator": discriminator}

    opts = {
        "generator": tf.keras.optimizers.Adam(0.001),
        "discriminator": tf.keras.optimizers.RMSprop(0.0001)
    }

    if global_size != 1:
        hvd.broadcast_variables(generator.variables, root_rank=0)
        hvd.broadcast_variables(discriminator.variables, root_rank=0)
        hvd.broadcast_variables(opts['generator'].variables(), root_rank=0)
        hvd.broadcast_variables(opts['discriminator'].variables(), root_rank=0)

    train_loop(_batch_size, _training_epochs, models, opts, global_size)
Exemplo n.º 23
0
    def train_step(x, y, first_batch):
        with tf.GradientTape(persistent=True) as tape:
            y_pred = model(x, training=True)
            loss = compiled_loss(y, y_pred)
            linear_loss = wide_optimizer.get_scaled_loss(
                loss) if args.amp else loss
            deep_loss = deep_optimizer.get_scaled_loss(
                loss) if args.amp else loss

        if not args.cpu:
            tape = hvd.DistributedGradientTape(tape)

        for metric in metrics:
            metric.update_state(y, y_pred)

        linear_vars = model.linear_model.trainable_variables
        dnn_vars = model.dnn_model.trainable_variables
        linear_grads = tape.gradient(linear_loss, linear_vars)
        dnn_grads = tape.gradient(deep_loss, dnn_vars)
        if args.amp:
            linear_grads = wide_optimizer.get_unscaled_gradients(linear_grads)
            dnn_grads = deep_optimizer.get_unscaled_gradients(dnn_grads)

        wide_optimizer.apply_gradients(zip(linear_grads, linear_vars))
        deep_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))
        if first_batch and not args.cpu:
            hvd.broadcast_variables(model.linear_model.variables, root_rank=0)
            hvd.broadcast_variables(model.dnn_model.variables, root_rank=0)
            hvd.broadcast_variables(wide_optimizer.variables(), root_rank=0)
            hvd.broadcast_variables(deep_optimizer.variables(), root_rank=0)
        return loss
Exemplo n.º 24
0
    def __call__(self, x, y):
        with tf.GradientTape(persistent=True) as tape:
            y_pred = self.model(x, training=True)
            loss = self.compiled_loss(y, y_pred)
            linear_loss = (
                self.wide_optimizer.get_scaled_loss(loss) if self.args.amp else loss
            )
            deep_loss = (
                self.deep_optimizer.get_scaled_loss(loss) if self.args.amp else loss
            )

        if not self.args.cpu:
            tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True)

        linear_vars = self.model.linear_model.trainable_variables
        dnn_vars = self.model.dnn_model.trainable_variables
        linear_grads = tape.gradient(linear_loss, linear_vars)
        dnn_grads = tape.gradient(deep_loss, dnn_vars)
        if self.args.amp:
            linear_grads = self.wide_optimizer.get_unscaled_gradients(linear_grads)
            dnn_grads = self.deep_optimizer.get_unscaled_gradients(dnn_grads)

        self.wide_optimizer.apply_gradients(zip(linear_grads, linear_vars))
        self.deep_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))

        if self.current_step_var == 0:
            hvd.broadcast_variables(self.model.linear_model.variables, root_rank=0)
            hvd.broadcast_variables(self.model.dnn_model.variables, root_rank=0)
            hvd.broadcast_variables(self.wide_optimizer.variables(), root_rank=0)
            hvd.broadcast_variables(self.deep_optimizer.variables(), root_rank=0)

        return loss
Exemplo n.º 25
0
def train_one_step(model, opt, x, y, step, loss_func, compression, opts):

    preprocess = PreProcess(opts)

    with tf.GradientTape(persistent=True) as tape:
        logits = model(x, training=True)
        loss = loss_func(y, logits)
        # scaled_loss = opt.get_scaled_loss(loss)

    # Horovod: add Horovod Distributed GradientTape.
    tape = hvd.DistributedGradientTape(tape, compression=compression,
                                       op=hvd.Average)  # ,device_sparse='/gpu:2', device_dense='/gpu:2')
    # scaled_gradients = tape.gradient(scaled_loss, model.trainable_variables)
    # grads = opt.get_unscaled_gradients(scaled_gradients)

    if opts.lr_scheduler == 'constant':
        lr = opts.base_lr
    elif opts.lr_scheduler == 'cosine':
        lr = cosine_decay_with_warmup(global_step=step,
                                      learning_rate_base=opts.base_lr,
                                      total_steps=opts.steps_per_epoch // 2,
                                      warmup_learning_rate=opts.warmup_learning_rate,
                                      warmup_steps=2 * hvd.size())
    elif opts.lr_scheduler == 'cyclic':
        lr = cyclic_learning_rate(global_step=step,
                                  base_lr=opts.min_lr,
                                  max_lr=opts.max_lr,
                                  step_size=opts.step_size,
                                  gamma=opts.gamma)
    else:
        raise NotImplementedError('Unsupported learning rate scheduling type')

    tf.keras.backend.set_value(opt.lr, lr)

    grads = tape.gradient(loss, model.trainable_variables)

    opt.apply_gradients(zip(grads, model.trainable_variables))

    if step == 0:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)

    if not opts.evaluate:
        lr = cosine_decay_with_warmup(global_step=step,
                                          learning_rate_base=0.001,
                                          warmup_learning_rate=0.00001,
                                          total_steps=opts.steps_per_epoch // 1,
                                          warmup_steps=2*hvd.size())
        opt = tf.keras.optimizers.SGD(learning_rate=lr*hvd.size(), momentum=0.9, nesterov=True)
        grads = tape.gradient(loss, model.trainable_variables)

        opt.apply_gradients(zip(grads, model.trainable_variables))
        if step == 0:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(opt.variables(), root_rank=0)

    pred = tf.argmax(logits, axis=-1)
    
    del tape
    return loss, pred, opt
Exemplo n.º 26
0
    def train_step(self, y_sketch_gt, y_sketch_teacher, x_image, first_step):
        with tf.GradientTape() as tape:
            params = self.forward(y_sketch_teacher, x_image, training=True)[:-1]
            total_loss, pen_loss, offset_loss, pixel_loss, kl_loss = self.compute_loss(params, y_sketch_gt, x_image)

        if self._distributed:
            tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(total_loss, self._encoder.trainable_variables + self._decoder.trainable_variables)
        self._optimizer.apply_gradients(zip(grads, self._encoder.trainable_variables + self._decoder.trainable_variables))

        if self._distributed and first_step:
            hvd.broadcast_variables(self._encoder.trainable_variables + self._decoder.trainable_variables, root_rank=0)
            hvd.broadcast_variables(self._optimizer.variables(), root_rank=0)

        return total_loss, pen_loss, offset_loss, pixel_loss, kl_loss
Exemplo n.º 27
0
    def on_batch_end(self, batch, logs=None):
        if self.broadcast_done:
            return

        with tf.device(self.device):
            if hvd._executing_eagerly() and hasattr(self.model, 'variables'):
                # TensorFlow 2.0 or TensorFlow eager
                hvd.broadcast_variables(self.model.variables,
                                        root_rank=self.root_rank)
                hvd.broadcast_variables(self.model.optimizer.variables(),
                                        root_rank=self.root_rank)
            else:
                bcast_op = hvd.broadcast_global_variables(self.root_rank)
                self.backend.get_session().run(bcast_op)

        self.broadcast_done = True
Exemplo n.º 28
0
def benchmark_step(first_batch):
    with tf.GradientTape() as tape:
        probs = model(data, training=True)
        loss = tf.losses.categorical_crossentropy(target, probs)

    gradients = tape.gradient(loss, model.trainable_variables)
    opt.apply_gradients(zip(gradients, model.trainable_variables))

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    #
    # Note: broadcast should be done after the first gradient step to ensure optimizer
    # initialization.
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)
Exemplo n.º 29
0
def benchmark_step(dataset_inputs, first_batch=False):
    x_input, y_label = dataset_inputs
    y_label = tf.reshape(y_label, (y_label[0], 1))
    with tf.GradientTape() as tape:
        prediction = model(x_input, training=True)
        loss = tf.losses.sparse_categorical_crossentropy(y_label, prediction)
    # Horovod: add Horovod Distributed GradientTape for reduction:=============#
    tape = hvd.DistributedGradientTape(tape)
    gradients = tape.gradient(loss, model.trainable_variables)
    opt.apply_gradients(zip(gradients, model.trainable_variables))
    # Horovod: broadcast initial variable states from rank 0 to all other
    # processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    # Note: broadcast should be done after the first gradient step to ensure
    # optimizer initialization.
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(opt.variables(), root_rank=0)
Exemplo n.º 30
0
def train_step(images, labels, first_batch):
    gradients, loss, predictions = get_grads(images, labels)

    # Rubik: Accumulate the gradients across microbatches
    # Horovod: Allreduce the accumulated gradients
    gradients = [hvd.allreduce(g.accumulate()) for g in gradients]
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Horovod: Broadcast the variables after first batch 
    if first_batch:
        hvd.broadcast_variables(model.variables, root_rank=0)
        hvd.broadcast_variables(optimizer.variables(), root_rank=0)

    # Rubik: Average the loss across microbatches
    train_loss(loss.reduce_mean())

    # Rubik: Merge predictions across microbatches
    train_accuracy(labels, predictions.merge())
    return loss.reduce_mean()