예제 #1
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        summed_grads_and_vars = []
        for (grad, var) in grads_and_vars:
            if grad is None:
                summed_grads_and_vars.append((grad, var))
            else:
                with ops.colocate_with(grad):
                    # gradient accumulation
                    if self._gradients_to_accumulate > 1 and not self._pipelining:
                        grad = gen_poputil_ops.ipu_stateful_gradient_accumulate(
                            grad / self._gradients_to_accumulate,
                            num_mini_batches=self._gradients_to_accumulate)

                    # replication
                    if self._replicas > 1:
                        grad = gen_poputil_ops.ipu_replication_normalise(
                            cross_replica_ops.cross_replica_sum(grad))

                    grad = math_ops.cast(grad, var.dtype)
                    summed_grads_and_vars.append((grad, var))

        if self._pipelining:
            # can do weight decay here as apply_gradients is only called on last accumulation step
            summed_grads_and_vars = self.add_WD(summed_grads_and_vars)

        ret = self._optimizer.apply_gradients(summed_grads_and_vars,
                                              global_step, name)
        if self._sharded:
            sharding.propagate_sharding(ops.get_default_graph())
        return ret
예제 #2
0
 def compute_gradients(self, loss, var_list=None, **kwargs):
     kwargs['colocate_gradients_with_ops'] = True
     grads_and_vars = self._optimizer.compute_gradients(loss,
                                                        var_list=var_list,
                                                        **kwargs)
     if not self._pipelining:
         grads_and_vars = self.add_WD(grads_and_vars)
     if self._gradients_to_accumulate > 1:
         grads_and_vars = [(grad / self._gradients_to_accumulate, var)
                           for grad, var in grads_and_vars]
     if self._sharded:
         sharding.propagate_sharding(ops.get_default_graph())
     return grads_and_vars
예제 #3
0
 def compute_gradients(self, loss, var_list=None, **kwargs):
     if not var_list:
         var_list = self._var_list
     grads_and_vars = self._optimizer.compute_gradients(loss,
                                                        var_list=var_list,
                                                        **kwargs)
     if not self._pipelining:
         grads_and_vars = self.add_WD(grads_and_vars)
     if self._gradient_accumulation_count > 1:
         grads_and_vars = [(grad / self._gradient_accumulation_count, var)
                           for grad, var in grads_and_vars]
     if self._sharded:
         sharding.propagate_sharding(ops.get_default_graph())
     return grads_and_vars
예제 #4
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        summed_grads_and_vars = []
        for (grad, var) in grads_and_vars:
            if grad is None:
                summed_grads_and_vars.append((grad, var))
            else:
                with ops.colocate_with(grad):
                    # gradient accumulation
                    if self._gradient_accumulation_count > 1 and not self._pipelining:
                        grad = gen_poputil_ops.ipu_stateful_gradient_accumulate(
                            grad,
                            num_mini_batches=self._gradient_accumulation_count)

                    # replication
                    if self._replicas > 1:
                        grad = gen_poputil_ops.ipu_replication_normalise(
                            cross_replica_ops.cross_replica_sum(grad))

                    # distribution with IPUMultiWorkerStrategy needs additional normalisation by the number of workers
                    if isinstance(
                            distribute.get_strategy(),
                            ipu_multi_worker_strategy.IPUMultiWorkerStrategy):
                        grad /= distribute.get_strategy().num_replicas_in_sync

                    grad = math_ops.cast(grad, var.dtype)
                    summed_grads_and_vars.append((grad, var))

        if self._pipelining:
            # can do weight decay here as apply_gradients is only called on last accumulation step
            summed_grads_and_vars = self.add_WD(summed_grads_and_vars)

        if self._grad_scale != 1.0:
            # don't rescale batch norm moving average statistics as they are not affected by loss scaling
            summed_grads_and_vars = [
                (grad, var) if 'batch_norm/moving_' in var.name else
                (grad / self._grad_scale, var)
                for grad, var in summed_grads_and_vars
            ]
        ret = self._optimizer.apply_gradients(summed_grads_and_vars,
                                              global_step, name)
        if self._sharded:
            sharding.propagate_sharding(ops.get_default_graph())
        return ret
예제 #5
0
 def apply_gradients(self, grads_and_vars, global_step=None, name=None):
   ret = self._optimizer.apply_gradients(grads_and_vars, global_step, name)
   sharding.propagate_sharding(ops.get_default_graph())
   return ret
예제 #6
0
 def compute_gradients(self, loss, var_list=None, **kwargs):
   kwargs['colocate_gradients_with_ops'] = True
   ret = self._optimizer.compute_gradients(loss, var_list=var_list, **kwargs)
   sharding.propagate_sharding(ops.get_default_graph())
   return ret