def apply_gradients(self, grads_and_vars, global_step=None, name=None): summed_grads_and_vars = [] for (grad, var) in grads_and_vars: if grad is None: summed_grads_and_vars.append((grad, var)) else: with ops.colocate_with(grad): # gradient accumulation if self._gradients_to_accumulate > 1 and not self._pipelining: grad = gen_poputil_ops.ipu_stateful_gradient_accumulate( grad / self._gradients_to_accumulate, num_mini_batches=self._gradients_to_accumulate) # replication if self._replicas > 1: grad = gen_poputil_ops.ipu_replication_normalise( cross_replica_ops.cross_replica_sum(grad)) grad = math_ops.cast(grad, var.dtype) summed_grads_and_vars.append((grad, var)) if self._pipelining: # can do weight decay here as apply_gradients is only called on last accumulation step summed_grads_and_vars = self.add_WD(summed_grads_and_vars) ret = self._optimizer.apply_gradients(summed_grads_and_vars, global_step, name) if self._sharded: sharding.propagate_sharding(ops.get_default_graph()) return ret
def compute_gradients(self, loss, var_list=None, **kwargs): kwargs['colocate_gradients_with_ops'] = True grads_and_vars = self._optimizer.compute_gradients(loss, var_list=var_list, **kwargs) if not self._pipelining: grads_and_vars = self.add_WD(grads_and_vars) if self._gradients_to_accumulate > 1: grads_and_vars = [(grad / self._gradients_to_accumulate, var) for grad, var in grads_and_vars] if self._sharded: sharding.propagate_sharding(ops.get_default_graph()) return grads_and_vars
def compute_gradients(self, loss, var_list=None, **kwargs): if not var_list: var_list = self._var_list grads_and_vars = self._optimizer.compute_gradients(loss, var_list=var_list, **kwargs) if not self._pipelining: grads_and_vars = self.add_WD(grads_and_vars) if self._gradient_accumulation_count > 1: grads_and_vars = [(grad / self._gradient_accumulation_count, var) for grad, var in grads_and_vars] if self._sharded: sharding.propagate_sharding(ops.get_default_graph()) return grads_and_vars
def apply_gradients(self, grads_and_vars, global_step=None, name=None): summed_grads_and_vars = [] for (grad, var) in grads_and_vars: if grad is None: summed_grads_and_vars.append((grad, var)) else: with ops.colocate_with(grad): # gradient accumulation if self._gradient_accumulation_count > 1 and not self._pipelining: grad = gen_poputil_ops.ipu_stateful_gradient_accumulate( grad, num_mini_batches=self._gradient_accumulation_count) # replication if self._replicas > 1: grad = gen_poputil_ops.ipu_replication_normalise( cross_replica_ops.cross_replica_sum(grad)) # distribution with IPUMultiWorkerStrategy needs additional normalisation by the number of workers if isinstance( distribute.get_strategy(), ipu_multi_worker_strategy.IPUMultiWorkerStrategy): grad /= distribute.get_strategy().num_replicas_in_sync grad = math_ops.cast(grad, var.dtype) summed_grads_and_vars.append((grad, var)) if self._pipelining: # can do weight decay here as apply_gradients is only called on last accumulation step summed_grads_and_vars = self.add_WD(summed_grads_and_vars) if self._grad_scale != 1.0: # don't rescale batch norm moving average statistics as they are not affected by loss scaling summed_grads_and_vars = [ (grad, var) if 'batch_norm/moving_' in var.name else (grad / self._grad_scale, var) for grad, var in summed_grads_and_vars ] ret = self._optimizer.apply_gradients(summed_grads_and_vars, global_step, name) if self._sharded: sharding.propagate_sharding(ops.get_default_graph()) return ret
def apply_gradients(self, grads_and_vars, global_step=None, name=None): ret = self._optimizer.apply_gradients(grads_and_vars, global_step, name) sharding.propagate_sharding(ops.get_default_graph()) return ret
def compute_gradients(self, loss, var_list=None, **kwargs): kwargs['colocate_gradients_with_ops'] = True ret = self._optimizer.compute_gradients(loss, var_list=var_list, **kwargs) sharding.propagate_sharding(ops.get_default_graph()) return ret