def __call__(self, y_true, y_pred, sample_weight=None): """Invokes the `Loss` instance. Args: y_true: Ground truth values. y_pred: The predicted values. sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `sample_weight` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled by the corresponding element in the `sample_weight` vector. If the shape of `sample_weight` matches the shape of `y_pred`, then the loss of each measurable element of `y_pred` is scaled by the corresponding value of `sample_weight`. Returns: Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same shape as `y_true`; otherwise, it is scalar. Raises: ValueError: If the shape of `sample_weight` is invalid. """ # If we are wrapping a lambda function strip '<>' from the name as it is not # accepted in scope name. scope_name = 'lambda' if self.name == '<lambda>' else self.name graph_ctx = tf_utils.graph_context_for_symbolic_tensors( y_true, y_pred, sample_weight) with K.name_scope(scope_name or self.__class__.__name__), graph_ctx: losses = self.call(y_true, y_pred) return losses_utils.compute_weighted_loss( losses, sample_weight, reduction=self._get_reduction())
def compute_weighted_loss(losses, sample_weight=None, reduction=ReductionV2.SUM_OVER_BATCH_SIZE, name=None): """Computes the weighted loss. Args: losses: `Tensor` of shape `[batch_size, d1, ... dN]`. sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as `losses`, or be broadcastable to `losses`. reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: Optional name for the op. Raises: ValueError: If the shape of `sample_weight` is not compatible with `losses`. Returns: Weighted loss `Tensor` of the same type as `losses`. If `reduction` is `NONE`, this has the same shape as `losses`; otherwise, it is scalar. """ ReductionV2.validate(reduction) # If this function is called directly, then we just default 'AUTO' to # 'SUM_OVER_BATCH_SIZE'. Eg. Canned estimator use cases. if reduction == ReductionV2.AUTO: reduction = ReductionV2.SUM_OVER_BATCH_SIZE if sample_weight is None: sample_weight = 1.0 with K.name_scope(name or 'weighted_loss'): # Save the `reduction` argument for loss normalization when distributing # to multiple replicas. Used only for estimator + v1 optimizer flow. ops.get_default_graph()._last_loss_reduction = reduction # pylint: disable=protected-access # Update dimensions of `sample_weight` to match with `losses` if possible. losses, _, sample_weight = squeeze_or_expand_dimensions( losses, None, sample_weight) losses = ops.convert_to_tensor(losses) input_dtype = losses.dtype losses = math_ops.cast(losses, dtypes.float32) sample_weight = math_ops.cast(sample_weight, dtypes.float32) try: # Broadcast weights if possible. sample_weight = weights_broadcast_ops.broadcast_weights( sample_weight, losses) except ValueError: # Reduce values to same ndim as weight array. ndim = K.ndim(losses) weight_ndim = K.ndim(sample_weight) losses = K.mean(losses, axis=list(range(weight_ndim, ndim))) sample_weight.shape.assert_is_compatible_with(losses.shape) weighted_losses = math_ops.multiply(losses, sample_weight) # Apply reduction function to the individual weighted losses. loss = reduce_weighted_loss(weighted_losses, reduction) # Convert the result back to the input type. loss = math_ops.cast(loss, input_dtype) return loss
def _distributed_apply(self, distribution, grads_and_vars, name): """`apply_gradients` using a `DistributionStrategy`.""" reduced_grads = distribution.extended.batch_reduce_to( ds_reduce_util.ReduceOp.SUM, grads_and_vars) var_list = [v for _, v in grads_and_vars] grads_and_vars = zip(reduced_grads, var_list) def apply_grad_to_update_var(var, grad): """Apply gradient to variable.""" if isinstance(var, ops.Tensor): raise NotImplementedError("Trying to update a Tensor ", var) if isinstance(grad, ops.IndexedSlices): if var.constraint is not None: raise RuntimeError( "Cannot use a constraint function on a sparse variable.") return self._resource_apply_sparse_duplicate_indices( grad.values, var, grad.indices) update_op = self._resource_apply_dense(grad, var) if var.constraint is not None: with ops.control_dependencies([update_op]): return var.assign(var.constraint(var)) else: return update_op update_ops = [] with backend.name_scope(name or self._name): for grad, var in grads_and_vars: scope_name = ("" if ops.executing_eagerly_outside_functions() else "_" + var.op.name) with backend.name_scope("update" + scope_name): update_ops.extend( distribution.extended.update( var, apply_grad_to_update_var, args=(grad,), group=False)) any_symbolic = any(isinstance(i, ops.Operation) or tf_utils.is_symbolic_tensor(i) for i in update_ops) if not context.executing_eagerly() or any_symbolic: # If the current context is graph mode or any of the update ops are # symbolic then the step update should be carried out under a graph # context. (eager updates execute immediately) with ops._get_graph_from_inputs(update_ops).as_default(): # pylint: disable=protected-access with ops.control_dependencies(update_ops): return self._iterations.assign_add(1).op return self._iterations.assign_add(1)
def __init__(self, optimizer, iterations=None): # pylint: disable=super-init-not-called self.optimizer = optimizer self._track_checkpointable(optimizer, name='optimizer') if iterations is None: with K.name_scope(self.__class__.__name__): self.iterations = K.variable(0, dtype='int64', name='iterations') else: self.iterations = iterations self._track_checkpointable(self.iterations, name='global_step')
def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs): super(SGD, self).__init__(**kwargs) with K.name_scope(self.__class__.__name__): self.iterations = K.variable(0, dtype='int64', name='iterations') self.lr = K.variable(lr, name='lr') self.momentum = K.variable(momentum, name='momentum') self.decay = K.variable(decay, name='decay') self.initial_decay = decay self.nesterov = nesterov
def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs): super(Adagrad, self).__init__(**kwargs) with K.name_scope(self.__class__.__name__): self.lr = K.variable(lr, name='lr') self.decay = K.variable(decay, name='decay') self.iterations = K.variable(0, dtype='int64', name='iterations') if epsilon is None: epsilon = K.epsilon() self.epsilon = epsilon self.initial_decay = decay
def _separable_conv_block(ip, filters, kernel_size=(3, 3), strides=(1, 1), block_id=None): """Adds 2 blocks of [relu-separable conv-batchnorm]. Arguments: ip: Input tensor filters: Number of output filters per layer kernel_size: Kernel size of separable convolutions strides: Strided convolution for downsampling block_id: String block_id Returns: A Keras tensor """ channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 with K.name_scope('separable_conv_block_%s' % block_id): x = Activation('relu')(ip) x = SeparableConv2D( filters, kernel_size, strides=strides, name='separable_conv_1_%s' % block_id, padding='same', use_bias=False, kernel_initializer='he_normal')( x) x = BatchNormalization( axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='separable_conv_1_bn_%s' % (block_id))( x) x = Activation('relu')(x) x = SeparableConv2D( filters, kernel_size, name='separable_conv_2_%s' % block_id, padding='same', use_bias=False, kernel_initializer='he_normal')( x) x = BatchNormalization( axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='separable_conv_2_bn_%s' % (block_id))( x) return x
def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0., **kwargs): super(Adamax, self).__init__(**kwargs) with K.name_scope(self.__class__.__name__): self.iterations = K.variable(0, dtype='int64', name='iterations') self.lr = K.variable(lr, name='lr') self.beta_1 = K.variable(beta_1, name='beta_1') self.beta_2 = K.variable(beta_2, name='beta_2') self.decay = K.variable(decay, name='decay') if epsilon is None: epsilon = K.epsilon() self.epsilon = epsilon self.initial_decay = decay
def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004, **kwargs): super(Nadam, self).__init__(**kwargs) with K.name_scope(self.__class__.__name__): self.iterations = K.variable(0, dtype='int64', name='iterations') self.m_schedule = K.variable(1., name='m_schedule') self.lr = K.variable(lr, name='lr') self.beta_1 = K.variable(beta_1, name='beta_1') self.beta_2 = K.variable(beta_2, name='beta_2') if epsilon is None: epsilon = K.epsilon() self.epsilon = epsilon self.schedule_decay = schedule_decay
def _eager_metrics_fn(model, outputs, targets): """Calculates the metrics for each output of the given model. Arguments: model: The model on which metrics are being calculated. outputs: The outputs of the given model. targets: The predictions or targets of the given model. Returns: Returns the metric names and metric results for each output of the model. """ metric_names = [] metric_results = [] if not isinstance(outputs, list): outputs = [outputs] if not isinstance(targets, list): targets = [targets] for i in range(len(model.outputs)): output_metrics = model.nested_metrics[i] for nested_output_metric in output_metrics: metric_name, metric_fn = _get_metrics_info( nested_output_metric, backend.int_shape(model.outputs[i]), model.loss_functions[i]) if len(model.output_names) > 1: metric_name = model.output_names[i] + '_' + metric_name if metric_name not in model.metrics_names: model.metrics_names.append(metric_name) with backend.name_scope(metric_name): metric_result = metric_fn(targets[i], outputs[i]) metric_names.append(metric_name) metric_results.append(backend.mean(metric_result)) return metric_results
def _eager_metrics_fn(model, outputs, targets): """Calculates the metrics for each output of the given model. Arguments: model: The model on which metrics are being calculated. outputs: The outputs of the given model. targets: The predictions or targets of the given model. Returns: Returns the metric names and metric results for each output of the model. """ metric_names = [] metric_results = [] if not isinstance(outputs, list): outputs = [outputs] if not isinstance(targets, list): targets = [targets] for i in range(len(model.outputs)): output_metrics = model.nested_metrics[i] for nested_output_metric in output_metrics: metric_name, metric_fn = _get_metrics_info( nested_output_metric, backend.int_shape(model.outputs[i]), model.loss_functions[i]) if len(model.output_names) > 1: metric_name = model.output_names[i] + '_' + metric_name if metric_name not in model.metrics_names: model.metrics_names.append(metric_name) with backend.name_scope(metric_name): metric_result = metric_fn(targets[i], outputs[i]) metric_names.append(metric_name) metric_results.append(backend.mean(metric_result)) return metric_results
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0., amsgrad=False, multipliers=None, debug_verbose=False, **kwargs): super(AdvancedAdam, self).__init__(**kwargs) with K.name_scope(self.__class__.__name__): self.iterations = K.variable(0, dtype='int64', name='iterations') self.lr = K.variable(lr, name='lr') self.beta_1 = K.variable(beta_1, name='beta_1') self.beta_2 = K.variable(beta_2, name='beta_2') self.decay = K.variable(decay, name='decay') if epsilon is None: epsilon = K.epsilon() self.epsilon = epsilon self.initial_decay = decay self.amsgrad = amsgrad self.multipliers = multipliers self.debug_verbose = debug_verbose
def get_gradients(self, loss, params): """Returns gradients of `loss` with respect to `params`. Arguments: loss: Loss tensor. params: List of variables. Returns: List of gradient tensors. Raises: ValueError: In case any gradient cannot be computed (e.g. if gradient function not implemented). """ params = nest.flatten(params) with backend.get_graph().as_default(), backend.name_scope( self._name + "/gradients"): grads = gradients.gradients(loss, params) for grad, param in zip(grads, params): if grad is None: raise ValueError( "Variable {} has `None` for gradient. " "Please make sure that all of your ops have a " "gradient defined (i.e. are differentiable). " "Common ops without gradient: " "K.argmax, K.round, K.eval.".format(param)) if hasattr(self, "clipnorm"): grads = [ clip_ops.clip_by_norm(g, self.clipnorm) for g in grads ] if hasattr(self, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads ] return grads
def __call__(self, ip): with backend.name_scope('separable_conv_block_%s' % self.block_id): x = layers.Activation('relu')(ip) if self.strides == 2: x = layers.ZeroPadding1D(padding=self.kernel_size, name='separable_conv_1_pad_%s' % self.block_id)(x) conv_pad = 'valid' else: conv_pad = 'same' x = layers.SeparableConv1D(self.filters, self.kernel_size, strides=self.strides, name='separable_conv_1_%s' % self.block_id, padding=conv_pad, use_bias=False, kernel_initializer='he_normal')(x) x = layers.BatchNormalization(momentum=0.9997, epsilon=1e-3, name='separable_conv_1_bn_%s' % self.block_id)(x) x = layers.Activation('relu')(x) x = layers.SeparableConv1D(self.filters, self.kernel_size, name='separable_conv_2_%s' % self.block_id, padding='same', use_bias=False, kernel_initializer='he_normal')(x) x = layers.BatchNormalization(momentum=0.9997, epsilon=1e-3, name='separable_conv_2_bn_%s' % self.block_id)(x) return x
def inference(self, inputs, *args, **kwargs): call_context = base_layer_utils.call_context() input_list = nest.flatten(inputs) # We will attempt to build a TF graph if & only if all inputs are symbolic. # This is always the case in graph mode. It can also be the case in eager # mode when all inputs can be traced back to `keras.Input()` (when building # models using the functional API). build_graph = tf_utils.are_all_symbolic_tensors(input_list) # Accept NumPy and scalar inputs by converting to Tensors. if any(isinstance(x, (np.ndarray, float, int)) for x in input_list): def _convert_non_tensor(x): # Don't call `ops.convert_to_tensor` on all `inputs` because # `SparseTensors` can't be converted to `Tensor`. if isinstance(x, (np.ndarray, float, int)): return ops.convert_to_tensor(x) return x inputs = nest.map_structure(_convert_non_tensor, inputs) input_list = nest.flatten(inputs) # Handle `mask` propagation from previous layer to current layer. Masks can # be propagated explicitly via the `mask` argument, or implicitly via # setting the `_keras_mask` attribute on the inputs to a Layer. Masks passed # explicitly take priority. mask_arg_passed_by_framework = False input_masks = self._collect_input_masks(inputs, args, kwargs) if (self._expects_mask_arg and input_masks is not None and not self._call_arg_was_passed('mask', args, kwargs)): mask_arg_passed_by_framework = True kwargs['mask'] = input_masks # If `training` argument was not explicitly passed, propagate `training` # value from this layer's calling layer. training_arg_passed_by_framework = False # Priority 1: `training` was explicitly passed. if self._call_arg_was_passed('training', args, kwargs): training_value = self._get_call_arg_value('training', args, kwargs) if not self._expects_training_arg: kwargs.pop('training') else: training_value = None # Priority 2: `training` was passed to a parent layer. if call_context.training is not None: training_value = call_context.training # Priority 3a: `learning_phase()` has been set. elif backend.global_learning_phase_is_set(): training_value = backend.learning_phase() # Priority 3b: Pass the `learning_phase()` if in the Keras FuncGraph. elif build_graph: with backend.get_graph().as_default(): if base_layer_utils.is_in_keras_graph(): training_value = backend.learning_phase() if self._expects_training_arg and training_value is not None: # Force the training_value to be bool type which matches to the contract # for layer/model call args. if tensor_util.is_tensor(training_value): training_value = math_ops.cast(training_value, dtypes.bool) else: training_value = bool(training_value) kwargs['training'] = training_value training_arg_passed_by_framework = True # Only create Keras history if at least one tensor originates from a # `keras.Input`. Otherwise this Layer may be being used outside the Keras # framework. if build_graph and base_layer_utils.needs_keras_history(inputs): base_layer_utils.create_keras_history(inputs) # Clear eager losses on top level model call. # We are clearing the losses only on the top level model call and not on # every layer/model call because layer/model may be reused. if (base_layer_utils.is_in_eager_or_tf_function() and not call_context.in_call): self._clear_losses() with call_context.enter(self, inputs, build_graph, training_value): # Check input assumptions set after layer building, e.g. input shape. if build_graph: # Symbolic execution on symbolic tensors. We will attempt to build # the corresponding TF subgraph inside `backend.get_graph()` # TODO(reedwm): We should assert input compatibility after the inputs # are casted, not before. input_spec.assert_input_compatibility(self.input_spec, inputs, self.name) if (any(isinstance(x, ragged_tensor.RaggedTensor) for x in input_list) and self._supports_ragged_inputs is False): # pylint: disable=g-bool-id-comparison raise ValueError('Layer %s does not support RaggedTensors as input. ' 'Inputs received: %s. You can try converting your ' 'input to an uniform tensor.' % (self.name, inputs)) graph = backend.get_graph() with graph.as_default(), backend.name_scope(self._name_scope()): # Build layer if applicable (if the `build` method has been # overridden). self._maybe_build(inputs) cast_inputs = self._maybe_cast_inputs(inputs) # Wrapping `call` function in autograph to allow for dynamic control # flow and control dependencies in call. We are limiting this to # subclassed layers as autograph is strictly needed only for # subclassed layers and models. # tf_convert will respect the value of autograph setting in the # enclosing tf.function, if any. if (base_layer_utils.is_subclassed(self) and not base_layer_utils.from_saved_model(self)): call_fn = autograph.tf_convert( self._inference, ag_ctx.control_status_ctx()) else: call_fn = self._inference if not self.dynamic: try: with base_layer_utils.autocast_context_manager( self._compute_dtype): # Add auto_control_deps in V2 when they are not already added by # a `tf.function`. if (ops.executing_eagerly_outside_functions() and not base_layer_utils.is_in_eager_or_tf_function()): with auto_control_deps.AutomaticControlDependencies() as acd: outputs = call_fn(cast_inputs, *args, **kwargs) # Wrap Tensors in `outputs` in `tf.identity` to avoid # circular dependencies. outputs = base_layer_utils.mark_as_return(outputs, acd) else: outputs = call_fn(cast_inputs, *args, **kwargs) except errors.OperatorNotAllowedInGraphError as e: raise TypeError('You are attempting to use Python control ' 'flow in a layer that was not declared to be ' 'dynamic. Pass `dynamic=True` to the class ' 'constructor.\nEncountered error:\n"""\n' + str(e) + '\n"""') else: # We will use static shape inference to return symbolic tensors # matching the specifications of the layer outputs. # Since `self.dynamic` is True, we will never attempt to # run the underlying TF graph (which is disconnected). # TODO(fchollet): consider py_func as an alternative, which # would enable us to run the underlying graph if needed. outputs = self._symbolic_call(inputs) if outputs is None: raise ValueError('A layer\'s `call` method should return a ' 'Tensor or a list of Tensors, not None ' '(layer: ' + self.name + ').') if base_layer_utils.have_all_keras_metadata(inputs): if training_arg_passed_by_framework: kwargs.pop('training') if mask_arg_passed_by_framework: kwargs.pop('mask') inputs, outputs = self._set_connectivity_metadata_( inputs, outputs, args, kwargs) self._handle_activity_regularization(inputs, outputs) self._set_mask_metadata(inputs, outputs, input_masks) if hasattr(self, '_set_inputs') and not self.inputs: # Subclassed network: explicitly set metadata normally set by # a call to self._set_inputs(). # TODO(b/120997007): This should be done in Eager as well, but # causes garbage collection issues because of the placeholders # created on the default Keras graph. self._set_inputs(inputs, outputs) else: # Eager execution on data tensors. with backend.name_scope(self._name_scope()): self._maybe_build(inputs) cast_inputs = self._maybe_cast_inputs(inputs) with base_layer_utils.autocast_context_manager( self._compute_dtype): outputs = self._inference(cast_inputs, *args, **kwargs) self._handle_activity_regularization(inputs, outputs) self._set_mask_metadata(inputs, outputs, input_masks) return outputs
def _num_elements(losses): """Computes the number of elements in `losses` tensor.""" with K.name_scope('num_elements') as scope: return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
def _model_loss(model, inputs, targets, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: List of input arrays. targets: List of target arrays. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss and loss value calculated using the specified loss function. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ total_loss = 0 if len(inputs) == 1: if model._expects_training_arg: outs = model.call(inputs[0], training=training) else: outs = model.call(inputs[0]) else: if model._expects_training_arg: outs = model.call(inputs, training=training) else: outs = model.call(inputs) if not isinstance(outs, list): outs = [outs] if not isinstance(targets, list): targets = [targets] loss_metrics = [] with backend.name_scope('loss'): for i, loss_fn in enumerate(model.loss_functions): if sample_weights: weights = sample_weights[i] else: weights = None # TODO(fchollet): support masking; in practice `_keras_mask` is never # set in this context currently. mask = outs[i]._keras_mask weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn) with backend.name_scope(model.output_names[i] + '_loss'): output_loss = weighted_masked_fn( targets[i], outs[i], weights, mask=mask) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: loss_metrics.append(backend.mean(output_loss)) loss_weight = model.loss_weights_list[i] if total_loss is None: total_loss = loss_weight * output_loss else: total_loss += loss_weight * output_loss total_loss = backend.mean(total_loss) # Add regularization losses custom_losses = [] for layer in model.layers: if layer.losses: custom_losses += layer.losses if custom_losses: total_loss += sum(custom_losses) return outs, total_loss, loss_metrics
def _model_loss(model, inputs, targets, output_loss_metrics=None, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. output_loss_metrics: List of metrics that are used to aggregated output loss values. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ # Used to keep track of the total loss value (stateless). # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) + # loss_weight_2 * output_2_loss_fn(...) + # layer losses. total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] # Allow mixed `NumPy` and `EagerTensor` input here. if any( isinstance(input_t, (np.ndarray, float, int)) for input_t in nest.flatten(inputs)): inputs = nest.map_structure(ops.convert_to_tensor, inputs) outs = model(inputs, **kwargs) outs = nest.flatten(outs) # `None` by default for `EagerTensors`. masks = [t._keras_mask for t in outs] targets = nest.flatten(targets) # Used to keep track of individual output losses (stateless). output_losses = [] # Used to keep track of individual output losses (stateful). aggregated_output_losses = [] with backend.name_scope('loss'): for i, loss_fn in enumerate(model.loss_functions): weights = sample_weights[i] if sample_weights else None mask = masks[i] with backend.name_scope(model.output_names[i] + '_loss'): if mask is not None: mask = math_ops.cast(mask, outs[i].dtype) # Update weights with mask. if weights is None: weights = mask else: # Update dimensions of weights to match with mask if possible. mask, _, weights = ( losses_utils.squeeze_or_expand_dimensions(mask, None, weights)) weights *= mask # Reset reduction on the loss so that we can get the per sample loss # value. We use this to get both the stateless and stateful loss # values without having to compute the underlying loss function # twice. weighted_losses = None if hasattr(loss_fn, 'reduction'): current_loss_reduction = loss_fn.reduction loss_fn.reduction = losses_utils.ReductionV2.NONE weighted_losses = loss_fn(targets[i], outs[i], sample_weight=weights) loss_fn.reduction = current_loss_reduction # Compute the stateless loss value. output_loss = losses_utils.reduce_weighted_loss(weighted_losses) else: # Compute the stateless loss value for a custom loss class. # Here we assume that the class takes care of loss reduction # because if this class returns a vector value we cannot # differentiate between use case where a custom optimizer # expects a vector loss value vs unreduced per-sample loss value. output_loss = loss_fn(targets[i], outs[i], sample_weight=weights) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: output_losses.append(backend.mean(output_loss)) if output_loss_metrics is not None: # Compute the stateful loss value. if weighted_losses is not None: aggregated_output_loss = output_loss_metrics[i](weighted_losses) else: # Custom loss class. aggregated_output_loss = training_utils.call_metric_function( output_loss_metrics[i], targets[i], outs[i], weights=weights) # Keep track of the stateful output loss result. aggregated_output_losses.append(aggregated_output_loss) loss_weight = model.loss_weights_list[i] if total_loss is None: total_loss = loss_weight * output_loss else: total_loss += loss_weight * output_loss total_loss = backend.mean(total_loss) # Add regularization losses custom_losses = model.losses if custom_losses: total_loss += losses_utils.scale_loss_for_distribution( math_ops.add_n(custom_losses)) return outs, total_loss, output_losses, aggregated_output_losses, masks
def _adjust_block(p, ip, filters, block_id=None): """Adjusts the input `previous path` to match the shape of the `input`. Used in situations where the output number of filters needs to be changed. Arguments: p: Input tensor which needs to be modified ip: Input tensor whose shape needs to be matched filters: Number of output filters to be matched block_id: String block_id Returns: Adjusted Keras tensor """ channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 img_dim = 2 if K.image_data_format() == 'channels_first' else -2 ip_shape = K.int_shape(ip) if p is not None: p_shape = K.int_shape(p) with K.name_scope('adjust_block'): if p is None: p = ip elif p_shape[img_dim] != ip_shape[img_dim]: with K.name_scope('adjust_reduction_block_%s' % block_id): p = Activation('relu', name='adjust_relu_1_%s' % block_id)(p) p1 = AveragePooling2D( (1, 1), strides=(2, 2), padding='valid', name='adjust_avg_pool_1_%s' % block_id)( p) p1 = Conv2D( filters // 2, (1, 1), padding='same', use_bias=False, name='adjust_conv_1_%s' % block_id, kernel_initializer='he_normal')( p1) p2 = ZeroPadding2D(padding=((0, 1), (0, 1)))(p) p2 = Cropping2D(cropping=((1, 0), (1, 0)))(p2) p2 = AveragePooling2D( (1, 1), strides=(2, 2), padding='valid', name='adjust_avg_pool_2_%s' % block_id)( p2) p2 = Conv2D( filters // 2, (1, 1), padding='same', use_bias=False, name='adjust_conv_2_%s' % block_id, kernel_initializer='he_normal')( p2) p = concatenate([p1, p2], axis=channel_dim) p = BatchNormalization( axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='adjust_bn_%s' % block_id)( p) elif p_shape[channel_dim] != filters: with K.name_scope('adjust_projection_block_%s' % block_id): p = Activation('relu')(p) p = Conv2D( filters, (1, 1), strides=(1, 1), padding='same', name='adjust_conv_projection_%s' % block_id, use_bias=False, kernel_initializer='he_normal')( p) p = BatchNormalization( axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='adjust_bn_%s' % block_id)( p) return p
def _model_loss(model, inputs, targets, output_loss_metrics=None, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. output_loss_metrics: List of metrics that are used to aggregated output loss values. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ # TODO(psv): Dedup code here with graph mode prepare_total_loss() fn. # Used to keep track of the total loss value (stateless). # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) + # loss_weight_2 * output_2_loss_fn(...) + # layer losses. total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] # Allow mixed `NumPy` and `EagerTensor` input here. if any( isinstance(input_t, (np.ndarray, float, int)) for input_t in nest.flatten(inputs)): inputs = nest.map_structure(ops.convert_to_tensor, inputs) outs = model(inputs, **kwargs) outs = nest.flatten(outs) masks = [getattr(t, '_keras_mask', None) for t in outs] targets = nest.flatten(targets) # Used to keep track of individual output losses. output_losses = [] with backend.name_scope('loss'): loss_fns = [ loss_fn for loss_fn in model.loss_functions if loss_fn is not None ] for i, loss_fn in enumerate(loss_fns): weights = sample_weights[i] if sample_weights else None mask = masks[i] with backend.name_scope(model.output_names[i] + '_loss'): if mask is not None: mask = math_ops.cast(mask, outs[i].dtype) # Update weights with mask. if weights is None: weights = mask else: # Update dimensions of weights to match with mask if possible. mask, _, weights = ( losses_utils.squeeze_or_expand_dimensions(mask, None, weights)) weights *= mask weighted_losses = None if hasattr(loss_fn, 'reduction'): per_sample_losses = loss_fn.call(targets[i], outs[i]) weighted_losses = losses_utils.compute_weighted_loss( per_sample_losses, sample_weight=weights, reduction=losses_utils.ReductionV2.NONE) loss_reduction = loss_fn.reduction # `AUTO` loss reduction defaults to `SUM_OVER_BATCH_SIZE` for all # compile use cases. if loss_reduction == losses_utils.ReductionV2.AUTO: loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE # Compute the stateless loss value. output_loss = losses_utils.reduce_weighted_loss( weighted_losses, reduction=loss_reduction) if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE: output_loss = losses_utils.scale_loss_for_distribution(output_loss) else: # Compute the stateless loss value for a custom loss class. # Here we assume that the class takes care of loss reduction # because if this class returns a vector value we cannot # differentiate between use case where a custom optimizer # expects a vector loss value vs unreduced per-sample loss value. output_loss = loss_fn(targets[i], outs[i], sample_weight=weights) # For custom losses we assume reduction was mean. output_loss = losses_utils.scale_loss_for_distribution(output_loss) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: # Keep track of the stateful output loss result. output_losses.append(output_loss_metrics[i](output_loss)) total_loss += model._loss_weights_list[i] * output_loss # Add regularization losses custom_losses = model.losses if custom_losses: total_loss += losses_utils.scale_loss_for_distribution( math_ops.add_n(custom_losses)) return outs, total_loss, output_losses, masks
def _num_elements(losses): """Computes the number of elements in `losses` tensor.""" with K.name_scope('num_elements') as scope: return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
def _calculate_mean_and_var(self, x, axes, keep_dims): with K.name_scope('moments'): # The dynamic range of fp16 is too limited to support the collection of # sufficient statistics. As a workaround we simply perform the operations # on 32-bit floats before converting the mean and variance back to fp16 y = math_ops.cast( x, dtypes.float32) if x.dtype == dtypes.float16 else x replica_ctx = ds.get_replica_context() if replica_ctx: # local to me local_sum = math_ops.reduce_sum(y, axis=axes, keepdims=True) local_squared_sum = math_ops.reduce_sum(math_ops.square(y), axis=axes, keepdims=True) batch_size = math_ops.cast( array_ops.shape_v2(y)[0], dtypes.float32) # TODO(b/163099951): batch the all-reduces once we sort out the ordering # issue for NCCL. We don't have a mechanism to launch NCCL in the same # order in each replica nowadays, so we limit NCCL to batch all-reduces. # get the sum of all replicas (converge all devices) y_sum = replica_ctx.all_reduce(reduce_util.ReduceOp.SUM, local_sum) # get the sum from all replicas (converge all devices) y_squared_sum = replica_ctx.all_reduce( reduce_util.ReduceOp.SUM, local_squared_sum) # get the net batch size from all devices (converge all devices) global_batch_size = replica_ctx.all_reduce( reduce_util.ReduceOp.SUM, batch_size) # get the number of total params you are averaging (local) axes_vals = [(array_ops.shape_v2(y))[i] for i in range(1, len(axes))] multiplier = math_ops.cast(math_ops.reduce_prod(axes_vals), dtypes.float32) multiplier = multiplier * global_batch_size # conver mean var (locally) mean = y_sum / multiplier y_squared_mean = y_squared_sum / multiplier # var = E(x^2) - E(x)^2 variance = y_squared_mean - math_ops.square(mean) else: # if you only have one replica dont worry about it # Compute true mean while keeping the dims for proper broadcasting. mean = math_ops.reduce_mean(y, axes, keepdims=True, name='mean') # sample variance, not unbiased variance # Note: stop_gradient does not change the gradient that gets # backpropagated to the mean from the variance calculation, # because that gradient is zero variance = math_ops.reduce_mean(math_ops.squared_difference( y, mean), axes, keepdims=True, name='variance') if not keep_dims: mean = array_ops.squeeze(mean, axes) variance = array_ops.squeeze(variance, axes) if x.dtype == dtypes.float16: return (math_ops.cast(mean, dtypes.float16), math_ops.cast(variance, dtypes.float16)) else: return (mean, variance)
def _assign_subdiv_new_value(self, variable, value, subdivisions, count): with K.name_scope('AssignNewValue') as scope: with ops.colocate_with(variable): update_value = array_ops.where((count + 1) % subdivisions == 0, value, variable) return state_ops.assign(variable, update_value, name=scope)
def _assign_latent_avg(self, variable, value): with K.name_scope('latent_avg') as scope: with ops.colocate_with(variable): return state_ops.assign(variable, value, name=scope)
def _model_loss(model, inputs, targets, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] if model._compute_output_and_mask_jointly: outs, masks = model._call_and_compute_mask(inputs, **kwargs) masks = generic_utils.to_list(masks) else: outs = model.call(inputs, **kwargs) masks = None outs = generic_utils.to_list(outs) if masks is None: masks = [None for _ in outs] targets = generic_utils.to_list(targets) loss_metrics = [] with backend.name_scope('loss'): for i, loss_fn in enumerate(model.loss_functions): if sample_weights: weights = sample_weights[i] else: weights = None mask = masks[i] weighted_masked_fn = training_utils.weighted_masked_objective( loss_fn) with backend.name_scope(model.output_names[i] + '_loss'): output_loss = weighted_masked_fn(targets[i], outs[i], weights, mask=mask) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: loss_metrics.append(backend.mean(output_loss)) loss_weight = model.loss_weights_list[i] if total_loss is None: total_loss = loss_weight * output_loss else: total_loss += loss_weight * output_loss total_loss = backend.mean(total_loss) # Add regularization losses custom_losses = [] for layer in model.layers: if layer.losses: custom_losses += layer.losses if custom_losses: total_loss += sum(custom_losses) return outs, total_loss, loss_metrics, masks
def apply_gradients(self, grads_and_vars, name=None, experimental_aggregate_gradients=True): """Apply gradients to variables. This is the second part of `minimize()`. It returns an `Operation` that applies gradients. The method sums gradients from all replicas in the presence of `tf.distribute.Strategy` by default. You can aggregate gradients yourself by passing `experimental_aggregate_gradients=False`. Example: ```python grads = tape.gradient(loss, vars) grads = tf.distribute.get_replica_context().all_reduce('sum', grads) # Processing aggregated gradients. optimizer.apply_gradients(zip(grads, vars), experimental_aggregate_gradients=False) ``` Args: grads_and_vars: List of (gradient, variable) pairs. name: Optional name for the returned operation. Default to the name passed to the `Optimizer` constructor. experimental_aggregate_gradients: Whether to sum gradients from different replicas in the presense of `tf.distribute.Strategy`. If False, it's user responsibility to aggregate the gradients. Default to True. Returns: An `Operation` that applies the specified gradients. The `iterations` will be automatically increased by 1. Raises: TypeError: If `grads_and_vars` is malformed. ValueError: If none of the variables have gradients. """ grads_and_vars = _filter_grads(grads_and_vars) var_list = [v for (_, v) in grads_and_vars] with backend.name_scope(self._name): # Create iteration if necessary. with ops.init_scope(): _ = self.iterations self._create_hypers() self._create_slots(var_list) if not grads_and_vars: # Distribution strategy does not support reducing an empty list of # gradients return control_flow_ops.no_op() if distribute_ctx.in_cross_replica_context(): raise RuntimeError( "`apply_gradients() cannot be called in cross-replica context. " "Use `tf.distribute.Strategy.run` to enter replica " "context.") strategy = distribute_ctx.get_strategy() if (not experimental_aggregate_gradients and strategy and isinstance( strategy.extended, parameter_server_strategy.ParameterServerStrategyExtended)): raise NotImplementedError( "`experimental_aggregate_gradients=False is not supported for " "ParameterServerStrategy and CentralStorageStrategy") apply_state = self._prepare(var_list) if experimental_aggregate_gradients: reduced_grads = self._aggregate_gradients(grads_and_vars) var_list = [v for _, v in grads_and_vars] grads_and_vars = list(zip(reduced_grads, var_list)) return distribute_ctx.get_replica_context().merge_call( functools.partial(self._distributed_apply, apply_state=apply_state), args=(grads_and_vars,), kwargs={ "name": name, })
def _reduction_a_cell(ip, p, filters, block_id=None): """Adds a Reduction cell for NASNet-A (Fig. 4 in the paper). Arguments: ip: Input tensor `x` p: Input tensor `p` filters: Number of output filters block_id: String block_id Returns: A Keras tensor """ channel_dim = 1 if K.image_data_format() == 'channels_first' else -1 with K.name_scope('reduction_A_block_%s' % block_id): p = _adjust_block(p, ip, filters, block_id) h = Activation('relu')(ip) h = Conv2D( filters, (1, 1), strides=(1, 1), padding='same', name='reduction_conv_1_%s' % block_id, use_bias=False, kernel_initializer='he_normal')( h) h = BatchNormalization( axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='reduction_bn_1_%s' % block_id)( h) with K.name_scope('block_1'): x1_1 = _separable_conv_block( h, filters, (5, 5), strides=(2, 2), block_id='reduction_left1_%s' % block_id) x1_2 = _separable_conv_block( p, filters, (7, 7), strides=(2, 2), block_id='reduction_1_%s' % block_id) x1 = add([x1_1, x1_2], name='reduction_add_1_%s' % block_id) with K.name_scope('block_2'): x2_1 = MaxPooling2D( (3, 3), strides=(2, 2), padding='same', name='reduction_left2_%s' % block_id)( h) x2_2 = _separable_conv_block( p, filters, (7, 7), strides=(2, 2), block_id='reduction_right2_%s' % block_id) x2 = add([x2_1, x2_2], name='reduction_add_2_%s' % block_id) with K.name_scope('block_3'): x3_1 = AveragePooling2D( (3, 3), strides=(2, 2), padding='same', name='reduction_left3_%s' % block_id)( h) x3_2 = _separable_conv_block( p, filters, (5, 5), strides=(2, 2), block_id='reduction_right3_%s' % block_id) x3 = add([x3_1, x3_2], name='reduction_add3_%s' % block_id) with K.name_scope('block_4'): x4 = AveragePooling2D( (3, 3), strides=(1, 1), padding='same', name='reduction_left4_%s' % block_id)( x1) x4 = add([x2, x4]) with K.name_scope('block_5'): x5_1 = _separable_conv_block( x1, filters, (3, 3), block_id='reduction_left4_%s' % block_id) x5_2 = MaxPooling2D( (3, 3), strides=(2, 2), padding='same', name='reduction_right5_%s' % block_id)( h) x5 = add([x5_1, x5_2], name='reduction_add4_%s' % block_id) x = concatenate( [x2, x3, x4, x5], axis=channel_dim, name='reduction_concat_%s' % block_id) return x, ip
def remove_squeezable_dimensions(labels, predictions, expected_rank_diff=0, name=None): """Squeeze last dim if ranks differ from expected by exactly 1. In the common case where we expect shapes to match, `expected_rank_diff` defaults to 0, and we squeeze the last dimension of the larger rank if they differ by 1. But, for example, if `labels` contains class IDs and `predictions` contains 1 probability per class, we expect `predictions` to have 1 more dimension than `labels`, so `expected_rank_diff` would be 1. In this case, we'd squeeze `labels` if `rank(predictions) - rank(labels) == 0`, and `predictions` if `rank(predictions) - rank(labels) == 2`. This will use static shape if available. Otherwise, it will add graph operations, which could result in a performance hit. Args: labels: Label values, a `Tensor` whose dimensions match `predictions`. predictions: Predicted values, a `Tensor` of arbitrary dimensions. expected_rank_diff: Expected result of `rank(predictions) - rank(labels)`. name: Name of the op. Returns: Tuple of `labels` and `predictions`, possibly with last dim squeezed. """ with backend.name_scope(name or 'remove_squeezable_dimensions'): if not isinstance(predictions, ragged_tensor.RaggedTensor): predictions = ops.convert_to_tensor_v2_with_dispatch(predictions) if not isinstance(labels, ragged_tensor.RaggedTensor): labels = ops.convert_to_tensor_v2_with_dispatch(labels) predictions_shape = predictions.shape predictions_rank = predictions_shape.ndims labels_shape = labels.shape labels_rank = labels_shape.ndims if (labels_rank is not None) and (predictions_rank is not None): # Use static rank. rank_diff = predictions_rank - labels_rank if (rank_diff == expected_rank_diff + 1 and predictions_shape.dims[-1].is_compatible_with(1)): predictions = array_ops.squeeze(predictions, [-1]) elif (rank_diff == expected_rank_diff - 1 and labels_shape.dims[-1].is_compatible_with(1)): labels = array_ops.squeeze(labels, [-1]) return labels, predictions # Use dynamic rank. rank_diff = array_ops.rank(predictions) - array_ops.rank(labels) if (predictions_rank is None) or (predictions_shape.dims[-1].is_compatible_with(1)): predictions = control_flow_ops.cond( math_ops.equal(expected_rank_diff + 1, rank_diff), lambda: array_ops.squeeze(predictions, [-1]), lambda: predictions) if (labels_rank is None) or (labels_shape.dims[-1].is_compatible_with(1)): labels = control_flow_ops.cond( math_ops.equal(expected_rank_diff - 1, rank_diff), lambda: array_ops.squeeze(labels, [-1]), lambda: labels) return labels, predictions
def _model_loss(model, inputs, targets, output_loss_metrics=None, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. output_loss_metrics: List of metrics that are used to aggregated output loss values. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] if model._compute_output_and_mask_jointly: outs, masks = model._call_and_compute_mask(inputs, **kwargs) masks = nest.flatten(masks) else: outs = model.call(inputs, **kwargs) masks = None outs = nest.flatten(outs) if masks is None: masks = [None for _ in outs] targets = nest.flatten(targets) loss_metrics = [] aggregated_loss_metrics = [] with backend.name_scope('loss'): for i, loss_fn in enumerate(model.loss_functions): if sample_weights: weights = sample_weights[i] else: weights = None mask = masks[i] with backend.name_scope(model.output_names[i] + '_loss'): if isinstance(loss_fn, losses_module.Loss): if mask is not None: mask = math_ops.cast(mask, outs[i].dtype) # Update weights with mask. if weights is None: weights = mask else: # Update dimensions of weights to match with mask if possible. mask, _, weights = squeeze_or_expand_dimensions( mask, None, weights) weights *= mask output_loss = loss_fn(targets[i], outs[i], sample_weight=weights) else: weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn) output_loss = weighted_masked_fn( targets[i], outs[i], weights, mask=mask) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: loss_metrics.append(backend.mean(output_loss)) if output_loss_metrics is not None: # Keep track of the stateful loss result. aggregated_loss_metrics.append( training_utils.call_metric_function( output_loss_metrics[i], targets[i], outs[i], weights=weights, mask=mask)) loss_weight = model.loss_weights_list[i] if total_loss is None: total_loss = loss_weight * output_loss else: total_loss += loss_weight * output_loss total_loss = backend.mean(total_loss) # Add regularization losses custom_losses = model.losses if custom_losses: total_loss += math_ops.add_n(custom_losses) model._clear_losses() return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
def _eager_loss_fn(outputs, targets, loss_fn, output_name): with backend.name_scope(output_name + '_loss'): loss = loss_fn(targets, outputs) return loss
def _model_loss(model, inputs, targets, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: List of input arrays. targets: List of target arrays. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss and loss value calculated using the specified loss function. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ total_loss = 0 if len(inputs) == 1: if model._expects_training_arg: outs = model.call(inputs[0], training=training) else: outs = model.call(inputs[0]) else: if model._expects_training_arg: outs = model.call(inputs, training=training) else: outs = model.call(inputs) if not isinstance(outs, list): outs = [outs] if not isinstance(targets, list): targets = [targets] loss_metrics = [] with backend.name_scope('loss'): for i, loss_fn in enumerate(model.loss_functions): if sample_weights: weights = sample_weights[i] else: weights = None # TODO(fchollet): support masking; in practice `_keras_mask` is never # set in this context currently. mask = outs[i]._keras_mask weighted_masked_fn = training_utils.weighted_masked_objective( loss_fn) with backend.name_scope(model.output_names[i] + '_loss'): output_loss = weighted_masked_fn(targets[i], outs[i], weights, mask=mask) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: loss_metrics.append(backend.mean(output_loss)) loss_weight = model.loss_weights_list[i] if total_loss is None: total_loss = loss_weight * output_loss else: total_loss += loss_weight * output_loss total_loss = backend.mean(total_loss) # Add regularization losses custom_losses = [] for layer in model.layers: if layer.losses: custom_losses += layer.losses if custom_losses: total_loss += sum(custom_losses) return outs, total_loss, loss_metrics
def _assign_new_value(self, variable, value): with K.name_scope('AssignNewValue') as scope: with ops.colocate_with(variable): return state_ops.assign(variable, value, name=scope)
def _distributed_apply(distribution, grads_and_vars, name, apply_state): """`apply_gradients` using a `DistributionStrategy`.""" reduced_grads = distribution.extended.batch_reduce_to( ds_reduce_util.ReduceOp.SUM, grads_and_vars) var_list = [v for _, v in grads_and_vars] grads_and_vars = zip(reduced_grads, var_list) def apply_grad_to_update_var(var, grad): """Apply gradient to variable.""" if isinstance(var, ops.Tensor): raise NotImplementedError("Trying to update a Tensor ", var) apply_kwargs = {} if not isinstance(var, de.TrainableWrapper): if isinstance(grad, ops.IndexedSlices): if var.constraint is not None: raise RuntimeError( "Cannot use a constraint function on a sparse variable.") if "apply_state" in self._sparse_apply_args: apply_kwargs["apply_state"] = apply_state return self._resource_apply_sparse_duplicate_indices( grad.values, var, grad.indices, **apply_kwargs) if "apply_state" in self._dense_apply_args: apply_kwargs["apply_state"] = apply_state update_op = self._resource_apply_dense(grad, var, **apply_kwargs) if var.constraint is not None: with ops.control_dependencies([update_op]): return var.assign(var.constraint(var)) else: return update_op else: with ops.colocate_with(None, ignore_existing=True): _slots = [self.get_slot(var, _s) for _s in self.get_slot_names()] # Add the optimizer slots to restricting list. if var.params.restrict_policy is not None: var.params.restrict_policy._track_optimizer_slots(_slots) with ops.control_dependencies([grad]): _before = [var.read_value()] + [_s.read_value() for _s in _slots] if isinstance(grad, ops.IndexedSlices): if var.constraint is not None: raise RuntimeError( "Cannot use a constraint function on a sparse variable.") if "apply_state" in self._sparse_apply_args: apply_kwargs["apply_state"] = apply_state with ops.control_dependencies(_before): _apply_op = self._resource_apply_sparse_duplicate_indices( grad.values, var, grad.indices, **apply_kwargs) with ops.control_dependencies([_apply_op]): _after = control_flow_ops.group([var.update_op()] + [_s.update_op() for _s in _slots]) return _after if "apply_state" in self._dense_apply_args: apply_kwargs["apply_state"] = apply_state with ops.control_dependencies(_before): update_op = self._resource_apply_dense(grad, var, **apply_kwargs) if var.constraint is not None: with ops.control_dependencies([update_op]): return var.assign(var.constraint(var)) else: with ops.control_dependencies([update_op]): _after = control_flow_ops.group([var.update_op()] + [_s.update_op() for _s in _slots]) return _after update_ops = [] with backend.name_scope(name or self._name): for grad, var in grads_and_vars: scope_name = ("update" if ops.executing_eagerly_outside_functions() else "update_" + var.op.name) # Colocate the update with variables to avoid unnecessary communication # delays. See b/136304694. with backend.name_scope( scope_name), distribution.extended.colocate_vars_with(var): update_ops.extend( distribution.extended.update(var, apply_grad_to_update_var, args=(grad,), group=False)) any_symbolic = any( isinstance(i, ops.Operation) or tf_utils.is_symbolic_tensor(i) for i in update_ops) if not context.executing_eagerly() or any_symbolic: # If the current context is graph mode or any of the update ops are # symbolic then the step update should be carried out under a graph # context. (eager updates execute immediately) with ops._get_graph_from_inputs(update_ops).as_default(): # pylint: disable=protected-access with ops.control_dependencies(update_ops): return self._iterations.assign_add(1).op return self._iterations.assign_add(1)
def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False): """Replicates a model on different GPUs. Specifically, this function implements single-machine multi-GPU data parallelism. It works in the following way: - Divide the model's input(s) into multiple sub-batches. - Apply a model copy on each sub-batch. Every model copy is executed on a dedicated GPU. - Concatenate the results (on CPU) into one big batch. E.g. if your `batch_size` is 64 and you use `gpus=2`, then we will divide the input into 2 sub-batches of 32 samples, process each sub-batch on one GPU, then return the full batch of 64 processed samples. This induces quasi-linear speedup on up to 8 GPUs. This function is only available with the TensorFlow backend for the time being. Args: model: A Keras model instance. To avoid OOM errors, this model could have been built on CPU, for instance (see usage example below). gpus: Integer >= 2, number of on GPUs on which to create model replicas. cpu_merge: A boolean value to identify whether to force merging model weights under the scope of the CPU or not. cpu_relocation: A boolean value to identify whether to create the model's weights under the scope of the CPU. If the model is not defined under any preceding device scope, you can still rescue it by activating this option. Returns: A Keras `Model` instance which can be used just like the initial `model` argument, but which distributes its workload on multiple GPUs. Example 1: Training models with weights merge on CPU ```python import tensorflow as tf from keras.applications import Xception from keras.utils import multi_gpu_model import numpy as np num_samples = 1000 height = 224 width = 224 num_classes = 1000 # Instantiate the base model (or "template" model). # We recommend doing this with under a CPU device scope, # so that the model's weights are hosted on CPU memory. # Otherwise they may end up hosted on a GPU, which would # complicate weight sharing. with tf.device('/cpu:0'): model = Xception(weights=None, input_shape=(height, width, 3), classes=num_classes) # Replicates the model on 8 GPUs. # This assumes that your machine has 8 available GPUs. parallel_model = multi_gpu_model(model, gpus=8) parallel_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') # Generate dummy data. x = np.random.random((num_samples, height, width, 3)) y = np.random.random((num_samples, num_classes)) # This `fit` call will be distributed on 8 GPUs. # Since the batch size is 256, each GPU will process 32 samples. parallel_model.fit(x, y, epochs=20, batch_size=256) # Save model via the template model (which shares the same weights): model.save('my_model.h5') ``` Example 2: Training models with weights merge on CPU using cpu_relocation ```python .. # Not needed to change the device scope for model definition: model = Xception(weights=None, ..) try: model = multi_gpu_model(model, cpu_relocation=True) print("Training using multiple GPUs..") except: print("Training using single GPU or CPU..") model.compile(..) .. ``` Example 3: Training models with weights merge on GPU (recommended for NV-link) ```python .. # Not needed to change the device scope for model definition: model = Xception(weights=None, ..) try: model = multi_gpu_model(model, cpu_merge=False) print("Training using multiple GPUs..") except: print("Training using single GPU or CPU..") model.compile(..) .. ``` Raises: ValueError: if the `gpus` argument does not match available devices. """ if isinstance(gpus, (list, tuple)): if len(gpus) <= 1: raise ValueError('For multi-gpu usage to be effective, ' 'call `multi_gpu_model` with `len(gpus) >= 2`. ' 'Received: `gpus=%s`' % gpus) num_gpus = len(gpus) target_gpu_ids = gpus else: if gpus <= 1: raise ValueError('For multi-gpu usage to be effective, ' 'call `multi_gpu_model` with `gpus >= 2`. ' 'Received: `gpus=%s`' % gpus) num_gpus = gpus target_gpu_ids = range(num_gpus) target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in target_gpu_ids] available_devices = _get_available_devices() available_devices = [ _normalize_device_name(name) for name in available_devices ] for device in target_devices: if device not in available_devices: raise ValueError('To call `multi_gpu_model` with `gpus=%s`, ' 'we expect the following devices to be available: %s. ' 'However this machine only has: %s. ' 'Try reducing `gpus`.' % (gpus, target_devices, available_devices)) def get_slice(data, i, parts): """Slice an array into `parts` slices and return slice `i`. Args: data: array to slice. i: index of slice to return. parts: number of slices to make. Returns: Slice `i` of `data`. """ shape = array_ops.shape(data) batch_size = shape[:1] input_shape = shape[1:] step = batch_size // parts if i == parts - 1: size = batch_size - step * i else: size = step size = array_ops.concat([size, input_shape], axis=0) stride = array_ops.concat([step, input_shape * 0], axis=0) start = stride * i return array_ops.slice(data, start, size) # Relocate the model definition under CPU device scope if needed if cpu_relocation: from tensorflow.python.keras.models import clone_model # pylint: disable=g-import-not-at-top with ops.device('/cpu:0'): model = clone_model(model) all_outputs = [[] for _ in range(len(model.outputs))] # Place a copy of the model on each GPU, # each getting a slice of the inputs. for i, gpu_id in enumerate(target_gpu_ids): with ops.device('/gpu:%d' % gpu_id): with backend.name_scope('replica_%d' % gpu_id): inputs = [] # Retrieve a slice of the input. for x in model.inputs: input_shape = tuple(x.shape.as_list())[1:] slice_i = Lambda( get_slice, output_shape=input_shape, arguments={ 'i': i, 'parts': num_gpus })( x) inputs.append(slice_i) # Apply model on slice # (creating a model replica on the target device). outputs = model(inputs) if not isinstance(outputs, list): outputs = [outputs] # Save the outputs for merging back together later. for o, output in enumerate(outputs): all_outputs[o].append(output) # Deduplicate output names to handle Siamese networks. occurrences = {} for n in model.output_names: if n not in occurrences: occurrences[n] = 1 else: occurrences[n] += 1 conflict_counter = {n: 0 for n, count in occurrences.items() if count > 1} output_names = [] for n in model.output_names: if n in conflict_counter: conflict_counter[n] += 1 n += '_%d' % conflict_counter[n] output_names.append(n) # Merge outputs under expected scope. with ops.device('/cpu:0' if cpu_merge else '/gpu:%d' % target_gpu_ids[0]): merged = [] for name, outputs in zip(output_names, all_outputs): merged.append(concatenate(outputs, axis=0, name=name)) return Model(model.inputs, merged)
def _model_loss(model, inputs, targets, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] if model._compute_output_and_mask_jointly: outs, masks = model._call_and_compute_mask(inputs, **kwargs) masks = generic_utils.to_list(masks) else: outs = model.call(inputs, **kwargs) masks = None outs = generic_utils.to_list(outs) if masks is None: masks = [None for _ in outs] targets = generic_utils.to_list(targets) loss_metrics = [] with backend.name_scope('loss'): for i, loss_fn in enumerate(model.loss_functions): if sample_weights: weights = sample_weights[i] else: weights = None mask = masks[i] weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn) with backend.name_scope(model.output_names[i] + '_loss'): output_loss = weighted_masked_fn( targets[i], outs[i], weights, mask=mask) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: loss_metrics.append(backend.mean(output_loss)) loss_weight = model.loss_weights_list[i] if total_loss is None: total_loss = loss_weight * output_loss else: total_loss += loss_weight * output_loss total_loss = backend.mean(total_loss) # Add regularization losses custom_losses = [] for layer in model.layers: if layer.losses: custom_losses += layer.losses if custom_losses: total_loss += sum(custom_losses) return outs, total_loss, loss_metrics, masks
def _eager_loss_fn(outputs, targets, loss_fn, output_name): with backend.name_scope(output_name + '_loss'): loss = loss_fn(targets, outputs) return loss
def _model_loss(model, inputs, targets, output_loss_metrics=None, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. output_loss_metrics: List of metrics that are used to aggregated output loss values. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] if model._compute_output_and_mask_jointly: outs, masks = model._call_and_compute_mask(inputs, **kwargs) masks = generic_utils.to_list(masks) else: outs = model.call(inputs, **kwargs) masks = None outs = generic_utils.to_list(outs) if masks is None: masks = [None for _ in outs] targets = generic_utils.to_list(targets) loss_metrics = [] aggregated_loss_metrics = [] with backend.name_scope('loss'): for i, loss_fn in enumerate(model.loss_functions): if sample_weights: weights = sample_weights[i] else: weights = None mask = masks[i] with backend.name_scope(model.output_names[i] + '_loss'): if isinstance(loss_fn, losses_module.Loss): if mask is not None: mask = math_ops.cast(mask, outs[i].dtype) # Update weights with mask. if weights is None: weights = mask else: # Update dimensions of weights to match with mask if possible. mask, _, weights = squeeze_or_expand_dimensions( mask, None, weights) weights *= mask output_loss = loss_fn(targets[i], outs[i], sample_weight=weights) else: weighted_masked_fn = training_utils.weighted_masked_objective( loss_fn) output_loss = weighted_masked_fn(targets[i], outs[i], weights, mask=mask) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: loss_metrics.append(backend.mean(output_loss)) if output_loss_metrics is not None: # Keep track of the stateful loss result. aggregated_loss_metrics.append( training_utils.call_metric_function( output_loss_metrics[i], targets[i], outs[i], weights=weights, mask=mask)) loss_weight = model.loss_weights_list[i] if total_loss is None: total_loss = loss_weight * output_loss else: total_loss += loss_weight * output_loss total_loss = backend.mean(total_loss) # Add regularization losses custom_losses = model.losses if custom_losses: total_loss += math_ops.add_n(custom_losses) model._clear_losses() return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
def _adjust_block(p, ip, filters, block_id=None): """Adjusts the input `previous path` to match the shape of the `input`. Used in situations where the output number of filters needs to be changed. Arguments: p: Input tensor which needs to be modified ip: Input tensor whose shape needs to be matched filters: Number of output filters to be matched block_id: String block_id Returns: Adjusted Keras tensor """ channel_dim = 1 if backend.image_data_format() == 'channels_first' else -1 img_dim = 2 if backend.image_data_format() == 'channels_first' else -2 ip_shape = backend.int_shape(ip) if p is not None: p_shape = backend.int_shape(p) with backend.name_scope('adjust_block'): if p is None: p = ip elif p_shape[img_dim] != ip_shape[img_dim]: with backend.name_scope('adjust_reduction_block_%s' % block_id): p = layers.Activation('relu', name='adjust_relu_1_%s' % block_id)(p) p1 = layers.AveragePooling2D( (1, 1), strides=(2, 2), padding='valid', name='adjust_avg_pool_1_%s' % block_id)(p) p1 = layers.Conv2D(filters // 2, (1, 1), padding='same', use_bias=False, name='adjust_conv_1_%s' % block_id, kernel_initializer='he_normal')(p1) p2 = layers.ZeroPadding2D(padding=((0, 1), (0, 1)))(p) p2 = layers.Cropping2D(cropping=((1, 0), (1, 0)))(p2) p2 = layers.AveragePooling2D( (1, 1), strides=(2, 2), padding='valid', name='adjust_avg_pool_2_%s' % block_id)(p2) p2 = layers.Conv2D(filters // 2, (1, 1), padding='same', use_bias=False, name='adjust_conv_2_%s' % block_id, kernel_initializer='he_normal')(p2) p = layers.concatenate([p1, p2], axis=channel_dim) p = layers.BatchNormalization(axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='adjust_bn_%s' % block_id)(p) elif p_shape[channel_dim] != filters: with backend.name_scope('adjust_projection_block_%s' % block_id): p = layers.Activation('relu')(p) p = layers.Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='adjust_conv_projection_%s' % block_id, use_bias=False, kernel_initializer='he_normal')(p) p = layers.BatchNormalization(axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='adjust_bn_%s' % block_id)(p) return p
def _assign_singular_vector(self, variable, value): with K.name_scope('AssignSingularVector') as scope: with ops.colocate_with(variable): return state_ops.assign(variable, value, name=scope)
def _reduction_a_cell(ip, p, filters, block_id=None): """Adds a Reduction cell for NASNet-A (Fig. 4 in the paper). Arguments: ip: Input tensor `x` p: Input tensor `p` filters: Number of output filters block_id: String block_id Returns: A Keras tensor """ channel_dim = 1 if backend.image_data_format() == 'channels_first' else -1 with backend.name_scope('reduction_A_block_%s' % block_id): p = _adjust_block(p, ip, filters, block_id) h = layers.Activation('relu')(ip) h = layers.Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='reduction_conv_1_%s' % block_id, use_bias=False, kernel_initializer='he_normal')(h) h = layers.BatchNormalization(axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='reduction_bn_1_%s' % block_id)(h) h3 = layers.ZeroPadding2D(padding=imagenet_utils.correct_pad(h, 3), name='reduction_pad_1_%s' % block_id)(h) with backend.name_scope('block_1'): x1_1 = _separable_conv_block(h, filters, (5, 5), strides=(2, 2), block_id='reduction_left1_%s' % block_id) x1_2 = _separable_conv_block(p, filters, (7, 7), strides=(2, 2), block_id='reduction_right1_%s' % block_id) x1 = layers.add([x1_1, x1_2], name='reduction_add_1_%s' % block_id) with backend.name_scope('block_2'): x2_1 = layers.MaxPooling2D( (3, 3), strides=(2, 2), padding='valid', name='reduction_left2_%s' % block_id)(h3) x2_2 = _separable_conv_block(p, filters, (7, 7), strides=(2, 2), block_id='reduction_right2_%s' % block_id) x2 = layers.add([x2_1, x2_2], name='reduction_add_2_%s' % block_id) with backend.name_scope('block_3'): x3_1 = layers.AveragePooling2D( (3, 3), strides=(2, 2), padding='valid', name='reduction_left3_%s' % block_id)(h3) x3_2 = _separable_conv_block(p, filters, (5, 5), strides=(2, 2), block_id='reduction_right3_%s' % block_id) x3 = layers.add([x3_1, x3_2], name='reduction_add3_%s' % block_id) with backend.name_scope('block_4'): x4 = layers.AveragePooling2D( (3, 3), strides=(1, 1), padding='same', name='reduction_left4_%s' % block_id)(x1) x4 = layers.add([x2, x4]) with backend.name_scope('block_5'): x5_1 = _separable_conv_block(x1, filters, (3, 3), block_id='reduction_left4_%s' % block_id) x5_2 = layers.MaxPooling2D( (3, 3), strides=(2, 2), padding='valid', name='reduction_right5_%s' % block_id)(h3) x5 = layers.add([x5_1, x5_2], name='reduction_add4_%s' % block_id) x = layers.concatenate([x2, x3, x4, x5], axis=channel_dim, name='reduction_concat_%s' % block_id) return x, ip
def _subdiv_calculate_mean_and_var(self, x, axes, keep_dims): with K.name_scope('moments'): # The dynamic range of fp16 is too limited to support the collection of # sufficient statistics. As a workaround we simply perform the operations # on 32-bit floats before converting the mean and variance back to fp16 y = math_ops.cast( x, dtypes.float32) if x.dtype == dtypes.float16 else x replica_ctx = ds.get_replica_context() if replica_ctx: # local to me local_sum = math_ops.reduce_sum(y, axis=axes, keepdims=True) local_squared_sum = math_ops.reduce_sum(math_ops.square(y), axis=axes, keepdims=True) batch_size = math_ops.cast( array_ops.shape_v2(y)[0], dtypes.float32) # TODO(b/163099951): batch the all-reduces once we sort out the ordering # issue for NCCL. We don't have a mechanism to launch NCCL in the same # order in each replica nowadays, so we limit NCCL to batch all-reduces. # get the sum of all replicas (converge all devices) y_sum = replica_ctx.all_reduce(reduce_util.ReduceOp.SUM, local_sum) # get the sum from all replicas (converge all devices) y_squared_sum = replica_ctx.all_reduce( reduce_util.ReduceOp.SUM, local_squared_sum) # get the net batch size from all devices (converge all devices) input_batch_size = replica_ctx.all_reduce( reduce_util.ReduceOp.SUM, batch_size) #tf.print(replica_ctx.replica_id_in_sync_group, replica_ctx.num_replicas_in_sync, batch_size, self.aggregated_square_sum_batch, axes) # get the number of total params you are averaging (local) axes_vals = [(array_ops.shape_v2(y))[i] for i in range(1, len(axes))] multiplier_ = math_ops.cast(math_ops.reduce_prod(axes_vals), dtypes.float32) multiplier = multiplier_ * input_batch_size # conver mean var (locally) mean = y_sum / multiplier y_squared_mean = y_squared_sum / multiplier # var = E(x^2) - E(x)^2 variance = y_squared_mean - math_ops.square(mean) net_sum = y_sum / multiplier_ squared_mean = y_squared_sum / multiplier_ else: # mean = math_ops.reduce_mean(y, axes, keepdims=True, name='mean') # # sample variance, not unbiased variance # # Note: stop_gradient does not change the gradient that gets # # backpropagated to the mean from the variance calculation, # # because that gradient is zero # variance = math_ops.reduce_mean( # math_ops.squared_difference(y, array_ops.stop_gradient(mean)), # axes, # keepdims=True, # name='variance') net_sum = math_ops.reduce_sum(y, axis=axes, keepdims=True) squared_mean = math_ops.reduce_sum(math_ops.square(y), axis=axes, keepdims=True) if self._support_zero_size_input(): # Keras assumes that batch dimension is the first dimension for Batch # Normalization. input_batch_size = array_ops.shape(y)[0] else: input_batch_size = None # get the number of total params you are averaging including batchsize(local) axes_vals = [(array_ops.shape_v2(y))[i] for i in range(1, len(axes))] multiplier = math_ops.cast(math_ops.reduce_prod(axes_vals), dtypes.float32) squared_mean = squared_mean / multiplier net_sum = net_sum / multiplier if input_batch_size is None: mean, variance = nn.moments(y, axes, keep_dims=True) input_batch_size = 0 else: batches_ = math_ops.cast(input_batch_size, self._param_dtype) # # if you only have one replica dont worry about it # # Compute true mean while keeping the dims for proper broadcasting. mean = net_sum / batches_ variance = squared_mean / batches_ - math_ops.square(mean) input_batch_size = math_ops.cast(input_batch_size, dtypes.int32) if not keep_dims: mean = array_ops.squeeze(mean, axes) net_sum = array_ops.squeeze(net_sum, axes) variance = array_ops.squeeze(variance, axes) squared_mean = array_ops.squeeze(squared_mean, axes) if x.dtype == dtypes.float16: return (math_ops.cast(mean, dtypes.float16), math_ops.cast(net_sum, dtypes.float16), math_ops.cast(variance, dtypes.float16), math_ops.cast(squared_mean, dtypes.float16), input_batch_size) else: return (mean, net_sum, variance, squared_mean, input_batch_size)
def __init__(self, lr=0.2, kd=0.1, **kwargs): super(PD, self).__init__(**kwargs) with K.name_scope(self.__class__.__name__): self.iterations = K.variable(0, dtype='int64', name='iterations') self.lr = K.variable(lr, name='lr') self.kd = K.variable(kd, name='kd')
def __init__(self, optimizer): # pylint: disable=super-init-not-called self.optimizer = optimizer self._track_checkpointable(optimizer, name='optimizer') with K.name_scope(self.__class__.__name__): self.iterations = K.variable(0, dtype='int64', name='iterations')
def __init__(self, name, **kwargs): """Create a new Optimizer. This must be called by the constructors of subclasses. Note that Optimizer instances should not bind to a single graph, and so shouldn't keep Tensors as member variables. Generally you should be able to use the _set_hyper()/state.get_hyper() facility instead. This class in stateful and thread-compatible. Args: name: A non-empty string. The name to use for accumulators created for the optimizer. **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use `learning_rate` instead. Raises: ValueError: If name is malformed. RuntimeError: If _create_slots has been overridden instead of _create_vars. """ allowed_kwargs = {"clipnorm", "clipvalue", "lr", "decay"} for k in kwargs: if k not in allowed_kwargs: raise TypeError("Unexpected keyword argument " "passed to optimizer: " + str(k)) # checks that all keyword arguments are non-negative. if kwargs[k] < 0: raise ValueError("Expected {} >= 0, received: {}".format( k, kwargs[k])) self._use_locking = True self._init_set_name(name) # in graph mode, name_scope performs uniquification, so keep scope_context. with backend.name_scope(self._name) as name_scope: self._scope_ctx = name_scope self._hyper = {} # dict: {variable name : {slot name : variable}} self._slots = {} self._slot_names = [] self._weights = [] self._iterations = None # For implementing Trackable. Stores information about how to restore # slot variables which have not yet been created # (trackable._CheckpointPosition objects). # {slot_name : # {_var_key(variable_to_train): [checkpoint_position, ... ], ... }, # ... } self._deferred_slot_restorations = {} decay = kwargs.pop("decay", 0.0) if decay < 0.: raise ValueError("decay cannot be less than 0: {}".format(decay)) self._initial_decay = decay if "clipnorm" in kwargs: self.clipnorm = kwargs.pop("clipnorm") if "clipvalue" in kwargs: self.clipvalue = kwargs.pop("clipvalue") self._hypers_created = False
def _model_loss(model, inputs, targets, output_loss_metrics=None, sample_weights=None, training=False): """Calculates the loss for a given model. Args: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. output_loss_metrics: List of metrics that are used to aggregated output loss values. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ # TODO(psv): Dedup code here with graph mode prepare_total_loss() fn. # Used to keep track of the total loss value (stateless). # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) + # loss_weight_2 * output_2_loss_fn(...) + # layer losses. total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] # Allow mixed `NumPy` and `EagerTensor` input here. if any( isinstance(input_t, (np.ndarray, float, int)) for input_t in nest.flatten(inputs)): inputs = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch, inputs) outs = model(inputs, **kwargs) outs = nest.flatten(outs) if targets: targets = training_utils_v1.cast_if_floating_dtype_and_mismatch( targets, outs) # TODO(sallymatson/psv): check if we should do same mismatch fix for weights if sample_weights: sample_weights = [ training_utils_v1.cast_if_floating_dtype( ops.convert_to_tensor_v2_with_dispatch(val)) if val is not None else None for val in sample_weights ] masks = [getattr(t, '_keras_mask', None) for t in outs] targets = nest.flatten(targets) # Used to keep track of individual output losses. output_losses = [] with backend.name_scope('loss'): loss_fns = [ loss_fn for loss_fn in model.loss_functions if loss_fn is not None ] custom_losses = model.losses # Regularization losses if not loss_fns and not custom_losses: if training: raise ValueError('The model cannot be trained ' 'because it has no loss to optimize.') else: raise ValueError('The model cannot be evaluated ' 'because it has no loss to compute.') for i, loss_fn in enumerate(loss_fns): weights = sample_weights[i] if sample_weights else None mask = masks[i] with backend.name_scope(model.output_names[i] + '_loss'): if mask is not None: mask = math_ops.cast(mask, outs[i].dtype) # Update weights with mask. if weights is None: weights = mask else: # Update dimensions of weights to match with mask if possible. weights = math_ops.cast(weights, outs[i].dtype) mask, _, weights = ( losses_utils.squeeze_or_expand_dimensions( mask, sample_weight=weights)) weights *= mask if hasattr(loss_fn, 'reduction'): per_sample_losses = loss_fn.call(targets[i], outs[i]) weighted_losses = losses_utils.compute_weighted_loss( per_sample_losses, sample_weight=weights, reduction=losses_utils.ReductionV2.NONE) loss_reduction = loss_fn.reduction # `AUTO` loss reduction defaults to `SUM_OVER_BATCH_SIZE` for all # compile use cases. if loss_reduction == losses_utils.ReductionV2.AUTO: loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE # Compute the stateless loss value. output_loss = losses_utils.reduce_weighted_loss( weighted_losses, reduction=loss_reduction) else: # Compute the stateless loss value for a custom loss class. # Here we assume that the class takes care of loss reduction # because if this class returns a vector value we cannot # differentiate between use case where a custom optimizer # expects a vector loss value vs unreduced per-sample loss value. output_loss = loss_fn(targets[i], outs[i], sample_weight=weights) loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: # Keep track of the stateful output loss result. output_losses.append(output_loss_metrics[i](output_loss)) # Scale output loss for distribution. For custom losses we assume # reduction was mean. if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE: output_loss = losses_utils.scale_loss_for_distribution( output_loss) total_loss += model._loss_weights_list[i] * output_loss # Add regularization losses if custom_losses: total_loss += losses_utils.scale_loss_for_distribution( math_ops.add_n(custom_losses)) return outs, total_loss, output_losses, masks
def _model_loss(model, inputs, targets, output_loss_metrics=None, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. output_loss_metrics: List of metrics that are used to aggregated output loss values. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ # Used to keep track of the total loss value (stateless). # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) + # loss_weight_2 * output_2_loss_fn(...) + # layer losses. total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] # Allow mixed `NumPy` and `EagerTensor` input here. if any( isinstance(input_t, (np.ndarray, float, int)) for input_t in nest.flatten(inputs)): inputs = nest.map_structure(ops.convert_to_tensor, inputs) outs = model(inputs, **kwargs) outs = nest.flatten(outs) # `None` by default for `EagerTensors`. masks = [t._keras_mask for t in outs] targets = nest.flatten(targets) # Used to keep track of individual output losses (stateless). output_losses = [] # Used to keep track of individual output losses (stateful). aggregated_output_losses = [] with backend.name_scope('loss'): for i, loss_fn in enumerate(model.loss_functions): weights = sample_weights[i] if sample_weights else None mask = masks[i] with backend.name_scope(model.output_names[i] + '_loss'): if mask is not None: mask = math_ops.cast(mask, outs[i].dtype) # Update weights with mask. if weights is None: weights = mask else: # Update dimensions of weights to match with mask if possible. mask, _, weights = ( losses_utils.squeeze_or_expand_dimensions( mask, None, weights)) weights *= mask # Reset reduction on the loss so that we can get the per sample loss # value. We use this to get both the stateless and stateful loss # values without having to compute the underlying loss function # twice. weighted_losses = None if hasattr(loss_fn, 'reduction'): current_loss_reduction = loss_fn.reduction loss_fn.reduction = losses_utils.ReductionV2.NONE weighted_losses = loss_fn(targets[i], outs[i], sample_weight=weights) loss_fn.reduction = current_loss_reduction # Compute the stateless loss value. output_loss = losses_utils.reduce_weighted_loss( weighted_losses) else: # Compute the stateless loss value for a custom loss class. # Here we assume that the class takes care of loss reduction # because if this class returns a vector value we cannot # differentiate between use case where a custom optimizer # expects a vector loss value vs unreduced per-sample loss value. output_loss = loss_fn(targets[i], outs[i], sample_weight=weights) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: output_losses.append(backend.mean(output_loss)) if output_loss_metrics is not None: # Compute the stateful loss value. if weighted_losses is not None: aggregated_output_loss = output_loss_metrics[i]( weighted_losses) else: # Custom loss class. aggregated_output_loss = training_utils.call_metric_function( output_loss_metrics[i], targets[i], outs[i], weights=weights) # Keep track of the stateful output loss result. aggregated_output_losses.append(aggregated_output_loss) loss_weight = model.loss_weights_list[i] if total_loss is None: total_loss = loss_weight * output_loss else: total_loss += loss_weight * output_loss total_loss = backend.mean(total_loss) # Add regularization losses custom_losses = model.losses if custom_losses: total_loss += losses_utils.scale_loss_for_distribution( math_ops.add_n(custom_losses)) model._clear_losses() return outs, total_loss, output_losses, aggregated_output_losses, masks
def _normal_a_cell(ip, p, filters, block_id=None): """Adds a Normal cell for NASNet-A (Fig. 4 in the paper). Arguments: ip: Input tensor `x` p: Input tensor `p` filters: Number of output filters block_id: String block_id Returns: A Keras tensor """ channel_dim = 1 if backend.image_data_format() == 'channels_first' else -1 with backend.name_scope('normal_A_block_%s' % block_id): p = _adjust_block(p, ip, filters, block_id) h = layers.Activation('relu')(ip) h = layers.Conv2D(filters, (1, 1), strides=(1, 1), padding='same', name='normal_conv_1_%s' % block_id, use_bias=False, kernel_initializer='he_normal')(h) h = layers.BatchNormalization(axis=channel_dim, momentum=0.9997, epsilon=1e-3, name='normal_bn_1_%s' % block_id)(h) with backend.name_scope('block_1'): x1_1 = _separable_conv_block(h, filters, kernel_size=(5, 5), block_id='normal_left1_%s' % block_id) x1_2 = _separable_conv_block(p, filters, block_id='normal_right1_%s' % block_id) x1 = layers.add([x1_1, x1_2], name='normal_add_1_%s' % block_id) with backend.name_scope('block_2'): x2_1 = _separable_conv_block(p, filters, (5, 5), block_id='normal_left2_%s' % block_id) x2_2 = _separable_conv_block(p, filters, (3, 3), block_id='normal_right2_%s' % block_id) x2 = layers.add([x2_1, x2_2], name='normal_add_2_%s' % block_id) with backend.name_scope('block_3'): x3 = layers.AveragePooling2D( (3, 3), strides=(1, 1), padding='same', name='normal_left3_%s' % (block_id))(h) x3 = layers.add([x3, p], name='normal_add_3_%s' % block_id) with backend.name_scope('block_4'): x4_1 = layers.AveragePooling2D( (3, 3), strides=(1, 1), padding='same', name='normal_left4_%s' % (block_id))(p) x4_2 = layers.AveragePooling2D( (3, 3), strides=(1, 1), padding='same', name='normal_right4_%s' % (block_id))(p) x4 = layers.add([x4_1, x4_2], name='normal_add_4_%s' % block_id) with backend.name_scope('block_5'): x5 = _separable_conv_block(h, filters, block_id='normal_left5_%s' % block_id) x5 = layers.add([x5, h], name='normal_add_5_%s' % block_id) x = layers.concatenate([p, x1, x2, x3, x4, x5], axis=channel_dim, name='normal_concat_%s' % block_id) return x, ip
def build(self, input_shape): with K.name_scope(self.forward_layer.name): self.forward_layer.build(input_shape) with K.name_scope(self.backward_layer.name): self.backward_layer.build(input_shape) self.built = True
def build(self, input_shape): with K.name_scope(self.forward_layer.name): self.forward_layer.build(input_shape) with K.name_scope(self.backward_layer.name): self.backward_layer.build(input_shape) self.built = True
def add_weight(self, name, shape, dtype=None, initializer=None, regularizer=None, trainable=None, constraint=None, use_resource=None, synchronization=vs.VariableSynchronization.AUTO, aggregation=vs.VariableAggregation.NONE, partitioner=None, **kwargs): """Adds a new variable to the layer, or gets an existing one; returns it. Arguments: name: variable name. shape: variable shape. dtype: The type of the variable. Defaults to `self.dtype` or `float32`. initializer: initializer instance (callable). regularizer: regularizer instance (callable). trainable: whether the variable should be part of the layer's "trainable_variables" (e.g. variables, biases) or "non_trainable_variables" (e.g. BatchNorm mean, stddev). Note, if the current variable scope is marked as non-trainable then this parameter is ignored and any added variables are also marked as non-trainable. `trainable` defaults to `True` unless `synchronization` is set to `ON_READ`. constraint: constraint instance (callable). use_resource: Whether to use `ResourceVariable`. synchronization: Indicates when a distributed a variable will be aggregated. Accepted values are constants defined in the class `tf.VariableSynchronization`. By default the synchronization is set to `AUTO` and the current `DistributionStrategy` chooses when to synchronize. If `synchronization` is set to `ON_READ`, `trainable` must not be set to `True`. aggregation: Indicates how a distributed variable will be aggregated. Accepted values are constants defined in the class `tf.VariableAggregation`. partitioner: (optional) partitioner instance (callable). If provided, when the requested variable is created it will be split into multiple partitions according to `partitioner`. In this case, an instance of `PartitionedVariable` is returned. Available partitioners include `tf.compat.v1.fixed_size_partitioner` and `tf.compat.v1.variable_axis_size_partitioner`. For more details, see the documentation of `tf.compat.v1.get_variable` and the "Variable Partitioners and Sharding" section of the API guide. **kwargs: Additional keyword arguments. Returns: The created variable. Usually either a `Variable` or `ResourceVariable` instance. If `partitioner` is not `None`, a `PartitionedVariable` instance is returned. Raises: RuntimeError: If called with partitioned variable regularization and eager execution is enabled. ValueError: When trainable has been set to True with synchronization set as `ON_READ`. """ for kwarg in kwargs: if kwarg != 'experimental_autocast': raise TypeError('Unknown keyword argument:', kwarg) if self._keras_style: return super(Layer, self).add_weight( name=name, shape=shape, dtype=dtype, initializer=initializer, regularizer=regularizer, trainable=trainable and self.trainable, constraint=constraint, use_resource=use_resource, synchronization=vs.VariableSynchronization.AUTO, aggregation=vs.VariableAggregation.NONE, partitioner=partitioner, **kwargs) if synchronization == vs.VariableSynchronization.ON_READ: if trainable: raise ValueError( 'Synchronization value can be set to ' 'VariableSynchronization.ON_READ only for non-trainable variables. ' 'You have specified trainable=True and ' 'synchronization=VariableSynchronization.ON_READ.') else: # Set trainable to be false when variable is to be synced on read. trainable = False elif trainable is None: trainable = True def _should_add_regularizer(variable, existing_variable_set): if base_layer_utils.is_split_variable(variable): for var in variable: if var in existing_variable_set: return False return True else: return variable not in existing_variable_set init_graph = None if not context.executing_eagerly(): default_graph = ops.get_default_graph() if default_graph.building_function: with ops.init_scope(): # Retrieve the variables from the graph into which variables # will be lifted; if initialization ops will be lifted into # the eager context, then there is nothing to retrieve, since variable # collections are not supported when eager execution is enabled. if not context.executing_eagerly(): init_graph = ops.get_default_graph() existing_variables = set(tf_variables.global_variables()) else: # Initialization ops will not be lifted out of the default graph. init_graph = default_graph existing_variables = set(tf_variables.global_variables()) if dtype is None: dtype = self.dtype or dtypes.float32 self._set_scope(None) reuse = self.built or self._reuse prev_len_trainable = len(self._trainable_weights) with vs.variable_scope( self._scope, reuse=reuse, auxiliary_name_scope=False) as scope: self._current_scope = scope with backend.name_scope(self._name_scope()): use_resource = (use_resource or self._use_resource_variables or scope.use_resource) if initializer is None: initializer = scope.initializer variable = super(Layer, self).add_weight( name, shape, dtype=dtypes.as_dtype(dtype), initializer=initializer, trainable=trainable and self.trainable, constraint=constraint, partitioner=partitioner, use_resource=use_resource, synchronization=synchronization, aggregation=aggregation, getter=vs.get_variable, **kwargs) if regularizer: if (ops.executing_eagerly_outside_functions() or _should_add_regularizer(variable, existing_variables)): self._handle_weight_regularization(name, variable, regularizer) if init_graph is not None: # Handle edge case where a custom getter has overridden `trainable`. # There is one known occurrence of this, in unit test # testBasicRNNCellNotTrainable in # contrib.rnn.python.kernel_tests.core_rnn_cell_test with init_graph.as_default(): trainable_variables = tf_variables.trainable_variables() if (trainable and self.trainable and variable not in trainable_variables): # A custom getter / variable scope overrode the trainable flag. extra_trainable_vars = self._trainable_weights[prev_len_trainable:] self._trainable_weights = self._trainable_weights[ :prev_len_trainable] self._non_trainable_weights += extra_trainable_vars return variable