def testExportSharded(self): with self.cached_session(): empty_key = -2 default_val = -1 num_shards = 2 keys = constant_op.constant([10, 11, 12], dtypes.int64) values = constant_op.constant([2, 3, 4], dtypes.int64) table = ShardedMutableDenseHashTable( dtypes.int64, dtypes.int64, default_val, empty_key, num_shards=num_shards) self.assertAllEqual(0, table.size().eval()) table.insert(keys, values).run() self.assertAllEqual(3, table.size().eval()) keys_list, values_list = table.export_sharded() self.assertAllEqual(num_shards, len(keys_list)) self.assertAllEqual(num_shards, len(values_list)) # Exported keys include empty key buckets set to the empty_key self.assertAllEqual(set([-2, 10, 12]), set(keys_list[0].eval().flatten())) self.assertAllEqual(set([-2, 11]), set(keys_list[1].eval().flatten())) # Exported values include empty value buckets set to 0 self.assertAllEqual(set([0, 2, 4]), set(values_list[0].eval().flatten())) self.assertAllEqual(set([0, 3]), set(values_list[1].eval().flatten()))
def testExportSharded(self): with self.test_session(): empty_key = -2 default_val = -1 num_shards = 2 keys = constant_op.constant([10, 11, 12], dtypes.int64) values = constant_op.constant([2, 3, 4], dtypes.int64) table = ShardedMutableDenseHashTable(dtypes.int64, dtypes.int64, default_val, empty_key, num_shards=num_shards) self.assertAllEqual(0, table.size().eval()) table.insert(keys, values).run() self.assertAllEqual(3, table.size().eval()) keys_list, values_list = table.export_sharded() self.assertAllEqual(num_shards, len(keys_list)) self.assertAllEqual(num_shards, len(values_list)) # Exported keys include empty key buckets set to the empty_key self.assertAllEqual(set([-2, 10, 12]), set(keys_list[0].eval().flatten())) self.assertAllEqual(set([-2, 11]), set(keys_list[1].eval().flatten())) # Exported values include empty value buckets set to 0 self.assertAllEqual(set([0, 2, 4]), set(values_list[0].eval().flatten())) self.assertAllEqual(set([0, 3]), set(values_list[1].eval().flatten()))
class SdcaModel(object): """Stochastic dual coordinate ascent solver for linear models. Loss functions supported: * Binary logistic loss * Squared loss * Hinge loss * Smooth hinge loss * Poisson log loss This class defines an optimizer API to train a linear model. ### Usage ```python # Create a solver with the desired parameters. lr = tf.contrib.linear_optimizer.SdcaModel(examples, variables, options) min_op = lr.minimize() opt_op = lr.update_weights(min_op) predictions = lr.predictions(examples) # Primal loss + L1 loss + L2 loss. regularized_loss = lr.regularized_loss(examples) # Primal loss only unregularized_loss = lr.unregularized_loss(examples) examples: { sparse_features: list of SparseFeatureColumn. dense_features: list of dense tensors of type float32. example_labels: a tensor of type float32 and shape [Num examples] example_weights: a tensor of type float32 and shape [Num examples] example_ids: a tensor of type string and shape [Num examples] } variables: { sparse_features_weights: list of tensors of shape [vocab size] dense_features_weights: list of tensors of shape [dense_feature_dimension] } options: { symmetric_l1_regularization: 0.0 symmetric_l2_regularization: 1.0 loss_type: "logistic_loss" num_loss_partitions: 1 (Optional, with default value of 1. Number of partitions of the global loss function, 1 means single machine solver, and >1 when we have more than one optimizer working concurrently.) num_table_shards: 1 (Optional, with default value of 1. Number of shards of the internal state table, typically set to match the number of parameter servers for large data sets. } ``` In the training program you will just have to run the returned Op from minimize(). ```python # Execute opt_op and train for num_steps. for _ in range(num_steps): opt_op.run() # You can also check for convergence by calling lr.approximate_duality_gap() ``` """ @deprecation.deprecated( None, 'This class is deprecated. To UPDATE or USE linear optimizers, ' 'please check its latest version in core: ' 'tensorflow_estimator/python/estimator/canned/linear_optimizer/.') def __init__(self, examples, variables, options): """Create a new sdca optimizer.""" if not examples or not variables or not options: raise ValueError( 'examples, variables and options must all be specified.') supported_losses = ('logistic_loss', 'squared_loss', 'hinge_loss', 'smooth_hinge_loss', 'poisson_loss') if options['loss_type'] not in supported_losses: raise ValueError('Unsupported loss_type: ', options['loss_type']) self._assertSpecified([ 'example_labels', 'example_weights', 'example_ids', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) self._assertSpecified( ['sparse_features_weights', 'dense_features_weights'], variables) self._assertList(['sparse_features_weights', 'dense_features_weights'], variables) self._assertSpecified([ 'loss_type', 'symmetric_l2_regularization', 'symmetric_l1_regularization' ], options) for name in [ 'symmetric_l1_regularization', 'symmetric_l2_regularization' ]: value = options[name] if value < 0.0: raise ValueError('%s should be non-negative. Found (%f)' % (name, value)) self._examples = examples self._variables = variables self._options = options self._create_slots() self._hashtable = ShardedMutableDenseHashTable( key_dtype=dtypes.int64, value_dtype=dtypes.float32, num_shards=self._num_table_shards(), default_value=[0.0, 0.0, 0.0, 0.0], # SdcaFprint never returns 0 or 1 for the low64 bits, so this a safe # empty_key (that will never collide with actual payloads). empty_key=[0, 0], deleted_key=[1, 1]) summary.scalar('approximate_duality_gap', self.approximate_duality_gap()) summary.scalar('examples_seen', self._hashtable.size()) def _symmetric_l1_regularization(self): return self._options['symmetric_l1_regularization'] def _symmetric_l2_regularization(self): # Algorithmic requirement (for now) is to have minimal l2 of 1.0. return max(self._options['symmetric_l2_regularization'], 1.0) def _num_loss_partitions(self): # Number of partitions of the global objective. # TODO(andreasst): set num_loss_partitions automatically based on the number # of workers return self._options.get('num_loss_partitions', 1) def _adaptive(self): # Perform adaptive sampling. return self._options.get('adaptive', True) def _num_table_shards(self): # Number of hash table shards. # Return 1 if not specified or if the value is 'None' # TODO(andreasst): set num_table_shards automatically based on the number # of parameter servers num_shards = self._options.get('num_table_shards') return 1 if num_shards is None else num_shards # TODO(sibyl-Aix6ihai): Use optimizer interface to make use of slot creation logic. def _create_slots(self): """Make unshrinked internal variables (slots).""" # Unshrinked variables have the updates before applying L1 regularization. # Each unshrinked slot variable is either a `Variable` or list of # `Variable`, depending on the value of its corresponding primary variable. # We avoid using `PartitionedVariable` for the unshrinked slots since we do # not need any of the extra information. self._slots = collections.defaultdict(list) for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: # Our primary variable may be either a PartitionedVariable, or a list # of Variables (each representing a partition). if (isinstance(var, var_ops.PartitionedVariable) or isinstance(var, list)): var_list = [] # pylint: disable=protected-access for v in var: with ops.colocate_with(v): # TODO(andreasst): remove SDCAOptimizer suffix once bug 30843109 # is fixed. slot_var = var_ops.VariableV1( initial_value=array_ops.zeros_like( v.initialized_value(), dtypes.float32), name=v.op.name + '_unshrinked/SDCAOptimizer') var_list.append(slot_var) self._slots['unshrinked_' + name].append(var_list) # pylint: enable=protected-access else: with ops.device(var.device): # TODO(andreasst): remove SDCAOptimizer suffix once bug 30843109 is # fixed. self._slots['unshrinked_' + name].append( var_ops.VariableV1(array_ops.zeros_like( var.initialized_value(), dtypes.float32), name=var.op.name + '_unshrinked/SDCAOptimizer')) def _assertSpecified(self, items, check_in): for x in items: if check_in[x] is None: raise ValueError(check_in[x] + ' must be specified.') def _assertList(self, items, check_in): for x in items: if not isinstance(check_in[x], list): raise ValueError(x + ' must be a list.') def _var_to_list(self, var): """Wraps var in a list if it is not a list or PartitionedVariable.""" if not (isinstance(var, list) or isinstance(var, var_ops.PartitionedVariable)): var = [var] return var def _l1_loss(self): """Computes the (un-normalized) l1 loss of the model.""" with name_scope('sdca/l1_loss'): sums = [] for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: for v in self._var_to_list(var): weights = internal_convert_to_tensor(v) with ops.device(weights.device): sums.append( math_ops.reduce_sum( math_ops.abs( math_ops.cast(weights, dtypes.float64)))) # SDCA L1 regularization cost is: l1 * sum(|weights|) return self._options[ 'symmetric_l1_regularization'] * math_ops.add_n(sums) def _l2_loss(self, l2): """Computes the (un-normalized) l2 loss of the model.""" with name_scope('sdca/l2_loss'): sums = [] for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: for v in self._var_to_list(var): weights = internal_convert_to_tensor(v) with ops.device(weights.device): sums.append( math_ops.reduce_sum( math_ops.square( math_ops.cast(weights, dtypes.float64)))) # SDCA L2 regularization cost is: l2 * sum(weights^2) / 2 return l2 * math_ops.add_n(sums) / 2.0 def _convert_n_to_tensor(self, input_list, as_ref=False): """Converts input list to a set of tensors.""" # input_list can be a list of Variables (that are implicitly partitioned), # in which case the underlying logic in internal_convert_to_tensor will not # concatenate the partitions together. This method takes care of the # concatenating (we only allow partitioning on the first axis). output_list = [] for x in input_list: tensor_to_convert = x if isinstance(x, list) or isinstance(x, var_ops.PartitionedVariable): # We only allow for partitioning on the first axis. tensor_to_convert = array_ops.concat(x, axis=0) output_list.append( internal_convert_to_tensor(tensor_to_convert, as_ref=as_ref)) return output_list def _get_first_dimension_size_statically(self, w, num_partitions): """Compute the static size of the first dimension for a sharded variable.""" dim_0_size = w[0].get_shape()[0] for p in range(1, num_partitions): dim_0_size += w[p].get_shape()[0] return dim_0_size def _linear_predictions(self, examples): """Returns predictions of the form w*x.""" with name_scope('sdca/prediction'): sparse_variables = self._convert_n_to_tensor( self._variables['sparse_features_weights']) result_sparse = 0.0 for sfc, sv in zip(examples['sparse_features'], sparse_variables): # TODO(sibyl-Aix6ihai): following does not take care of missing features. result_sparse += math_ops.segment_sum( math_ops.multiply( array_ops.gather(sv, sfc.feature_indices), sfc.feature_values), sfc.example_indices) dense_features = self._convert_n_to_tensor( examples['dense_features']) dense_variables = self._convert_n_to_tensor( self._variables['dense_features_weights']) result_dense = 0.0 for i in range(len(dense_variables)): result_dense += math_ops.matmul( dense_features[i], array_ops.expand_dims(dense_variables[i], -1)) # Reshaping to allow shape inference at graph construction time. return array_ops.reshape(result_dense, [-1]) + result_sparse def predictions(self, examples): """Add operations to compute predictions by the model. If logistic_loss is being used, predicted probabilities are returned. If poisson_loss is being used, predictions are exponentiated. Otherwise, (raw) linear predictions (w*x) are returned. Args: examples: Examples to compute predictions on. Returns: An Operation that computes the predictions for examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified( ['example_weights', 'sparse_features', 'dense_features'], examples) self._assertList(['sparse_features', 'dense_features'], examples) result = self._linear_predictions(examples) if self._options['loss_type'] == 'logistic_loss': # Convert logits to probability for logistic loss predictions. with name_scope('sdca/logistic_prediction'): result = math_ops.sigmoid(result) elif self._options['loss_type'] == 'poisson_loss': # Exponeniate the prediction for poisson loss predictions. with name_scope('sdca/poisson_prediction'): result = math_ops.exp(result) return result def _get_partitioned_update_ops(self, v_num, num_partitions_by_var, p_assignments_by_var, gather_ids_by_var, weights, full_update, p_assignments, num_partitions): """Get updates for partitioned variables.""" num_partitions = num_partitions_by_var[v_num] p_assignments = p_assignments_by_var[v_num] gather_ids = gather_ids_by_var[v_num] updates = data_flow_ops.dynamic_partition(full_update, p_assignments, num_partitions) update_ops = [] for p in range(num_partitions): with ops.colocate_with(weights[p]): result = state_ops.scatter_add(weights[p], gather_ids[p], updates[p]) update_ops.append(result) return update_ops def minimize(self, global_step=None, name=None): """Add operations to train a linear model by minimizing the loss function. Args: global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Returns: An Operation that updates the variables passed in the constructor. """ # Technically, the op depends on a lot more than the variables, # but we'll keep the list short. with name_scope(name, 'sdca/minimize'): sparse_example_indices = [] sparse_feature_indices = [] sparse_features_values = [] for sf in self._examples['sparse_features']: sparse_example_indices.append(sf.example_indices) sparse_feature_indices.append(sf.feature_indices) # If feature values are missing, sdca assumes a value of 1.0f. if sf.feature_values is not None: sparse_features_values.append(sf.feature_values) # pylint: disable=protected-access example_ids_hashed = gen_sdca_ops.sdca_fprint( internal_convert_to_tensor(self._examples['example_ids'])) # pylint: enable=protected-access example_state_data = self._hashtable.lookup(example_ids_hashed) # Solver returns example_state_update, new delta sparse_feature_weights # and delta dense_feature_weights. sparse_weights = [] sparse_indices = [] # If we have partitioned variables, keep a few dictionaries of Tensors # around that we need for the assign_add after the op call to # gen_sdca_ops.sdca_optimizer(). These are keyed because we may have a # mix of partitioned and un-partitioned variables. num_partitions_by_var = {} p_assignments_by_var = {} gather_ids_by_var = {} for v_num, (w, i) in enumerate( zip(self._slots['unshrinked_sparse_features_weights'], sparse_feature_indices)): # Append the sparse_indices (in full-variable space). sparse_idx = math_ops.cast( array_ops.unique(math_ops.cast(i, dtypes.int32))[0], dtypes.int64) sparse_indices.append(sparse_idx) if isinstance(w, list) or isinstance( w, var_ops.PartitionedVariable): num_partitions = len(w) flat_ids = array_ops.reshape(sparse_idx, [-1]) # We use div partitioning, which is easiest to support downstream. # Compute num_total_ids as the sum of dim-0 of w, then assign # to partitions based on a constant number of ids per partition. # Optimize if we already know the full shape statically. dim_0_size = self._get_first_dimension_size_statically( w, num_partitions) if tensor_shape.dimension_value(dim_0_size): num_total_ids = constant_op.constant( tensor_shape.dimension_value(dim_0_size), flat_ids.dtype) else: dim_0_sizes = [] for p in range(num_partitions): if tensor_shape.dimension_value( w[p].shape[0]) is not None: dim_0_sizes.append( tensor_shape.dimension_value( w[p].shape[0])) else: with ops.colocate_with(w[p]): dim_0_sizes.append( array_ops.shape(w[p])[0]) num_total_ids = math_ops.reduce_sum( math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype)) ids_per_partition = num_total_ids // num_partitions extras = num_total_ids % num_partitions p_assignments = math_ops.maximum( flat_ids // (ids_per_partition + 1), (flat_ids - extras) // ids_per_partition) # Emulate a conditional using a boolean indicator tensor new_ids = array_ops.where( p_assignments < extras, flat_ids % (ids_per_partition + 1), (flat_ids - extras) % ids_per_partition) # Cast partition assignments to int32 for use in dynamic_partition. # There really should not be more than 2^32 partitions. p_assignments = math_ops.cast(p_assignments, dtypes.int32) # Partition list of ids based on assignments into num_partitions # separate lists. gather_ids = data_flow_ops.dynamic_partition( new_ids, p_assignments, num_partitions) # Add these into the dictionaries for use in the later update. num_partitions_by_var[v_num] = num_partitions p_assignments_by_var[v_num] = p_assignments gather_ids_by_var[v_num] = gather_ids # Gather the weights from each partition. partition_gathered_weights = [] for p in range(num_partitions): with ops.colocate_with(w[p]): partition_gathered_weights.append( array_ops.gather(w[p], gather_ids[p])) # Stitch the weights back together in the same order they were before # we dynamic_partitioned them. condition_indices = data_flow_ops.dynamic_partition( math_ops.range(array_ops.shape(new_ids)[0]), p_assignments, num_partitions) batch_gathered_weights = data_flow_ops.dynamic_stitch( condition_indices, partition_gathered_weights) else: w_as_tensor = internal_convert_to_tensor(w) with ops.device(w_as_tensor.device): batch_gathered_weights = array_ops.gather( w_as_tensor, sparse_idx) sparse_weights.append(batch_gathered_weights) # pylint: disable=protected-access if compat.forward_compatible(year=2018, month=10, day=30): esu, sfw, dfw = gen_sdca_ops.sdca_optimizer_v2( sparse_example_indices, sparse_feature_indices, sparse_features_values, self._convert_n_to_tensor( self._examples['dense_features']), internal_convert_to_tensor( self._examples['example_weights']), internal_convert_to_tensor( self._examples['example_labels']), sparse_indices, sparse_weights, self._convert_n_to_tensor( self._slots['unshrinked_dense_features_weights']), example_state_data, loss_type=self._options['loss_type'], l1=self._options['symmetric_l1_regularization'], l2=self._symmetric_l2_regularization(), num_loss_partitions=self._num_loss_partitions(), num_inner_iterations=1, adaptive=self._adaptive()) else: esu, sfw, dfw = gen_sdca_ops.sdca_optimizer( sparse_example_indices, sparse_feature_indices, sparse_features_values, self._convert_n_to_tensor( self._examples['dense_features']), internal_convert_to_tensor( self._examples['example_weights']), internal_convert_to_tensor( self._examples['example_labels']), sparse_indices, sparse_weights, self._convert_n_to_tensor( self._slots['unshrinked_dense_features_weights']), example_state_data, loss_type=self._options['loss_type'], l1=self._options['symmetric_l1_regularization'], l2=self._symmetric_l2_regularization(), num_loss_partitions=self._num_loss_partitions(), num_inner_iterations=1, adaptative=self._adaptive()) # pylint: enable=protected-access with ops.control_dependencies([esu]): update_ops = [self._hashtable.insert(example_ids_hashed, esu)] # Update the weights before the proximal step. for v_num, (w, i, u) in enumerate( zip(self._slots['unshrinked_sparse_features_weights'], sparse_indices, sfw)): if (isinstance(w, var_ops.PartitionedVariable) or isinstance(w, list)): update_ops += self._get_partitioned_update_ops( v_num, num_partitions_by_var, p_assignments_by_var, gather_ids_by_var, w, u, p_assignments, num_partitions) else: update_ops.append(state_ops.scatter_add(w, i, u)) for w, u in zip( self._slots['unshrinked_dense_features_weights'], dfw): if (isinstance(w, var_ops.PartitionedVariable) or isinstance(w, list)): split_updates = array_ops.split( u, num_or_size_splits=[ v.shape.as_list()[0] for v in w ]) for v, split_update in zip(w, split_updates): update_ops.append( state_ops.assign_add(v, split_update)) else: update_ops.append(state_ops.assign_add(w, u)) if not global_step: return control_flow_ops.group(*update_ops) with ops.control_dependencies(update_ops): return state_ops.assign_add(global_step, 1, name=name).op def update_weights(self, train_op): """Updates the model weights. This function must be called on at least one worker after `minimize`. In distributed training this call can be omitted on non-chief workers to speed up training. Args: train_op: The operation returned by the `minimize` call. Returns: An Operation that updates the model weights. """ with ops.control_dependencies([train_op]): update_ops = [] # Copy over unshrinked weights to user provided variables. for name in ['sparse_features_weights', 'dense_features_weights']: for var, slot_var in zip(self._variables[name], self._slots['unshrinked_' + name]): for v, sv in zip(self._var_to_list(var), self._var_to_list(slot_var)): update_ops.append(v.assign(sv)) # Apply proximal step. with ops.control_dependencies(update_ops): update_ops = [] for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: for v in self._var_to_list(var): with ops.device(v.device): # pylint: disable=protected-access update_ops.append( gen_sdca_ops.sdca_shrink_l1( self._convert_n_to_tensor([v], as_ref=True), l1=self._symmetric_l1_regularization(), l2=self._symmetric_l2_regularization())) return control_flow_ops.group(*update_ops) def approximate_duality_gap(self): """Add operations to compute the approximate duality gap. Returns: An Operation that computes the approximate duality gap over all examples. """ with name_scope('sdca/approximate_duality_gap'): _, values_list = self._hashtable.export_sharded() shard_sums = [] for values in values_list: with ops.device(values.device): # For large tables to_double() below allocates a large temporary # tensor that is freed once the sum operation completes. To reduce # peak memory usage in cases where we have multiple large tables on a # single device, we serialize these operations. # Note that we need double precision to get accurate results. with ops.control_dependencies(shard_sums): shard_sums.append( math_ops.reduce_sum(math_ops.to_double(values), 0)) summed_values = math_ops.add_n(shard_sums) primal_loss = summed_values[1] dual_loss = summed_values[2] example_weights = summed_values[3] # Note: we return NaN if there are no weights or all weights are 0, e.g. # if no examples have been processed return (primal_loss + dual_loss + self._l1_loss() + (2.0 * self._l2_loss(self._symmetric_l2_regularization())) ) / example_weights def unregularized_loss(self, examples): """Add operations to compute the loss (without the regularization loss). Args: examples: Examples to compute unregularized loss on. Returns: An Operation that computes mean (unregularized) loss for given set of examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified([ 'example_labels', 'example_weights', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) with name_scope('sdca/unregularized_loss'): predictions = math_ops.cast(self._linear_predictions(examples), dtypes.float64) labels = math_ops.cast( internal_convert_to_tensor(examples['example_labels']), dtypes.float64) weights = math_ops.cast( internal_convert_to_tensor(examples['example_weights']), dtypes.float64) if self._options['loss_type'] == 'logistic_loss': return math_ops.reduce_sum( math_ops.multiply( sigmoid_cross_entropy_with_logits(labels=labels, logits=predictions), weights)) / math_ops.reduce_sum(weights) if self._options['loss_type'] == 'poisson_loss': return math_ops.reduce_sum( math_ops.multiply( log_poisson_loss(targets=labels, log_input=predictions), weights)) / math_ops.reduce_sum(weights) if self._options['loss_type'] in [ 'hinge_loss', 'smooth_hinge_loss' ]: # hinge_loss = max{0, 1 - y_i w*x} where y_i \in {-1, 1}. So, we need to # first convert 0/1 labels into -1/1 labels. all_ones = array_ops.ones_like(predictions) adjusted_labels = math_ops.subtract(2 * labels, all_ones) # Tensor that contains (unweighted) error (hinge loss) per # example. error = nn_ops.relu( math_ops.subtract( all_ones, math_ops.multiply(adjusted_labels, predictions))) weighted_error = math_ops.multiply(error, weights) return math_ops.reduce_sum( weighted_error) / math_ops.reduce_sum(weights) # squared loss err = math_ops.subtract(labels, predictions) weighted_squared_err = math_ops.multiply(math_ops.square(err), weights) # SDCA squared loss function is sum(err^2) / (2*sum(weights)) return (math_ops.reduce_sum(weighted_squared_err) / (2.0 * math_ops.reduce_sum(weights))) def regularized_loss(self, examples): """Add operations to compute the loss with regularization loss included. Args: examples: Examples to compute loss on. Returns: An Operation that computes mean (regularized) loss for given set of examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified([ 'example_labels', 'example_weights', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) with name_scope('sdca/regularized_loss'): weights = internal_convert_to_tensor(examples['example_weights']) return (( self._l1_loss() + # Note that here we are using the raw regularization # (as specified by the user) and *not* # self._symmetric_l2_regularization(). self._l2_loss(self._options['symmetric_l2_regularization'])) / math_ops.reduce_sum(math_ops.cast(weights, dtypes.float64)) + self.unregularized_loss(examples))
class SdcaModel(object): """Stochastic dual coordinate ascent solver for linear models. Loss functions supported: * Binary logistic loss * Squared loss * Hinge loss * Smooth hinge loss * Poisson log loss This class defines an optimizer API to train a linear model. ### Usage ```python # Create a solver with the desired parameters. lr = tf.contrib.linear_optimizer.SdcaModel(examples, variables, options) min_op = lr.minimize() opt_op = lr.update_weights(min_op) predictions = lr.predictions(examples) # Primal loss + L1 loss + L2 loss. regularized_loss = lr.regularized_loss(examples) # Primal loss only unregularized_loss = lr.unregularized_loss(examples) examples: { sparse_features: list of SparseFeatureColumn. dense_features: list of dense tensors of type float32. example_labels: a tensor of type float32 and shape [Num examples] example_weights: a tensor of type float32 and shape [Num examples] example_ids: a tensor of type string and shape [Num examples] } variables: { sparse_features_weights: list of tensors of shape [vocab size] dense_features_weights: list of tensors of shape [dense_feature_dimension] } options: { symmetric_l1_regularization: 0.0 symmetric_l2_regularization: 1.0 loss_type: "logistic_loss" num_loss_partitions: 1 (Optional, with default value of 1. Number of partitions of the global loss function, 1 means single machine solver, and >1 when we have more than one optimizer working concurrently.) num_table_shards: 1 (Optional, with default value of 1. Number of shards of the internal state table, typically set to match the number of parameter servers for large data sets. } ``` In the training program you will just have to run the returned Op from minimize(). ```python # Execute opt_op and train for num_steps. for _ in range(num_steps): opt_op.run() # You can also check for convergence by calling lr.approximate_duality_gap() ``` """ def __init__(self, examples, variables, options): """Create a new sdca optimizer.""" if not examples or not variables or not options: raise ValueError('examples, variables and options must all be specified.') supported_losses = ('logistic_loss', 'squared_loss', 'hinge_loss', 'smooth_hinge_loss', 'poisson_loss') if options['loss_type'] not in supported_losses: raise ValueError('Unsupported loss_type: ', options['loss_type']) self._assertSpecified([ 'example_labels', 'example_weights', 'example_ids', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) self._assertSpecified(['sparse_features_weights', 'dense_features_weights'], variables) self._assertList(['sparse_features_weights', 'dense_features_weights'], variables) self._assertSpecified([ 'loss_type', 'symmetric_l2_regularization', 'symmetric_l1_regularization' ], options) for name in ['symmetric_l1_regularization', 'symmetric_l2_regularization']: value = options[name] if value < 0.0: raise ValueError('%s should be non-negative. Found (%f)' % (name, value)) self._examples = examples self._variables = variables self._options = options self._create_slots() self._hashtable = ShardedMutableDenseHashTable( key_dtype=dtypes.int64, value_dtype=dtypes.float32, num_shards=self._num_table_shards(), default_value=[0.0, 0.0, 0.0, 0.0], # SdcaFprint never returns 0 or 1 for the low64 bits, so this a safe # empty_key (that will never collide with actual payloads). empty_key=[0, 0], deleted_key=[1, 1]) summary.scalar('approximate_duality_gap', self.approximate_duality_gap()) summary.scalar('examples_seen', self._hashtable.size()) def _symmetric_l1_regularization(self): return self._options['symmetric_l1_regularization'] def _symmetric_l2_regularization(self): # Algorithmic requirement (for now) is to have minimal l2 of 1.0. return max(self._options['symmetric_l2_regularization'], 1.0) def _num_loss_partitions(self): # Number of partitions of the global objective. # TODO(andreasst): set num_loss_partitions automatically based on the number # of workers return self._options.get('num_loss_partitions', 1) def _adaptive(self): # Perform adaptive sampling. return self._options.get('adaptive', True) def _num_table_shards(self): # Number of hash table shards. # Return 1 if not specified or if the value is 'None' # TODO(andreasst): set num_table_shards automatically based on the number # of parameter servers num_shards = self._options.get('num_table_shards') return 1 if num_shards is None else num_shards # TODO(sibyl-Aix6ihai): Use optimizer interface to make use of slot creation logic. def _create_slots(self): """Make unshrinked internal variables (slots).""" # Unshrinked variables have the updates before applying L1 regularization. # Each unshrinked slot variable is either a `Variable` or list of # `Variable`, depending on the value of its corresponding primary variable. # We avoid using `PartitionedVariable` for the unshrinked slots since we do # not need any of the extra information. self._slots = collections.defaultdict(list) for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: # Our primary variable may be either a PartitionedVariable, or a list # of Variables (each representing a partition). if (isinstance(var, var_ops.PartitionedVariable) or isinstance(var, list)): var_list = [] # pylint: disable=protected-access for v in var: with ops.colocate_with(v): # TODO(andreasst): remove SDCAOptimizer suffix once bug 30843109 # is fixed. slot_var = var_ops.VariableV1( initial_value=array_ops.zeros_like(v.initialized_value(), dtypes.float32), name=v.op.name + '_unshrinked/SDCAOptimizer') var_list.append(slot_var) self._slots['unshrinked_' + name].append(var_list) # pylint: enable=protected-access else: with ops.device(var.device): # TODO(andreasst): remove SDCAOptimizer suffix once bug 30843109 is # fixed. self._slots['unshrinked_' + name].append( var_ops.VariableV1( array_ops.zeros_like(var.initialized_value(), dtypes.float32), name=var.op.name + '_unshrinked/SDCAOptimizer')) def _assertSpecified(self, items, check_in): for x in items: if check_in[x] is None: raise ValueError(check_in[x] + ' must be specified.') def _assertList(self, items, check_in): for x in items: if not isinstance(check_in[x], list): raise ValueError(x + ' must be a list.') def _var_to_list(self, var): """Wraps var in a list if it is not a list or PartitionedVariable.""" if not (isinstance(var, list) or isinstance(var, var_ops.PartitionedVariable)): var = [var] return var def _l1_loss(self): """Computes the (un-normalized) l1 loss of the model.""" with name_scope('sdca/l1_loss'): sums = [] for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: for v in self._var_to_list(var): weights = internal_convert_to_tensor(v) with ops.device(weights.device): sums.append( math_ops.reduce_sum( math_ops.abs(math_ops.cast(weights, dtypes.float64)))) # SDCA L1 regularization cost is: l1 * sum(|weights|) return self._options['symmetric_l1_regularization'] * math_ops.add_n(sums) def _l2_loss(self, l2): """Computes the (un-normalized) l2 loss of the model.""" with name_scope('sdca/l2_loss'): sums = [] for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: for v in self._var_to_list(var): weights = internal_convert_to_tensor(v) with ops.device(weights.device): sums.append(math_ops.reduce_sum(math_ops.square(math_ops.cast( weights, dtypes.float64)))) # SDCA L2 regularization cost is: l2 * sum(weights^2) / 2 return l2 * math_ops.add_n(sums) / 2.0 def _convert_n_to_tensor(self, input_list, as_ref=False): """Converts input list to a set of tensors.""" # input_list can be a list of Variables (that are implicitly partitioned), # in which case the underlying logic in internal_convert_to_tensor will not # concatenate the partitions together. This method takes care of the # concatenating (we only allow partitioning on the first axis). output_list = [] for x in input_list: tensor_to_convert = x if isinstance(x, list) or isinstance(x, var_ops.PartitionedVariable): # We only allow for partitioning on the first axis. tensor_to_convert = array_ops.concat(x, axis=0) output_list.append(internal_convert_to_tensor( tensor_to_convert, as_ref=as_ref)) return output_list def _get_first_dimension_size_statically(self, w, num_partitions): """Compute the static size of the first dimension for a sharded variable.""" dim_0_size = w[0].get_shape()[0] for p in range(1, num_partitions): dim_0_size += w[p].get_shape()[0] return dim_0_size def _linear_predictions(self, examples): """Returns predictions of the form w*x.""" with name_scope('sdca/prediction'): sparse_variables = self._convert_n_to_tensor(self._variables[ 'sparse_features_weights']) result_sparse = 0.0 for sfc, sv in zip(examples['sparse_features'], sparse_variables): # TODO(sibyl-Aix6ihai): following does not take care of missing features. result_sparse += math_ops.segment_sum( math_ops.multiply( array_ops.gather(sv, sfc.feature_indices), sfc.feature_values), sfc.example_indices) dense_features = self._convert_n_to_tensor(examples['dense_features']) dense_variables = self._convert_n_to_tensor(self._variables[ 'dense_features_weights']) result_dense = 0.0 for i in range(len(dense_variables)): result_dense += math_ops.matmul(dense_features[i], array_ops.expand_dims( dense_variables[i], -1)) # Reshaping to allow shape inference at graph construction time. return array_ops.reshape(result_dense, [-1]) + result_sparse def predictions(self, examples): """Add operations to compute predictions by the model. If logistic_loss is being used, predicted probabilities are returned. If poisson_loss is being used, predictions are exponentiated. Otherwise, (raw) linear predictions (w*x) are returned. Args: examples: Examples to compute predictions on. Returns: An Operation that computes the predictions for examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified( ['example_weights', 'sparse_features', 'dense_features'], examples) self._assertList(['sparse_features', 'dense_features'], examples) result = self._linear_predictions(examples) if self._options['loss_type'] == 'logistic_loss': # Convert logits to probability for logistic loss predictions. with name_scope('sdca/logistic_prediction'): result = math_ops.sigmoid(result) elif self._options['loss_type'] == 'poisson_loss': # Exponeniate the prediction for poisson loss predictions. with name_scope('sdca/poisson_prediction'): result = math_ops.exp(result) return result def _get_partitioned_update_ops(self, v_num, num_partitions_by_var, p_assignments_by_var, gather_ids_by_var, weights, full_update, p_assignments, num_partitions): """Get updates for partitioned variables.""" num_partitions = num_partitions_by_var[v_num] p_assignments = p_assignments_by_var[v_num] gather_ids = gather_ids_by_var[v_num] updates = data_flow_ops.dynamic_partition( full_update, p_assignments, num_partitions) update_ops = [] for p in range(num_partitions): with ops.colocate_with(weights[p]): result = state_ops.scatter_add(weights[p], gather_ids[p], updates[p]) update_ops.append(result) return update_ops def minimize(self, global_step=None, name=None): """Add operations to train a linear model by minimizing the loss function. Args: global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Returns: An Operation that updates the variables passed in the constructor. """ # Technically, the op depends on a lot more than the variables, # but we'll keep the list short. with name_scope(name, 'sdca/minimize'): sparse_example_indices = [] sparse_feature_indices = [] sparse_features_values = [] for sf in self._examples['sparse_features']: sparse_example_indices.append(sf.example_indices) sparse_feature_indices.append(sf.feature_indices) # If feature values are missing, sdca assumes a value of 1.0f. if sf.feature_values is not None: sparse_features_values.append(sf.feature_values) # pylint: disable=protected-access example_ids_hashed = gen_sdca_ops.sdca_fprint( internal_convert_to_tensor(self._examples['example_ids'])) # pylint: enable=protected-access example_state_data = self._hashtable.lookup(example_ids_hashed) # Solver returns example_state_update, new delta sparse_feature_weights # and delta dense_feature_weights. sparse_weights = [] sparse_indices = [] # If we have partitioned variables, keep a few dictionaries of Tensors # around that we need for the assign_add after the op call to # gen_sdca_ops.sdca_optimizer(). These are keyed because we may have a # mix of partitioned and un-partitioned variables. num_partitions_by_var = {} p_assignments_by_var = {} gather_ids_by_var = {} for v_num, (w, i) in enumerate( zip(self._slots['unshrinked_sparse_features_weights'], sparse_feature_indices)): # Append the sparse_indices (in full-variable space). sparse_idx = math_ops.cast( array_ops.unique(math_ops.cast(i, dtypes.int32))[0], dtypes.int64) sparse_indices.append(sparse_idx) if isinstance(w, list) or isinstance(w, var_ops.PartitionedVariable): num_partitions = len(w) flat_ids = array_ops.reshape(sparse_idx, [-1]) # We use div partitioning, which is easiest to support downstream. # Compute num_total_ids as the sum of dim-0 of w, then assign # to partitions based on a constant number of ids per partition. # Optimize if we already know the full shape statically. dim_0_size = self._get_first_dimension_size_statically( w, num_partitions) if tensor_shape.dimension_value(dim_0_size): num_total_ids = constant_op.constant( tensor_shape.dimension_value(dim_0_size), flat_ids.dtype) else: dim_0_sizes = [] for p in range(num_partitions): if tensor_shape.dimension_value(w[p].shape[0]) is not None: dim_0_sizes.append(tensor_shape.dimension_value(w[p].shape[0])) else: with ops.colocate_with(w[p]): dim_0_sizes.append(array_ops.shape(w[p])[0]) num_total_ids = math_ops.reduce_sum( math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype)) ids_per_partition = num_total_ids // num_partitions extras = num_total_ids % num_partitions p_assignments = math_ops.maximum( flat_ids // (ids_per_partition + 1), (flat_ids - extras) // ids_per_partition) # Emulate a conditional using a boolean indicator tensor new_ids = array_ops.where(p_assignments < extras, flat_ids % (ids_per_partition + 1), (flat_ids - extras) % ids_per_partition) # Cast partition assignments to int32 for use in dynamic_partition. # There really should not be more than 2^32 partitions. p_assignments = math_ops.cast(p_assignments, dtypes.int32) # Partition list of ids based on assignments into num_partitions # separate lists. gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, num_partitions) # Add these into the dictionaries for use in the later update. num_partitions_by_var[v_num] = num_partitions p_assignments_by_var[v_num] = p_assignments gather_ids_by_var[v_num] = gather_ids # Gather the weights from each partition. partition_gathered_weights = [] for p in range(num_partitions): with ops.colocate_with(w[p]): partition_gathered_weights.append( array_ops.gather(w[p], gather_ids[p])) # Stitch the weights back together in the same order they were before # we dynamic_partitioned them. condition_indices = data_flow_ops.dynamic_partition( math_ops.range(array_ops.shape(new_ids)[0]), p_assignments, num_partitions) batch_gathered_weights = data_flow_ops.dynamic_stitch( condition_indices, partition_gathered_weights) else: w_as_tensor = internal_convert_to_tensor(w) with ops.device(w_as_tensor.device): batch_gathered_weights = array_ops.gather( w_as_tensor, sparse_idx) sparse_weights.append(batch_gathered_weights) # pylint: disable=protected-access if compat.forward_compatible(year=2018, month=10, day=30): esu, sfw, dfw = gen_sdca_ops.sdca_optimizer_v2( sparse_example_indices, sparse_feature_indices, sparse_features_values, self._convert_n_to_tensor(self._examples['dense_features']), internal_convert_to_tensor(self._examples['example_weights']), internal_convert_to_tensor(self._examples['example_labels']), sparse_indices, sparse_weights, self._convert_n_to_tensor(self._slots[ 'unshrinked_dense_features_weights']), example_state_data, loss_type=self._options['loss_type'], l1=self._options['symmetric_l1_regularization'], l2=self._symmetric_l2_regularization(), num_loss_partitions=self._num_loss_partitions(), num_inner_iterations=1, adaptive=self._adaptive()) else: esu, sfw, dfw = gen_sdca_ops.sdca_optimizer( sparse_example_indices, sparse_feature_indices, sparse_features_values, self._convert_n_to_tensor(self._examples['dense_features']), internal_convert_to_tensor(self._examples['example_weights']), internal_convert_to_tensor(self._examples['example_labels']), sparse_indices, sparse_weights, self._convert_n_to_tensor(self._slots[ 'unshrinked_dense_features_weights']), example_state_data, loss_type=self._options['loss_type'], l1=self._options['symmetric_l1_regularization'], l2=self._symmetric_l2_regularization(), num_loss_partitions=self._num_loss_partitions(), num_inner_iterations=1, adaptative=self._adaptive()) # pylint: enable=protected-access with ops.control_dependencies([esu]): update_ops = [self._hashtable.insert(example_ids_hashed, esu)] # Update the weights before the proximal step. for v_num, (w, i, u) in enumerate( zip(self._slots['unshrinked_sparse_features_weights'], sparse_indices, sfw)): if (isinstance(w, var_ops.PartitionedVariable) or isinstance(w, list)): update_ops += self._get_partitioned_update_ops( v_num, num_partitions_by_var, p_assignments_by_var, gather_ids_by_var, w, u, p_assignments, num_partitions) else: update_ops.append(state_ops.scatter_add(w, i, u)) for w, u in zip(self._slots['unshrinked_dense_features_weights'], dfw): if (isinstance(w, var_ops.PartitionedVariable) or isinstance(w, list)): split_updates = array_ops.split( u, num_or_size_splits=[v.shape.as_list()[0] for v in w]) for v, split_update in zip(w, split_updates): update_ops.append(state_ops.assign_add(v, split_update)) else: update_ops.append(state_ops.assign_add(w, u)) if not global_step: return control_flow_ops.group(*update_ops) with ops.control_dependencies(update_ops): return state_ops.assign_add(global_step, 1, name=name).op def update_weights(self, train_op): """Updates the model weights. This function must be called on at least one worker after `minimize`. In distributed training this call can be omitted on non-chief workers to speed up training. Args: train_op: The operation returned by the `minimize` call. Returns: An Operation that updates the model weights. """ with ops.control_dependencies([train_op]): update_ops = [] # Copy over unshrinked weights to user provided variables. for name in ['sparse_features_weights', 'dense_features_weights']: for var, slot_var in zip(self._variables[name], self._slots['unshrinked_' + name]): for v, sv in zip(self._var_to_list(var), self._var_to_list(slot_var)): update_ops.append(v.assign(sv)) # Apply proximal step. with ops.control_dependencies(update_ops): update_ops = [] for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: for v in self._var_to_list(var): with ops.device(v.device): # pylint: disable=protected-access update_ops.append( gen_sdca_ops.sdca_shrink_l1( self._convert_n_to_tensor([v], as_ref=True), l1=self._symmetric_l1_regularization(), l2=self._symmetric_l2_regularization())) return control_flow_ops.group(*update_ops) def approximate_duality_gap(self): """Add operations to compute the approximate duality gap. Returns: An Operation that computes the approximate duality gap over all examples. """ with name_scope('sdca/approximate_duality_gap'): _, values_list = self._hashtable.export_sharded() shard_sums = [] for values in values_list: with ops.device(values.device): # For large tables to_double() below allocates a large temporary # tensor that is freed once the sum operation completes. To reduce # peak memory usage in cases where we have multiple large tables on a # single device, we serialize these operations. # Note that we need double precision to get accurate results. with ops.control_dependencies(shard_sums): shard_sums.append( math_ops.reduce_sum(math_ops.to_double(values), 0)) summed_values = math_ops.add_n(shard_sums) primal_loss = summed_values[1] dual_loss = summed_values[2] example_weights = summed_values[3] # Note: we return NaN if there are no weights or all weights are 0, e.g. # if no examples have been processed return (primal_loss + dual_loss + self._l1_loss() + (2.0 * self._l2_loss(self._symmetric_l2_regularization())) ) / example_weights def unregularized_loss(self, examples): """Add operations to compute the loss (without the regularization loss). Args: examples: Examples to compute unregularized loss on. Returns: An Operation that computes mean (unregularized) loss for given set of examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified([ 'example_labels', 'example_weights', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) with name_scope('sdca/unregularized_loss'): predictions = math_ops.cast( self._linear_predictions(examples), dtypes.float64) labels = math_ops.cast( internal_convert_to_tensor(examples['example_labels']), dtypes.float64) weights = math_ops.cast( internal_convert_to_tensor(examples['example_weights']), dtypes.float64) if self._options['loss_type'] == 'logistic_loss': return math_ops.reduce_sum(math_ops.multiply( sigmoid_cross_entropy_with_logits(labels=labels, logits=predictions), weights)) / math_ops.reduce_sum(weights) if self._options['loss_type'] == 'poisson_loss': return math_ops.reduce_sum(math_ops.multiply( log_poisson_loss(targets=labels, log_input=predictions), weights)) / math_ops.reduce_sum(weights) if self._options['loss_type'] in ['hinge_loss', 'smooth_hinge_loss']: # hinge_loss = max{0, 1 - y_i w*x} where y_i \in {-1, 1}. So, we need to # first convert 0/1 labels into -1/1 labels. all_ones = array_ops.ones_like(predictions) adjusted_labels = math_ops.subtract(2 * labels, all_ones) # Tensor that contains (unweighted) error (hinge loss) per # example. error = nn_ops.relu( math_ops.subtract(all_ones, math_ops.multiply(adjusted_labels, predictions))) weighted_error = math_ops.multiply(error, weights) return math_ops.reduce_sum(weighted_error) / math_ops.reduce_sum( weights) # squared loss err = math_ops.subtract(labels, predictions) weighted_squared_err = math_ops.multiply(math_ops.square(err), weights) # SDCA squared loss function is sum(err^2) / (2*sum(weights)) return (math_ops.reduce_sum(weighted_squared_err) / (2.0 * math_ops.reduce_sum(weights))) def regularized_loss(self, examples): """Add operations to compute the loss with regularization loss included. Args: examples: Examples to compute loss on. Returns: An Operation that computes mean (regularized) loss for given set of examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified([ 'example_labels', 'example_weights', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) with name_scope('sdca/regularized_loss'): weights = internal_convert_to_tensor(examples['example_weights']) return (( self._l1_loss() + # Note that here we are using the raw regularization # (as specified by the user) and *not* # self._symmetric_l2_regularization(). self._l2_loss(self._options['symmetric_l2_regularization'])) / math_ops.reduce_sum(math_ops.cast(weights, dtypes.float64)) + self.unregularized_loss(examples))
class SdcaModel(object): """Stochastic dual coordinate ascent solver for linear models. This class currently only supports a single machine (multi-threaded) implementation. We expect the weights and duals to fit in a single machine. Loss functions supported: * Binary logistic loss * Squared loss * Hinge loss * Smooth hinge loss This class defines an optimizer API to train a linear model. ### Usage ```python # Create a solver with the desired parameters. lr = tf.contrib.linear_optimizer.SdcaModel(examples, variables, options) min_op = lr.minimize() opt_op = lr.update_weights(min_op) predictions = lr.predictions(examples) # Primal loss + L1 loss + L2 loss. regularized_loss = lr.regularized_loss(examples) # Primal loss only unregularized_loss = lr.unregularized_loss(examples) examples: { sparse_features: list of SparseFeatureColumn. dense_features: list of dense tensors of type float32. example_labels: a tensor of type float32 and shape [Num examples] example_weights: a tensor of type float32 and shape [Num examples] example_ids: a tensor of type string and shape [Num examples] } variables: { sparse_features_weights: list of tensors of shape [vocab size] dense_features_weights: list of tensors of shape [dense_feature_dimension] } options: { symmetric_l1_regularization: 0.0 symmetric_l2_regularization: 1.0 loss_type: "logistic_loss" num_loss_partitions: 1 (Optional, with default value of 1. Number of partitions of the global loss function, 1 means single machine solver, and >1 when we have more than one optimizer working concurrently.) num_table_shards: 1 (Optional, with default value of 1. Number of shards of the internal state table, typically set to match the number of parameter servers for large data sets. } ``` In the training program you will just have to run the returned Op from minimize(). ```python # Execute opt_op and train for num_steps. for _ in range(num_steps): opt_op.run() # You can also check for convergence by calling lr.approximate_duality_gap() ``` """ def __init__(self, examples, variables, options): """Create a new sdca optimizer.""" if not examples or not variables or not options: raise ValueError( 'examples, variables and options must all be specified.') supported_losses = ('logistic_loss', 'squared_loss', 'hinge_loss', 'smooth_hinge_loss') if options['loss_type'] not in supported_losses: raise ValueError('Unsupported loss_type: ', options['loss_type']) self._assertSpecified([ 'example_labels', 'example_weights', 'example_ids', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) self._assertSpecified( ['sparse_features_weights', 'dense_features_weights'], variables) self._assertList(['sparse_features_weights', 'dense_features_weights'], variables) self._assertSpecified([ 'loss_type', 'symmetric_l2_regularization', 'symmetric_l1_regularization' ], options) for name in [ 'symmetric_l1_regularization', 'symmetric_l2_regularization' ]: value = options[name] if value < 0.0: raise ValueError('%s should be non-negative. Found (%f)' % (name, value)) self._examples = examples self._variables = variables self._options = options self._create_slots() self._hashtable = ShardedMutableDenseHashTable( key_dtype=dtypes.int64, value_dtype=dtypes.float32, num_shards=self._num_table_shards(), default_value=[0.0, 0.0, 0.0, 0.0], # SdcaFprint never returns 0 or 1 for the low64 bits, so this a safe # empty_key (that will never collide with actual payloads). empty_key=[0, 0]) summary.scalar('approximate_duality_gap', self.approximate_duality_gap()) summary.scalar('examples_seen', self._hashtable.size()) def _symmetric_l1_regularization(self): return self._options['symmetric_l1_regularization'] def _symmetric_l2_regularization(self): # Algorithmic requirement (for now) is to have minimal l2 of 1.0. return max(self._options['symmetric_l2_regularization'], 1.0) def _num_loss_partitions(self): # Number of partitions of the global objective. # TODO (andreasst): set num_loss_partitions automatically based on the number id:749 gh:750 # of workers return self._options.get('num_loss_partitions', 1) def _num_table_shards(self): # Number of hash table shards. # Return 1 if not specified or if the value is 'None' # TODO (andreasst): set num_table_shards automatically based on the number id:775 gh:776 # of parameter servers num_shards = self._options.get('num_table_shards') return 1 if num_shards is None else num_shards # TODO (sibyl-Aix6ihai): Use optimizer interface to make use of slot creation logic. id:1308 gh:1309 def _create_slots(self): # Make internal variables which have the updates before applying L1 # regularization. self._slots = collections.defaultdict(list) for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: with ops.device(var.device): # TODO (andreasst): remove SDCAOptimizer suffix once bug 30843109 is id:1142 gh:1143 # fixed self._slots['unshrinked_' + name].append( var_ops.Variable( array_ops.zeros_like(var.initialized_value(), dtypes.float32), name=var.op.name + '_unshrinked/SDCAOptimizer')) def _assertSpecified(self, items, check_in): for x in items: if check_in[x] is None: raise ValueError(check_in[x] + ' must be specified.') def _assertList(self, items, check_in): for x in items: if not isinstance(check_in[x], list): raise ValueError(x + ' must be a list.') def _l1_loss(self): """Computes the (un-normalized) l1 loss of the model.""" with name_scope('sdca/l1_loss'): sums = [] for name in ['sparse_features_weights', 'dense_features_weights']: for weights in self._convert_n_to_tensor( self._variables[name]): with ops.device(weights.device): sums.append( math_ops.reduce_sum( math_ops.abs( math_ops.cast(weights, dtypes.float64)))) sum = math_ops.add_n(sums) # SDCA L1 regularization cost is: l1 * sum(|weights|) return self._options['symmetric_l1_regularization'] * sum def _l2_loss(self, l2): """Computes the (un-normalized) l2 loss of the model.""" with name_scope('sdca/l2_loss'): sums = [] for name in ['sparse_features_weights', 'dense_features_weights']: for weights in self._convert_n_to_tensor( self._variables[name]): with ops.device(weights.device): sums.append( math_ops.reduce_sum( math_ops.square( math_ops.cast(weights, dtypes.float64)))) sum = math_ops.add_n(sums) # SDCA L2 regularization cost is: l2 * sum(weights^2) / 2 return l2 * sum / 2.0 def _convert_n_to_tensor(self, input_list, as_ref=False): """Converts input list to a set of tensors.""" return [ internal_convert_to_tensor(x, as_ref=as_ref) for x in input_list ] def _linear_predictions(self, examples): """Returns predictions of the form w*x.""" with name_scope('sdca/prediction'): sparse_variables = self._convert_n_to_tensor( self._variables['sparse_features_weights']) result_sparse = 0.0 for sfc, sv in zip(examples['sparse_features'], sparse_variables): # TODO (sibyl-Aix6ihai): following does not take care of missing features. id:810 gh:811 result_sparse += math_ops.segment_sum( math_ops.multiply( array_ops.gather(sv, sfc.feature_indices), sfc.feature_values), sfc.example_indices) dense_features = self._convert_n_to_tensor( examples['dense_features']) dense_variables = self._convert_n_to_tensor( self._variables['dense_features_weights']) result_dense = 0.0 for i in range(len(dense_variables)): result_dense += math_ops.matmul( dense_features[i], array_ops.expand_dims(dense_variables[i], -1)) # Reshaping to allow shape inference at graph construction time. return array_ops.reshape(result_dense, [-1]) + result_sparse def predictions(self, examples): """Add operations to compute predictions by the model. If logistic_loss is being used, predicted probabilities are returned. Otherwise, (raw) linear predictions (w*x) are returned. Args: examples: Examples to compute predictions on. Returns: An Operation that computes the predictions for examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified( ['example_weights', 'sparse_features', 'dense_features'], examples) self._assertList(['sparse_features', 'dense_features'], examples) result = self._linear_predictions(examples) if self._options['loss_type'] == 'logistic_loss': # Convert logits to probability for logistic loss predictions. with name_scope('sdca/logistic_prediction'): result = math_ops.sigmoid(result) return result def minimize(self, global_step=None, name=None): """Add operations to train a linear model by minimizing the loss function. Args: global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Returns: An Operation that updates the variables passed in the constructor. """ # Technically, the op depends on a lot more than the variables, # but we'll keep the list short. with name_scope(name, 'sdca/minimize'): sparse_example_indices = [] sparse_feature_indices = [] sparse_features_values = [] for sf in self._examples['sparse_features']: sparse_example_indices.append(sf.example_indices) sparse_feature_indices.append(sf.feature_indices) # If feature values are missing, sdca assumes a value of 1.0f. if sf.feature_values is not None: sparse_features_values.append(sf.feature_values) # pylint: disable=protected-access example_ids_hashed = gen_sdca_ops.sdca_fprint( internal_convert_to_tensor(self._examples['example_ids'])) # pylint: enable=protected-access example_state_data = self._hashtable.lookup(example_ids_hashed) # Solver returns example_state_update, new delta sparse_feature_weights # and delta dense_feature_weights. weights_tensor = self._convert_n_to_tensor( self._slots['unshrinked_sparse_features_weights']) sparse_weights = [] sparse_indices = [] for w, i in zip(weights_tensor, sparse_feature_indices): # Find the feature ids to lookup in the variables. with ops.device(w.device): sparse_indices.append( math_ops.cast( array_ops.unique(math_ops.cast(i, dtypes.int32))[0], dtypes.int64)) sparse_weights.append( array_ops.gather(w, sparse_indices[-1])) # pylint: disable=protected-access esu, sfw, dfw = gen_sdca_ops.sdca_optimizer( sparse_example_indices, sparse_feature_indices, sparse_features_values, self._convert_n_to_tensor(self._examples['dense_features']), internal_convert_to_tensor(self._examples['example_weights']), internal_convert_to_tensor(self._examples['example_labels']), sparse_indices, sparse_weights, self._convert_n_to_tensor( self._slots['unshrinked_dense_features_weights']), example_state_data, loss_type=self._options['loss_type'], l1=self._options['symmetric_l1_regularization'], l2=self._symmetric_l2_regularization(), num_loss_partitions=self._num_loss_partitions(), num_inner_iterations=1) # pylint: enable=protected-access with ops.control_dependencies([esu]): update_ops = [self._hashtable.insert(example_ids_hashed, esu)] # Update the weights before the proximal step. for w, i, u in zip( self._slots['unshrinked_sparse_features_weights'], sparse_indices, sfw): update_ops.append(state_ops.scatter_add(w, i, u)) for w, u in zip( self._slots['unshrinked_dense_features_weights'], dfw): update_ops.append(w.assign_add(u)) if not global_step: return control_flow_ops.group(*update_ops) with ops.control_dependencies(update_ops): return state_ops.assign_add(global_step, 1, name=name).op def update_weights(self, train_op): """Updates the model weights. This function must be called on at least one worker after `minimize`. In distributed training this call can be omitted on non-chief workers to speed up training. Args: train_op: The operation returned by the `minimize` call. Returns: An Operation that updates the model weights. """ with ops.control_dependencies([train_op]): update_ops = [] # Copy over unshrinked weights to user provided variables. for name in ['sparse_features_weights', 'dense_features_weights']: for var, slot_var in zip(self._variables[name], self._slots['unshrinked_' + name]): update_ops.append(var.assign(slot_var)) # Apply proximal step. with ops.control_dependencies(update_ops): update_ops = [] for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: with ops.device(var.device): # pylint: disable=protected-access update_ops.append( gen_sdca_ops.sdca_shrink_l1( self._convert_n_to_tensor([var], as_ref=True), l1=self._symmetric_l1_regularization(), l2=self._symmetric_l2_regularization())) return control_flow_ops.group(*update_ops) def approximate_duality_gap(self): """Add operations to compute the approximate duality gap. Returns: An Operation that computes the approximate duality gap over all examples. """ with name_scope('sdca/approximate_duality_gap'): _, values_list = self._hashtable.export_sharded() shard_sums = [] for values in values_list: with ops.device(values.device): # For large tables to_double() below allocates a large temporary # tensor that is freed once the sum operation completes. To reduce # peak memory usage in cases where we have multiple large tables on a # single device, we serialize these operations. # Note that we need double precision to get accurate results. with ops.control_dependencies(shard_sums): shard_sums.append( math_ops.reduce_sum(math_ops.to_double(values), 0)) summed_values = math_ops.add_n(shard_sums) primal_loss = summed_values[1] dual_loss = summed_values[2] example_weights = summed_values[3] # Note: we return NaN if there are no weights or all weights are 0, e.g. # if no examples have been processed return (primal_loss + dual_loss + self._l1_loss() + (2.0 * self._l2_loss(self._symmetric_l2_regularization())) ) / example_weights def unregularized_loss(self, examples): """Add operations to compute the loss (without the regularization loss). Args: examples: Examples to compute unregularized loss on. Returns: An Operation that computes mean (unregularized) loss for given set of examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified([ 'example_labels', 'example_weights', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) with name_scope('sdca/unregularized_loss'): predictions = math_ops.cast(self._linear_predictions(examples), dtypes.float64) labels = math_ops.cast( internal_convert_to_tensor(examples['example_labels']), dtypes.float64) weights = math_ops.cast( internal_convert_to_tensor(examples['example_weights']), dtypes.float64) if self._options['loss_type'] == 'logistic_loss': return math_ops.reduce_sum( math_ops.multiply( sigmoid_cross_entropy_with_logits(labels=labels, logits=predictions), weights)) / math_ops.reduce_sum(weights) if self._options['loss_type'] in [ 'hinge_loss', 'smooth_hinge_loss' ]: # hinge_loss = max{0, 1 - y_i w*x} where y_i \in {-1, 1}. So, we need to # first convert 0/1 labels into -1/1 labels. all_ones = array_ops.ones_like(predictions) adjusted_labels = math_ops.subtract(2 * labels, all_ones) # Tensor that contains (unweighted) error (hinge loss) per # example. error = nn_ops.relu( math_ops.subtract( all_ones, math_ops.multiply(adjusted_labels, predictions))) weighted_error = math_ops.multiply(error, weights) return math_ops.reduce_sum( weighted_error) / math_ops.reduce_sum(weights) # squared loss err = math_ops.subtract(labels, predictions) weighted_squared_err = math_ops.multiply(math_ops.square(err), weights) # SDCA squared loss function is sum(err^2) / (2*sum(weights)) return (math_ops.reduce_sum(weighted_squared_err) / (2.0 * math_ops.reduce_sum(weights))) def regularized_loss(self, examples): """Add operations to compute the loss with regularization loss included. Args: examples: Examples to compute loss on. Returns: An Operation that computes mean (regularized) loss for given set of examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified([ 'example_labels', 'example_weights', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) with name_scope('sdca/regularized_loss'): weights = internal_convert_to_tensor(examples['example_weights']) return (( self._l1_loss() + # Note that here we are using the raw regularization # (as specified by the user) and *not* # self._symmetric_l2_regularization(). self._l2_loss(self._options['symmetric_l2_regularization'])) / math_ops.reduce_sum(math_ops.cast(weights, dtypes.float64)) + self.unregularized_loss(examples))
class SdcaModel(object): """Stochastic dual coordinate ascent solver for linear models. This class currently only supports a single machine (multi-threaded) implementation. We expect the weights and duals to fit in a single machine. Loss functions supported: * Binary logistic loss * Squared loss * Hinge loss * Smooth hinge loss This class defines an optimizer API to train a linear model. ### Usage ```python # Create a solver with the desired parameters. lr = tf.contrib.linear_optimizer.SdcaModel(examples, variables, options) min_op = lr.minimize() opt_op = lr.update_weights(min_op) predictions = lr.predictions(examples) # Primal loss + L1 loss + L2 loss. regularized_loss = lr.regularized_loss(examples) # Primal loss only unregularized_loss = lr.unregularized_loss(examples) examples: { sparse_features: list of SparseFeatureColumn. dense_features: list of dense tensors of type float32. example_labels: a tensor of type float32 and shape [Num examples] example_weights: a tensor of type float32 and shape [Num examples] example_ids: a tensor of type string and shape [Num examples] } variables: { sparse_features_weights: list of tensors of shape [vocab size] dense_features_weights: list of tensors of shape [dense_feature_dimension] } options: { symmetric_l1_regularization: 0.0 symmetric_l2_regularization: 1.0 loss_type: "logistic_loss" num_loss_partitions: 1 (Optional, with default value of 1. Number of partitions of the global loss function, 1 means single machine solver, and >1 when we have more than one optimizer working concurrently.) num_table_shards: 1 (Optional, with default value of 1. Number of shards of the internal state table, typically set to match the number of parameter servers for large data sets. } ``` In the training program you will just have to run the returned Op from minimize(). ```python # Execute opt_op and train for num_steps. for _ in range(num_steps): opt_op.run() # You can also check for convergence by calling lr.approximate_duality_gap() ``` """ def __init__(self, examples, variables, options): """Create a new sdca optimizer.""" if not examples or not variables or not options: raise ValueError('examples, variables and options must all be specified.') supported_losses = ('logistic_loss', 'squared_loss', 'hinge_loss', 'smooth_hinge_loss') if options['loss_type'] not in supported_losses: raise ValueError('Unsupported loss_type: ', options['loss_type']) self._assertSpecified([ 'example_labels', 'example_weights', 'example_ids', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) self._assertSpecified(['sparse_features_weights', 'dense_features_weights'], variables) self._assertList(['sparse_features_weights', 'dense_features_weights'], variables) self._assertSpecified([ 'loss_type', 'symmetric_l2_regularization', 'symmetric_l1_regularization' ], options) for name in ['symmetric_l1_regularization', 'symmetric_l2_regularization']: value = options[name] if value < 0.0: raise ValueError('%s should be non-negative. Found (%f)' % (name, value)) self._examples = examples self._variables = variables self._options = options self._create_slots() self._hashtable = ShardedMutableDenseHashTable( key_dtype=dtypes.int64, value_dtype=dtypes.float32, num_shards=self._num_table_shards(), default_value=[0.0, 0.0, 0.0, 0.0], # SdcaFprint never returns 0 or 1 for the low64 bits, so this a safe # empty_key (that will never collide with actual payloads). empty_key=[0, 0]) summary.scalar('approximate_duality_gap', self.approximate_duality_gap()) summary.scalar('examples_seen', self._hashtable.size()) def _symmetric_l1_regularization(self): return self._options['symmetric_l1_regularization'] def _symmetric_l2_regularization(self): # Algorithmic requirement (for now) is to have minimal l2 of 1.0. return max(self._options['symmetric_l2_regularization'], 1.0) def _num_loss_partitions(self): # Number of partitions of the global objective. # TODO(andreasst): set num_loss_partitions automatically based on the number # of workers return self._options.get('num_loss_partitions', 1) def _num_table_shards(self): # Number of hash table shards. # Return 1 if not specified or if the value is 'None' # TODO(andreasst): set num_table_shards automatically based on the number # of parameter servers num_shards = self._options.get('num_table_shards') return 1 if num_shards is None else num_shards # TODO(sibyl-Aix6ihai): Use optimizer interface to make use of slot creation logic. def _create_slots(self): # Make internal variables which have the updates before applying L1 # regularization. self._slots = collections.defaultdict(list) for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: with ops.device(var.device): # TODO(andreasst): remove SDCAOptimizer suffix once bug 30843109 is # fixed self._slots['unshrinked_' + name].append( var_ops.Variable( array_ops.zeros_like(var.initialized_value(), dtypes.float32), name=var.op.name + '_unshrinked/SDCAOptimizer')) def _assertSpecified(self, items, check_in): for x in items: if check_in[x] is None: raise ValueError(check_in[x] + ' must be specified.') def _assertList(self, items, check_in): for x in items: if not isinstance(check_in[x], list): raise ValueError(x + ' must be a list.') def _l1_loss(self): """Computes the (un-normalized) l1 loss of the model.""" with name_scope('sdca/l1_loss'): sums = [] for name in ['sparse_features_weights', 'dense_features_weights']: for weights in self._convert_n_to_tensor(self._variables[name]): with ops.device(weights.device): sums.append( math_ops.reduce_sum( math_ops.abs(math_ops.cast(weights, dtypes.float64)))) sum = math_ops.add_n(sums) # SDCA L1 regularization cost is: l1 * sum(|weights|) return self._options['symmetric_l1_regularization'] * sum def _l2_loss(self, l2): """Computes the (un-normalized) l2 loss of the model.""" with name_scope('sdca/l2_loss'): sums = [] for name in ['sparse_features_weights', 'dense_features_weights']: for weights in self._convert_n_to_tensor(self._variables[name]): with ops.device(weights.device): sums.append( math_ops.reduce_sum( math_ops.square(math_ops.cast(weights, dtypes.float64)))) sum = math_ops.add_n(sums) # SDCA L2 regularization cost is: l2 * sum(weights^2) / 2 return l2 * sum / 2.0 def _convert_n_to_tensor(self, input_list, as_ref=False): """Converts input list to a set of tensors.""" return [internal_convert_to_tensor(x, as_ref=as_ref) for x in input_list] def _linear_predictions(self, examples): """Returns predictions of the form w*x.""" with name_scope('sdca/prediction'): sparse_variables = self._convert_n_to_tensor(self._variables[ 'sparse_features_weights']) result_sparse = 0.0 for sfc, sv in zip(examples['sparse_features'], sparse_variables): # TODO(sibyl-Aix6ihai): following does not take care of missing features. result_sparse += math_ops.segment_sum( math_ops.multiply( array_ops.gather(sv, sfc.feature_indices), sfc.feature_values), sfc.example_indices) dense_features = self._convert_n_to_tensor(examples['dense_features']) dense_variables = self._convert_n_to_tensor(self._variables[ 'dense_features_weights']) result_dense = 0.0 for i in range(len(dense_variables)): result_dense += math_ops.matmul( dense_features[i], array_ops.expand_dims(dense_variables[i], -1)) # Reshaping to allow shape inference at graph construction time. return array_ops.reshape(result_dense, [-1]) + result_sparse def predictions(self, examples): """Add operations to compute predictions by the model. If logistic_loss is being used, predicted probabilities are returned. Otherwise, (raw) linear predictions (w*x) are returned. Args: examples: Examples to compute predictions on. Returns: An Operation that computes the predictions for examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified( ['example_weights', 'sparse_features', 'dense_features'], examples) self._assertList(['sparse_features', 'dense_features'], examples) result = self._linear_predictions(examples) if self._options['loss_type'] == 'logistic_loss': # Convert logits to probability for logistic loss predictions. with name_scope('sdca/logistic_prediction'): result = math_ops.sigmoid(result) return result def minimize(self, global_step=None, name=None): """Add operations to train a linear model by minimizing the loss function. Args: global_step: Optional `Variable` to increment by one after the variables have been updated. name: Optional name for the returned operation. Returns: An Operation that updates the variables passed in the constructor. """ # Technically, the op depends on a lot more than the variables, # but we'll keep the list short. with name_scope(name, 'sdca/minimize'): sparse_example_indices = [] sparse_feature_indices = [] sparse_features_values = [] for sf in self._examples['sparse_features']: sparse_example_indices.append(sf.example_indices) sparse_feature_indices.append(sf.feature_indices) # If feature values are missing, sdca assumes a value of 1.0f. if sf.feature_values is not None: sparse_features_values.append(sf.feature_values) # pylint: disable=protected-access example_ids_hashed = gen_sdca_ops.sdca_fprint( internal_convert_to_tensor(self._examples['example_ids'])) # pylint: enable=protected-access example_state_data = self._hashtable.lookup(example_ids_hashed) # Solver returns example_state_update, new delta sparse_feature_weights # and delta dense_feature_weights. weights_tensor = self._convert_n_to_tensor(self._slots[ 'unshrinked_sparse_features_weights']) sparse_weights = [] sparse_indices = [] for w, i in zip(weights_tensor, sparse_feature_indices): # Find the feature ids to lookup in the variables. with ops.device(w.device): sparse_indices.append( math_ops.cast( array_ops.unique(math_ops.cast(i, dtypes.int32))[0], dtypes.int64)) sparse_weights.append(array_ops.gather(w, sparse_indices[-1])) # pylint: disable=protected-access esu, sfw, dfw = gen_sdca_ops.sdca_optimizer( sparse_example_indices, sparse_feature_indices, sparse_features_values, self._convert_n_to_tensor(self._examples['dense_features']), internal_convert_to_tensor(self._examples['example_weights']), internal_convert_to_tensor(self._examples['example_labels']), sparse_indices, sparse_weights, self._convert_n_to_tensor(self._slots[ 'unshrinked_dense_features_weights']), example_state_data, loss_type=self._options['loss_type'], l1=self._options['symmetric_l1_regularization'], l2=self._symmetric_l2_regularization(), num_loss_partitions=self._num_loss_partitions(), num_inner_iterations=1) # pylint: enable=protected-access with ops.control_dependencies([esu]): update_ops = [self._hashtable.insert(example_ids_hashed, esu)] # Update the weights before the proximal step. for w, i, u in zip(self._slots['unshrinked_sparse_features_weights'], sparse_indices, sfw): update_ops.append(state_ops.scatter_add(w, i, u)) for w, u in zip(self._slots['unshrinked_dense_features_weights'], dfw): update_ops.append(w.assign_add(u)) if not global_step: return control_flow_ops.group(*update_ops) with ops.control_dependencies(update_ops): return state_ops.assign_add(global_step, 1, name=name).op def update_weights(self, train_op): """Updates the model weights. This function must be called on at least one worker after `minimize`. In distributed training this call can be omitted on non-chief workers to speed up training. Args: train_op: The operation returned by the `minimize` call. Returns: An Operation that updates the model weights. """ with ops.control_dependencies([train_op]): update_ops = [] # Copy over unshrinked weights to user provided variables. for name in ['sparse_features_weights', 'dense_features_weights']: for var, slot_var in zip(self._variables[name], self._slots['unshrinked_' + name]): update_ops.append(var.assign(slot_var)) # Apply proximal step. with ops.control_dependencies(update_ops): update_ops = [] for name in ['sparse_features_weights', 'dense_features_weights']: for var in self._variables[name]: with ops.device(var.device): # pylint: disable=protected-access update_ops.append( gen_sdca_ops.sdca_shrink_l1( self._convert_n_to_tensor( [var], as_ref=True), l1=self._symmetric_l1_regularization(), l2=self._symmetric_l2_regularization())) return control_flow_ops.group(*update_ops) def approximate_duality_gap(self): """Add operations to compute the approximate duality gap. Returns: An Operation that computes the approximate duality gap over all examples. """ with name_scope('sdca/approximate_duality_gap'): _, values_list = self._hashtable.export_sharded() shard_sums = [] for values in values_list: with ops.device(values.device): # For large tables to_double() below allocates a large temporary # tensor that is freed once the sum operation completes. To reduce # peak memory usage in cases where we have multiple large tables on a # single device, we serialize these operations. # Note that we need double precision to get accurate results. with ops.control_dependencies(shard_sums): shard_sums.append( math_ops.reduce_sum(math_ops.to_double(values), 0)) summed_values = math_ops.add_n(shard_sums) primal_loss = summed_values[1] dual_loss = summed_values[2] example_weights = summed_values[3] # Note: we return NaN if there are no weights or all weights are 0, e.g. # if no examples have been processed return (primal_loss + dual_loss + self._l1_loss() + (2.0 * self._l2_loss(self._symmetric_l2_regularization())) ) / example_weights def unregularized_loss(self, examples): """Add operations to compute the loss (without the regularization loss). Args: examples: Examples to compute unregularized loss on. Returns: An Operation that computes mean (unregularized) loss for given set of examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified([ 'example_labels', 'example_weights', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) with name_scope('sdca/unregularized_loss'): predictions = math_ops.cast( self._linear_predictions(examples), dtypes.float64) labels = math_ops.cast( internal_convert_to_tensor(examples['example_labels']), dtypes.float64) weights = math_ops.cast( internal_convert_to_tensor(examples['example_weights']), dtypes.float64) if self._options['loss_type'] == 'logistic_loss': return math_ops.reduce_sum(math_ops.multiply( sigmoid_cross_entropy_with_logits(labels=labels, logits=predictions), weights)) / math_ops.reduce_sum(weights) if self._options['loss_type'] in ['hinge_loss', 'smooth_hinge_loss']: # hinge_loss = max{0, 1 - y_i w*x} where y_i \in {-1, 1}. So, we need to # first convert 0/1 labels into -1/1 labels. all_ones = array_ops.ones_like(predictions) adjusted_labels = math_ops.subtract(2 * labels, all_ones) # Tensor that contains (unweighted) error (hinge loss) per # example. error = nn_ops.relu( math_ops.subtract(all_ones, math_ops.multiply(adjusted_labels, predictions))) weighted_error = math_ops.multiply(error, weights) return math_ops.reduce_sum(weighted_error) / math_ops.reduce_sum( weights) # squared loss err = math_ops.subtract(labels, predictions) weighted_squared_err = math_ops.multiply(math_ops.square(err), weights) # SDCA squared loss function is sum(err^2) / (2*sum(weights)) return (math_ops.reduce_sum(weighted_squared_err) / (2.0 * math_ops.reduce_sum(weights))) def regularized_loss(self, examples): """Add operations to compute the loss with regularization loss included. Args: examples: Examples to compute loss on. Returns: An Operation that computes mean (regularized) loss for given set of examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified([ 'example_labels', 'example_weights', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) with name_scope('sdca/regularized_loss'): weights = internal_convert_to_tensor(examples['example_weights']) return (( self._l1_loss() + # Note that here we are using the raw regularization # (as specified by the user) and *not* # self._symmetric_l2_regularization(). self._l2_loss(self._options['symmetric_l2_regularization'])) / math_ops.reduce_sum(math_ops.cast(weights, dtypes.float64)) + self.unregularized_loss(examples))