def call(self, inputs): shape = inputs.get_shape().as_list() output_shape = shape[:-1] + [self.units] if len(output_shape) > 2: # Broadcasting is required for the inputs. outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1], [0]]) # Reshape the output back to the original ndim of the input. outputs.set_shape(output_shape) else: outputs = standard_ops.matmul(inputs, self.kernel) if self.use_bias: outputs = nn.bias_add(outputs, self.bias) if self.activation is not None: return self.activation(outputs) # pylint: disable=not-callable return outputs
def call(self, inputs): inputs = ops.convert_to_tensor(inputs, dtype=self.dtype) shape = inputs.get_shape().as_list() if len(shape) > 2: # Broadcasting is required for the inputs. outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1], [0]]) # Reshape the output back to the original ndim of the input. if not context.executing_eagerly(): output_shape = shape[:-1] + [self.units] outputs.set_shape(output_shape) else: outputs = gen_math_ops.mat_mul(inputs, self.kernel) if self.use_bias: outputs = nn.bias_add(outputs, self.bias) if self.activation is not None: return self.activation(outputs) # pylint: disable=not-callable return outputs
def call(self, inputs): inputs = ops.convert_to_tensor(inputs) rank = common_shapes.rank(inputs) if rank > 2: # Broadcasting is required for the inputs. outputs = standard_ops.tensordot(inputs, self.kernel, [[rank - 1], [0]]) # Reshape the output back to the original ndim of the input. if not context.executing_eagerly(): shape = inputs.shape.as_list() output_shape = shape[:-1] + [self.units] outputs.set_shape(output_shape) else: # Cast the inputs to self.dtype, which is the variable dtype. We do not # cast if `should_cast_variables` is True, as in that case the variable # will be automatically casted to inputs.dtype. if not self._mixed_precision_policy.should_cast_variables: inputs = math_ops.cast(inputs, self.dtype) outputs = gen_math_ops.mat_mul(inputs, self.kernel) if self.use_bias: outputs = nn.bias_add(outputs, self.bias) if self.activation is not None: return self.activation(outputs) # pylint: disable=not-callable return outputs
def _minimize_constrained(self, minimization_problem, global_step=None, var_list=None, gate_gradients=train_optimizer.Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, name=None, grad_loss=None): """Returns an `Operation` for minimizing the constrained problem. The `optimizer` constructor parameter will be used to update the model parameters, while the constraint/objective weight matrix (the analogue of Lagrange multipliers) will be updated using `constrained_optimizer` (if provided) or `optimizer` (if not). Whether the matrix updates are additive or multiplicative depends on the derived class. Args: minimization_problem: ConstrainedMinimizationProblem, the problem to optimize. global_step: as in `tf.train.Optimizer`'s `minimize` method. var_list: as in `tf.train.Optimizer`'s `minimize` method. gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` method. name: as in `tf.train.Optimizer`'s `minimize` method. grad_loss: as in `tf.train.Optimizer`'s `minimize` method. Raises: ValueError: If the minimization_problem tensors have different dtypes. Returns: `Operation`, the train_op. """ objective = minimization_problem.objective constraints = minimization_problem.constraints proxy_constraints = minimization_problem.proxy_constraints if proxy_constraints is None: proxy_constraints = constraints # Make sure that the objective, constraints and proxy constraints all have # the same dtype. if (objective.dtype.base_dtype != constraints.dtype.base_dtype or objective.dtype.base_dtype != proxy_constraints.dtype.base_dtype): raise ValueError("objective, constraints and proxy_constraints must " "have the same dtype") # Flatten both constraints tensors to 1d. num_constraints = minimization_problem.num_constraints constraints = standard_ops.reshape(constraints, shape=(num_constraints,)) proxy_constraints = standard_ops.reshape( proxy_constraints, shape=(num_constraints,)) # We use a lambda to initialize the state so that, if this function call is # inside the scope of a tf.control_dependencies() block, the dependencies # will not be applied to the initializer. state = standard_ops.Variable( lambda: self._initial_state(num_constraints), trainable=False, name="swap_regret_optimizer_state") zero_and_constraints = standard_ops.concat( (standard_ops.zeros((1,), dtype=constraints.dtype), constraints), axis=0) objective_and_proxy_constraints = standard_ops.concat( (standard_ops.expand_dims(objective, 0), proxy_constraints), axis=0) distribution = self._distribution(state) loss = standard_ops.tensordot( standard_ops.cast(distribution, objective_and_proxy_constraints.dtype), objective_and_proxy_constraints, 1) matrix_gradient = standard_ops.matmul( standard_ops.expand_dims( standard_ops.cast(zero_and_constraints, distribution.dtype), 1), standard_ops.expand_dims(distribution, 0)) update_ops = [] if self.constraint_optimizer is None: # If we don't have a separate constraint_optimizer, then we use # self._optimizer for both the update of the model parameters, and that of # the internal state. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, grad_loss=grad_loss) grads_and_vars.append( self._constraint_grad_and_var(state, matrix_gradient)) update_ops.append( self.optimizer.apply_gradients(grads_and_vars, name="update")) else: # If we have a separate constraint_optimizer, then we use self._optimizer # for the update of the model parameters, and self._constraint_optimizer # for that of the internal state. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, grad_loss=grad_loss) matrix_grads_and_vars = [ self._constraint_grad_and_var(state, matrix_gradient) ] gradients = [ gradient for gradient, _ in grads_and_vars + matrix_grads_and_vars if gradient is not None ] with ops.control_dependencies(gradients): update_ops.append( self.optimizer.apply_gradients(grads_and_vars, name="update")) update_ops.append( self.constraint_optimizer.apply_gradients( matrix_grads_and_vars, name="optimizer_state_update")) with ops.control_dependencies(update_ops): if global_step is None: # If we don't have a global step, just project, and we're done. return self._projection_op(state, name=name) else: # If we have a global step, then we need to increment it in addition to # projecting. projection_op = self._projection_op(state, name="project") with ops.colocate_with(global_step): global_step_op = state_ops.assign_add( global_step, 1, name="global_step_increment") return control_flow_ops.group(projection_op, global_step_op, name=name)
def _matmul(self, inputs, kernel): if inputs.shape.ndims <= 2: return standard_ops.matmul(inputs, kernel) # To handle broadcasting, we must use `tensordot`. return standard_ops.tensordot(inputs, kernel, axes=[[-1], [0]])
def _minimize_constrained(self, minimization_problem, global_step=None, var_list=None, gate_gradients=train_optimizer.Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, name=None, grad_loss=None): """Returns an `Operation` for minimizing the constrained problem. The `optimizer` constructor parameter will be used to update the model parameters, while the constraint/objective weight matrix (the analogue of Lagrange multipliers) will be updated using `constrained_optimizer` (if provided) or `optimizer` (if not). Whether the matrix updates are additive or multiplicative depends on the derived class. Args: minimization_problem: ConstrainedMinimizationProblem, the problem to optimize. global_step: as in `tf.train.Optimizer`'s `minimize` method. var_list: as in `tf.train.Optimizer`'s `minimize` method. gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` method. name: as in `tf.train.Optimizer`'s `minimize` method. grad_loss: as in `tf.train.Optimizer`'s `minimize` method. Raises: ValueError: If the minimization_problem tensors have different dtypes. Returns: `Operation`, the train_op. """ objective = minimization_problem.objective constraints = minimization_problem.constraints proxy_constraints = minimization_problem.proxy_constraints if proxy_constraints is None: proxy_constraints = constraints # Make sure that the objective, constraints and proxy constraints all have # the same dtype. if (objective.dtype.base_dtype != constraints.dtype.base_dtype or objective.dtype.base_dtype != proxy_constraints.dtype.base_dtype): raise ValueError( "objective, constraints and proxy_constraints must " "have the same dtype") # Flatten both constraints tensors to 1d. num_constraints = minimization_problem.num_constraints constraints = standard_ops.reshape(constraints, shape=(num_constraints, )) proxy_constraints = standard_ops.reshape(proxy_constraints, shape=(num_constraints, )) # We use a lambda to initialize the state so that, if this function call is # inside the scope of a tf.control_dependencies() block, the dependencies # will not be applied to the initializer. state = standard_ops.Variable( lambda: self._initial_state(num_constraints), trainable=False, name="swap_regret_optimizer_state") zero_and_constraints = standard_ops.concat((standard_ops.zeros( (1, ), dtype=constraints.dtype), constraints), axis=0) objective_and_proxy_constraints = standard_ops.concat( (standard_ops.expand_dims(objective, 0), proxy_constraints), axis=0) distribution = self._distribution(state) loss = standard_ops.tensordot( standard_ops.cast(distribution, objective_and_proxy_constraints.dtype), objective_and_proxy_constraints, 1) matrix_gradient = standard_ops.matmul( standard_ops.expand_dims( standard_ops.cast(zero_and_constraints, distribution.dtype), 1), standard_ops.expand_dims(distribution, 0)) update_ops = [] if self.constraint_optimizer is None: # If we don't have a separate constraint_optimizer, then we use # self._optimizer for both the update of the model parameters, and that of # the internal state. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, grad_loss=grad_loss) grads_and_vars.append( self._constraint_grad_and_var(state, matrix_gradient)) update_ops.append( self.optimizer.apply_gradients(grads_and_vars, name="update")) else: # If we have a separate constraint_optimizer, then we use self._optimizer # for the update of the model parameters, and self._constraint_optimizer # for that of the internal state. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, grad_loss=grad_loss) matrix_grads_and_vars = [ self._constraint_grad_and_var(state, matrix_gradient) ] gradients = [ gradient for gradient, _ in grads_and_vars + matrix_grads_and_vars if gradient is not None ] with ops.control_dependencies(gradients): update_ops.append( self.optimizer.apply_gradients(grads_and_vars, name="update")) update_ops.append( self.constraint_optimizer.apply_gradients( matrix_grads_and_vars, name="optimizer_state_update")) with ops.control_dependencies(update_ops): if global_step is None: # If we don't have a global step, just project, and we're done. return self._projection_op(state, name=name) else: # If we have a global step, then we need to increment it in addition to # projecting. projection_op = self._projection_op(state, name="project") with ops.colocate_with(global_step): global_step_op = state_ops.assign_add( global_step, 1, name="global_step_increment") return control_flow_ops.group(projection_op, global_step_op, name=name)
def minimize_constrained(self, minimization_problem, global_step=None, var_list=None, gate_gradients=train_optimizer.Optimizer.GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, name=None, grad_loss=None): """Returns an `Op` for minimizing the constrained problem. The `optimizer` constructor parameter will be used to update the model parameters, while the Lagrange multipliers will be updated using `constrained_optimizer` (if provided) or `optimizer` (if not). Args: minimization_problem: ConstrainedMinimizationProblem, the problem to optimize. global_step: as in `tf.train.Optimizer`'s `minimize` method. var_list: as in `tf.train.Optimizer`'s `minimize` method. gate_gradients: as in `tf.train.Optimizer`'s `minimize` method. aggregation_method: as in `tf.train.Optimizer`'s `minimize` method. colocate_gradients_with_ops: as in `tf.train.Optimizer`'s `minimize` method. name: as in `tf.train.Optimizer`'s `minimize` method. grad_loss: as in `tf.train.Optimizer`'s `minimize` method. Returns: TensorFlow Op. """ objective = minimization_problem.objective constraints = minimization_problem.constraints proxy_constraints = minimization_problem.proxy_constraints if proxy_constraints is None: proxy_constraints = constraints # Flatten both constraints tensors to 1d. num_constraints = minimization_problem.num_constraints constraints = standard_ops.reshape(constraints, shape=(num_constraints,)) proxy_constraints = standard_ops.reshape( proxy_constraints, shape=(num_constraints,)) # We use a lambda to initialize the state so that, if this function call is # inside the scope of a tf.control_dependencies() block, the dependencies # will not be applied to the initializer. state = standard_ops.Variable( lambda: self._initial_state(num_constraints), trainable=False, name="external_regret_optimizer_state") multipliers = self._lagrange_multipliers(state) loss = ( objective + standard_ops.tensordot(multipliers, proxy_constraints, 1)) multipliers_gradient = constraints update_ops = [] if self.constraint_optimizer is None: # If we don't have a separate constraint_optimizer, then we use # self._optimizer for both the update of the model parameters, and that of # the internal state. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, grad_loss=grad_loss) grads_and_vars.append( self._constraint_grad_and_var(state, multipliers_gradient)) update_ops.append( self.optimizer.apply_gradients(grads_and_vars, name="update")) else: # If we have a separate constraint_optimizer, then we use self._optimizer # for the update of the model parameters, and self._constraint_optimizer # for that of the internal state. grads_and_vars = self.optimizer.compute_gradients( loss, var_list=var_list, gate_gradients=gate_gradients, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, grad_loss=grad_loss) multiplier_grads_and_vars = [ self._constraint_grad_and_var(state, multipliers_gradient) ] gradients = [ gradient for gradient, _ in grads_and_vars + multiplier_grads_and_vars if gradient is not None ] with ops.control_dependencies(gradients): update_ops.append( self.optimizer.apply_gradients(grads_and_vars, name="update")) update_ops.append( self.constraint_optimizer.apply_gradients( multiplier_grads_and_vars, name="optimizer_state_update")) with ops.control_dependencies(update_ops): if global_step is None: # If we don't have a global step, just project, and we're done. return self._projection_op(state, name=name) else: # If we have a global step, then we need to increment it in addition to # projecting. projection_op = self._projection_op(state, name="project") with ops.colocate_with(global_step): global_step_op = state_ops.assign_add( global_step, 1, name="global_step_increment") return control_flow_ops.group(projection_op, global_step_op, name=name)
def dense(inputs, kernel, bias=None, activation=None, dtype=None): """Densely connected NN layer op. Args: inputs: `tf.Tensor` or `tf.SparseTensor`. Inputs to operation. kernel: `tf.Variable`. Matrix kernel. bias: (Optional) `tf.Variable`. Bias to add to outputs. activation: (Optional) 1-argument callable. Activation function to apply to outputs. dtype: (Optional) `tf.DType`. Dtype to cast `inputs` to. Returns: `tf.Tensor`. Output of dense connection. """ if dtype: if inputs.dtype.base_dtype != dtype.base_dtype: inputs = math_ops.cast(inputs, dtype=dtype) rank = inputs.shape.rank if rank == 2 or rank is None: # We use embedding_lookup_sparse as a more efficient matmul operation for # large sparse input tensors. The op will result in a sparse gradient, as # opposed to sparse_ops.sparse_tensor_dense_matmul which results in dense # gradients. This can lead to sigfinicant speedups, see b/171762937. if isinstance(inputs, sparse_tensor.SparseTensor): # We need to fill empty rows, as the op assumes at least one id per row. inputs, _ = sparse_ops.sparse_fill_empty_rows(inputs, 0) # We need to do some munging of our input to use the embedding lookup as a # matrix multiply. We split our input matrix into separate ids and weights # tensors. The values of the ids tensor should be the column indices of # our input matrix and the values of the weights tensor can continue to # the actual matrix weights. The column arrangement of ids and weights # will be summed over and does not matter. See the documentation for # sparse_ops.sparse_tensor_dense_matmul a more detailed explanation of the # inputs to both ops. ids = sparse_tensor.SparseTensor(indices=inputs.indices, values=inputs.indices[:, 1], dense_shape=inputs.dense_shape) weights = inputs outputs = embedding_ops.embedding_lookup_sparse_v2(kernel, ids, weights, combiner="sum") else: outputs = gen_math_ops.MatMul(a=inputs, b=kernel) # Broadcast kernel to inputs. else: outputs = standard_ops.tensordot(inputs, kernel, [[rank - 1], [0]]) # Reshape the output back to the original ndim of the input. if not context.executing_eagerly(): shape = inputs.shape.as_list() output_shape = shape[:-1] + [kernel.shape[-1]] outputs.set_shape(output_shape) if bias is not None: outputs = nn_ops.bias_add(outputs, bias) if activation is not None: outputs = activation(outputs) return outputs
def call(self, inputs): inputs = ops.convert_to_tensor(inputs, dtype=self.dtype) enable_quantop_dense = int(os.getenv('ENABLE_QUANTOP_DENSE', 0)) if enable_quantop_dense == 1: inputs_qs = quantemu_ops.quantize_emu( inputs, data_format='unknown', allocate_copy=int(os.getenv('QUANTEMU_ALLOCATE_COPY_INPUTS', 0)), data_type=int(os.getenv('QUANTEMU_DENSE_DATA_TYPE', 0)), precision=int(os.getenv('QUANTEMU_PRECISION_DENSE_INPUTS', 23)), exponent_bits=int(os.getenv('QUANTEMU_EXPBITS', 5)), round_mode=int(os.getenv('QUANTEMU_RMODE_INPUTS', 0))) kernel_qs = quantemu_ops.quantize_emu( self.kernel, data_format='unknown', allocate_copy=int( os.getenv('QUANTEMU_ALLOCATE_COPY_FILTERS', 0)), data_type=int(os.getenv('QUANTEMU_DENSE_DATA_TYPE', 0)), precision=int(os.getenv('QUANTEMU_PRECISION_DENSE_FILTERS', 23)), exponent_bits=int(os.getenv('QUANTEMU_EXPBITS', 5)), round_mode=int(os.getenv('QUANTEMU_RMODE_FILTERS', 0))) rank = common_shapes.rank(inputs) if rank > 2: # Broadcasting is required for the inputs. outputs = standard_ops.tensordot(inputs_qs, kernel_qs, [[rank - 1], [0]]) # Reshape the output back to the original ndim of the input. if not context.executing_eagerly(): shape = inputs.get_shape().as_list() output_shape = shape[:-1] + [self.units] outputs.set_shape(output_shape) else: outputs = gen_math_ops.mat_mul(inputs_qs, kernel_qs) if self.use_bias: outputs = nn.bias_add(outputs, self.bias) if self.activation is not None: return self.activation(outputs) # pylint: disable=not-callable return outputs else: # No quantization rank = common_shapes.rank(inputs) if rank > 2: # Broadcasting is required for the inputs. outputs = standard_ops.tensordot(inputs, self.kernel, [[rank - 1], [0]]) # Reshape the output back to the original ndim of the input. if not context.executing_eagerly(): shape = inputs.get_shape().as_list() output_shape = shape[:-1] + [self.units] outputs.set_shape(output_shape) else: outputs = gen_math_ops.mat_mul(inputs, self.kernel) if self.use_bias: outputs = nn.bias_add(outputs, self.bias) if self.activation is not None: return self.activation(outputs) # pylint: disable=not-callable return outputs
def _broadcasted_tensordot(_inputs, _kernel): return standard_ops.tensordot(_inputs, _kernel, [[rank - 1], [0]])
def call(self, inputs): inputs = ops.convert_to_tensor(inputs) rank = common_shapes.rank(inputs) if rank > 2: # Broadcasting is required for the inputs. outputs = standard_ops.tensordot(inputs, self.kernel, [[rank - 1], [0]]) # Reshape the output back to the original ndim of the input. if not context.executing_eagerly(): shape = inputs.get_shape().as_list() output_shape = shape[:-1] + [self.units] outputs.set_shape(output_shape) else: outputs = gen_math_ops.mat_mul(inputs, self.kernel) if self.activation is not None: outputs = self.activation(outputs) # pylint: disable=not-callable if self.verbose > 0: print(outputs.get_shape(), 'outputs before masking') if self.leaky_inputs: if self.verbose > 0: print('performing mask op') self.full_outputs = outputs # outputs_d = {} # for dim in range(outputs.get_shape()[1]): outputs_1d = tf.reshape(tf.transpose(outputs), [ -1, ], name='outputs_1d_') # mask_array = self.mask_array['key_' + str(dim)] if self.verbose > 0: print('mask_array', self.mask_array.get_shape()) mask_array_1d = tf.reshape(self.mask_array, [ -1, ], name='mask_ph_1d_') if self.verbose > 0: print('mask_array_1d', mask_array_1d.get_shape()) mask = tf.math.greater(mask_array_1d, tf.constant(0.0), name='masking_op_') outputs = outputs_1d[mask] outputs = tf.expand_dims(outputs, axis=1) # outputs_d['val_' + str(dim)] = outputs if self.verbose > 0: print(outputs.get_shape(), 'shape of output after masking') return outputs else: self.full_outputs = outputs if self.verbose > 0: print(outputs.get_shape(), 'shape of output without masking') return outputs