def single_batch_a_func( self, features, scope, mode, context_fn, reuse, config, params): """Single step action predictor when there is a single batch dim.""" del config with tf.variable_scope(scope, reuse=reuse, use_resource=True): with tf.variable_scope('state_features', reuse=reuse, use_resource=True): feature_points, end_points = vision_layers.BuildImagesToFeaturesModel( features.image, is_training=(mode == TRAIN), normalizer_fn=tf.contrib.layers.layer_norm) if context_fn: feature_points = context_fn(feature_points) if params and params.get('is_inner_loop', False): if self._predict_con_gripper_pose: gripper_pose = self._predict_gripper_pose(feature_points) else: gripper_pose = tf.zeros_like(features.gripper_pose) else: gripper_pose = features.gripper_pose action, _ = vision_layers.BuildImageFeaturesToPoseModel( feature_points, aux_input=gripper_pose, num_outputs=self._action_size) action = self._output_mean + self._output_stddev * action return { 'action': action, 'image': features.image, 'feature_points': feature_points, 'softmax': end_points['softmax'], }
def embed_condition_images( condition_image, scope, reuse=tf.AUTO_REUSE, fc_layers = None): """Independently embed a (meta)-batch of images. Args: condition_image: A rank 4 tensor of images: [N, H, W, C]. scope: Name of the tf variable_scope. reuse: The variable_scope reuse setting. fc_layers: An optional tuple of ints describing the number of units in each fully-connected hidden layer. Returns: A rank 2 tensor of embeddings: [N, embedding size]. Raises: ValueError if `condition_image` has incorrect rank. """ if len(condition_image.shape) != 4: raise ValueError( 'Image has unexpected shape {}.'.format(condition_image.shape)) with tf.variable_scope(scope, reuse=reuse, use_resource=True): image_embedding, _ = vision_layers.BuildImagesToFeaturesModel( condition_image) if fc_layers is not None: image_embedding = layers.stack( image_embedding, layers.fully_connected, fc_layers[:-1], activation_fn=tf.nn.relu, normalizer_fn=layers.layer_norm) image_embedding = layers.fully_connected( image_embedding, fc_layers[-1], activation_fn=None) return image_embedding
def a_func( self, features, scope, mode, config=None, params=None, reuse=tf.AUTO_REUSE, context_fn=None, ): """A (state) regression function. This function can return a stochastic or a deterministic tensor. Args: features: This is the first item returned from the input_fn and parsed by tensorspec_utils.validate_and_pack. A spec_structure which fulfills the requirements of the self.get_feature_spefication. scope: String specifying variable scope. mode: (ModeKeys) Specifies if this is training, evaluation or prediction. config: Optional configuration object. Will receive what is passed to Estimator in config parameter, or the default config. Allows updating things in your model_fn based on configuration such as num_ps_replicas, or model_dir. params: An optional dict of hyper parameters that will be passed into input_fn and model_fn. Keys are names of parameters, values are basic python types. There are reserved keys for TPUEstimator, including 'batch_size'. reuse: Whether or not to reuse variables under variable scope 'scope'. context_fn: Optional python function that takes in features and returns new features of same shape. For merging information like in RL^2. Returns: outputs: A {key: Tensor} mapping. The key 'action' is required. """ del config is_training = mode == TRAIN image = tf.image.convert_image_dtype(features.state, tf.float32) with tf.variable_scope(scope, reuse=reuse, use_resource=True): with tf.variable_scope('state_features', reuse=reuse, use_resource=True): feature_points, end_points = vision_layers.BuildImagesToFeaturesModel( image, is_training=is_training, normalizer_fn=layers.layer_norm) del end_points if context_fn: feature_points = context_fn(feature_points) estimated_pose, _ = vision_layers.BuildImageFeaturesToPoseModel( feature_points, num_outputs=self._action_size) return { 'inference_output': estimated_pose, 'state_features': feature_points }
def embed_condition_images(condition_image, scope, reuse=tf.AUTO_REUSE, fc_layers=None, use_spatial_softmax=True): """Independently embed a (meta)-batch of images. Args: condition_image: A rank 4 tensor of images: [N, H, W, C]. scope: Name of the tf variable_scope. reuse: The variable_scope reuse setting. fc_layers: An optional tuple of ints describing the number of units in each fully-connected hidden layer, or 1x1 conv layer when excluding spatial softmax. use_spatial_softmax: Whether to use a spatial softmax or not. Returns: A rank 2 tensor of embeddings: [N, embedding size] if spatial_softmax is True. Otherwise, a rank 4 tensor of visual features [N, H, W, embedding size] Raises: ValueError if `condition_image` has incorrect rank. """ if len(condition_image.shape) != 4: raise ValueError('Image has unexpected shape {}.'.format( condition_image.shape)) with tf.variable_scope(scope, reuse=reuse, use_resource=True): image_embedding, _ = vision_layers.BuildImagesToFeaturesModel( condition_image, use_spatial_softmax=use_spatial_softmax) if fc_layers is not None: if len(image_embedding.shape) == 2: image_embedding = layers.stack(image_embedding, layers.fully_connected, fc_layers[:-1], activation_fn=tf.nn.relu, normalizer_fn=layers.layer_norm) image_embedding = layers.fully_connected(image_embedding, fc_layers[-1], activation_fn=None) else: image_embedding = layers.stack(image_embedding, layers.conv2d, fc_layers[:-1], kernel_size=[1, 1], activation_fn=tf.nn.relu, normalizer_fn=layers.layer_norm) image_embedding = layers.conv2d(image_embedding, fc_layers[-1], activation_fn=None) return image_embedding
def _single_batch_a_func(self, features, scope, mode, context_fn=None, reuse=tf.AUTO_REUSE): """A state -> action regression function that expects a single batch dim.""" gripper_pose = features.gripper_pose if self._use_gripper_input else None with tf.variable_scope(scope, reuse=reuse, use_resource=True): with tf.variable_scope('state_features', reuse=reuse, use_resource=True): feature_points, end_points = vision_layers.BuildImagesToFeaturesModel( features.image, is_training=(mode == TRAIN), normalizer_fn=tf.contrib.layers.layer_norm) if context_fn: feature_points = context_fn(feature_points) fc_input = tf.concat([feature_points, gripper_pose], -1) outputs = {} if self._num_mixture_components > 1: dist_params = mdn.predict_mdn_params( fc_input, self._num_mixture_components, self._action_size, condition_sigmas=self._condition_mixture_stddev) gm = mdn.get_mixture_distribution( dist_params, self._num_mixture_components, self._action_size, self._output_mean if self._normalize_outputs else None) if self._output_mixture_sample: # Output a mixture sample as action. action = gm.sample() else: action = mdn.gaussian_mixture_approximate_mode(gm) outputs['dist_params'] = dist_params else: action, _ = vision_layers.BuildImageFeaturesToPoseModel( fc_input, num_outputs=self._action_size) action = self._output_mean + self._output_stddev * action outputs.update({ 'action': action, 'image': features.image, 'feature_points': feature_points, 'softmax': end_points['softmax'] }) return outputs