예제 #1
0
def _transform_into_images(config, data, img_type="all"):
    """ reshapes data (shape: (batch_size, feature_length)) into the required image shape with an
    additional batch_dimension, e.g. (1,120,160,7) """
    data_shape = get_correct_image_shape(config, get_type=img_type)
    data = data[:, :-6]
    data = tf.reshape(data, [-1, *data_shape])
    return data
예제 #2
0
def _transform_edge_into_images(config, data, img_type="all", output_cnn_2_filter_maps=False):
    """ reshapes data (shape: (batch_size, feature_length)) into the required image shape with an
    additional batch_dimension, e.g. (1,120,160,7) """
    data_shape = get_correct_image_shape(config, get_type=img_type)
    if output_cnn_2_filter_maps:
        data_shape = (120,160,2)
    data = tf.reshape(data, [-1, *data_shape])
    return data
    def _build(self, inputs, verbose=VERBOSITY):

        if EncodeProcessDecode_v7_edge_segmentation_no_edges_dropout.convnet_tanh:
            activation = tf.nn.tanh
        else:
            activation = tf.nn.relu

        img_shape = get_correct_image_shape(
            config=None,
            get_type="seg",
            depth_data_provided=
            EncodeProcessDecode_v7_edge_segmentation_no_edges_dropout.
            depth_data_provided)
        img_data = tf.reshape(
            inputs, [-1, *img_shape])  # -1 means "all", i.e. batch dimension
        print(img_data.get_shape())
        ''' 60, 80 '''
        outputs = snt.Conv2D(output_channels=32,
                             kernel_shape=3,
                             stride=2,
                             padding="SAME")(img_data)
        outputs = activation(outputs)
        if EncodeProcessDecode_v7_edge_segmentation_no_edges_dropout.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
        print(outputs.get_shape())
        ''' 30, 40 '''
        outputs = snt.Conv2D(output_channels=32,
                             kernel_shape=3,
                             stride=2,
                             padding="SAME")(outputs)
        outputs = activation(outputs)
        if EncodeProcessDecode_v7_edge_segmentation_no_edges_dropout.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
        print(outputs.get_shape())
        ''' 15, 20 '''
        outputs = snt.Conv2D(output_channels=16,
                             kernel_shape=3,
                             stride=2,
                             padding="SAME")(outputs)
        outputs = activation(outputs)
        if EncodeProcessDecode_v7_edge_segmentation_no_edges_dropout.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
        print(outputs.get_shape())
        ''' 8, 10 '''
        outputs = snt.Conv2D(output_channels=5,
                             kernel_shape=3,
                             stride=2,
                             padding="SAME")(outputs)
        outputs = activation(outputs)
        if EncodeProcessDecode_v7_edge_segmentation_no_edges_dropout.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
        print(outputs.get_shape())

        outputs = tf.layers.flatten(outputs)  # 8,10,5 flattened

        return outputs
    def _build(self, inputs, name, verbose=VERBOSITY, keep_dropout_prop=0.9):

        if EncodeProcessDecode_v5_no_skip_batch_norm.convnet_tanh:
            activation = tf.nn.tanh
        else:
            activation = tf.nn.relu
        """ velocity (x,y,z) and position (x,y,z) """
        n_non_visual_elements = 6

        filter_sizes = [
            EncodeProcessDecode_v5_no_skip_batch_norm.n_conv_filters,
            EncodeProcessDecode_v5_no_skip_batch_norm.n_conv_filters * 2
        ]
        """ shape: (batch_size, features), get everything except velocity and position """
        img_data = inputs[:, :-n_non_visual_elements]
        img_shape = get_correct_image_shape(
            config=None,
            get_type="all",
            depth_data_provided=EncodeProcessDecode_v5_no_skip_batch_norm.
            depth_data_provided)
        img_data = tf.reshape(
            img_data, [-1, *img_shape])  # -1 means "all", i.e. batch dimension
        ''' Layer1 encoder output shape (?, 120, 160, filter_sizes[0]) '''
        outputs1 = tf.layers.conv2d(
            img_data,
            filters=64,
            kernel_size=3,
            strides=1,
            padding='same',
            activation=activation,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v5_no_skip_batch_norm.conv_layer_instance_norm:
            outputs1 = snt.BatchNorm()(outputs1, is_training=self._is_training)

        l1_shape = outputs1.get_shape()
        ''' Layer2 encoder output shape (?, 120, 160, filter_sizes[0]) '''
        outputs = tf.layers.conv2d(
            outputs1,
            filters=64,
            kernel_size=3,
            strides=1,
            padding='same',
            activation=activation,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v5_no_skip_batch_norm.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l2_shape = outputs.get_shape()
        ''' Layer3 encoder output shape (?, 60, 80, filter_sizes[0]) '''
        if EncodeProcessDecode_v5_no_skip_batch_norm.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l3_shape = outputs.get_shape()

        if self._is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' Layer4 encoder output shape (?, 60, 80, filter_sizes[0]) '''
        outputs = tf.layers.conv2d(
            outputs,
            filters=filter_sizes[0],
            kernel_size=3,
            strides=1,
            padding='same',
            activation=activation,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v5_no_skip_batch_norm.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l4_shape = outputs.get_shape()
        ''' Layer5 encoder output shape (?, 60, 80, filter_sizes[0]) '''
        outputs = tf.layers.conv2d(
            outputs,
            filters=filter_sizes[0],
            kernel_size=3,
            strides=1,
            padding='same',
            activation=activation,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        # --------------- SKIP CONNECTION --------------- #
        outputs2 = outputs

        if EncodeProcessDecode_v5_no_skip_batch_norm.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l5_shape = outputs.get_shape()
        ''' Layer6 encoder output shape (?, 30, 40, filter_sizes[0]) '''
        if EncodeProcessDecode_v5_no_skip_batch_norm.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l6_shape = outputs.get_shape()

        if self._is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' Layer7 encoder output shape (?, 30, 40, filter_sizes[1]) '''
        outputs = tf.layers.conv2d(
            outputs,
            filters=filter_sizes[0],
            kernel_size=3,
            strides=1,
            padding='same',
            activation=activation,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v5_no_skip_batch_norm.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l7_shape = outputs.get_shape()
        ''' Layer8 encoder output shape (?, 30, 40, filter_sizes[0]) '''
        outputs = tf.layers.conv2d(
            outputs,
            filters=filter_sizes[0],
            kernel_size=3,
            strides=1,
            padding='same',
            activation=activation,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v5_no_skip_batch_norm.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l8_shape = outputs.get_shape()
        ''' Layer9 encoder output shape (?, 15, 20, filter_sizes[0]) '''
        if EncodeProcessDecode_v5_no_skip_batch_norm.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l9_shape = outputs.get_shape()

        if self._is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' Layer10 encoder output shape (?, 15, 20, filter_sizes[1]) '''
        outputs = tf.layers.conv2d(
            outputs,
            filters=filter_sizes[1],
            kernel_size=3,
            strides=1,
            padding='same',
            activation=activation,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v5_no_skip_batch_norm.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l10_shape = outputs.get_shape()
        ''' Layer11 encoder output shape (?, 15, 20, filter_sizes[1]) '''
        outputs = tf.layers.conv2d(
            outputs,
            filters=filter_sizes[1],
            kernel_size=3,
            strides=1,
            padding='same',
            activation=activation,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))
        # --------------- SKIP CONNECTION --------------- #
        outputs3 = outputs

        if EncodeProcessDecode_v5_no_skip_batch_norm.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l11_shape = outputs.get_shape()
        ''' Layer12 encoder output shape (?, 7, 10, filter_sizes[1]) '''
        if EncodeProcessDecode_v5_no_skip_batch_norm.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l12_shape = outputs.get_shape()
        ''' Layer13 encoder output shape (?, 4, 5, filter_sizes[1]) '''
        outputs = tf.layers.conv2d(
            outputs,
            filters=filter_sizes[1],
            kernel_size=3,
            strides=2,
            padding='same',
            activation=activation,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v5_no_skip_batch_norm.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l13_shape = outputs.get_shape()
        ''' Layer14 encoder output shape (?, 2, 3, filter_sizes[1]) '''
        outputs = tf.layers.conv2d(
            outputs,
            filters=filter_sizes[1],
            kernel_size=3,
            strides=2,
            padding='same',
            activation=activation,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v5_no_skip_batch_norm.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l14_shape = outputs.get_shape()
        ''' Layer15 encoder output shape (?, 1, 1, filter_sizes[1]) '''
        if EncodeProcessDecode_v5_no_skip_batch_norm.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l15_shape = outputs.get_shape()

        if verbose:
            print("Layer1 encoder output shape", l1_shape)
            print("Layer2 encoder output shape", l2_shape)
            print("Layer3 encoder output shape", l3_shape)
            print("Layer4 encoder output shape", l4_shape)
            print("Layer5 encoder output shape", l5_shape)
            print("Layer6 encoder output shape", l6_shape)
            print("Layer7 encoder output shape", l7_shape)
            print("Layer8 encoder output shape", l8_shape)
            print("Layer9 encoder output shape", l9_shape)
            print("Layer10 encoder output shape", l10_shape)
            print("Layer11 encoder output shape", l11_shape)
            print("Layer12 encoder output shape", l12_shape)
            print("Layer13 encoder output shape", l13_shape)
            print("Layer14 encoder output shape", l14_shape)
            print("Layer15 encoder output shape", l15_shape)

        #' shape (?, 7, 10, filter_sizes[1]) -> (?, n_neurons_nodes_total_dim-n_neurons_nodes_non_visual) '
        visual_latent_output = tf.layers.flatten(outputs)
        #visual_latent_output = tf.layers.dense(inputs=visual_latent_output, units=EncodeProcessDecode_v4_172_improve_shapes_exp1.n_neurons_nodes_total_dim - EncodeProcessDecode_v4_172_improve_shapes_exp1.n_neurons_nodes_non_visual)

        # --------------- SKIP CONNECTION --------------- #
        self.skip1 = outputs1
        self.skip2 = outputs2
        self.skip3 = outputs3

        return visual_latent_output
예제 #5
0
    def _build(self, inputs, name, is_training=True, verbose=False):

        if EncodeProcessDecode_v1.convnet_tanh:
            activation = tf.nn.tanh
        else:
            activation = tf.nn.relu

        if "global" in name:
            n_non_visual_elements = 5
        else:
            n_non_visual_elements = 6


        filter_sizes = [EncodeProcessDecode_v1.n_conv_filters, EncodeProcessDecode_v1.n_conv_filters * 2]

        img_data = inputs[:, :-n_non_visual_elements]  # shape: (batch_size, features)
        img_shape = get_correct_image_shape(config=None, get_type="all", depth_data_provided=EncodeProcessDecode_v1.depth_data_provided)
        img_data = tf.reshape(img_data, [-1, *img_shape])  # -1 means "all", i.e. batch dimension

        ''' layer 1'''
        outputs = tf.layers.conv2d(img_data, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        l1_shape = outputs.get_shape()

        ''' layer 2'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        l2_shape = outputs.get_shape()

        ''' layer 3'''
        if EncodeProcessDecode_v1.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l3_shape = outputs.get_shape()

        ''' layer 4'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        l4_shape = outputs.get_shape()

        ''' layer 5'''
        if EncodeProcessDecode_v1.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l5_shape = outputs.get_shape()


        ''' layer 6'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        l6_shape = outputs.get_shape()

        ''' layer 7'''
        if EncodeProcessDecode_v1.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l7_shape = outputs.get_shape()

        ''' layer 8'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        l8_shape = outputs.get_shape()

        ''' layer 9'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        l9_shape = outputs.get_shape()

        ''' layer 10'''
        if EncodeProcessDecode_v1.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l10_shape = outputs.get_shape()

        outputs = tf.layers.batch_normalization(outputs, training=is_training)

        if verbose:
            print("Layer0 encoder output shape", l1_shape)
            print("Layer1 encoder output shape", l2_shape)
            print("Layer2 encoder output shape", l3_shape)
            print("Layer3 encoder output shape", l4_shape)
            print("Layer4 encoder output shape", l5_shape)
            print("Layer5 encoder output shape", l6_shape)
            print("Layer6 encoder output shape", l7_shape)
            print("Layer7 encoder output shape", l8_shape)
            print("Layer8 encoder output shape", l9_shape)
            print("Layer9 encoder output shape", l10_shape)

        ' shape (?, 7, 10, 32) -> (?, dimensions_latent_repr-n_neurons_mlp_nonvisual) '
        visual_latent_output = tf.layers.flatten(outputs)

        ''' layer 11'''
        visual_latent_output = tf.layers.dense(inputs=visual_latent_output, units=EncodeProcessDecode_v1.dimensions_latent_repr - EncodeProcessDecode_v1.n_neurons_mlp_nonvisual)
        return visual_latent_output
예제 #6
0
    def _build(self, inputs, is_training=True, verbose=False):
        filter_sizes = [EncodeProcessDecode_v1.n_conv_filters, EncodeProcessDecode_v1.n_conv_filters * 2]

        if EncodeProcessDecode_v1.convnet_tanh:
            activation = tf.nn.tanh
        else:
            activation = tf.nn.relu

        img_shape = get_correct_image_shape(config=None, get_type='all', depth_data_provided=EncodeProcessDecode_v1.depth_data_provided)

        """ get image data, get everything >except< last n elements which are non-visual """
        image_data = inputs[:, :-EncodeProcessDecode_v1.n_neurons_mlp_nonvisual]

        visual_latent_space_dim = EncodeProcessDecode_v1.dimensions_latent_repr - EncodeProcessDecode_v1.n_neurons_mlp_nonvisual

        """ in order to apply 1x1 2D convolutions, transform shape (batch_size, features) -> shape (batch_size, 1, 1, features)"""
        image_data = tf.expand_dims(image_data, axis=1)
        image_data = tf.expand_dims(image_data, axis=1)  # yields shape (?,1,1,latent_dim)

        #assert is_square(visual_latent_space_dim), "dimension of visual latent space vector (dimensions of latent representation: ({}) - " \
        #                                           "dimensions of non visual latent representation({})) must be square".format(
        #    EncodeProcessDecode.dimensions_latent_repr, EncodeProcessDecode.n_neurons_mlp_nonvisual)

        #image_data = tf.reshape(image_data, (-1, int(math.sqrt(visual_latent_space_dim)), int(math.sqrt(visual_latent_space_dim)), 1))
        #image_data = tf.reshape(image_data, (-1, 7, 10, 5))
        image_data = tf.reshape(image_data, (-1, 7, 10, 15))

        ''' layer 1 (7,10,5) -> (7,10,filter_sizes[1])'''
        outputs = tf.layers.conv2d_transpose(image_data, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l1_shape = outputs.get_shape()

        ''' layer 2 (7,10,filter_sizes[1]) -> (15,20,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[1], kernel_size=(3, 2), strides=2, padding='valid')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l2_shape = outputs.get_shape()

        ''' layer 2 (15,20,filter_sizes[1]) -> (15,20,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l3_shape = outputs.get_shape()

        ''' layer 2 (15,20,filter_sizes[1]) -> (30,40,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l4_shape = outputs.get_shape()

        ''' layer 3 (30,40,filter_sizes[1]) -> (30,40,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[1], kernel_size=3, strides=2, padding='same')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l5_shape = outputs.get_shape()


        ''' layer 4 (30,40,filter_sizes[1]) -> (30,40,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l6_shape = outputs.get_shape()


        ''' layer 5 (30,40,filter_sizes[0]) -> (60,80,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[0], kernel_size=3, strides=2, padding='same')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l7_shape = outputs.get_shape()

        ''' layer 5 (60,80,filter_sizes[0]) -> (60,80,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l8_shape = outputs.get_shape()

        ''' layer 5 (60,80,filter_sizes[0]) -> (120,160,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[0], kernel_size=3, strides=2, padding='same')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l9_shape = outputs.get_shape()

        ''' layer 5 (120,160,filter_sizes[0]) -> (120,160,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l10_shape = outputs.get_shape()

        ''' layer 6 (120,160,filter_sizes[0]) -> (120,160,3 or 4 or 7]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=img_shape[2], kernel_size=1, strides=1, padding='same')
        outputs = tf.layers.batch_normalization(outputs, training=is_training)
        outputs = activation(outputs)
        l11_shape = outputs.get_shape()

        visual_latent_output = tf.layers.flatten(outputs)

        if verbose:
            print("Image data shape", image_data.get_shape())
            print("Layer1 decoder output shape", l1_shape)
            print("Layer2 decoder output shape", l2_shape)
            print("Layer3 decoder output shape", l3_shape)
            print("Layer4 decoder output shape", l4_shape)
            print("Layer5 decoder output shape", l5_shape)
            print("Layer6 decoder output shape", l6_shape)
            print("Layer7 decoder output shape", l7_shape)
            print("Layer8 decoder output shape", l8_shape)
            print("Layer9 decoder output shape", l9_shape)
            print("Layer10 decoder output shape", l10_shape)
            print("Layer11 decoder output shape", l11_shape)
            print("decoder shape before adding non-visual data", visual_latent_output.get_shape())

        # outputs = tf.nn.dropout(outputs, keep_prob=tf.constant(1.0)) # todo: deal with train/test time

        return visual_latent_output
예제 #7
0
def graph_to_input_and_targets_single_experiment(config,
                                                 graph,
                                                 features,
                                                 initial_pos_vel_known,
                                                 return_only_unpadded=False):
    """Returns 2 graphs with input and target feature vectors for training.

    Args:
      graph: An `nx.DiGraph` instance.

    Returns:
      The input `nx.DiGraph` instance.
      The target `nx.DiGraph` instance.

    Raises:
      ValueError: unknown node type
    """
    gripper_as_global = config.gripper_as_global
    data_offset_manipulable_objects = config.data_offset_manipulable_objects
    experiment_length = features['experiment_length']
    experiment_id = features['experiment_id']
    """ handles the testing cycles when a different number of rollouts shall be predicted than seen in training """
    if config.n_rollouts is not experiment_length:
        experiment_length = config.n_rollouts

    target_graphs = [graph.copy() for _ in range(experiment_length)]

    def create_node_feature(attr, features, step, config):
        if attr['type_name'] == 'container':
            """ container only has object segmentations """
            # pad up to fixed size since sonnet can only handle fixed-sized features
            res = attr['features']
            if config.use_object_seg_data_only_for_init:
                feature = features['object_segments'][1].flatten()
            else:
                feature = features['object_segments'][step][1].flatten()
            res[:feature.shape[0]] = feature
            return res

        elif attr['type_name'] == 'gripper':
            """ gripper only has obj segs and gripper pos """
            if config.use_object_seg_data_only_for_init:
                obj_seg = features['object_segments'][0].flatten()
            else:
                obj_seg = features['object_segments'][step][0].flatten()
            pos = features['gripperpos'][step].flatten().astype(np.float32)
            vel = features['grippervel'][step].flatten().astype(np.float32)
            return np.concatenate((obj_seg, vel, pos))

        elif "manipulable" in attr['type_name']:
            """ we assume shape (image features, vel(3dim), pos(3dim)) """
            obj_id = int(attr['type_name'].split("_")[2])
            obj_id_segs = obj_id + data_offset_manipulable_objects

            # obj_seg will have data as following: (rgb, seg, optionally: depth)
            if config.use_object_seg_data_only_for_init:
                """ in this case, the nodes will have static visual information over time """
                obj_seg = features['object_segments'][obj_id].flatten()
            else:
                """ in this case, the nodes will have dynamic visual information over time """
                obj_seg = features['object_segments'][step][
                    obj_id_segs].astype(np.float32)
                """ nodes have full access to scene observation (i.e. rgb and depth) """
                if config.nodes_get_full_rgb_depth:
                    rgb = features["img"][step].astype(np.float32)
                    depth = features["depth"][step].astype(np.float32)
                    obj_seg[:, :, :3] = rgb
                    obj_seg[:, :, -3:] = depth

                obj_seg = obj_seg.flatten()
            pos = features['objpos'][step][obj_id].flatten().astype(np.float32)

            # normalize velocity
            # """ (normalized) velocity is computed here since rolled indexing in
            # tfrecords seems not straightforward """
            # if step == 0:
            #    diff = np.zeros(shape=3, dtype=np.float32)
            # else:
            #    diff = features['objpos'][step-1][obj_id] - features['objpos'][step][obj_id]
            #    if config.normalize_data:
            #        vel = normalize_list([diff])[0]
            #vel = (diff * 240.0).flatten().astype(np.float32)
            vel = features['objvel'][step][obj_id].flatten().astype(np.float32)
            if config.remove_pos_vel:
                pos = np.zeros(shape=np.shape(pos), dtype=np.float32)
                vel = np.zeros(shape=np.shape(vel), dtype=np.float32)
            return np.concatenate((obj_seg, vel, pos))

    def create_edge_feature_distance(receiver, sender, target_graph_i):
        node_feature_rcv = target_graph_i.nodes(data=True)[receiver]
        node_feature_snd = target_graph_i.nodes(data=True)[sender]
        """ the position is always the last three elements of the flattened feature vector """
        pos1 = node_feature_rcv['features'][-3:]
        pos2 = node_feature_snd['features'][-3:]
        return (pos1 - pos2).astype(np.float32)

    def create_edge_feature(sender,
                            target_graph,
                            target_graph_previous,
                            seg_as_edges,
                            img_shape=None):
        if not seg_as_edges:
            node_feature_snd_prev = target_graph_previous.nodes(
                data=True)[sender]
            node_feature_snd = target_graph.nodes(data=True)[sender]
            """ the position is always the last three elements of the flattened feature vector """
            pos_prev = node_feature_snd_prev["features"][-3:]
            vel_pos = node_feature_snd['features'][-6:]
            vel_pos = np.insert(vel_pos, 3, pos_prev)
            """ will yield (vel_t, pos_{t-1}, pos_t)"""
            return vel_pos.astype(np.float32)
        else:
            node_feature = target_graph.nodes(
                data=True)[sender]['features'][:-6]
            node_feature = np.reshape(node_feature, img_shape)
            return node_feature[:, :, 3].flatten()

    input_control_graphs = []

    for step in range(experiment_length):

        for node_index, node_feature in graph.nodes(data=True):
            node_feature = create_node_feature(node_feature, features, step,
                                               config)
            target_graphs[step].add_node(node_index, features=node_feature)
        """ if gripper_as_global = True, graphs will have one node less
         add globals (image, segmentation, depth, gravity, time_step) """
        if gripper_as_global:
            if config.global_output_size == 5:
                global_features = np.concatenate(
                    (np.atleast_1d(step), np.atleast_1d(constants.g),
                     features['gripperpos'][step].flatten())).astype(
                         np.float32)
            elif config.global_output_size == 9:
                padding_flag = 1 if step >= features[
                    "unpadded_experiment_length"] else 0
                global_features = np.concatenate((
                    np.atleast_1d(padding_flag),
                    np.atleast_1d(step),
                    np.atleast_1d(constants.g),
                    features['gripperpos'][step].flatten(),
                    features['grippervel'][step].flatten(),
                )).astype(np.float32)
            else:
                global_features = np.concatenate(
                    (features['img'][step].flatten(),
                     features['seg'][step].flatten(),
                     features['depth'][step].flatten(), np.atleast_1d(step),
                     np.atleast_1d(constants.g),
                     features['gripperpos'][step].flatten())).astype(
                         np.float32)

            target_graphs[step].graph["features"] = global_features
            """ assign gripperpos to input control graphs """
            input_control_graph = graph.copy()
            for i in range(input_control_graph.number_of_nodes()):
                input_control_graph.nodes(data=True)[i]["features"] = None
            for receiver, sender, edge_feature in input_control_graph.edges(
                    data=True):
                input_control_graph[sender][receiver][0]['features'] = None

            input_control_graph.graph["features"] = global_features

            assert target_graphs[step].graph["features"].shape[
                0] == config.global_output_size
            assert input_control_graph.graph["features"].shape[
                0] == config.global_output_size
            input_control_graphs.append(input_control_graph)

        else:
            if config.global_output_size == 2:
                target_graphs[step].graph["features"] = np.concatenate(
                    (np.atleast_1d(step),
                     np.atleast_1d(constants.g))).astype(np.float32)

            #assert target_graphs[step].graph["features"].shape[0]-3 == config.global_output_size
            input_control_graphs = None
    """ compute distances between every manipulable object (and gripper if not gripper_as_global) """
    for step in range(experiment_length):
        for sender, receiver, edge_feature in target_graphs[step].edges(
                data="features"):
            if step == 0:
                target_graphs_previous = target_graphs[step]
            else:
                target_graphs_previous = target_graphs[step - 1]
            edge_feature = create_edge_feature(
                sender=sender,
                target_graph=target_graphs[step],
                target_graph_previous=target_graphs_previous,
                seg_as_edges=config.edges_carry_segmentation_data,
                img_shape=get_correct_image_shape(config, get_type='all'))
            if config.remove_edges:
                edge_feature = np.zeros(shape=np.shape(edge_feature),
                                        dtype=np.float32)

            target_graphs[step].add_edge(sender,
                                         receiver,
                                         key=0,
                                         features=edge_feature)

    input_graphs = []
    for i in range(experiment_length - 1):
        inp = target_graphs[i].copy()
        """ gripperpos and grippervel always reflect the current step. However, we are interested in predicting
        the effects of a new/next control command --> shift by one """
        inp.graph["features"] = input_control_graphs[i + 1].graph["features"]
        input_graphs.append(inp)

    target_graphs = target_graphs[1:]  # first state is used for init

    # todo: following code assumes all nodes are of type 'manipulable'
    """ set velocity and position info to zero """
    if not initial_pos_vel_known:
        """ for all nodes """
        for graph in input_graphs:
            for idx, node_feature in graph.nodes(data=True):
                feat = node_feature['features']
                feat[-6:] = 0
                graph.add_node(idx, features=feat)
                """ for all edges """
            for receiver, sender, edge_feature in graph.edges(data=True):
                feat = edge_feature['features']
                feat[:] = 0
                graph.add_edge(sender, receiver, features=feat)

    if return_only_unpadded:
        input_graphs = [
            graph for graph in input_graphs if graph.graph['features'][0] == 0
        ]
        target_graphs = [
            graph for graph in target_graphs if graph.graph['features'][0] == 0
        ]
    """ check if the gripper pos+vel in the input graph are values from the next time step """
    assert (input_graphs[0].graph['features'] ==
            target_graphs[0].graph['features']).all()

    return input_graphs, target_graphs, experiment_id
    def _build(self, inputs, name, verbose=VERBOSITY, keep_dropout_prop=0.7):

        if EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.convnet_tanh:
            activation = tf.nn.tanh
        else:
            activation = tf.nn.relu

        n_non_visual_elements = 6 # velocity (x,y,z) and position (x,y,z)

        filter_sizes = [EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.n_conv_filters, EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.n_conv_filters * 2]

        img_data = inputs[:, :-n_non_visual_elements]  # shape: (batch_size, features)
        img_shape = get_correct_image_shape(config=None, get_type="all", depth_data_provided=EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.depth_data_provided)
        img_data = tf.reshape(img_data, [-1, *img_shape])  # -1 means "all", i.e. batch dimension

        ''' layer 1'''
        outputs1 = tf.layers.conv2d(img_data, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs1)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l1_shape = outputs.get_shape()

        ''' layer 2'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l2_shape = outputs.get_shape()

        ''' layer 3'''
        if EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l3_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 4'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l4_shape = outputs.get_shape()

        ''' layer 5'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation)

        # --------------- SKIP CONNECTION --------------- #
        outputs2 = outputs

        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l5_shape = outputs.get_shape()

        ''' layer 6'''
        if EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l6_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 7'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l7_shape = outputs.get_shape()

        ''' layer 8'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l8_shape = outputs.get_shape()

        ''' layer 9'''
        if EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l9_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 10'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same', activation=activation)
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l10_shape = outputs.get_shape()

        ''' layer 11'''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same', activation=activation)
        # --------------- SKIP CONNECTION --------------- #
        outputs3 = outputs
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l11_shape = outputs.get_shape()

        ''' layer 12'''
        if EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l12_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        if verbose:
            print("Layer1 encoder output shape", l1_shape)
            print("Layer2 encoder output shape", l2_shape)
            print("Layer3 encoder output shape", l3_shape)
            print("Layer4 encoder output shape", l4_shape)
            print("Layer5 encoder output shape", l5_shape)
            print("Layer6 encoder output shape", l6_shape)
            print("Layer7 encoder output shape", l7_shape)
            print("Layer8 encoder output shape", l8_shape)
            print("Layer9 encoder output shape", l9_shape)
            print("Layer10 encoder output shape", l10_shape)
            print("Layer11 encoder output shape", l11_shape)
            print("Layer12 encoder output shape", l12_shape)

        ' shape (?, 7, 10, 32) -> (?, n_neurons_nodes_total_dim-n_neurons_nodes_non_visual) '
        visual_latent_output = tf.layers.flatten(outputs)

        ''' layer 11'''
        visual_latent_output = tf.layers.dense(inputs=visual_latent_output, units=EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.n_neurons_nodes_total_dim - EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.n_neurons_nodes_non_visual)

        # --------------- SKIP CONNECTION --------------- #
        self.skip1 = outputs1
        self.skip2 = outputs2
        self.skip3 = outputs3

        return visual_latent_output
    def _build(self, inputs, name, verbose=VERBOSITY, keep_dropout_prop=0.7):
        filter_sizes = [EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.n_conv_filters, EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.n_conv_filters * 2]

        if EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.convnet_tanh:
            activation = tf.nn.tanh
        else:
            activation = tf.nn.relu

        img_shape = get_correct_image_shape(config=None, get_type='all', depth_data_provided=EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.depth_data_provided)

        """ get image data, get everything >except< last n elements which are non-visual (position and velocity) """
        image_data = inputs[:, :-EncodeProcessDecode_v4_1082_latent_dim_only_seg_skip_connection_one_step.n_neurons_nodes_non_visual]

        #visual_latent_space_dim = EncodeProcessDecode_v3.n_neurons_nodes_total_dim - EncodeProcessDecode_v3.n_neurons_nodes_total_dim

        """ in order to apply 1x1 2D convolutions, transform shape (batch_size, features) -> shape (batch_size, 1, 1, features)"""
        image_data = tf.expand_dims(image_data, axis=1)
        image_data = tf.expand_dims(image_data, axis=1)  # yields shape (?,1,1,latent_dim)
        image_data = tf.reshape(image_data, (-1, 7, 10, 15))

        ''' layer 1 (7,10,5) -> (7,10,filter_sizes[1])'''
        outputs = tf.layers.conv2d_transpose(image_data, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l1_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 2 (7,10,filter_sizes[1]) -> (15,20,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[1], kernel_size=(3, 2), strides=2, padding='valid')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l2_shape = outputs.get_shape()

        outputsl2 = outputs

        ''' layer 2_2 (15,20,filter_sizes[1] -> (15,20,filter_sizes[1]) '''
        # --------------- SKIP CONNECTION --------------- #
        #outputs = tf.concat([outputs, self.skip3], axis=3)
        #outputs = outputs + self.skip3
        #after_skip3 = outputs.get_shape()

        # --------------- SKIP CONNECTION --------------- #
        outputs = tf.layers.conv2d(self.skip3, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l1_2_shape = outputs.get_shape()

        outputs = outputsl2 + outputs
        after_skip3 = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 3 (15,20,filter_sizes[1]) -> (15,20,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l3_shape = outputs.get_shape()

        ''' layer 4 (15,20,filter_sizes[1]) -> (15,20,filter_sizes[1]) '''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l4_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 5 (15,20,filter_sizes[1]) -> (30,40,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l5_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 7 (30,40,filter_sizes[1]) -> (30,40,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[1], kernel_size=3, strides=2, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l7_shape = outputs.get_shape()

        ''' layer 8 (30,40,filter_sizes[1]) -> (30,40,filter_sizes[1]) '''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l8_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 9 (30,40,filter_sizes[1]) -> (30,40,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l9_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 11 (30,40,filter_sizes[0]) -> (60,80,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[0], kernel_size=3, strides=2, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l11_shape = outputs.get_shape()

        ''' layer 12 (60,80,filter_sizes[0]) -> (60,80,filter_sizes[0]) '''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l12_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 13 (60,80,filter_sizes[0])  -> (60,80,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l13_shape = outputs.get_shape()

        outputsl13 = outputs

        # --------------- SKIP CONNECTION --------------- #
        #outputs = tf.concat([outputs, self.skip2], axis=3)
        #outputs = outputs + self.skip2
        #after_skip2 = outputs.get_shape()

        ''' layer 14 (60,80,filter_sizes[0]) -> (60,80,filter_sizes[0]) '''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l14_shape = outputs.get_shape()

        # --------------- SKIP CONNECTION --------------- #
        outputs = outputsl13 + outputs
        after_skip2 = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' layer 15 (60,80,filter_sizes[0]) -> (120,160,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[0], kernel_size=3, strides=2, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l15_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        # --------------- SKIP CONNECTION --------------- #
        #outputs = outputs + self.skip1
        #outputs = tf.concat([outputs, self.skip1], axis=3)
        #after_skip1 = outputs.get_shape()

        ''' layer 18 (120,160,filter_sizes[0]) -> (120,160,filter_sizes[0]) '''
        outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=1, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l18_shape = outputs.get_shape()

        ''' layer 17 (120,160,filter_sizes[0]) -> (120,160,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l17_shape = outputs.get_shape()

        outputs = tf.layers.conv2d(outputs, filters=1, kernel_size=1, strides=1, padding='same', activation=None)  # activation None for logits
        l19_shape = outputs.get_shape()

        visual_latent_output = tf.layers.flatten(outputs)

        if verbose:
            print("Image data shape", image_data.get_shape())
            print("Layer1 decoder output shape", l1_shape)
            print("Layer1_2 decoder output shape", l1_2_shape)
            print("Layer2 decoder output shape", l2_shape)
            print("Layer3 decoder output shape", l3_shape)
            print("Layer4 decoder output shape", l4_shape)
            print("Layer5 decoder output shape", l5_shape)
            print("Layer6 decoder output shape", l7_shape)
            print("Layer7 decoder output shape", l8_shape)
            print("Layer8 decoder output shape", l9_shape)
            print("Layer9 decoder output shape", l11_shape)
            print("Layer10 decoder output shape", l12_shape)
            print("Layer11 decoder output shape", l13_shape)
            print("Layer12 decoder output shape", l14_shape)
            print("Layer13 decoder output shape", l15_shape)
            print("Layer14 decoder output shape", l17_shape)
            print("Layer15 decoder output shape", l18_shape)
            print("Layer16 decoder output shape", l19_shape)
            print("decoder shape before adding non-visual data", visual_latent_output.get_shape())
            print("shape before skip3 {}".format(l1_shape))
            print("shape after skip3 {}".format(after_skip3))
            print("shape before skip2 {}".format(l11_shape))
            print("shape after skip2 {}".format(after_skip2))
            print("shape before skip1 {}".format(l17_shape))
            #print("shape after skip1 {}".format(after_skip1))

        return visual_latent_output
    def _build(self, inputs, verbose=VERBOSITY, keep_dropout_prop=0.9):

        if EncodeProcessDecode_v6_no_core.convnet_tanh:
            activation = tf.nn.tanh
        else:
            activation = tf.nn.relu

        """ velocity (x,y,z) and position (x,y,z) """
        n_globals = 9
        n_non_visual_elements = 6

        filter_sizes = [EncodeProcessDecode_v6_no_core.n_conv_filters,
                        EncodeProcessDecode_v6_no_core.n_conv_filters * 2]

        """ shape: (batch_size, features), get everything except velocity and position """
        img_data = inputs[:, :-(n_non_visual_elements + n_globals)]
        img_shape = get_correct_image_shape(config=None, get_type="all",
                                            depth_data_provided=EncodeProcessDecode_v6_no_core.depth_data_provided)
        img_data = tf.reshape(img_data, [-1, *img_shape])  # -1 means "all", i.e. batch dimension

        ''' Layer1 encoder output shape (?, 120, 160, filter_sizes[0]) '''
        outputs1 = snt.Conv2D(output_channels=128, kernel_shape=3, stride=1, padding="SAME")(img_data)
        outputs1 = activation(outputs1)
        #outputs1 = tf.layers.conv2d(img_data, filters=64, kernel_size=3, strides=1, padding='same', activation=activation, use_bias=False,
        #                            kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v6_no_core.conv_layer_instance_norm:
            outputs1 = snt.BatchNorm()(outputs1, is_training=self._is_training)
            #outputs1 = tf.contrib.layers.instance_norm(outputs1)

        l1_shape = outputs1.get_shape()

        ''' Layer2 encoder output shape (?, 120, 160, filter_sizes[0]) '''
        outputs = snt.Conv2D(output_channels=128, kernel_shape=3, stride=1, padding="SAME")(outputs1)
        outputs = activation(outputs)
        #outputs = tf.layers.conv2d(outputs1, filters=64, kernel_size=3, strides=1, padding='same', activation=activation, use_bias=False,
        #                           kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        #if EncodeProcessDecode_v5_no_skip_no_core_no_training_flags_new.conv_layer_instance_norm:
        #    outputs = tf.contrib.layers.instance_norm(outputs)

        l2_shape = outputs.get_shape()

        ''' Layer3 encoder output shape (?, 60, 80, filter_sizes[0]) '''
        if EncodeProcessDecode_v6_no_core.convnet_pooling:
            outputs = tf.layers.average_pooling2d(outputs, 2, 2)
        l3_shape = outputs.get_shape()

        #if is_training:
        #    outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        #else:
        #    outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' Layer4 encoder output shape (?, 60, 80, filter_sizes[0]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[0], kernel_shape=3, stride=1, padding="SAME")(outputs)
        outputs = activation(outputs)
        #outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation,
        #                           use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v6_no_core.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
            #outputs = tf.contrib.layers.instance_norm(outputs)

        l4_shape = outputs.get_shape()

        ''' Layer5 encoder output shape (?, 60, 80, filter_sizes[0]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[0], kernel_shape=3, stride=1, padding="SAME")(outputs)
        outputs = activation(outputs)
        #outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation,
        #                           use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        # --------------- SKIP CONNECTION --------------- #
        outputs2 = outputs

        if EncodeProcessDecode_v6_no_core.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
            #outputs = tf.contrib.layers.instance_norm(outputs)

        l5_shape = outputs.get_shape()

        ''' Layer6 encoder output shape (?, 30, 40, filter_sizes[0]) '''
        if EncodeProcessDecode_v6_no_core.convnet_pooling:
            outputs = tf.layers.average_pooling2d(outputs, 2, 2)
        l6_shape = outputs.get_shape()

        #if is_training:
        #    outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        #else:
        #    outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' Layer7 encoder output shape (?, 30, 40, filter_sizes[1]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[0], kernel_shape=3, stride=1, padding="SAME")(outputs)
        outputs = activation(outputs)
        #outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation,
        #                           use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v6_no_core.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
            #outputs = tf.contrib.layers.instance_norm(outputs)

        l7_shape = outputs.get_shape()

        ''' Layer8 encoder output shape (?, 30, 40, filter_sizes[0]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[0], kernel_shape=3, stride=1, padding="SAME")(outputs)
        outputs = activation(outputs)
        #outputs = tf.layers.conv2d(outputs, filters=filter_sizes[0], kernel_size=3, strides=1, padding='same', activation=activation,
        #                           use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v6_no_core.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
            #outputs = tf.contrib.layers.instance_norm(outputs)

        l8_shape = outputs.get_shape()

        ''' Layer9 encoder output shape (?, 15, 20, filter_sizes[0]) '''
        if EncodeProcessDecode_v6_no_core.convnet_pooling:
            outputs = tf.layers.average_pooling2d(outputs, 2, 2)
        l9_shape = outputs.get_shape()

        #if is_training:
        #    outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        #else:
        #    outputs = tf.nn.dropout(outputs, keep_prob=1.0)

        ''' Layer10 encoder output shape (?, 15, 20, filter_sizes[1]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[1], kernel_shape=3, stride=1, padding="SAME")(outputs)
        outputs = activation(outputs)
        #outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same', activation=activation,
        #                           use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v6_no_core.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
            #outputs = tf.contrib.layers.instance_norm(outputs)

        l10_shape = outputs.get_shape()

        ''' Layer11 encoder output shape (?, 15, 20, filter_sizes[1]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[1], kernel_shape=3, stride=1, padding="SAME")(outputs)
        outputs = activation(outputs)
        #outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=1, padding='same', activation=activation,
        #                           use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))
        # --------------- SKIP CONNECTION --------------- #
        outputs3 = outputs

        if EncodeProcessDecode_v6_no_core.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
            #outputs = tf.contrib.layers.instance_norm(outputs)

        l11_shape = outputs.get_shape()

        ''' Layer12 encoder output shape (?, 7, 10, filter_sizes[1]) '''
        if EncodeProcessDecode_v6_no_core.convnet_pooling:
            outputs = tf.layers.average_pooling2d(outputs, 2, 2)
        l12_shape = outputs.get_shape()

        ''' Layer13 encoder output shape (?, 4, 5, filter_sizes[1]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[1], kernel_shape=3, stride=2, padding="SAME")(outputs)
        outputs = activation(outputs)
        #outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=2, padding='same', activation=activation,
        #                           use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v6_no_core.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
            #outputs = tf.contrib.layers.instance_norm(outputs)

        l13_shape = outputs.get_shape()

        ''' Layer14 encoder output shape (?, 2, 3, filter_sizes[1]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[1], kernel_shape=3, stride=2, padding="SAME")(outputs)
        outputs = activation(outputs)
        #outputs = tf.layers.conv2d(outputs, filters=filter_sizes[1], kernel_size=3, strides=2, padding='same', activation=activation,
        #                           use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=1e-05))

        if EncodeProcessDecode_v6_no_core.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)
            #outputs = tf.contrib.layers.instance_norm(outputs)

        l14_shape = outputs.get_shape()

        ''' Layer15 encoder output shape (?, 1, 1, filter_sizes[1]) '''
        if EncodeProcessDecode_v6_no_core.convnet_pooling:
            outputs = tf.layers.average_pooling2d(outputs, 2, 2)
        l15_shape = outputs.get_shape()

        if verbose:
            print("Layer1 encoder output shape", l1_shape)
            print("Layer2 encoder output shape", l2_shape)
            print("Layer3 encoder output shape", l3_shape)
            print("Layer4 encoder output shape", l4_shape)
            print("Layer5 encoder output shape", l5_shape)
            print("Layer6 encoder output shape", l6_shape)
            print("Layer7 encoder output shape", l7_shape)
            print("Layer8 encoder output shape", l8_shape)
            print("Layer9 encoder output shape", l9_shape)
            print("Layer10 encoder output shape", l10_shape)
            print("Layer11 encoder output shape", l11_shape)
            print("Layer12 encoder output shape", l12_shape)
            print("Layer13 encoder output shape", l13_shape)
            print("Layer14 encoder output shape", l14_shape)
            print("Layer15 encoder output shape", l15_shape)

        # ' shape (?, 7, 10, filter_sizes[1]) -> (?, n_neurons_nodes_total_dim-n_neurons_nodes_non_visual) '
        visual_latent_output = tf.layers.flatten(outputs)
        # visual_latent_output = tf.layers.dense(inputs=visual_latent_output, units=EncodeProcessDecode_v4_172_improve_shapes_exp1.n_neurons_nodes_total_dim - EncodeProcessDecode_v4_172_improve_shapes_exp1.n_neurons_nodes_non_visual)

        # --------------- SKIP CONNECTION --------------- #
        self.skip1 = outputs1
        self.skip2 = outputs2
        self.skip3 = outputs3


        n_globals = 9
        n_non_visual_elements = 6

        gripper_input = inputs[:, -n_globals:]  # get x,y,z-gripper position and x,y,z-gripper velocity

        n_neurons = EncodeProcessDecode_v6_no_core.n_neurons_nodes_non_visual
        n_layers = EncodeProcessDecode_v6_no_core.n_neurons_nodes_non_visual
        output_size = EncodeProcessDecode_v6_no_core.n_neurons_nodes_non_visual
        net = snt.nets.MLP([n_neurons] * n_layers, activate_final=False)
        """ map velocity and position into a latent space, concatenate with visual latent space vector """
        gripper_latent_output = snt.Sequential([net, snt.LayerNorm(), snt.Linear(output_size)])(gripper_input)

        outputs = tf.concat([visual_latent_output, gripper_latent_output], axis=1)

        if verbose:
            print("final encoder output shape", outputs.get_shape())

        return outputs
    def _build(self, inputs, name, verbose=VERBOSITY, keep_dropout_prop=0.7):
        filter_sizes = [
            EncodeProcessDecode_v3_1114_latent_dim.n_conv_filters,
            EncodeProcessDecode_v3_1114_latent_dim.n_conv_filters * 2
        ]

        if EncodeProcessDecode_v3_1114_latent_dim.convnet_tanh:
            activation = tf.nn.tanh
        else:
            activation = tf.nn.relu

        img_shape = get_correct_image_shape(
            config=None,
            get_type='all',
            depth_data_provided=EncodeProcessDecode_v3_1114_latent_dim.
            depth_data_provided)
        """ get image data, get everything >except< last n elements which are non-visual (position and velocity) """
        image_data = inputs[:, :-EncodeProcessDecode_v3_1114_latent_dim.
                            n_neurons_nodes_non_visual]

        #visual_latent_space_dim = EncodeProcessDecode_v3.n_neurons_nodes_total_dim - EncodeProcessDecode_v3.n_neurons_nodes_total_dim
        """ in order to apply 1x1 2D convolutions, transform shape (batch_size, features) -> shape (batch_size, 1, 1, features)"""
        image_data = tf.expand_dims(image_data, axis=1)
        image_data = tf.expand_dims(image_data,
                                    axis=1)  # yields shape (?,1,1,latent_dim)
        image_data = tf.reshape(image_data, (-1, 7, 10, 15))
        ''' layer 1 (7,10,5) -> (7,10,filter_sizes[1])'''
        outputs = tf.layers.conv2d_transpose(image_data,
                                             filters=filter_sizes[1],
                                             kernel_size=3,
                                             strides=1,
                                             padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l1_shape = outputs.get_shape()
        ''' layer 2 (7,10,filter_sizes[1]) -> (15,20,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs,
                                             filters=filter_sizes[1],
                                             kernel_size=(3, 2),
                                             strides=2,
                                             padding='valid')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l2_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' layer 2 (15,20,filter_sizes[1]) -> (15,20,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs,
                                             filters=filter_sizes[1],
                                             kernel_size=3,
                                             strides=1,
                                             padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l3_shape = outputs.get_shape()
        ''' layer 2 (15,20,filter_sizes[1]) -> (30,40,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs,
                                             filters=filter_sizes[1],
                                             kernel_size=3,
                                             strides=1,
                                             padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l4_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' layer 3 (30,40,filter_sizes[1]) -> (30,40,filter_sizes[1]) '''
        outputs = tf.layers.conv2d_transpose(outputs,
                                             filters=filter_sizes[1],
                                             kernel_size=3,
                                             strides=2,
                                             padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l5_shape = outputs.get_shape()
        ''' layer 4 (30,40,filter_sizes[1]) -> (30,40,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs,
                                             filters=filter_sizes[0],
                                             kernel_size=3,
                                             strides=1,
                                             padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l6_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' layer 5 (30,40,filter_sizes[0]) -> (60,80,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs,
                                             filters=filter_sizes[0],
                                             kernel_size=3,
                                             strides=2,
                                             padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l7_shape = outputs.get_shape()
        ''' layer 5 (60,80,filter_sizes[0]) -> (60,80,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs,
                                             filters=filter_sizes[0],
                                             kernel_size=3,
                                             strides=1,
                                             padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l8_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' layer 5 (60,80,filter_sizes[0]) -> (120,160,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs,
                                             filters=filter_sizes[0],
                                             kernel_size=3,
                                             strides=2,
                                             padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l9_shape = outputs.get_shape()
        ''' layer 5 (120,160,filter_sizes[0]) -> (120,160,filter_sizes[0]) '''
        outputs = tf.layers.conv2d_transpose(outputs,
                                             filters=filter_sizes[0],
                                             kernel_size=3,
                                             strides=1,
                                             padding='same')
        outputs = activation(outputs)
        outputs = tf.contrib.layers.layer_norm(outputs)
        l10_shape = outputs.get_shape()

        if self.is_training:
            outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        else:
            outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' layer 6 (120,160,filter_sizes[0]) -> (120,160,3 or 4 or 7]) '''
        outputs = tf.layers.conv2d_transpose(outputs,
                                             filters=img_shape[2],
                                             kernel_size=1,
                                             strides=1,
                                             padding='same')
        outputs = activation(outputs)
        l11_shape = outputs.get_shape()

        visual_latent_output = tf.layers.flatten(outputs)

        if verbose:
            print("Image data shape", image_data.get_shape())
            print("Layer1 decoder output shape", l1_shape)
            print("Layer2 decoder output shape", l2_shape)
            print("Layer3 decoder output shape", l3_shape)
            print("Layer4 decoder output shape", l4_shape)
            print("Layer5 decoder output shape", l5_shape)
            print("Layer6 decoder output shape", l6_shape)
            print("Layer7 decoder output shape", l7_shape)
            print("Layer8 decoder output shape", l8_shape)
            print("Layer9 decoder output shape", l9_shape)
            print("Layer10 decoder output shape", l10_shape)
            print("Layer11 decoder output shape", l11_shape)
            print("decoder shape before adding non-visual data",
                  visual_latent_output.get_shape())

        return visual_latent_output
예제 #12
0
    def _build(self, inputs, verbose=VERBOSITY, keep_dropout_prop=0.9):

        if EncodeProcessDecode_v8_edge_segmentation.convnet_tanh:
            activation = tf.nn.tanh
        else:
            activation = tf.nn.relu
        """ velocity (x,y,z) and position (x,y,z) """

        n_non_visual_elements = 6

        filter_sizes = [
            EncodeProcessDecode_v8_edge_segmentation.n_conv_filters,
            EncodeProcessDecode_v8_edge_segmentation.n_conv_filters * 2
        ]
        """ shape: (batch_size, features), get everything except velocity and position """

        img_data = inputs[:, :-n_non_visual_elements]

        img_shape = get_correct_image_shape(
            config=None,
            get_type="all",
            depth_data_provided=EncodeProcessDecode_v8_edge_segmentation.
            depth_data_provided)
        img_data = tf.reshape(
            img_data, [-1, *img_shape])  # -1 means "all", i.e. batch dimension
        ''' Layer1 encoder output shape (?, 120, 160, filter_sizes[0]) '''
        outputs1 = snt.Conv2D(output_channels=128,
                              kernel_shape=3,
                              stride=1,
                              padding="SAME")(img_data)
        outputs1 = activation(outputs1)

        if EncodeProcessDecode_v8_edge_segmentation.conv_layer_instance_norm:
            outputs1 = snt.BatchNorm()(outputs1, is_training=self._is_training)

        l1_shape = outputs1.get_shape()
        ''' Layer2 encoder output shape (?, 120, 160, filter_sizes[0]) '''
        outputs = snt.Conv2D(output_channels=128,
                             kernel_shape=3,
                             stride=1,
                             padding="SAME")(outputs1)
        outputs = activation(outputs)

        if EncodeProcessDecode_v8_edge_segmentation.conv_layer_instance_norm:
            outputs1 = snt.BatchNorm()(outputs1, is_training=self._is_training)

        l2_shape = outputs.get_shape()
        ''' Layer3 encoder output shape (?, 60, 80, filter_sizes[0]) '''
        if EncodeProcessDecode_v8_edge_segmentation.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l3_shape = outputs.get_shape()

        #if is_training:
        #    outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        #else:
        #    outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' Layer4 encoder output shape (?, 60, 80, filter_sizes[0]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[0],
                             kernel_shape=3,
                             stride=1,
                             padding="SAME")(outputs)
        outputs = activation(outputs)

        if EncodeProcessDecode_v8_edge_segmentation.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l4_shape = outputs.get_shape()
        ''' Layer5 encoder output shape (?, 60, 80, filter_sizes[0]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[0],
                             kernel_shape=3,
                             stride=1,
                             padding="SAME")(outputs)
        outputs = activation(outputs)

        # --------------- SKIP CONNECTION --------------- #
        outputs2 = outputs

        if EncodeProcessDecode_v8_edge_segmentation.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l5_shape = outputs.get_shape()
        ''' Layer6 encoder output shape (?, 30, 40, filter_sizes[0]) '''
        if EncodeProcessDecode_v8_edge_segmentation.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l6_shape = outputs.get_shape()

        #if is_training:
        #    outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        #else:
        #    outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' Layer7 encoder output shape (?, 30, 40, filter_sizes[1]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[0],
                             kernel_shape=3,
                             stride=1,
                             padding="SAME")(outputs)
        outputs = activation(outputs)

        if EncodeProcessDecode_v8_edge_segmentation.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l7_shape = outputs.get_shape()
        ''' Layer8 encoder output shape (?, 30, 40, filter_sizes[0]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[0],
                             kernel_shape=3,
                             stride=1,
                             padding="SAME")(outputs)
        outputs = activation(outputs)

        if EncodeProcessDecode_v8_edge_segmentation.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l8_shape = outputs.get_shape()
        ''' Layer9 encoder output shape (?, 15, 20, filter_sizes[0]) '''
        if EncodeProcessDecode_v8_edge_segmentation.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l9_shape = outputs.get_shape()

        #if is_training:
        #    outputs = tf.nn.dropout(outputs, keep_prob=keep_dropout_prop)
        #else:
        #    outputs = tf.nn.dropout(outputs, keep_prob=1.0)
        ''' Layer10 encoder output shape (?, 15, 20, filter_sizes[1]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[1],
                             kernel_shape=3,
                             stride=1,
                             padding="SAME")(outputs)
        outputs = activation(outputs)

        if EncodeProcessDecode_v8_edge_segmentation.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l10_shape = outputs.get_shape()
        ''' Layer11 encoder output shape (?, 15, 20, filter_sizes[1]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[1],
                             kernel_shape=3,
                             stride=1,
                             padding="SAME")(outputs)
        outputs = activation(outputs)

        # --------------- SKIP CONNECTION --------------- #
        outputs3 = outputs

        if EncodeProcessDecode_v8_edge_segmentation.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l11_shape = outputs.get_shape()
        ''' Layer12 encoder output shape (?, 7, 10, filter_sizes[1]) '''
        if EncodeProcessDecode_v8_edge_segmentation.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l12_shape = outputs.get_shape()
        ''' Layer13 encoder output shape (?, 4, 5, filter_sizes[1]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[1],
                             kernel_shape=3,
                             stride=2,
                             padding="SAME")(outputs)
        outputs = activation(outputs)

        if EncodeProcessDecode_v8_edge_segmentation.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l13_shape = outputs.get_shape()
        ''' Layer14 encoder output shape (?, 2, 3, filter_sizes[1]) '''
        outputs = snt.Conv2D(output_channels=filter_sizes[1],
                             kernel_shape=3,
                             stride=2,
                             padding="SAME")(outputs)
        outputs = activation(outputs)

        if EncodeProcessDecode_v8_edge_segmentation.conv_layer_instance_norm:
            outputs = snt.BatchNorm()(outputs, is_training=self._is_training)

        l14_shape = outputs.get_shape()
        ''' Layer15 encoder output shape (?, 1, 1, filter_sizes[1]) '''
        if EncodeProcessDecode_v8_edge_segmentation.convnet_pooling:
            outputs = tf.layers.max_pooling2d(outputs, 2, 2)
        l15_shape = outputs.get_shape()

        if verbose:
            print("Layer1 encoder output shape", l1_shape)
            print("Layer2 encoder output shape", l2_shape)
            print("Layer3 encoder output shape", l3_shape)
            print("Layer4 encoder output shape", l4_shape)
            print("Layer5 encoder output shape", l5_shape)
            print("Layer6 encoder output shape", l6_shape)
            print("Layer7 encoder output shape", l7_shape)
            print("Layer8 encoder output shape", l8_shape)
            print("Layer9 encoder output shape", l9_shape)
            print("Layer10 encoder output shape", l10_shape)
            print("Layer11 encoder output shape", l11_shape)
            print("Layer12 encoder output shape", l12_shape)
            print("Layer13 encoder output shape", l13_shape)
            print("Layer14 encoder output shape", l14_shape)
            print("Layer15 encoder output shape", l15_shape)

        # ' shape (?, 7, 10, filter_sizes[1]) -> (?, n_neurons_nodes_total_dim-n_neurons_nodes_non_visual) '
        visual_latent_output = tf.layers.flatten(outputs)

        # --------------- SKIP CONNECTION --------------- #
        self.skip1 = outputs1
        self.skip2 = outputs2
        self.skip3 = outputs3

        if verbose:
            print("final encoder output shape", outputs.get_shape())

        return visual_latent_output