Пример #1
0
def normalize(inp, activation, reuse, scope):
    if FLAGS.norm == 'batch_norm':
        return tf_layers.batch_norm(inp, activation_fn=activation, reuse=reuse, scope=scope)
    elif FLAGS.norm == 'layer_norm':
        return tf_layers.layer_norm(inp, activation_fn=activation, reuse=reuse, scope=scope)
    elif FLAGS.norm == 'None':
        if activation is not None:
            return activation(inp)
        else:
            return inp
def encoder_model(frames, sequence_length, initializer, keep_prob_dropout=0.9, scope='encoder', fc_conv_layer=False):
  """
  Args:
    frames: 5D array of batch with videos - shape(batch_size, num_frames, frame_width, frame_higth, num_channels)
    sequence_length: number of frames that shall be encoded
    scope: tensorflow variable scope name
    initializer: specifies the initialization type (default: contrib.slim.layers uses Xavier init with uniform data)
    fc_conv_layer: adds an fc layer at the end of the encoder
  Returns:
    hidden4: hidden state of highest ConvLSTM layer
    fc_conv_layer: indicated whether a Fully Convolutional (8x8x16 -> 1x1x1024) shall be added
  """

  lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state6 = None, None, None, None, None, None

  for i in range(sequence_length):

    frame = frames[:,i,:,:,:]

    reuse = (i > 0)

    with tf.variable_scope(scope, reuse=reuse):
      #LAYER 1: conv1
      conv1 = slim.layers.conv2d(frame, 32, [5, 5], stride=2, scope='conv1', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
          normalizer_params={'scope': 'layer_norm1'})
      conv1 = tf.nn.dropout(conv1, keep_prob_dropout)

      #LAYER 2: convLSTM1
      hidden1, lstm_state1 = basic_conv_lstm_cell(conv1, lstm_state1, 32, initializer, filter_size=5, scope='convlstm1')
      hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')
      hidden1 = tf.nn.dropout(hidden1, keep_prob_dropout)

      #LAYER 3: conv2
      conv2 = slim.layers.conv2d(hidden1, hidden1.get_shape()[3], [5, 5], stride=2, scope='conv2', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
                                  normalizer_params={'scope': 'layer_norm3'})
      conv2 = tf.nn.dropout(conv2, keep_prob_dropout)

      #LAYER 4: convLSTM2
      hidden2, lstm_state2 = basic_conv_lstm_cell(conv2, lstm_state2, 32, initializer, filter_size=5, scope='convlstm2')
      hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm4')
      hidden2 = tf.nn.dropout(hidden2, keep_prob_dropout)

      #LAYER 5: conv3
      conv3 = slim.layers.conv2d(hidden2, hidden2.get_shape()[3], [5, 5], stride=2, scope='conv3', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
                                  normalizer_params={'scope': 'layer_norm5'})
      conv3 = tf.nn.dropout(conv3, keep_prob_dropout)

      #LAYER 6: convLSTM3
      hidden3, lstm_state3 = basic_conv_lstm_cell(conv3, lstm_state3, 32, initializer, filter_size=3, scope='convlstm3')
      hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm6')
      hidden3 = tf.nn.dropout(hidden3, keep_prob_dropout)

      #LAYER 7: conv4
      conv4 = slim.layers.conv2d(hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv4', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
                                 normalizer_params={'scope': 'layer_norm7'})
      conv4 = tf.nn.dropout(conv4, keep_prob_dropout)

      #LAYER 8: convLSTM4 (8x8 feature map size)
      hidden4, lstm_state4 = basic_conv_lstm_cell(conv4, lstm_state4, 64, initializer, filter_size=3, scope='convlstm4')
      hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm8')
      hidden4 = tf.nn.dropout(hidden4, keep_prob_dropout)

      #LAYER 8: conv5
      conv5 = slim.layers.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv5', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, 
                                 normalizer_params={'scope': 'layer_norm9'})
      conv5 = tf.nn.dropout(conv5, keep_prob_dropout)

      # LAYER 9: convLSTM5 (4x4 feature map size)
      hidden5, lstm_state5 = basic_conv_lstm_cell(conv5, lstm_state5, 64, initializer, filter_size=3, scope='convlstm5')
      hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm10')
      hidden5 = tf.nn.dropout(hidden5, keep_prob_dropout)

      # LAYER 10: Fully Convolutional Layer (4x4x128 --> 1x1xFC_LAYER_SIZE)
      # necessary for dimension compatibility with conv lstm cell
      fc_conv = slim.layers.conv2d(hidden5, FC_LAYER_SIZE, [4,4], stride=1, scope='fc_conv', padding='VALID', weights_initializer=initializer)
      fc_conv = tf.nn.dropout(fc_conv, keep_prob_dropout)

      # LAYER 11: Fully Convolutional LSTM (1x1x256 -> 1x1x128)
      hidden6, lstm_state6 = basic_conv_lstm_cell(fc_conv, lstm_state6, FC_LSTM_LAYER_SIZE, initializer, filter_size=1, scope='convlstm6')
      #no dropout since its the last encoder layer --> hidden repr should be steady

      # mu and sigma for sampling latent variable
      sigma = slim.layers.fully_connected(inputs=lstm_state6, num_outputs=VAE_REPR_SIZE, activation_fn=tf.nn.softplus)
      mu = slim.layers.fully_connected(inputs=lstm_state6, num_outputs=VAE_REPR_SIZE, activation_fn=None)

      # do reparamazerization trick to allow backprop flow through deterministic nodes sigma and mu
      z = mu + sigma * tf.random_normal(tf.shape(mu), mean=0., stddev=1.)

  return z, mu, sigma
Пример #3
0
 def _norm(self, inp, scope=None):
     reuse = tf.get_variable_scope().reuse
     with vs.variable_scope(scope or "norm") as scope:
         normalized = layer_norm(inp, reuse=reuse, scope=scope)
         return normalized
Пример #4
0
def inference(images, scope='RNN'):
        #train network    
    with tf.name_scope(scope, [images]):
        images=image_slicer(images)
        reuse = None
            
        #======================#                                       
        #Scale 1: coarse level #
        #======================#  
        
        #======Output size: 96x128x64
        #Layer 1:
        conv1=cnv.conv(images,'conv1',[3, 3, 3, 64],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu1= tf.nn.leaky_relu(conv1,alpha=0.1)             
        relu1 = tf_layers.layer_norm(relu1, scope='layer_norm1',reuse=reuse)
        
        #Layer 2:
        conv2=cnv.conv(relu1,'conv2',[5, 5, 64, 64],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu2=tf.nn.leaky_relu(conv2,alpha=0.1)            
        relu2 = tf_layers.layer_norm(relu2, scope='layer_norm2',reuse=reuse)
                        
        #======Output size: 48x64x128                             
        #Layer 3
        conv3=cnv.conv(relu2,'conv3',[3, 3, 64, 128],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu3=tf.nn.leaky_relu(conv3,alpha=0.1)             
        relu3 = tf_layers.layer_norm(relu3, scope='layer_norm3',reuse=reuse) 
 
        conv4=cnv.conv(relu3,'conv4',[5, 5, 128, 128],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu4=tf.nn.leaky_relu(conv4,alpha=0.1)            
        relu4 = tf_layers.layer_norm(relu4, scope='layer_norm4',reuse=reuse) 
           
        #======Output size: 24x32x256     
        #Layer 5
        conv5=cnv.conv(relu4,'conv5',[3, 3, 128, 256],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu5=tf.nn.leaky_relu(conv5,alpha=0.1)             
        relu5 = tf_layers.layer_norm(relu5, scope='layer_norm5',reuse=reuse)
            
        #Layer 6
        conv6=cnv.conv(relu5,'conv6',[5, 5, 256, 256],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu6=tf.nn.leaky_relu(conv6,alpha=0.1)            
        relu6 = tf_layers.layer_norm(relu6, scope='layer_norm6',reuse=reuse)
            
    
        #======Output size: 12x16x512   
        #Layer 7
        conv7=cnv.conv(relu6,'conv7',[3, 3, 256, 512],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu7=tf.nn.leaky_relu(conv7,alpha=0.1)             
        relu7 = tf_layers.layer_norm(relu7, scope='layer_norm7',reuse=reuse)
            
        #Layer 8
        conv8=cnv.conv(relu7,'conv8',[5, 5, 512, 512],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu8=tf.nn.leaky_relu(conv8,alpha=0.1)            
        relu8 = tf_layers.layer_norm(relu8, scope='layer_norm8',reuse=reuse)
        
        #======Output size: 6x8x512 
        #Layer 9
        conv9=cnv.conv(relu8,'conv9',[3, 3, 512, 512],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu9=tf.nn.leaky_relu(conv9,alpha=0.1)             
        relu9 = tf_layers.layer_norm(relu9, scope='layer_norm9',reuse=reuse)
                
        #upasmpling
        #======Output size: 12x16x512 
        #Layer 10 
        conv10=dcnv.deconv(relu9,[BATCH_SIZE,int(IMAGE_SIZE_H/16),int(IMAGE_SIZE_W/16),512],'d_conv10',[4, 4, 512, 512],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu10=tf.nn.leaky_relu(conv10,alpha=0.1)     
        relu10 = tf_layers.layer_norm(relu10, scope='layer_norm10',reuse=reuse)        
        #Layer 11       
        conv11=cnv.conv(relu10+relu8,'conv11',[5, 5, 512, 512],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu11=tf.nn.leaky_relu(conv11,alpha=0.1)             
        relu11 = tf_layers.layer_norm(relu11, scope='layer_norm11',reuse=reuse)        
        
        #======Output size: 24x32x256
        #Layer 12
        conv12=dcnv.deconv(relu11,[BATCH_SIZE,int(IMAGE_SIZE_H/8),int(IMAGE_SIZE_W/8),256],'d_conv12',[4, 4, 256, 512],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu12=tf.nn.leaky_relu(conv12,alpha=0.1)     
        relu12 = tf_layers.layer_norm(relu12, scope='layer_norm12',reuse=reuse)   
        #Layer 13 
        conv13=cnv.conv(relu12+relu6,'conv13',[5, 5, 256, 256],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu13=tf.nn.leaky_relu(conv13,alpha=0.1)           
        relu13 = tf_layers.layer_norm(relu13, scope='layer_norm13',reuse=reuse)            
                    
        #======Output size: 48x64x256
        #Layer 14
        conv14=dcnv.deconv(relu13,[BATCH_SIZE,int(IMAGE_SIZE_H/4),int(IMAGE_SIZE_W/4),128],'d_conv14',[4, 4, 128, 256],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu14=tf.nn.leaky_relu(conv14,alpha=0.1)      
        relu14 = tf_layers.layer_norm(relu14, scope='layer_norm14',reuse=reuse)   
            
        #Layer 15        
        conv15=cnv.conv(relu14+relu4,'conv15',[3, 3, 128, 128],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu15=tf.nn.leaky_relu(conv15,alpha=0.1)             
        relu15 = tf_layers.layer_norm(relu15, scope='layer_norm15',reuse=reuse)              
         
        #===================== output depth scale 1: 48x64x1 /4
        out_scale1=cnv.conv(relu15,'out_scale1',[3, 3, 128, 4],stride=[1,1,1, 1],padding='SAME',wd=0,FLOAT16=FLOAT16,reuse=reuse)    
            
         
        #======================#                                       
        #Scale 2: middle level #
        #======================#      
            
        #======Output size: 96x128x64
        #Layer 17:
        conv17=cnv.conv(images,'conv17',[3, 3, 3, 64],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu17=tf.nn.leaky_relu(conv17,alpha=0.1)             
        relu17 = tf_layers.layer_norm(relu17, scope='layer_norm17',reuse=reuse)            
        #Layer 18:   
        conv18=cnv.conv(relu17+relu1,'conv18',[5, 5, 64, 64],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu18=tf.nn.leaky_relu(conv18,alpha=0.1)           
        relu18 = tf_layers.layer_norm(relu18, scope='layer_norm18',reuse=reuse)                
    
            
        #Layer 19:48x64x128
        conv19=cnv.conv(relu18,'conv19',[3, 3, 64, 128],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu19=tf.nn.leaky_relu(conv19,alpha=0.1)            
        relu19 = tf_layers.layer_norm(relu19, scope='layer_norm19',reuse=reuse)   
            
                    
        #Layer 20:   
        conv20=cnv.conv(relu19+relu3,'conv20',[5, 5, 128, 128],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu20=tf.nn.leaky_relu(conv20,alpha=0.1)            
        relu20 = tf_layers.layer_norm(relu20, scope='layer_norm20',reuse=reuse)   

        #concatenate featuremap from coarse level
        concat1= tf.concat([relu20,relu15,out_scale1], 3, name='concat1')
        
        #Layer 21:        
        conv21=cnv.conv(concat1,'conv21',[9, 9, 260, 256],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu21=tf.nn.leaky_relu(conv21,alpha=0.1)            
        relu21 = tf_layers.layer_norm(relu21, scope='layer_norm21',reuse=reuse)   
            
        #upsampling
        #Layer 22
        conv22=dcnv.deconv(relu21,[BATCH_SIZE,int(IMAGE_SIZE_H/4),int(IMAGE_SIZE_W/4),128],'d_conv22',[4, 4, 128, 256],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu22=tf.nn.leaky_relu(conv22,alpha=0.1)      
        relu22 = tf_layers.layer_norm(relu22, scope='layer_norm22',reuse=reuse)                

        #Layer 23:
        conv23=cnv.conv(relu22+relu20,'conv23',[5, 5, 128, 128],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu23=tf.nn.leaky_relu(conv23,alpha=0.1)             
        relu23 = tf_layers.layer_norm(relu23, scope='layer_norm23',reuse=reuse)   

        #Layer 24:   
        conv24=dcnv.deconv(relu23,[BATCH_SIZE,int(IMAGE_SIZE_H/2),int(IMAGE_SIZE_W/2),64],'d_conv24',[4, 4, 64, 128],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu24=tf.nn.leaky_relu(conv24,alpha=0.1)     
        relu24 = tf_layers.layer_norm(relu24, scope='layer_norm24',reuse=reuse)             
            
        conv25=cnv.conv(relu24,'conv25',[5, 5, 64, 64],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu25=tf.nn.leaky_relu(conv25,alpha=0.1)             
        relu25 = tf_layers.layer_norm(relu25, scope='layer_norm25',reuse=reuse) 
        
        #===================== output depth scale 2: 96x128x1 /4
        out_scale2=cnv.conv(relu25,'out_scale2',[3, 3, 64, 4],stride=[1,1,1, 1],padding='SAME',wd=0,FLOAT16=FLOAT16,reuse=reuse)        
        
        
        #======================#                                       
        #Scale 3: fine level   #
        #======================#         
        #======Output size:  192x256x32
        #Layer 27:
        conv27=cnv.conv(images,'conv27',[3, 3, 3, 32],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu27=tf.nn.leaky_relu(conv27,alpha=0.1)    
        relu27 = tf_layers.layer_norm(relu27, scope='layer_norm27',reuse=reuse)    
            
        #Layer 28:     
        conv28=cnv.conv(relu27,'conv28',[3, 3, 32, 64],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu28=tf.nn.leaky_relu(conv28,alpha=0.1)            
        relu28 = tf_layers.layer_norm(relu28, scope='layer_norm28',reuse=reuse)            
        
        #Layer 29:
        conv29=cnv.conv(relu28+relu17,'conv29',[5, 5, 64, 64],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu29=tf.nn.leaky_relu(conv29,alpha=0.1)             
        relu29 = tf_layers.layer_norm(relu29, scope='layer_norm29',reuse=reuse)           
 
        
        #concatenate featuremap from middle leveil
        concat2= tf.concat([relu29,relu25,out_scale2], 3, name='concat2')    
        
        #Layer 30:      
        conv30=cnv.conv(concat2,'conv30',[5, 5, 132, 128],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu30=tf.nn.leaky_relu(conv30,alpha=0.1)            
        relu30 = tf_layers.layer_norm(relu30, scope='layer_norm30',reuse=reuse) 
            
        #Layer 31:   
        conv31=dcnv.deconv(relu30,[BATCH_SIZE,int(IMAGE_SIZE_H/2),int(IMAGE_SIZE_W/2),64],'d_conv31',[4, 4, 64, 128],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu31=tf.nn.leaky_relu(conv31,alpha=0.1)   
        relu31 = tf_layers.layer_norm(relu31, scope='layer_norm31',reuse=reuse)              
        
        
        #Layer 32: 
        conv32=cnv.conv(relu31+relu29,'conv32',[5, 5, 64, 64],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu32=tf.nn.leaky_relu(conv32,alpha=0.1)             
        relu32 = tf_layers.layer_norm(relu32, scope='layer_norm32',reuse=reuse)            
 
            
        #Layer 33:   
        conv33=dcnv.deconv(relu32,[BATCH_SIZE,int(IMAGE_SIZE_H),int(IMAGE_SIZE_W),32],'d_conv33',[4, 4, 32, 64],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu33=tf.nn.leaky_relu(conv33,alpha=0.1)    
        relu33 = tf_layers.layer_norm(relu33, scope='layer_norm33',reuse=reuse)    
            
        #Layer 34:      
        conv34=cnv.conv(relu33+relu27,'conv34',[3, 3, 32, 32],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu34=tf.nn.leaky_relu(conv34,alpha=0.1)             
        relu34 = tf_layers.layer_norm(relu34, scope='layer_norm34',reuse=reuse)         
            #Layer 35:      
        conv35=cnv.conv(relu34,'conv35',[3, 3, 32, 32],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse)
        relu35=tf.nn.leaky_relu(conv35,alpha=0.1)           
        relu35 = tf_layers.layer_norm(relu35, scope='layer_norm35',reuse=reuse)           
        #Inference layer 11 
        depth=cnv.conv(relu35,'depth',[3, 3, 32, 1],wd=0,FLOAT16=FLOAT16,reuse=reuse)
      
        #split
        scale1_depth=out_scale1[:,:,:,0]
        scale1_depth=tf.expand_dims(scale1_depth,3)    
        
        norm_x1=out_scale1[:,:,:,1]
        norm_y1=out_scale1[:,:,:,2]
        norm_z1=out_scale1[:,:,:,3]
        norm_x1=tf.expand_dims(norm_x1,3)
        norm_y1=tf.expand_dims(norm_y1,3)            
        norm_z1=tf.expand_dims(norm_z1,3)          
        scale1_normal=tf.concat([norm_x1,norm_y1,norm_z1], 3)
        
        tf.summary.image('depth_scale1:', scale1_depth)   
        tf.summary.image('normal_scale1:',scale1_normal)
        
        
        
        scale2_depth=out_scale2[:,:,:,0]
        scale2_depth=tf.expand_dims(scale2_depth,3)    
        
        norm_x2=out_scale2[:,:,:,1]
        norm_y2=out_scale2[:,:,:,2]
        norm_z2=out_scale2[:,:,:,3]
        norm_x2=tf.expand_dims(norm_x2,3)
        norm_y2=tf.expand_dims(norm_y2,3)            
        norm_z2=tf.expand_dims(norm_z2,3)         
        scale2_normal=tf.concat([norm_x2,norm_y2,norm_z2], 3)
        tf.summary.image('depth_scale2:', scale2_depth)   
        tf.summary.image('normal_scale2:',scale2_normal)   
        
        
        tf.summary.image('depth_scale3:', depth)   
        
        return scale1_depth, scale2_depth, depth,scale1_normal,scale2_normal
Пример #5
0
    def build(self):
        with slim.arg_scope([
                slim.layers.conv2d, slim.layers.fully_connected,
                tf_layers.layer_norm
        ]):

            layer1 = tf_layers.layer_norm(self.vgg_layer(self.images),
                                          scope='conv1_norm')
            layer2 = tf_layers.layer_norm(slim.layers.conv2d(layer1,
                                                             32, [3, 3],
                                                             stride=2,
                                                             scope='conv2'),
                                          scope='conv2_norm')

            layer3 = tf_layers.layer_norm(slim.layers.conv2d(layer2,
                                                             32, [3, 3],
                                                             stride=2,
                                                             scope='conv3'),
                                          scope='conv3_norm')

            batch_size, num_rows, num_cols, num_fp = layer3.get_shape()
            # print 'shape', layer3.get_shape
            num_rows, num_cols, num_fp = [
                int(x) for x in [num_rows, num_cols, num_fp]
            ]

            x_map = np.empty([num_rows, num_cols], np.float32)
            y_map = np.empty([num_rows, num_cols], np.float32)

            for i in range(num_rows):
                for j in range(num_cols):
                    x_map[i, j] = (i - num_rows / 2.0) / num_rows
                    y_map[i, j] = (j - num_cols / 2.0) / num_cols

            x_map = tf.convert_to_tensor(x_map)
            y_map = tf.convert_to_tensor(y_map)

            x_map = tf.reshape(x_map, [num_rows * num_cols])
            y_map = tf.reshape(y_map, [num_rows * num_cols])

            features = tf.reshape(tf.transpose(layer3, [0, 3, 1, 2]),
                                  [-1, num_rows * num_cols])
            softmax = tf.nn.softmax(features)
            # print 'softmax', softmax

            fp_x = tf.reduce_sum(tf.multiply(x_map, softmax), [1],
                                 keep_dims=True)
            fp_y = tf.reduce_sum(tf.multiply(y_map, softmax), [1],
                                 keep_dims=True)
            self.fp_y = fp_y
            self.fp_x = fp_x
            # print 'fp_x', fp_x
            # print 'fp_y', fp_y
            fp_flat = tf.reshape(tf.concat([fp_x, fp_y], 1), [-1, num_fp * 2])
            # print 'fp_flat', fp_flat
            # print 'configs', self.robot_configs

            self.predicted_eeps = slim.layers.fully_connected(
                fp_flat, 3, scope='predicted_eeps',
                activation_fn=None)  # dim of eeps: 3

            conv_out = tf.concat(
                [
                    fp_flat,
                    self.robot_configs,  # dim of angles: 7, dim of eeps: 3
                    self.predicted_eeps
                ],
                1)

            fc_layer1 = slim.layers.fully_connected(conv_out, 100, scope='fc1')

            self.predicted_actions = slim.layers.fully_connected(
                fc_layer1, 7, scope='predicted_actions',
                activation_fn=None)  # dim of velocities: 7
Пример #6
0
 def _norm(self, inp, scope=None):
   reuse = tf.get_variable_scope().reuse
   normalized = layer_norm(inp, reuse=reuse, scope=scope)
   return normalized
def encoder_model(frames, sequence_length, initializer, scope='encoder', fc_conv_layer=False):
  """
  Args:
    frames: 5D array of batch with videos - shape(batch_size, num_frames, frame_width, frame_higth, num_channels)
    sequence_length: number of frames that shall be encoded
    scope: tensorflow variable scope name
    initializer: specifies the initialization type (default: contrib.slim.layers uses Xavier init with uniform data)
    fc_conv_layer: adds an fc layer at the end of the encoder
  Returns:
    hidden4: hidden state of highest ConvLSTM layer
    fc_conv_layer: indicated whether a Fully Convolutional (8x8x16 -> 1x1x1024) shall be added
  """

  lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state6 = None, None, None, None, None, None

  for i in range(sequence_length):

    frame = frames[:,i,:,:,:]

    reuse = (i > 0)

    with tf.variable_scope(scope, reuse=reuse):
      #LAYER 1: conv1
      conv1 = slim.layers.conv2d(frame, 16, [5, 5], stride=2, scope='conv1', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
          normalizer_params={'scope': 'layer_norm1'})

      #LAYER 2: convLSTM1
      hidden1, lstm_state1 = basic_conv_lstm_cell(conv1, lstm_state1, 16, initializer, filter_size=5, scope='convlstm1')
      hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')

      #LAYER 3: conv2
      conv2 = slim.layers.conv2d(hidden1, hidden1.get_shape()[3], [5, 5], stride=2, scope='conv2', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
                                  normalizer_params={'scope': 'layer_norm3'})

      #LAYER 4: convLSTM2
      hidden2, lstm_state2 = basic_conv_lstm_cell(conv2, lstm_state2, 16, initializer, filter_size=5, scope='convlstm2')
      hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm4')

      #LAYER 5: conv3
      conv3 = slim.layers.conv2d(hidden2, hidden2.get_shape()[3], [5, 5], stride=2, scope='conv3', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
                                  normalizer_params={'scope': 'layer_norm5'})

      #LAYER 6: convLSTM3
      hidden3, lstm_state3 = basic_conv_lstm_cell(conv3, lstm_state3, 16, initializer, filter_size=3, scope='convlstm3')
      hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm6')


      #LAYER 7: conv4
      conv4 = slim.layers.conv2d(hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv4', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
                                 normalizer_params={'scope': 'layer_norm7'})

      #LAYER 8: convLSTM4 (8x8 featuremap size)
      hidden4, lstm_state4 = basic_conv_lstm_cell(conv4, lstm_state4, 32, initializer, filter_size=3, scope='convlstm4')
      hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm8')

      #LAYER 8: conv5
      conv5 = slim.layers.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv5', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, 
                                 normalizer_params={'scope': 'layer_norm9'})

      # LAYER 9: convLSTM5 (4x84 featuremap size)
      hidden5, lstm_state5 = basic_conv_lstm_cell(conv5, lstm_state5, 32, initializer, filter_size=3, scope='convlstm5')
      hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm10')

      #LAYER 10: Fully Convolutional Layer (4x4x32 --> 1x1xFC_LAYER_SIZE)
      fc_conv = slim.layers.conv2d(hidden5, FC_LAYER_SIZE, [4,4], stride=1, scope='fc_conv', padding='VALID', weights_initializer=initializer)

      #LAYER 11: Fully Convolutional LSTM (1x1x256 -> 1x1x128)
      hidden6, lstm_state6 = basic_conv_lstm_cell(fc_conv, lstm_state6, FC_LSTM_LAYER_SIZE, initializer, filter_size=1, scope='convlstm6')

      hidden_repr = hidden6

  return hidden_repr
Пример #8
0
def construct_model(images,
                    actions=None,
                    states=None,
                    iter_num=-1.0,
                    k=-1,
                    num_masks=10,
                    context_frames=2,
                    pix_distributions=None,
                    conf=None):

    if 'dna_size' in conf.keys():
        DNA_KERN_SIZE = conf['dna_size']
    else:
        DNA_KERN_SIZE = 5
    print 'constructing sawyer network'
    batch_size, img_height, img_width, color_channels = images[0].get_shape(
    )[0:4]
    lstm_func = basic_conv_lstm_cell

    # Generated robot states and images.
    gen_states, gen_images, gen_masks = [], [], []
    if states != None:
        current_state = states[0]
    else:
        current_state = None

    if actions == None:
        actions = [None for _ in images]

    gen_pix_distrib = []
    summaries = []

    if k == -1:
        feedself = True
    else:
        # Scheduled sampling:
        # Calculate number of ground-truth frames to pass in.
        num_ground_truth = tf.to_int32(
            tf.round(
                tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k)))))
        feedself = False

    # LSTM state sizes and states.

    if 'lstm_size' in conf:
        lstm_size = conf['lstm_size']
    else:
        lstm_size = np.int32(np.array([16, 16, 32, 32, 64, 32, 16]))

    lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
    lstm_state5, lstm_state6, lstm_state7 = None, None, None

    t = -1
    for image, action in zip(images[:-1], actions[:-1]):
        t += 1
        # Reuse variables after the first timestep.
        reuse = bool(gen_images)

        done_warm_start = len(gen_images) > context_frames - 1
        with slim.arg_scope([
                lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                tf_layers.layer_norm, slim.layers.conv2d_transpose
        ],
                            reuse=reuse):

            if feedself and done_warm_start:
                # Feed in generated image.
                prev_image = gen_images[-1]  # 64x64x6
                if pix_distributions != None:
                    prev_pix_distrib = gen_pix_distrib[-1]
            elif done_warm_start:
                # Scheduled sampling
                prev_image = scheduled_sample(image, gen_images[-1],
                                              batch_size, num_ground_truth)
            else:
                # Always feed in ground_truth
                prev_image = image
                if pix_distributions != None:
                    prev_pix_distrib = pix_distributions[t]
                    prev_pix_distrib = tf.expand_dims(prev_pix_distrib, -1)

            if 'transform_from_firstimage' in conf:
                assert conf['model'] == 'STP'
                if t > 1:
                    prev_image = images[1]
                    print 'using image 1'

            # Predicted state is always fed back in
            if not 'ignore_state_action' in conf:
                state_action = tf.concat(1, [action, current_state])

            enc0 = slim.layers.conv2d(  #32x32x32
                prev_image,
                32, [5, 5],
                stride=2,
                scope='scale1_conv1',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm1'})

            hidden1, lstm_state1 = lstm_func(  # 32x32x16
                enc0, lstm_state1, lstm_size[0], scope='state1')
            hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')

            enc1 = slim.layers.conv2d(  # 16x16x16
                hidden1,
                hidden1.get_shape()[3], [3, 3],
                stride=2,
                scope='conv2')

            hidden3, lstm_state3 = lstm_func(  #16x16x32
                enc1, lstm_state3, lstm_size[2], scope='state3')
            hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')

            enc2 = slim.layers.conv2d(  # 8x8x32
                hidden3,
                hidden3.get_shape()[3], [3, 3],
                stride=2,
                scope='conv3')

            if not 'ignore_state_action' in conf:
                # Pass in state and action.
                if 'ignore_state' in conf:
                    lowdim = action
                    print 'ignoring state'
                else:
                    lowdim = state_action

                smear = tf.reshape(
                    lowdim,
                    [int(batch_size), 1, 1,
                     int(lowdim.get_shape()[1])])
                smear = tf.tile(
                    smear,
                    [1,
                     int(enc2.get_shape()[1]),
                     int(enc2.get_shape()[2]), 1])

                enc2 = tf.concat(3, [enc2, smear])
            else:
                print 'ignoring states and actions'

            enc3 = slim.layers.conv2d(  #8x8x32
                enc2,
                hidden3.get_shape()[3], [1, 1],
                stride=1,
                scope='conv4')

            hidden5, lstm_state5 = lstm_func(  #8x8x64
                enc3, lstm_state5, lstm_size[4], scope='state5')
            hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')
            enc4 = slim.layers.conv2d_transpose(  #16x16x64
                hidden5,
                hidden5.get_shape()[3],
                3,
                stride=2,
                scope='convt1')

            hidden6, lstm_state6 = lstm_func(  #16x16x32
                enc4, lstm_state6, lstm_size[5], scope='state6')
            hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')

            if 'noskip' not in conf:
                # Skip connection.
                hidden6 = tf.concat(3, [hidden6, enc1])  # both 16x16

            enc5 = slim.layers.conv2d_transpose(  #32x32x32
                hidden6,
                hidden6.get_shape()[3],
                3,
                stride=2,
                scope='convt2')
            hidden7, lstm_state7 = lstm_func(  # 32x32x16
                enc5, lstm_state7, lstm_size[6], scope='state7')
            hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')

            if not 'noskip' in conf:
                # Skip connection.
                hidden7 = tf.concat(3, [hidden7, enc0])  # both 32x32

            enc6 = slim.layers.conv2d_transpose(  # 64x64x16
                hidden7,
                hidden7.get_shape()[3],
                3,
                stride=2,
                scope='convt3',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm9'})

            if 'single_view' not in conf:
                prev_image_cam1 = tf.slice(prev_image, [0, 0, 0, 0],
                                           [-1, -1, -1, 3])
                prev_image_cam2 = tf.slice(prev_image, [0, 0, 0, 3],
                                           [-1, -1, -1, 3])

            if conf['model'] == 'DNA':
                # Using largest hidden state for predicting untied conv kernels.
                trafo_input_cam1 = slim.layers.conv2d_transpose(
                    enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4_cam1')
                trafo_input_cam2 = slim.layers.conv2d_transpose(
                    enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4_cam2')

                if 'single_view' not in conf:
                    transformed_cam1 = [
                        dna_transformation(prev_image_cam1, trafo_input_cam1,
                                           conf['dna_size'])
                    ]
                    transformed_cam2 = [
                        dna_transformation(prev_image_cam2, trafo_input_cam2,
                                           conf['dna_size'])
                    ]
                else:
                    transformed_cam2 = [
                        dna_transformation(prev_image, trafo_input_cam2,
                                           conf['dna_size'])
                    ]

            if conf['model'] == 'STP':
                stp_input0 = tf.reshape(hidden5, [int(batch_size), -1])
                stp_input1_cam1 = slim.layers.fully_connected(
                    stp_input0, 100 * conf['numcam'], scope='fc_stp_cam1')

                stp_input1_cam2 = slim.layers.fully_connected(
                    stp_input0, 100 * conf['numcam'], scope='fc_stp_cam2')

                # disabling capability to generete pixels
                reuse_stp = None
                if reuse:
                    reuse_stp = reuse
                if 'single_view' not in conf:
                    transformed_cam1 = stp_transformation(prev_image_cam1,
                                                          stp_input1_cam1,
                                                          num_masks,
                                                          reuse_stp,
                                                          suffix='cam1')
                transformed_cam2 = stp_transformation(prev_image_cam2,
                                                      stp_input1_cam2,
                                                      num_masks,
                                                      reuse_stp,
                                                      suffix='cam2')
                # transformed += stp_transformation(prev_image, stp_input1, num_masks)

                if pix_distributions != None:
                    transf_distrib = stp_transformation(prev_pix_distrib,
                                                        stp_input1,
                                                        num_masks,
                                                        reuse=True)

            masks_cam1 = slim.layers.conv2d_transpose(enc6, (num_masks + 1),
                                                      1,
                                                      stride=1,
                                                      scope='convt7_cam1')

            masks_cam2 = slim.layers.conv2d_transpose(enc6, (num_masks + 1),
                                                      1,
                                                      stride=1,
                                                      scope='convt7_cam2')

            if 'single_view' not in conf:
                output_cam1, mask_list_cam1 = fuse_trafos(
                    conf, masks_cam1, prev_image_cam1, transformed_cam1)
                output_cam2, mask_list_cam2 = fuse_trafos(
                    conf, masks_cam2, prev_image_cam2, transformed_cam2)
                output = tf.concat(3, [output_cam1, output_cam2])
            else:
                output, mask_list_cam2 = fuse_trafos(conf, masks_cam2,
                                                     prev_image,
                                                     transformed_cam2)

            gen_images.append(output)
            gen_masks.append(mask_list_cam2)

            if conf['model'] == 'DNA' and pix_distributions != None:
                transf_distrib = [
                    dna_transformation(prev_pix_distrib, enc7, DNA_KERN_SIZE)
                ]

            if pix_distributions != None:
                pix_distrib_output = mask_list[0] * prev_pix_distrib
                mult_list = []
                for i in range(num_masks):
                    mult_list.append(transf_distrib[i] * mask_list[i + 1])
                    pix_distrib_output += mult_list[i]

                gen_pix_distrib.append(pix_distrib_output)

            if current_state != None:
                current_state = slim.layers.fully_connected(
                    state_action,
                    int(current_state.get_shape()[1]),
                    scope='state_pred',
                    activation_fn=None)
            gen_states.append(current_state)

    if pix_distributions != None:
        return gen_images, gen_states, gen_masks, gen_pix_distrib
    else:
        return gen_images, gen_states, gen_masks, None
    def build_network_core(self, action, current_state, input_image):
        lstm_func = basic_conv_lstm_cell

        with slim.arg_scope([
                lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                tf_layers.layer_norm, slim.layers.conv2d_transpose
        ],
                            reuse=self.reuse):

            enc0 = slim.layers.conv2d(  # 32x32x32
                input_image,
                32, [5, 5],
                stride=2,
                scope='scale1_conv1',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm1'})
            hidden1, self.lstm_state1 = self.lstm_func(  # 32x32x16
                enc0,
                self.lstm_state1,
                self.lstm_size[0],
                scope='state1')
            hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')

            enc1 = slim.layers.conv2d(  # 16x16x16
                hidden1,
                hidden1.get_shape()[3], [3, 3],
                stride=2,
                scope='conv2')
            hidden3, self.lstm_state3 = self.lstm_func(  # 16x16x32
                enc1,
                self.lstm_state3,
                self.lstm_size[1],
                scope='state3')
            hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')

            enc2 = slim.layers.conv2d(  # 8x8x32
                hidden3,
                hidden3.get_shape()[3], [3, 3],
                stride=2,
                scope='conv3')

            if not 'ignore_state_action' in self.conf:
                # Pass in state and action.
                state_action = tf.concat(axis=1,
                                         values=[action, current_state])

                smear = tf.reshape(state_action, [
                    int(self.batch_size), 1, 1,
                    int(state_action.get_shape()[1])
                ])
                smear = tf.tile(
                    smear,
                    [1,
                     int(enc2.get_shape()[1]),
                     int(enc2.get_shape()[2]), 1])

                enc2 = tf.concat(axis=3, values=[enc2, smear])
            else:
                print('ignoring states and actions')
            enc3 = slim.layers.conv2d(  # 8x8x32
                enc2,
                hidden3.get_shape()[3], [1, 1],
                stride=1,
                scope='conv4')
            hidden5, self.lstm_state5 = self.lstm_func(  # 8x8x64
                enc3,
                self.lstm_state5,
                self.lstm_size[2],
                scope='state5')
            hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')

            enc4 = slim.layers.conv2d_transpose(  # 16x16x64
                hidden5,
                hidden5.get_shape()[3],
                3,
                stride=2,
                scope='convt1')
            hidden6, self.lstm_state6 = self.lstm_func(  # 16x16x32
                enc4,
                self.lstm_state6,
                self.lstm_size[3],
                scope='state6')
            hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')
            if 'noskip' not in self.conf:
                # Skip connection.
                hidden6 = tf.concat(axis=3, values=[hidden6,
                                                    enc1])  # both 16x16

            enc5 = slim.layers.conv2d_transpose(  # 32x32x32
                hidden6,
                hidden6.get_shape()[3],
                3,
                stride=2,
                scope='convt2')
            hidden7, self.lstm_state7 = self.lstm_func(  # 32x32x16
                enc5,
                self.lstm_state7,
                self.lstm_size[4],
                scope='state7')
            hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')
            if not 'noskip' in self.conf:
                # Skip connection.
                hidden7 = tf.concat(axis=3, values=[hidden7,
                                                    enc0])  # both 32x32

            enc6 = slim.layers.conv2d_transpose(  # 64x64x16
                hidden7,
                hidden7.get_shape()[3],
                3,
                stride=2,
                scope='convt3',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm9'})

            if current_state != None:
                current_state = slim.layers.fully_connected(
                    state_action,
                    int(current_state.get_shape()[1]),
                    scope='state_pred',
                    activation_fn=None)
            self.gen_states.append(current_state)

            self.apply_trafo_predict(enc6, hidden5)

            return current_state
Пример #10
0
    def build(self):

        if 'kern_size' in list(self.conf.keys()):
            KERN_SIZE = self.conf['kern_size']
        else:
            KERN_SIZE = 5

        batch_size, img_height, img_width, color_channels = self.images[
            0].get_shape()[0:4]
        lstm_func = basic_conv_lstm_cell

        if self.states != None:
            current_state = self.states[0]
        else:
            current_state = None

        if self.actions == None:
            self.actions = [None for _ in self.images]

        if self.k == -1:
            feedself = True
        else:
            # Scheduled sampling:
            # Calculate number of ground-truth frames to pass in.
            num_ground_truth = tf.to_int32(
                tf.round(
                    tf.to_float(batch_size) *
                    (self.k / (self.k + tf.exp(self.iter_num / self.k)))))
            feedself = False

        # LSTM state sizes and states.

        if 'lstm_size' in self.conf:
            lstm_size = self.conf['lstm_size']
            print('using lstm size', lstm_size)
        else:
            lstm_size = np.int32(np.array([16, 32, 64, 32, 16]))

        lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
        lstm_state5, lstm_state6, lstm_state7 = None, None, None

        t = -1
        for image, action in zip(self.images[:-1], self.actions[:-1]):
            t += 1
            print(t)
            # Reuse variables after the first timestep.
            reuse = bool(self.gen_images)

            done_warm_start = len(self.gen_images) > self.context_frames - 1
            with slim.arg_scope([
                    lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                    tf_layers.layer_norm, slim.layers.conv2d_transpose
            ],
                                reuse=reuse):

                if feedself and done_warm_start:
                    # Feed in generated image.
                    prev_image = self.gen_images[-1]  # 64x64x6
                    if self.pix_distributions1 != None:
                        prev_pix_distrib1 = self.gen_distrib1[-1]
                        if 'ndesig' in self.conf:
                            prev_pix_distrib2 = self.gen_distrib2[-1]
                elif done_warm_start:
                    # Scheduled sampling
                    prev_image = scheduled_sample(image, self.gen_images[-1],
                                                  batch_size, num_ground_truth)
                else:
                    # Always feed in ground_truth
                    prev_image = image
                    if self.pix_distributions1 != None:
                        prev_pix_distrib1 = self.pix_distributions1[t]
                        if 'ndesig' in self.conf:
                            prev_pix_distrib2 = self.pix_distributions2[t]
                        if len(prev_pix_distrib1.get_shape()) == 3:
                            prev_pix_distrib1 = tf.expand_dims(
                                prev_pix_distrib1, -1)
                            if 'ndesig' in self.conf:
                                prev_pix_distrib2 = tf.expand_dims(
                                    prev_pix_distrib2, -1)

                if 'refeed_firstimage' in self.conf:
                    assert self.conf['model'] == 'STP'
                    if t > 1:
                        input_image = self.images[1]
                        print('refeed with image 1')
                    else:
                        input_image = prev_image
                else:
                    input_image = prev_image

                # Predicted state is always fed back in
                if not 'ignore_state_action' in self.conf:
                    state_action = tf.concat(axis=1,
                                             values=[action, current_state])

                enc0 = slim.layers.conv2d(  #32x32x32
                    input_image,
                    32, [5, 5],
                    stride=2,
                    scope='scale1_conv1',
                    normalizer_fn=tf_layers.layer_norm,
                    normalizer_params={'scope': 'layer_norm1'})

                hidden1, lstm_state1 = lstm_func(  # 32x32x16
                    enc0,
                    lstm_state1,
                    lstm_size[0],
                    scope='state1')
                hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')

                enc1 = slim.layers.conv2d(  # 16x16x16
                    hidden1,
                    hidden1.get_shape()[3], [3, 3],
                    stride=2,
                    scope='conv2')

                hidden3, lstm_state3 = lstm_func(  #16x16x32
                    enc1,
                    lstm_state3,
                    lstm_size[1],
                    scope='state3')
                hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')

                enc2 = slim.layers.conv2d(  # 8x8x32
                    hidden3,
                    hidden3.get_shape()[3], [3, 3],
                    stride=2,
                    scope='conv3')

                if not 'ignore_state_action' in self.conf:
                    # Pass in state and action.
                    if 'ignore_state' in self.conf:
                        lowdim = action
                        print('ignoring state')
                    else:
                        lowdim = state_action

                    smear = tf.reshape(
                        lowdim,
                        [int(batch_size), 1, 1,
                         int(lowdim.get_shape()[1])])
                    smear = tf.tile(smear, [
                        1,
                        int(enc2.get_shape()[1]),
                        int(enc2.get_shape()[2]), 1
                    ])

                    enc2 = tf.concat(axis=3, values=[enc2, smear])
                else:
                    print('ignoring states and actions')

                enc3 = slim.layers.conv2d(  #8x8x32
                    enc2,
                    hidden3.get_shape()[3], [1, 1],
                    stride=1,
                    scope='conv4')

                hidden5, lstm_state5 = lstm_func(  #8x8x64
                    enc3, lstm_state5, lstm_size[2], scope='state5')
                hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')
                enc4 = slim.layers.conv2d_transpose(  #16x16x64
                    hidden5,
                    hidden5.get_shape()[3],
                    3,
                    stride=2,
                    scope='convt1')

                hidden6, lstm_state6 = lstm_func(  #16x16x32
                    enc4,
                    lstm_state6,
                    lstm_size[3],
                    scope='state6')
                hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')

                if 'noskip' not in self.conf:
                    # Skip connection.
                    hidden6 = tf.concat(axis=3, values=[hidden6,
                                                        enc1])  # both 16x16

                enc5 = slim.layers.conv2d_transpose(  #32x32x32
                    hidden6,
                    hidden6.get_shape()[3],
                    3,
                    stride=2,
                    scope='convt2')
                hidden7, lstm_state7 = lstm_func(  # 32x32x16
                    enc5,
                    lstm_state7,
                    lstm_size[4],
                    scope='state7')
                hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')

                if not 'noskip' in self.conf:
                    # Skip connection.
                    hidden7 = tf.concat(axis=3, values=[hidden7,
                                                        enc0])  # both 32x32

                enc6 = slim.layers.conv2d_transpose(  # 64x64x16
                    hidden7,
                    hidden7.get_shape()[3],
                    3,
                    stride=2,
                    scope='convt3',
                    normalizer_fn=tf_layers.layer_norm,
                    normalizer_params={'scope': 'layer_norm9'})

                if 'transform_from_firstimage' in self.conf:
                    prev_image = self.images[1]
                    if self.pix_distributions1 != None:
                        prev_pix_distrib1 = self.pix_distributions1[1]
                        prev_pix_distrib1 = tf.expand_dims(
                            prev_pix_distrib1, -1)
                    print('transform from image 1')

                if self.conf['model'] == 'DNA':
                    # Using largest hidden state for predicting untied conv kernels.

                    if 'separable_filters' in self.conf:
                        num_filters = KERN_SIZE * 2
                    else:
                        num_filters = KERN_SIZE**2

                    trafo_input = slim.layers.conv2d_transpose(
                        enc6, num_filters, 1, stride=1, scope='convt4_cam2')

                    transformed_l, _ = self.dna_transformation(
                        self.conf, prev_image, trafo_input)
                    if self.pix_distributions1 != None:
                        transf_distrib_ndesig1, _ = self.dna_transformation(
                            self.conf, prev_pix_distrib1, trafo_input)
                        if 'ndesig' in self.conf:
                            transf_distrib_ndesig2, _ = self.dna_transformation(
                                self.conf, prev_pix_distrib2, trafo_input)
                    extra_masks = 1

                if self.conf['model'] == 'CDNA':
                    if 'gen_pix' in self.conf:
                        enc7 = slim.layers.conv2d_transpose(enc6,
                                                            color_channels,
                                                            1,
                                                            stride=1,
                                                            scope='convt4')
                        transformed_l = [tf.nn.sigmoid(enc7)]
                        extra_masks = 2
                    else:
                        transformed_l = []
                        extra_masks = 1

                    if 'mov_bckgd' in self.conf:
                        extra_masks = self.num_masks

                    cdna_input = tf.reshape(hidden5, [int(batch_size), -1])
                    new_transformed, cdna_kerns = self.cdna_transformation(
                        self.conf, prev_image, cdna_input, reuse_sc=reuse)
                    transformed_l += new_transformed
                    self.moved_images.append(transformed_l)

                    ## move the background is chosen:
                    if 'mov_bckgd' in self.conf:
                        cdna_input = tf.reshape(hidden5,
                                                [int(self.batch_size), -1])
                        bckgd_transformed, _ = self.cdna_transformation(
                            self.conf,
                            self.images[0],
                            cdna_input,
                            reuse_sc=reuse,
                            scope='bckgd_trafo')
                        self.moved_bckgd.append(bckgd_transformed)

                    if self.pix_distributions1 != None:
                        transf_distrib_ndesig1, _ = self.cdna_transformation(
                            self.conf,
                            prev_pix_distrib1,
                            cdna_input,
                            reuse_sc=True)
                        self.moved_pix_distrib1.append(transf_distrib_ndesig1)

                        if 'mov_bckgd' in self.conf:
                            bcgkd_distrib = tf.reshape(
                                self.pix_distributions1[0],
                                (self.batch_size, 64, 64, 1))
                            transf_distrib_bckgd, _ = self.cdna_transformation(
                                self.conf,
                                bcgkd_distrib,
                                cdna_input,
                                reuse_sc=True,
                                scope='bckgd_trafo')
                        if 'ndesig' in self.conf:
                            transf_distrib_ndesig2, _ = self.cdna_transformation(
                                self.conf,
                                prev_pix_distrib2,
                                cdna_input,
                                reuse_sc=True)

                            self.moved_pix_distrib2.append(
                                transf_distrib_ndesig2)

                if '1stimg_bckgd' in self.conf:
                    background = self.images[0]
                    print('using background from first image..')
                else:
                    background = prev_image

                if 'mov_bckgd' in self.conf:
                    output, mask_list, moved_parts = self.fuse_trafos_movbckgd(
                        enc6,
                        bckgd_transformed,
                        transformed_l,
                        scope='convt7_cam2',
                        extra_masks=extra_masks,
                        reuse=reuse)
                    self.movd_parts_list.append(moved_parts)
                else:
                    output, mask_list = self.fuse_trafos(
                        enc6,
                        background,
                        transformed_l,
                        scope='convt7_cam2',
                        extra_masks=extra_masks)
                self.gen_images.append(output)
                self.gen_masks.append(mask_list)

                if self.pix_distributions1 != None:
                    if 'mov_bckgd' in self.conf:
                        pix_distrib_output = self.fuse_pix_movebckgd(
                            mask_list, transf_distrib_ndesig1,
                            transf_distrib_bckgd)
                    else:
                        pix_distrib_output = self.fuse_pix_distrib(
                            extra_masks, mask_list, self.pix_distributions1,
                            prev_pix_distrib1, transf_distrib_ndesig1)

                    self.gen_distrib1.append(pix_distrib_output)

                    if 'ndesig' in self.conf:
                        pix_distrib_output = self.fuse_pix_distrib(
                            extra_masks, mask_list, self.pix_distributions2,
                            prev_pix_distrib2, transf_distrib_ndesig2)

                        self.gen_distrib2.append(pix_distrib_output)

                if 'visual_flowvec' in self.conf:
                    motion_vecs = self.compute_motion_vector(cdna_kerns)
                    output = tf.zeros([self.conf['batch_size'], 64, 64, 2])
                    for vec, mask in zip(motion_vecs, mask_list[1:]):
                        vec = tf.reshape(vec,
                                         [self.conf['batch_size'], 1, 1, 2])
                        vec = tf.tile(vec, [1, 64, 64, 1])
                        output += vec * mask

                    self.flow_vectors.append(output)

                if current_state != None:
                    current_state = slim.layers.fully_connected(
                        state_action,
                        int(current_state.get_shape()[1]),
                        scope='state_pred',
                        activation_fn=None)

                self.gen_states.append(current_state)
    def build(self):

        if 'kern_size' in list(self.conf.keys()):
            KERN_SIZE = self.conf['kern_size']
        else:
            KERN_SIZE = 5

        batch_size, img_height, img_width, color_channels = self.images[0].get_shape()[0:4]
        lstm_func = basic_conv_lstm_cell


        if self.states != None:
            current_state = self.states[0]
        else:
            current_state = None

        if self.actions == None:
            self.actions = [None for _ in self.images]

        if self.k == -1:
            feedself = True
        else:
            # Scheduled sampling:
            # Calculate number of ground-truth frames to pass in.
            num_ground_truth = tf.to_int32(
                tf.round(tf.to_float(batch_size) * (self.k / (self.k + tf.exp(self.iter_num / self.k)))))
            feedself = False

        # LSTM state sizes and states.

        if 'lstm_size' in self.conf:
            lstm_size = self.conf['lstm_size']
            print('using lstm size', lstm_size)
        else:
            lstm_size = np.int32(np.array([16, 32, 64, 32, 16]))


        lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
        lstm_state5, lstm_state6, lstm_state7 = None, None, None

        t = -1
        self.T = len(self.images)

        for image, action in zip(self.images[:-1], self.actions[:-1]):
            t +=1
            print(t)
            # Reuse variables after the first timestep.
            reuse = bool(self.gen_images)

            done_warm_start = len(self.gen_images) > self.ncontext - 1
            with slim.arg_scope(
                    [lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                     tf_layers.layer_norm, slim.layers.conv2d_transpose],
                    reuse=reuse):

                if feedself and done_warm_start:
                    # Feed in generated image.
                    prev_image = self.gen_images[-1]             # 64x64x6
                    if self.pix_distributions1 != None:
                        prev_pix_distrib1 = self.gen_distrib1[-1]
                        if 'ndesig' in self.conf:
                            prev_pix_distrib2 = self.gen_distrib2[-1]
                elif done_warm_start:
                    # Scheduled sampling
                    prev_image = scheduled_sample(image, self.gen_images[-1], batch_size,
                                                  num_ground_truth)
                else:
                    # Always feed in ground_truth
                    prev_image = image
                    if self.pix_distributions1 != None:
                        prev_pix_distrib1 = self.pix_distributions1[t]
                        if 'ndesig' in self.conf:
                            prev_pix_distrib2 = self.pix_distributions2[t]
                        if len(prev_pix_distrib1.get_shape()) == 3:
                            prev_pix_distrib1 = tf.expand_dims(prev_pix_distrib1, -1)
                            if 'ndesig' in self.conf:
                                prev_pix_distrib2 = tf.expand_dims(prev_pix_distrib2, -1)

                if 'refeed_firstimage' in self.conf:
                    assert self.conf['model']=='STP'
                    if t > 1:
                        input_image = self.images[1]
                        print('refeed with image 1')
                    else:
                        input_image = prev_image
                else:
                    input_image = prev_image

                # Predicted state is always fed back in
                if not 'ignore_state_action' in self.conf:
                    state_action = tf.concat(axis=1, values=[action, current_state])

                enc0 = slim.layers.conv2d(    #32x32x32
                    input_image,
                    32, [5, 5],
                    stride=2,
                    scope='scale1_conv1',
                    normalizer_fn=tf_layers.layer_norm,
                    normalizer_params={'scope': 'layer_norm1'})

                hidden1, lstm_state1 = lstm_func(       # 32x32x16
                    enc0, lstm_state1, lstm_size[0], scope='state1')
                hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')

                enc1 = slim.layers.conv2d(     # 16x16x16
                    hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2')

                hidden3, lstm_state3 = lstm_func(   #16x16x32
                    enc1, lstm_state3, lstm_size[1], scope='state3')
                hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')

                enc2 = slim.layers.conv2d(  # 8x8x32
                    hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv3')

                if not 'ignore_state_action' in self.conf:
                    # Pass in state and action.
                    if 'ignore_state' in self.conf:
                        lowdim = action
                        print('ignoring state')
                    else:
                        lowdim = state_action

                    smear = tf.reshape(
                        lowdim,
                        [int(batch_size), 1, 1, int(lowdim.get_shape()[1])])
                    smear = tf.tile(
                        smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1])

                    enc2 = tf.concat(axis=3, values=[enc2, smear])
                else:
                    print('ignoring states and actions')

                enc3 = slim.layers.conv2d(   #8x8x32
                    enc2, hidden3.get_shape()[3], [1, 1], stride=1, scope='conv4')

                hidden5, lstm_state5 = lstm_func(  #8x8x64
                    enc3, lstm_state5, lstm_size[2], scope='state5')
                hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')
                enc4 = slim.layers.conv2d_transpose(  #16x16x64
                    hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1')

                hidden6, lstm_state6 = lstm_func(  #16x16x32
                    enc4, lstm_state6, lstm_size[3], scope='state6')
                hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')

                if 'noskip' not in self.conf:
                    # Skip connection.
                    hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16

                enc5 = slim.layers.conv2d_transpose(  #32x32x32
                    hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2')
                hidden7, lstm_state7 = lstm_func( # 32x32x16
                    enc5, lstm_state7, lstm_size[4], scope='state7')
                hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')

                if not 'noskip' in self.conf:
                    # Skip connection.
                    hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32

                enc6 = slim.layers.conv2d_transpose(   # 64x64x16
                    hidden7,
                    hidden7.get_shape()[3], 3, stride=2, scope='convt3',
                    normalizer_fn=tf_layers.layer_norm,
                    normalizer_params={'scope': 'layer_norm9'})


                im_history = self.assemble_history(t)

                if self.conf['model']=='DNA':
                    # Using largest hidden state for predicting untied conv kernels.
                    trafo_input = slim.layers.conv2d_transpose(
                        enc6, KERN_SIZE ** 2, 1, stride=1, scope='convt4_cam2')

                    transformed_l = [self.dna_transformation(prev_image, trafo_input, self.conf['kern_size'])]
                    if self.pix_distributions1 != None:
                        transf_distrib_ndesig1 = [self.dna_transformation(prev_pix_distrib1, trafo_input, KERN_SIZE)]
                        if 'ndesig' in self.conf:
                            transf_distrib_ndesig2 = [
                                self.dna_transformation(prev_pix_distrib2, trafo_input, KERN_SIZE)]

                    total_masks = 1

                if self.conf['model'] == 'CDNA':
                    total_masks = (self.T-1)*self.num_masks
                    cdna_input = tf.reshape(hidden5, [int(batch_size), -1])

                    transformed_l = []
                    for i, h_image in enumerate(im_history):
                        transformed, _ = self.cdna_transformation(h_image,
                                                                cdna_input,
                                                                reuse_sc=reuse,
                                                                scope='cdna_from{}'.format(i))
                        transformed_l+=transformed

                    output, mask_list = self.fuse_trafos(enc6,
                                                         transformed_l,
                                                         scope='convt7_cam2',
                                                         total_masks=total_masks)

                    self.moved_images.append(transformed_l)

                    if self.pix_distributions1 != None:
                        transf_distrib_ndesig1, _ = self.cdna_transformation(prev_pix_distrib1,
                                                                       cdna_input,
                                                                         reuse_sc=True)
                        self.moved_pix_distrib1.append(transf_distrib_ndesig1)

                self.moved_images.append(transformed_l)
                self.gen_images.append(output)
                self.gen_masks.append(mask_list)

                if self.pix_distributions1!=None:
                    pix_distrib_output = self.fuse_pix_distrib(total_masks,
                                                                mask_list,
                                                                self.pix_distributions1,
                                                                prev_pix_distrib1,
                                                                transf_distrib_ndesig1)

                    self.gen_distrib1.append(pix_distrib_output)


                if current_state != None:
                    current_state = slim.layers.fully_connected(
                        state_action,
                        int(current_state.get_shape()[1]),
                        scope='state_pred',
                        activation_fn=None)

                self.gen_states.append(current_state)
Пример #12
0
 def _norm(self, inp, scope=None):
     reuse = tf.get_variable_scope().reuse
     normalized = layer_norm(inp, reuse=reuse, scope=scope)
     return normalized
Пример #13
0
def forward(images, index, dna, cdna, num_masks=10):
    stime = time.time()
    batch_size, img_height, img_width = images[0].get_shape()[0:3]
    lstm_func = basic_conv_lstm_cell

    # Generated robot states and images.
    gen_images = []
    lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32]))
    lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
    lstm_state5, lstm_state6, lstm_state7 = None, None, None

    for i in range(images.__len__()):
        # Reuse variables after the first timestep.
        reuse = (i > 0)
        with slim.arg_scope([
                lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                tf_layers.layer_norm, slim.layers.conv2d_transpose
        ],
                            reuse=reuse):
            if reuse and i > index:
                prev_image = tf.reshape(gen_images[-1],
                                        [batch_size, img_height, img_width, 1])
            else:
                prev_image = tf.reshape(images[i],
                                        [batch_size, img_height, img_width, 1])

            enc0 = slim.layers.conv2d(
                prev_image,
                32,
                5,
                stride=1,
                scope='scale1_conv1',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm1'})

            hidden1, lstm_state1 = lstm_func(enc0,
                                             lstm_state1,
                                             lstm_size[0],
                                             scope='state1')
            hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')
            hidden2, lstm_state2 = lstm_func(hidden1,
                                             lstm_state2,
                                             lstm_size[1],
                                             scope='state2')
            hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3')
            enc1 = slim.layers.conv2d(hidden2,
                                      hidden2.get_shape()[3], [3, 3],
                                      stride=1,
                                      scope='conv2')

            hidden3, lstm_state3 = lstm_func(enc1,
                                             lstm_state3,
                                             lstm_size[2],
                                             scope='state3')
            hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')
            hidden4, lstm_state4 = lstm_func(hidden3,
                                             lstm_state4,
                                             lstm_size[3],
                                             scope='state4')
            hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5')
            output = slim.layers.conv2d(hidden4,
                                        hidden4.get_shape()[3], [3, 3],
                                        stride=1,
                                        scope='conv3')

            if (i > index - 1):
                gen_images.append(output[:, :, :, 0:1])
        # print(gen_images.name)
    print(time.time() - stime)
    return gen_images
Пример #14
0
    if FLAGS.max_pool:
        conv_output = tf.nn.conv2d(inp, cweight, no_stride, 'SAME') + bweight
    else:
        conv_output = tf.nn.conv2d(inp, cweight, stride, 'SAME') + bweight
    normed = normalize(conv_output, activation, reuse, scope)
    if FLAGS.max_pool:
        normed = tf.nn.max_pool(normed, stride, stride, max_pool_pad)
    return normed

def normalize(inp, activation, reuse, scope):
    if FLAGS.norm == 'batch_norm':
	    "Batch Normalization"
        return tf_layers.batch_norm(inp, activation_fn=activation, reuse=reuse, scope=scope)
    elif FLAGS.norm == 'layer_norm':
	    "Layer Normalization"
        return tf_layers.layer_norm(inp, activation_fn=activation, reuse=reuse, scope=scope)
    elif FLAGS.norm == 'None':
        return activation(inp) if activation is not None else inp

## Loss functions
def mse(pred, label):
    "均方误差"
    pred = tf.reshape(pred, [-1])
    label = tf.reshape(label, [-1])
    return tf.reduce_mean(tf.square(pred-label))

def xent(pred, label):
    # Note - with tf version <=0.12, this loss has incorrect 2nd derivatives
	"交叉熵"
    return tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=label) / FLAGS.update_batch_size
Пример #15
0
    def build(self):

        if 'kern_size' in self.conf.keys():
            KERN_SIZE = self.conf['kern_size']
        else:
            KERN_SIZE = 5

        batch_size, img_height, img_width, color_channels = self.images[0].get_shape()[0:4]
        lstm_func = basic_conv_lstm_cell


        if self.states != None:
            current_state = self.states[0]
        else:
            current_state = None

        if self.actions == None:
            self.actions = [None for _ in self.images]

        if self.k == -1:
            feedself = True
        else:
            # Scheduled sampling:
            # Calculate number of ground-truth frames to pass in.
            num_ground_truth = tf.to_int32(
                tf.round(tf.to_float(batch_size) * (self.k / (self.k + tf.exp(self.iter_num / self.k)))))
            feedself = False

        # LSTM state sizes and states.

        if 'lstm_size' in self.conf:
            lstm_size = self.conf['lstm_size']
            print('using lstm size', lstm_size)
        else:
            ngf = self.conf['ngf']
            lstm_size = np.int32(np.array([ngf, ngf * 2, ngf * 4, ngf * 2, ngf]))


        lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
        lstm_state5, lstm_state6, lstm_state7 = None, None, None

        for t, action in enumerate(self.actions):
            print(t)
            # Reuse variables after the first timestep.
            reuse = bool(self.gen_images)

            done_warm_start = len(self.gen_images) > self.context_frames - 1
            with slim.arg_scope(
                    [lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                     tf_layers.layer_norm, slim.layers.conv2d_transpose],
                    reuse=reuse):

                if feedself and done_warm_start:
                    # Feed in generated image.
                    prev_image = self.gen_images[-1]             # 64x64x6
                    if self.pix_distributions1 != None:
                        prev_pix_distrib1 = self.gen_distrib1[-1]
                        if 'ndesig' in self.conf:
                            prev_pix_distrib2 = self.gen_distrib2[-1]
                elif done_warm_start:
                    # Scheduled sampling
                    prev_image = scheduled_sample(self.images[t], self.gen_images[-1], batch_size,
                                                  num_ground_truth)
                else:
                    # Always feed in ground_truth
                    prev_image = self.images[t]
                    if self.pix_distributions1 != None:
                        prev_pix_distrib1 = self.pix_distributions1[t]
                        if 'ndesig' in self.conf:
                            prev_pix_distrib2 = self.pix_distributions2[t]
                        if len(prev_pix_distrib1.get_shape()) == 3:
                            prev_pix_distrib1 = tf.expand_dims(prev_pix_distrib1, -1)
                            if 'ndesig' in self.conf:
                                prev_pix_distrib2 = tf.expand_dims(prev_pix_distrib2, -1)

                if 'refeed_firstimage' in self.conf:
                    assert self.conf['model']=='STP'
                    if t > 1:
                        input_image = self.images[1]
                        print('refeed with image 1')
                    else:
                        input_image = prev_image
                else:
                    input_image = prev_image

                # Predicted state is always fed back in
                if not 'ignore_state_action' in self.conf:
                    state_action = tf.concat(axis=1, values=[action, current_state])

                enc0 = slim.layers.conv2d(    #32x32x32
                    input_image,
                    32, [5, 5],
                    stride=2,
                    scope='scale1_conv1',
                    normalizer_fn=tf_layers.layer_norm,
                    normalizer_params={'scope': 'layer_norm1'})

                hidden1, lstm_state1 = lstm_func(       # 32x32x16
                    enc0, lstm_state1, lstm_size[0], scope='state1')
                hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')

                enc1 = slim.layers.conv2d(     # 16x16x16
                    hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2')

                hidden3, lstm_state3 = lstm_func(   #16x16x32
                    enc1, lstm_state3, lstm_size[1], scope='state3')
                hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')

                enc2 = slim.layers.conv2d(  # 8x8x32
                    hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv3')

                if not 'ignore_state_action' in self.conf:
                    # Pass in state and action.
                    if 'ignore_state' in self.conf:
                        lowdim = action
                        print('ignoring state')
                    else:
                        lowdim = state_action

                    smear = tf.reshape(
                        lowdim,
                        [int(batch_size), 1, 1, int(lowdim.get_shape()[1])])
                    smear = tf.tile(
                        smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1])

                    enc2 = tf.concat(axis=3, values=[enc2, smear])
                else:
                    print('ignoring states and actions')

                enc3 = slim.layers.conv2d(   #8x8x32
                    enc2, hidden3.get_shape()[3], [1, 1], stride=1, scope='conv4')

                hidden5, lstm_state5 = lstm_func(  #8x8x64
                    enc3, lstm_state5, lstm_size[2], scope='state5')
                hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')
                enc4 = slim.layers.conv2d_transpose(  #16x16x64
                    hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1')

                hidden6, lstm_state6 = lstm_func(  #16x16x32
                    enc4, lstm_state6, lstm_size[3], scope='state6')
                hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')

                if 'noskip' not in self.conf:
                    # Skip connection.
                    hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16

                enc5 = slim.layers.conv2d_transpose(  #32x32x32
                    hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2')
                hidden7, lstm_state7 = lstm_func( # 32x32x16
                    enc5, lstm_state7, lstm_size[4], scope='state7')
                hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')

                if not 'noskip' in self.conf:
                    # Skip connection.
                    hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32

                enc6 = slim.layers.conv2d_transpose(   # 64x64x16
                    hidden7,
                    hidden7.get_shape()[3], 3, stride=2, scope='convt3',
                    normalizer_fn=tf_layers.layer_norm,
                    normalizer_params={'scope': 'layer_norm9'})

                if 'transform_from_firstimage' in self.conf:
                    prev_image = self.images[1]
                    if self.pix_distributions1 != None:
                        prev_pix_distrib1 = self.pix_distributions1[1]
                        prev_pix_distrib1 = tf.expand_dims(prev_pix_distrib1, -1)
                    print('transform from image 1')

                if self.conf['model'] == 'DNA':
                    # Using largest hidden state for predicting untied conv kernels.
                    trafo_input = slim.layers.conv2d_transpose(
                        enc6, KERN_SIZE ** 2, 1, stride=1, scope='convt4_cam2')

                    transformed_l = [self.dna_transformation(prev_image, trafo_input, self.conf['kern_size'])]
                    if self.pix_distributions1 != None:
                        transf_distrib_ndesig1 = [self.dna_transformation(prev_pix_distrib1, trafo_input, KERN_SIZE)]
                        if 'ndesig' in self.conf:
                            transf_distrib_ndesig2 = [
                                self.dna_transformation(prev_pix_distrib2, trafo_input, KERN_SIZE)]


                    extra_masks = 1  ## extra_masks = 2 is needed for running singleview_shifted!!
                    # print('using extra masks 2 because of single view shifted!!')
                    # extra_masks = 2

                if self.conf['model'] == 'CDNA':
                    if 'gen_pix' in self.conf:
                        # Using largest hidden state for predicting a new image layer.
                        enc7 = slim.layers.conv2d_transpose(
                            enc6, color_channels, 1, stride=1, scope='convt4', activation_fn=None)
                        # This allows the network to also generate one image from scratch,
                        # which is useful when regions of the image become unoccluded.
                        transformed_l = [tf.nn.sigmoid(enc7)]
                        extra_masks = 2
                    else:
                        transformed_l = []
                        extra_masks = 1

                    cdna_input = tf.reshape(hidden5, [int(batch_size), -1])
                    new_transformed, _ = self.cdna_transformation(prev_image,
                                                            cdna_input,
                                                            reuse_sc=reuse)
                    transformed_l += new_transformed
                    self.moved_images.append(transformed_l)

                    if self.pix_distributions1 != None:
                        transf_distrib_ndesig1, _ = self.cdna_transformation(prev_pix_distrib1,
                                                                       cdna_input,
                                                                         reuse_sc=True)
                        self.moved_pix_distrib1.append(transf_distrib_ndesig1)
                        if 'ndesig' in self.conf:
                            transf_distrib_ndesig2, _ = self.cdna_transformation(
                                                                               prev_pix_distrib2,
                                                                               cdna_input,
                                                                               reuse_sc=True)

                            self.moved_pix_distrib2.append(transf_distrib_ndesig2)

                if self.conf['model'] == 'STP':
                    enc7 = slim.layers.conv2d_transpose(enc6, color_channels, 1, stride=1, scope='convt5', activation_fn= None)
                    # This allows the network to also generate one image from scratch,
                    # which is useful when regions of the image become unoccluded.
                    if 'gen_pix' in self.conf:
                        transformed_l = [tf.nn.sigmoid(enc7)]
                        extra_masks = 2
                    else:
                        transformed_l = []
                        extra_masks = 1

                    enc_stp = tf.reshape(hidden5, [int(batch_size), -1])
                    stp_input = slim.layers.fully_connected(
                        enc_stp, 200, scope='fc_stp_cam2')

                    # disabling capability to generete pixels
                    reuse_stp = None
                    if reuse:
                        reuse_stp = reuse

                    # enable the generation of pixels:
                    transformed, trafo = self.stp_transformation(prev_image, stp_input, self.num_masks, reuse_stp, suffix='cam2')
                    transformed_l += transformed

                    self.trafos.append(trafo)
                    self.moved_images.append(transformed_l)

                    if self.pix_distributions1 != None:
                        transf_distrib_ndesig1, _ = self.stp_transformation(prev_pix_distrib1, stp_input, suffix='cam2', reuse=True)
                        self.moved_pix_distrib1.append(transf_distrib_ndesig1)

                if '1stimg_bckgd' in self.conf:
                    background = self.images[0]
                    print('using background from first image..')
                else: background = prev_image
                output, mask_list = self.fuse_trafos(enc6, background,
                                                     transformed_l,
                                                     scope='convt7_cam2',
                                                     extra_masks= extra_masks)
                self.gen_images.append(output)
                self.gen_masks.append(mask_list)

                if self.pix_distributions1!=None:
                    pix_distrib_output = self.fuse_pix_distrib(extra_masks,
                                                                mask_list,
                                                                self.pix_distributions1,
                                                                prev_pix_distrib1,
                                                                transf_distrib_ndesig1)

                    self.gen_distrib1.append(pix_distrib_output)
                    if 'ndesig' in self.conf:
                        pix_distrib_output = self.fuse_pix_distrib(extra_masks,
                                                                    mask_list,
                                                                    self.pix_distributions2,
                                                                    prev_pix_distrib2,
                                                                    transf_distrib_ndesig2)

                        self.gen_distrib2.append(pix_distrib_output)

                if int(current_state.get_shape()[1]) == 0:
                    current_state = tf.zeros_like(state_action)
                else:
                    current_state = slim.layers.fully_connected(
                        state_action,
                        int(current_state.get_shape()[1]),
                        scope='state_pred',
                        activation_fn=None)

                self.gen_states.append(current_state)
Пример #16
0
def construct_model(images,
                    actions=None,
                    states=None,
                    iter_num=-1.0,
                    k=-1,
                    use_state=True,
                    num_masks=10,
                    stp=False,
                    cdna=True,
                    dna=False,
                    context_frames=2):
  """Build convolutional lstm video predictor using STP, CDNA, or DNA.

  Args:
    images: tensor of ground truth image sequences
    actions: tensor of action sequences
    states: tensor of ground truth state sequences
    iter_num: tensor of the current training iteration (for sched. sampling)
    k: constant used for scheduled sampling. -1 to feed in own prediction.
    use_state: True to include state and action in prediction
    num_masks: the number of different pixel motion predictions (and
               the number of masks for each of those predictions)
    stp: True to use Spatial Transformer Predictor (STP)
    cdna: True to use Convoluational Dynamic Neural Advection (CDNA)
    dna: True to use Dynamic Neural Advection (DNA)
    context_frames: number of ground truth frames to pass in before
                    feeding in own predictions
  Returns:
    gen_images: predicted future image frames
    gen_states: predicted future states

  Raises:
    ValueError: if more than one network option specified or more than 1 mask
    specified for DNA model.
  """
  if stp + cdna + dna != 1:
    raise ValueError('More than one, or no network option specified.')
  batch_size, img_height, img_width, color_channels = images[0].get_shape()[0:4]
  lstm_func = basic_conv_lstm_cell

  # Generated robot states and images.
  gen_states, gen_images = [], []
  current_state = states[0]

  if k == -1:
    feedself = True
  else:
    # Scheduled sampling:
    # Calculate number of ground-truth frames to pass in.
    num_ground_truth = tf.to_int32(
        tf.round(tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k)))))
    feedself = False

  # LSTM state sizes and states.
  lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32]))
  lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
  lstm_state5, lstm_state6, lstm_state7 = None, None, None

  for image, action in zip(images[:-1], actions[:-1]):
    # Reuse variables after the first timestep.
    reuse = bool(gen_images)

    done_warm_start = len(gen_images) > context_frames - 1
    with slim.arg_scope(
        [lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
         tf_layers.layer_norm, slim.layers.conv2d_transpose],
        reuse=reuse):

      if feedself and done_warm_start:
        # Feed in generated image.
        prev_image = gen_images[-1]
      elif done_warm_start:
        # Scheduled sampling
        prev_image = scheduled_sample(image, gen_images[-1], batch_size,
                                      num_ground_truth)
      else:
        # Always feed in ground_truth
        prev_image = image

      # Predicted state is always fed back in
      state_action = tf.concat(axis=1, values=[action, current_state])

      enc0 = slim.layers.conv2d(
          prev_image,
          32, [5, 5],
          stride=2,
          scope='scale1_conv1',
          normalizer_fn=tf_layers.layer_norm,
          normalizer_params={'scope': 'layer_norm1'})

      hidden1, lstm_state1 = lstm_func(
          enc0, lstm_state1, lstm_size[0], scope='state1')
      hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')
      hidden2, lstm_state2 = lstm_func(
          hidden1, lstm_state2, lstm_size[1], scope='state2')
      hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3')
      enc1 = slim.layers.conv2d(
          hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv2')

      hidden3, lstm_state3 = lstm_func(
          enc1, lstm_state3, lstm_size[2], scope='state3')
      hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')
      hidden4, lstm_state4 = lstm_func(
          hidden3, lstm_state4, lstm_size[3], scope='state4')
      hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5')
      enc2 = slim.layers.conv2d(
          hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv3')

      # Pass in state and action.
      smear = tf.reshape(
          state_action,
          [int(batch_size), 1, 1, int(state_action.get_shape()[1])])
      smear = tf.tile(
          smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1])
      if use_state:
        enc2 = tf.concat(axis=3, values=[enc2, smear])
      enc3 = slim.layers.conv2d(
          enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope='conv4')

      hidden5, lstm_state5 = lstm_func(
          enc3, lstm_state5, lstm_size[4], scope='state5')  # last 8x8
      hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')
      enc4 = slim.layers.conv2d_transpose(
          hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1')

      hidden6, lstm_state6 = lstm_func(
          enc4, lstm_state6, lstm_size[5], scope='state6')  # 16x16
      hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')
      # Skip connection.
      hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16

      enc5 = slim.layers.conv2d_transpose(
          hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2')
      hidden7, lstm_state7 = lstm_func(
          enc5, lstm_state7, lstm_size[6], scope='state7')  # 32x32
      hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')

      # Skip connection.
      hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32

      enc6 = slim.layers.conv2d_transpose(
          hidden7,
          hidden7.get_shape()[3], 3, stride=2, scope='convt3',
          normalizer_fn=tf_layers.layer_norm,
          normalizer_params={'scope': 'layer_norm9'})

      if dna:
        # Using largest hidden state for predicting untied conv kernels.
        enc7 = slim.layers.conv2d_transpose(
            enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4')
      else:
        # Using largest hidden state for predicting a new image layer.
        enc7 = slim.layers.conv2d_transpose(
            enc6, color_channels, 1, stride=1, scope='convt4')
        # This allows the network to also generate one image from scratch,
        # which is useful when regions of the image become unoccluded.
        transformed = [tf.nn.sigmoid(enc7)]

      if stp:
        stp_input0 = tf.reshape(hidden5, [int(batch_size), -1])
        stp_input1 = slim.layers.fully_connected(
            stp_input0, 100, scope='fc_stp')
        transformed += stp_transformation(prev_image, stp_input1, num_masks)
      elif cdna:
        cdna_input = tf.reshape(hidden5, [int(batch_size), -1])
        transformed += cdna_transformation(prev_image, cdna_input, num_masks,
                                           int(color_channels))
      elif dna:
        # Only one mask is supported (more should be unnecessary).
        if num_masks != 1:
          raise ValueError('Only one mask is supported for DNA model.')
        transformed = [dna_transformation(prev_image, enc7)]

      masks = slim.layers.conv2d_transpose(
          enc6, num_masks + 1, 1, stride=1, scope='convt7')
      masks = tf.reshape(
          tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])),
          [int(batch_size), int(img_height), int(img_width), num_masks + 1])
      mask_list = tf.split(axis=3, num_or_size_splits=num_masks + 1, value=masks)
      output = mask_list[0] * prev_image
      for layer, mask in zip(transformed, mask_list[1:]):
        output += layer * mask
      gen_images.append(output)

      current_state = slim.layers.fully_connected(
          state_action,
          int(current_state.get_shape()[1]),
          scope='state_pred',
          activation_fn=None)
      gen_states.append(current_state)

  return gen_images, gen_states
Пример #17
0
def construct_model(images,
                    actions=None,
                    states=None,
                    iter_num=-1.0,
                    k=-1,
                    use_state=True,
                    context_frames=2,
                    conf=None):
    """Build convolutional lstm video predictor using STP, CDNA, or DNA.

    Args:
      images: tensor of ground truth image sequences
      actions: tensor of action sequences
      states: tensor of ground truth state sequences
      iter_num: tensor of the current training iteration (for sched. sampling)
      k: constant used for scheduled sampling. -1 to feed in own prediction.
      use_state: True to include state and action in prediction
      num_masks: the number of different pixel motion predictions (and
                 the number of masks for each of those predictions)
      stp: True to use Spatial Transformer Predictor (STP)
      cdna: True to use Convoluational Dynamic Neural Advection (CDNA)
      dna: True to use Dynamic Neural Advection (DNA)
      context_frames: number of ground truth frames to pass in before
                      feeding in own predictions
      pix_distrib: the initial one-hot distriubtion for designated pixels
    Returns:
      gen_images: predicted future image frames
      gen_states: predicted future states

    Raises:
      ValueError: if more than one network option specified or more than 1 mask
      specified for DNA model.
    """

    if 'dna_size' in conf.keys():
        DNA_KERN_SIZE = conf['dna_size']
    else:
        DNA_KERN_SIZE = 5

    print 'constructing network with hidden state...'

    batch_size, img_height, img_width, color_channels = images[0].get_shape(
    )[0:4]
    batch_size = int(batch_size)
    lstm_func = basic_conv_lstm_cell

    # Generated robot states and images.
    gen_states, gen_images, gen_masks, inf_low_state_list, pred_low_state_list = [], [], [], [], []
    current_state = states[0]
    gen_pix_distrib = []

    summaries = []

    if k == -1:
        feedself = True
    else:
        # Scheduled sampling:
        # Calculate number of ground-truth frames to pass in.
        num_ground_truth = tf.to_int32(
            tf.round(
                tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k)))))
        feedself = False

    # LSTM state sizes and states.
    lstm_size = np.int32(np.array([16, 32, 64, 100, 10]))
    lstm_state1, lstm_state2, lstm_state3 = None, None, None

    for t, image, action in zip(range(len(images)), images[:-1], actions[:-1]):

        # Reuse variables after the first timestep.
        reuse = bool(gen_images)

        done_warm_start = len(gen_images) > context_frames - 1
        with slim.arg_scope([
                lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                tf_layers.layer_norm, slim.layers.conv2d_transpose
        ],
                            reuse=reuse):

            if feedself and done_warm_start:
                # Feed in generated image.
                prev_image = gen_images[-1]
            elif done_warm_start:
                # Scheduled sampling
                prev_image = scheduled_sample(image, gen_images[-1],
                                              batch_size, num_ground_truth)
            else:
                # Always feed in ground_truth
                prev_image = image

            if (not 'prop_latent' in conf) or t < 2:  # encode!
                print 'encode {}'.format(t)

                # Predicted state is always fed back in
                state_action = tf.concat(1, [action, current_state])  # 6x

                enc0 = slim.layers.conv2d(  #32x32x32
                    prev_image,
                    32,
                    kernel_size=[5, 5],
                    stride=2,
                    scope='scale1_conv1',
                    normalizer_fn=tf_layers.layer_norm,
                    normalizer_params={'scope': 'layer_norm1'})

                hidden1, lstm_state1 = lstm_func(  #32x32x16
                    enc0,
                    lstm_state1,
                    lstm_size[0],
                    scope='state1')
                hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')

                enc1 = slim.layers.conv2d(  #16x16x16
                    hidden1,
                    hidden1.get_shape()[3], [3, 3],
                    stride=2,
                    scope='conv2')

                hidden2, lstm_state2 = lstm_func(  #16x16x32
                    enc1,
                    lstm_state2,
                    lstm_size[1],
                    scope='state3')
                hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3')

                enc2 = slim.layers.conv2d(  #8x8x32
                    hidden2,
                    hidden2.get_shape()[3], [3, 3],
                    stride=2,
                    scope='conv3')

                # Pass in state and action.
                smear = tf.reshape(
                    state_action,
                    [batch_size, 1, 1,
                     int(state_action.get_shape()[1])])
                smear = tf.tile(  #8x8x6
                    smear,
                    [1,
                     int(enc2.get_shape()[1]),
                     int(enc2.get_shape()[2]), 1])
                if use_state:
                    enc2 = tf.concat(3, [enc2, smear])
                enc3 = slim.layers.conv2d(  #8x8x32
                    enc2,
                    hidden2.get_shape()[3], [1, 1],
                    stride=1,
                    scope='conv4')

                hidden3, lstm_state3 = lstm_func(  #8x8x64
                    enc3, lstm_state3, lstm_size[2],
                    scope='state5')  # last 8x8
                hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')

                enc3 = slim.layers.conv2d(  # 8x8x32
                    hidden3, 32, [1, 1], stride=1, scope='conv5')

                if 'num_lt_featuremaps' in conf:
                    enc4_num_ft_mps = conf['num_lt_featuremaps']
                else:
                    enc4_num_ft_mps = 8

                enc4 = slim.layers.conv2d(  # 8x8x enc4_num_ft_mps
                    enc3,
                    enc4_num_ft_mps, [3, 3],
                    stride=1,
                    scope='conv6')

                if '4x4lowdim' in conf:
                    enc5 = slim.layers.conv2d(  # 8x8x1
                        enc4, 1, [3, 3], stride=1, scope='conv7')

                    low_dim_state = slim.layers.conv2d(  # 4x4x1
                        enc5, 1, [3, 3], stride=2, scope='conv8')
                else:
                    if 'num_lt_featuremaps' in conf:
                        num_lt_feature = conf['num_lt_featuremaps']
                    else:
                        num_lt_feature = 1
                    print 'number of latent featrue maps: ', num_lt_feature

                    low_dim_state = slim.layers.conv2d(  # 8x8xnum_lt_feature
                        enc4,
                        num_lt_feature, [3, 3],
                        stride=1,
                        scope='conv7')

                inf_low_state_list.append(low_dim_state)
                pred_low_state_list.append(
                    project_fwd_lowdim(conf, low_dim_state))

                ## start decoding from here:
                print 'decode with inferred lt-state at t{}'.format(t)

            else:  #when propagating latent t = 2,3,...
                assert '4x4lowdim' not in conf
                print 'decode with predicted lt-state at t{}'.format(t)

                pred_low_state_list.append(
                    project_fwd_lowdim(conf, pred_low_state_list[-1]))
                low_dim_state = pred_low_state_list[-1]

            if '4x4lowdim' in conf:
                dec4 = slim.layers.conv2d_transpose(  # 8x8x1
                    low_dim_state,
                    1, [3, 3],
                    stride=2,
                    scope='convt0')
            else:
                dec4 = low_dim_state

            dec5 = slim.layers.conv2d_transpose(  #  8x8x16
                dec4,
                16,
                3,
                stride=1,
                scope='convt1',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm5'})

            dec6 = slim.layers.conv2d_transpose(  # 16x16x16
                dec5,
                16,
                3,
                stride=2,
                scope='convt2',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm6'})

            if 'skip' in conf:
                dec6 = tf.concat(3, [dec6, enc1])  # both 16x16x16 + 16x16x16

            dec7 = slim.layers.conv2d_transpose(  # 16x16x32
                dec6,
                32,
                3,
                stride=1,
                scope='convt3',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm7'})

            dec8 = slim.layers.conv2d_transpose(  #32x32x32
                dec7,
                32,
                3,
                stride=2,
                scope='convt4',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm8'})

            if 'skip' in conf:
                dec8 = tf.concat(3, [dec8, enc0])  # both 32x32x32 + 32x32x32

            dec9 = slim.layers.conv2d_transpose(  #64x64x16
                dec8,
                16,
                3,
                stride=2,
                scope='convt5',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm9'})

            # Using largest hidden state for predicting untied conv kernels.
            dec10 = slim.layers.conv2d_transpose(dec9,
                                                 DNA_KERN_SIZE**2,
                                                 1,
                                                 stride=1,
                                                 scope='convt6')

            if conf['model'] == 'STP':
                num_masks = conf['num_masks']
                stp_input = tf.reshape(dec10, [int(batch_size), -1])
                transformed = stp_transformation(prev_image, stp_input,
                                                 num_masks)

            elif conf['model'] == 'DNA':
                transformed = [
                    dna_transformation(prev_image, dec10, DNA_KERN_SIZE)
                ]

            if 'use_masks' in conf:
                masks = slim.layers.conv2d_transpose(dec10,
                                                     num_masks + 1,
                                                     1,
                                                     stride=1,
                                                     scope='convt7')
                masks = tf.reshape(
                    tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [
                        int(batch_size),
                        int(img_height),
                        int(img_width), num_masks + 1
                    ])
                mask_list = tf.split(3, num_masks + 1, masks)
                output = mask_list[0] * prev_image
                for layer, mask in zip(transformed, mask_list[1:]):
                    output += layer * mask
            else:
                mask_list = None
                [output] = transformed

            gen_images.append(output)
            gen_masks.append(mask_list)

            current_state = decode_low_dim_obs(conf, low_dim_state)
            gen_states.append(current_state)

    return gen_images, gen_states, gen_masks, inf_low_state_list, pred_low_state_list
Пример #18
0
def construct_model(images,
                          actions=None,
                          states=None,
                          iter_num=-1.0,
                          k=-1,
                          use_state=True,
                          num_masks=10,
                          stp=False,
                          cdna=True,
                          dna=False,
                          context_frames=2):
  """Build convolutional lstm video predictor using STP, CDNA, or DNA.

  Args:
     images: tensor of ground truth image sequences
     actions: tensor of action sequences
     states: tensor of ground truth state sequences
     iter_num: tensor of the current training iteration (for sched. sampling)
     k: constant used for scheduled sampling. -1 to feed in own prediction.
     use_state: True to include state and action in prediction
     num_masks: the number of different pixel motion predictions (and
                    the number of masks for each of those predictions)
     stp: True to use Spatial Transformer Predictor (STP)
     cdna: True to use Convoluational Dynamic Neural Advection (CDNA)
     dna: True to use Dynamic Neural Advection (DNA)
     context_frames: number of ground truth frames to pass in before
                          feeding in own predictions
  Returns:
     gen_images: predicted future image frames
     gen_states: predicted future states

  Raises:
     ValueError: if more than one network option specified or more than 1 mask
     specified for DNA model.
  """
  if stp + cdna + dna != 1:
     raise ValueError('More than one, or no network option specified.')
  batch_size, img_height, img_width, color_channels = images[0].get_shape()[0:4]    # images(10, 32, 64, 64, 3) axis changed
  lstm_func = basic_conv_lstm_cell

  # Generated robot states and images.
  gen_states, gen_images = [], []
  current_state = states[0]

  if k == -1:
     feedself = True
  else:
     # Scheduled sampling:
     # Calculate number of ground-truth frames to pass in.
     num_ground_truth = tf.to_int32(
          tf.round(tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k)))))
     feedself = False

  # LSTM state sizes and states.
  lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32]))
  lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None # out of the loop!!!
  lstm_state5, lstm_state6, lstm_state7 = None, None, None

  for image, action in zip(images[:-1], actions[:-1]): # images[0,1,2,...,8] , no last images[9]  32, 64, 64, 3
     # Reuse variables after the first timestep.
     reuse = bool(gen_images)

     done_warm_start = len(gen_images) > context_frames - 1
     with slim.arg_scope(
          [lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
            tf_layers.layer_norm, slim.layers.conv2d_transpose],
          reuse=reuse):

        if feedself and done_warm_start:
          # Feed in generated image.
          prev_image = gen_images[-1]
        elif done_warm_start:
          # Scheduled sampling
          prev_image = scheduled_sample(image, gen_images[-1], batch_size,
                                                  num_ground_truth)
        else:
          # Always feed in ground_truth
          prev_image = image

        # Predicted state is always fed back in
        state_action = tf.concat(axis=1, values=[action, current_state])

        enc0 = slim.layers.conv2d(
             prev_image,
             32, [5, 5],
             stride=2,
             scope='scale1_conv1',
             normalizer_fn=tf_layers.layer_norm,
             normalizer_params={'scope': 'layer_norm1'})

        hidden1, lstm_state1 = lstm_func(enc0, lstm_state1, lstm_size[0], scope='state1') # 32
        hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')

        hidden2, lstm_state2 = lstm_func(hidden1, lstm_state2, lstm_size[1], scope='state2')# 32
        hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3')

        enc1 = slim.layers.conv2d(hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv2') # input, num_output, kernels

        hidden3, lstm_state3 = lstm_func(enc1, lstm_state3, lstm_size[2], scope='state3')# 64
        hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')

        hidden4, lstm_state4 = lstm_func(hidden3, lstm_state4, lstm_size[3], scope='state4')# 64
        hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5')

        enc2 = slim.layers.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv3')

        # Pass in state and action.
        smear = tf.reshape(
             state_action,
             [int(batch_size), 1, 1, int(state_action.get_shape()[1])])
        smear = tf.tile(
             smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1])
        if use_state:
          enc2 = tf.concat(axis=3, values=[enc2, smear])
        enc3 = slim.layers.conv2d(enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope='conv4')

        hidden5, lstm_state5 = lstm_func(enc3, lstm_state5, lstm_size[4], scope='state5')  # last 8x8  128
        hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')

        enc4 = slim.layers.conv2d_transpose(hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1')

        hidden6, lstm_state6 = lstm_func(enc4, lstm_state6, lstm_size[5], scope='state6')  # 16x16 64
        hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')
        # Skip connection.
        hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16

        enc5 = slim.layers.conv2d_transpose(hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2')

        hidden7, lstm_state7 = lstm_func(enc5, lstm_state7, lstm_size[6], scope='state7')  # 32x32 32
        hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')

        # Skip connection.
        hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32

        enc6 = slim.layers.conv2d_transpose(
             hidden7,
             hidden7.get_shape()[3], 3, stride=2, scope='convt3',
             normalizer_fn=tf_layers.layer_norm,
             normalizer_params={'scope': 'layer_norm9'})

        if dna:
          # Using largest hidden state for predicting untied conv kernels.
          enc7 = slim.layers.conv2d_transpose(
                enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4')
        else:
          # Using largest hidden state for predicting a new image layer.
          enc7 = slim.layers.conv2d_transpose(
                enc6, color_channels, 1, stride=1, scope='convt4')
          # This allows the network to also generate one image from scratch,
          # which is useful when regions of the image become unoccluded.
          transformed = [tf.nn.sigmoid(enc7)]

        if stp:
          stp_input0 = tf.reshape(hidden5, [int(batch_size), -1])
          stp_input1 = slim.layers.fully_connected(
                stp_input0, 100, scope='fc_stp')
          transformed += stp_transformation(prev_image, stp_input1, num_masks)
        elif cdna:
          cdna_input = tf.reshape(hidden5, [int(batch_size), -1])
          transformed += cdna_transformation(prev_image, cdna_input, num_masks,
                                                         int(color_channels))
        elif dna:
          # Only one mask is supported (more should be unnecessary).
          if num_masks != 1:
             raise ValueError('Only one mask is supported for DNA model.')
          transformed = [dna_transformation(prev_image, enc7)]

        masks = slim.layers.conv2d_transpose(
             enc6, num_masks + 1, 1, stride=1, scope='convt7')
        masks = tf.reshape(
             tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])),
             [int(batch_size), int(img_height), int(img_width), num_masks + 1])
        mask_list = tf.split(axis=3, num_or_size_splits=num_masks + 1, value=masks)
        output = mask_list[0] * prev_image
        for layer, mask in zip(transformed, mask_list[1:]):
          output += layer * mask
        gen_images.append(output)

        current_state = slim.layers.fully_connected(
             state_action,
             int(current_state.get_shape()[1]),
             scope='state_pred',
             activation_fn=None)
        gen_states.append(current_state)

  return gen_images, gen_states
Пример #19
0
def construct_model(images,
                    actions=None,
                    states=None,
                    iter_num=-1.0,
                    k=-1,
                    use_state=True,
                    num_masks=10,
                    stp=False,
                    cdna=True,
                    dna=False,
                    context_frames=2,
                    pix_distributions=None,
                    conf=None):
    """Build convolutional lstm video predictor using STP, CDNA, or DNA.

    Args:
      images: tensor of ground truth image sequences
      actions: tensor of action sequences
      states: tensor of ground truth state sequences
      iter_num: tensor of the current training iteration (for sched. sampling)
      k: constant used for scheduled sampling. -1 to feed in own prediction.
      use_state: True to include state and action in prediction
      num_masks: the number of different pixel motion predictions (and
                 the number of masks for each of those predictions)
      stp: True to use Spatial Transformer Predictor (STP)
      cdna: True to use Convoluational Dynamic Neural Advection (CDNA)
      dna: True to use Dynamic Neural Advection (DNA)
      context_frames: number of ground truth frames to pass in before
                      feeding in own predictions
      pix_distrib: the initial one-hot distriubtion for designated pixels
    Returns:
      gen_images: predicted future image frames
      gen_states: predicted future states

    Raises:
      ValueError: if more than one network option specified or more than 1 mask
      specified for DNA model.
    """

    if 'dna_size' in conf.keys():
        DNA_KERN_SIZE = conf['dna_size']
    else:
        DNA_KERN_SIZE = 5

    print 'constructing network with less layers...'

    if stp + cdna + dna != 1:
        raise ValueError('More than one, or no network option specified.')
    batch_size, img_height, img_width, color_channels = images[0].get_shape(
    )[0:4]
    batch_size = int(batch_size)
    lstm_func = basic_conv_lstm_cell

    # Generated robot states and images.
    gen_states, gen_images, gen_masks, inf_low_state, pred_low_state = [], [], [], [], []
    current_state = states[0]
    gen_pix_distrib = []

    summaries = []

    if k == -1:
        feedself = True
    else:
        # Scheduled sampling:
        # Calculate number of ground-truth frames to pass in.
        num_ground_truth = tf.to_int32(
            tf.round(
                tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k)))))
        feedself = False

    # LSTM state sizes and states.
    lstm_size = np.int32(np.array([16, 32, 64, 100, 10]))
    lstm_state1, lstm_state2, lstm_state3 = None, None, None

    single_lstm1 = BasicLSTMCell(lstm_size[3], state_is_tuple=True)
    single_lstm2 = BasicLSTMCell(lstm_size[4], state_is_tuple=True)
    low_dim_lstm = MultiRNNCell([single_lstm1, single_lstm2],
                                state_is_tuple=True)

    low_dim_lstm_state = low_dim_lstm.zero_state(batch_size, tf.float32)

    dim_low_state = int(lstm_size[-1])

    t = -1
    for image, action in zip(images[:-1], actions[:-1]):
        t += 1
        print 'building timestep ', t
        # Reuse variables after the first timestep.
        reuse = bool(gen_images)

        done_warm_start = len(gen_images) > context_frames - 1
        with slim.arg_scope([
                lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                tf_layers.layer_norm, slim.layers.conv2d_transpose
        ],
                            reuse=reuse):

            if feedself and done_warm_start:
                # Feed in generated image.
                prev_image = gen_images[-1]
                if pix_distributions != None:
                    prev_pix_distrib = gen_pix_distrib[-1]
            elif done_warm_start:
                # Scheduled sampling
                prev_image = scheduled_sample(image, gen_images[-1],
                                              batch_size, num_ground_truth)
            else:
                # Always feed in ground_truth
                prev_image = image
                if pix_distributions != None:
                    prev_pix_distrib = pix_distributions[t]
                    prev_pix_distrib = tf.expand_dims(prev_pix_distrib, -1)

            # Predicted state is always fed back in
            state_action = tf.concat(1, [action, current_state])  # 6x

            import pdb
            pdb.set_trace()
            enc0 = slim.layers.conv2d(  #32x32x32
                prev_image,
                32,
                kernel_size=[5, 5],
                stride=2,
                scope='scale1_conv1',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm1'})

            hidden1, lstm_state1 = lstm_func(  #32x32
                enc0, lstm_state1, lstm_size[0], scope='state1')
            hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')

            enc1 = slim.layers.conv2d(  #16x16
                hidden1,
                hidden1.get_shape()[3], [3, 3],
                stride=2,
                scope='conv2')

            hidden2, lstm_state2 = lstm_func(  #16x16x32
                enc1, lstm_state2, lstm_size[1], scope='state3')
            hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm4')

            enc2 = slim.layers.conv2d(  #8x8x32
                hidden2,
                hidden2.get_shape()[3], [3, 3],
                stride=2,
                scope='conv3')

            # Pass in state and action.
            smear = tf.reshape(
                state_action,
                [batch_size, 1, 1,
                 int(state_action.get_shape()[1])])
            smear = tf.tile(  #8x8x6
                smear,
                [1, int(enc2.get_shape()[1]),
                 int(enc2.get_shape()[2]), 1])
            if use_state:
                enc2 = tf.concat(3, [enc2, smear])
            enc3 = slim.layers.conv2d(  #8x8x32
                enc2,
                hidden2.get_shape()[3], [1, 1],
                stride=1,
                scope='conv4')

            hidden3, lstm_state3 = lstm_func(  #8x8x64
                enc3, lstm_state3, lstm_size[2], scope='state5')  # last 8x8
            hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm6')

            enc3 = slim.layers.conv2d(  # 8x8x32
                hidden3, 16, [1, 1], stride=1, scope='conv5')

            enc3_flat = tf.reshape(enc3, [batch_size, -1])

            if 'use_low_dim_lstm' in conf:
                with tf.variable_scope('low_dim_lstm', reuse=reuse):
                    hidden4, low_dim_lstm_state = low_dim_lstm(
                        enc3_flat, low_dim_lstm_state)
                low_dim_state = hidden4
            else:
                enc_fully1 = slim.layers.fully_connected(enc3_flat,
                                                         400,
                                                         scope='enc_fully1')

                enc_fully2 = slim.layers.fully_connected(enc_fully1,
                                                         100,
                                                         scope='enc_fully2')

                low_dim_state = enc_fully2

            # inferred low dimensional state:
            inf_low_state.append(low_dim_state)

            pred_low_state.append(project_fwd_lowdim(low_dim_state))

            smear = tf.reshape(low_dim_state,
                               [batch_size, 1, 1, dim_low_state])
            smear = tf.tile(  # 8x8xdim_hidden_state
                smear,
                [1, int(enc2.get_shape()[1]),
                 int(enc2.get_shape()[2]), 1])

            enc4 = slim.layers.conv2d_transpose(  #16x16x32
                smear,
                hidden3.get_shape()[3],
                3,
                stride=2,
                scope='convt1')

            enc5 = slim.layers.conv2d_transpose(  #32x32x32
                enc4,
                enc0.get_shape()[3],
                3,
                stride=2,
                scope='convt2')

            enc6 = slim.layers.conv2d_transpose(  #64x64x16
                enc5,
                16,
                3,
                stride=2,
                scope='convt3',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm9'})

            # Using largest hidden state for predicting untied conv kernels.
            enc7 = slim.layers.conv2d_transpose(enc6,
                                                DNA_KERN_SIZE**2,
                                                1,
                                                stride=1,
                                                scope='convt4')

            # Only one mask is supported (more should be unnecessary).
            if num_masks != 1:
                raise ValueError('Only one mask is supported for DNA model.')
            transformed = [dna_transformation(prev_image, enc7, DNA_KERN_SIZE)]

            if 'use_masks' in conf:
                masks = slim.layers.conv2d_transpose(enc6,
                                                     num_masks + 1,
                                                     1,
                                                     stride=1,
                                                     scope='convt7')
                masks = tf.reshape(
                    tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [
                        int(batch_size),
                        int(img_height),
                        int(img_width), num_masks + 1
                    ])
                mask_list = tf.split(3, num_masks + 1, masks)
                output = mask_list[0] * prev_image
                for layer, mask in zip(transformed, mask_list[1:]):
                    output += layer * mask
            else:
                mask_list = None
                output = transformed

            gen_images.append(output)
            gen_masks.append(mask_list)

            if dna and pix_distributions != None:
                transf_distrib = [
                    dna_transformation(prev_pix_distrib, enc7, DNA_KERN_SIZE)
                ]

            if pix_distributions != None:
                pix_distrib_output = mask_list[0] * prev_pix_distrib
                mult_list = []
                for i in range(num_masks):
                    mult_list.append(transf_distrib[i] * mask_list[i + 1])
                    pix_distrib_output += mult_list[i]

                gen_pix_distrib.append(pix_distrib_output)

            # pred_low_state_stopped = tf.stop_gradient(pred_low_state)

            state_enc1 = slim.layers.fully_connected(
                # pred_low_state[-1],
                low_dim_state,
                100,
                scope='state_enc1')

            state_enc2 = slim.layers.fully_connected(
                state_enc1,
                # int(current_state.get_shape()[1]),
                4,
                scope='state_enc2',
                activation_fn=None)
            current_state = tf.squeeze(state_enc2)
            gen_states.append(current_state)

    if pix_distributions != None:
        return gen_images, gen_states, gen_masks, gen_pix_distrib, inf_low_state, pred_low_state
    else:
        return gen_images, gen_states, gen_masks, None, inf_low_state, pred_low_state
def decoder_model(hidden_repr, sequence_length, initializer, num_channels=3, scope='decoder', fc_conv_layer=False):
  """
  Args:
    hidden_repr: Tensor of latent space representation
    sequence_length: number of frames that shall be decoded from the hidden_repr
    num_channels: number of channels for generated frames
    initializer: specifies the initialization type (default: contrib.slim.layers uses Xavier init with uniform data)
    fc_conv_layer: adds an fc layer at the end of the encoder
  Returns:
    frame_gen: array of generated frames (Tensors)
    fc_conv_layer: indicates whether hidden_repr is 1x1xdepth tensor a and fully concolutional layer shall be added
  """
  frame_gen = []

  lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state0 = None, None, None, None, None, None
  assert (not fc_conv_layer) or (hidden_repr.get_shape()[1] == hidden_repr.get_shape()[2] == 1)

  for i in range(sequence_length):
    reuse = (i > 0) #reuse variables (recurrence) after first time step

    with tf.variable_scope(scope, reuse=reuse):

      #Fully Convolutional Layer (1x1xFC_LAYER_SIZE -> 4x4x16)
      hidden0, lstm_state0 = basic_conv_lstm_cell(hidden_repr, lstm_state0, FC_LAYER_SIZE, initializer, filter_size=1,
                                                  scope='convlstm0')


      fc_conv = slim.layers.conv2d_transpose(hidden0, 32, [4, 4], stride=1, scope='fc_conv', padding='VALID', weights_initializer=initializer)


      #LAYER 1: convLSTM1
      hidden1, lstm_state1 = basic_conv_lstm_cell(fc_conv, lstm_state1, 32, initializer, filter_size=3, scope='convlstm1')
      hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm1')

      #LAYER 2: upconv1 (8x8 -> 16x16)
      upconv1 = slim.layers.conv2d_transpose(hidden1, hidden1.get_shape()[3], 3, stride=2, scope='upconv1', weights_initializer=initializer,
                                             normalizer_fn=tf_layers.layer_norm,
                                             normalizer_params={'scope': 'layer_norm2'})

      #LAYER 3: convLSTM2
      hidden2, lstm_state2 = basic_conv_lstm_cell(upconv1, lstm_state2, 32, initializer, filter_size=3, scope='convlstm2')
      hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3')

      #LAYER 4: upconv2 (16x16 -> 32x32)
      upconv2 = slim.layers.conv2d_transpose(hidden2, hidden2.get_shape()[3], 3, stride=2, scope='upconv2', weights_initializer=initializer,
                                             normalizer_fn=tf_layers.layer_norm,
                                             normalizer_params={'scope': 'layer_norm4'})

      #LAYER 5: convLSTM3
      hidden3, lstm_state3 = basic_conv_lstm_cell(upconv2, lstm_state3, 16, initializer, filter_size=3, scope='convlstm3')
      hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm5')

      # LAYER 6: upconv3 (32x32 -> 64x64)
      upconv3 = slim.layers.conv2d_transpose(hidden3, hidden3.get_shape()[3], 5, stride=2, scope='upconv3', weights_initializer=initializer,
                                             normalizer_fn=tf_layers.layer_norm,
                                             normalizer_params={'scope': 'layer_norm6'})

      #LAYER 7: convLSTM4
      hidden4, lstm_state4 = basic_conv_lstm_cell(upconv3, lstm_state4, 16, initializer, filter_size=5, scope='convlstm4')
      hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm7')

      #Layer 8: upconv4 (64x64 -> 128x128)
      upconv4 = slim.layers.conv2d_transpose(hidden4, 16, 5, stride=2, scope='upconv4', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer,
                                             normalizer_params={'scope': 'layer_norm8'})

      #LAYER 9: convLSTM5
      hidden5, lstm_state5 = basic_conv_lstm_cell(upconv4, lstm_state5, 16, initializer, filter_size=5, scope='convlstm5')
      hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm9')

      upconv5 = slim.layers.conv2d_transpose(hidden5, num_channels, 5, stride=2, scope='upconv5', weights_initializer=initializer)

      frame_gen.append(upconv5)

  assert len(frame_gen)==sequence_length
  return frame_gen
Пример #21
0
def construct_model(images,
                    actions=None,
                    states=None,
                    iter_num=-1.0,
                    k=-1,
                    use_state=True,
                    num_masks=10,
                    stp=False,
                    cdna=True,
                    dna=False,
                    context_frames=2,
                    conf=None):

    if 'dna_size' in conf.keys():
        DNA_KERN_SIZE = conf['dna_size']
    else:
        DNA_KERN_SIZE = 5

    print 'constructing network with less layers...'

    if stp + cdna + dna != 1:
        raise ValueError('More than one, or no network option specified.')
    batch_size, img_height, img_width, color_channels = images[0].get_shape(
    )[0:4]
    lstm_func = basic_conv_lstm_cell

    # Generated robot states and images.
    gen_states, gen_images, gen_masks, gen_poses = [], [], [], []

    summaries = []

    if k == -1:
        feedself = True
    else:
        # Scheduled sampling:
        # Calculate number of ground-truth frames to pass in.
        num_ground_truth = tf.to_int32(
            tf.round(
                tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k)))))
        feedself = False

    # LSTM state sizes and states.

    if 'lstm_size' in conf:
        lstm_size = conf['lstm_size']
    else:
        lstm_size = np.int32(np.array([16, 16, 32, 32, 64, 32, 16]))

    lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
    lstm_state5, lstm_state6, lstm_state7 = None, None, None

    t = -1
    for image, action, state in zip(images[:-1], actions[:-1], states[:-1]):
        t += 1
        # Reuse variables after the first timestep.
        reuse = bool(gen_images)

        done_warm_start = len(gen_images) > context_frames - 1
        with slim.arg_scope([
                lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                tf_layers.layer_norm, slim.layers.conv2d_transpose
        ],
                            reuse=reuse):

            if feedself and done_warm_start:
                # Feed in generated image.
                prev_image = gen_images[-1]
                prev_state = gen_states[-1]
            elif done_warm_start:
                # Scheduled sampling
                prev_image = scheduled_sample(image, gen_images[-1],
                                              batch_size, num_ground_truth)
                prev_image = tf.reshape(prev_image,
                                        [conf['batch_size'], 64, 64, 3])
                prev_state = scheduled_sample(state, gen_states[-1],
                                              batch_size, num_ground_truth)
                prev_state = tf.reshape(prev_state, [conf['batch_size'], 4])
            else:
                # Always feed in ground_truth
                prev_image = image
                prev_state = state

            if 'transform_from_firstimage' in conf:
                assert stp
                if t > 1:
                    prev_image = images[1]
                    print 'using image 1'

            enc0 = slim.layers.conv2d(  #32x32x32
                prev_image,
                32, [5, 5],
                stride=2,
                scope='scale1_conv1',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm1'})

            hidden1, lstm_state1 = lstm_func(  # 32x32x16
                enc0, lstm_state1, lstm_size[0], scope='state1')
            hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')
            enc1 = slim.layers.conv2d(  # 16x16x16
                hidden1,
                hidden1.get_shape()[3], [3, 3],
                stride=2,
                scope='conv2')

            hidden3, lstm_state3 = lstm_func(  #16x16x32
                enc1, lstm_state3, lstm_size[2], scope='state3')
            hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')
            enc2 = slim.layers.conv2d(  #8x8x32
                hidden3,
                hidden3.get_shape()[3], [3, 3],
                stride=2,
                scope='conv3')

            # Pass in state and action.
            # Predicted state is always fed back in
            state_action = tf.concat(1, [action, prev_state])
            smear = tf.reshape(
                state_action,
                [int(batch_size), 1, 1,
                 int(state_action.get_shape()[1])])
            smear = tf.tile(
                smear,
                [1, int(enc2.get_shape()[1]),
                 int(enc2.get_shape()[2]), 1])
            if use_state:
                enc2 = tf.concat(3, [enc2, smear])
            enc3 = slim.layers.conv2d(  #8x8x32
                enc2,
                hidden3.get_shape()[3], [1, 1],
                stride=1,
                scope='conv4')

            hidden5, lstm_state5 = lstm_func(  #8x8x64
                enc3, lstm_state5, lstm_size[4], scope='state5')
            hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')
            enc4 = slim.layers.conv2d_transpose(  #16x16x64
                hidden5,
                hidden5.get_shape()[3],
                3,
                stride=2,
                scope='convt1')

            hidden6, lstm_state6 = lstm_func(  #16x16x32
                enc4, lstm_state6, lstm_size[5], scope='state6')
            hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')

            if not 'noskip' in conf:
                # Skip connection.
                hidden6 = tf.concat(3, [hidden6, enc1])  # both 16x16

            enc5 = slim.layers.conv2d_transpose(  #32x32x32
                hidden6,
                hidden6.get_shape()[3],
                3,
                stride=2,
                scope='convt2')
            hidden7, lstm_state7 = lstm_func(  # 32x32x16
                enc5, lstm_state7, lstm_size[6], scope='state7')
            hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')

            if not 'noskip' in conf:
                # Skip connection.
                hidden7 = tf.concat(3, [hidden7, enc0])  # both 32x32

            enc6 = slim.layers.conv2d_transpose(  # 64x64x16
                hidden7,
                hidden7.get_shape()[3],
                3,
                stride=2,
                scope='convt3',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm9'})

            if dna:
                # Using largest hidden state for predicting untied conv kernels.
                enc7 = slim.layers.conv2d_transpose(enc6,
                                                    DNA_KERN_SIZE**2,
                                                    1,
                                                    stride=1,
                                                    scope='convt4')
            else:
                # Using largest hidden state for predicting a new image layer.
                enc7 = slim.layers.conv2d_transpose(enc6,
                                                    color_channels,
                                                    1,
                                                    stride=1,
                                                    scope='convt4')
                # This allows the network to also generate one image from scratch,
                # which is useful when regions of the image become unoccluded.
                transformed = [tf.nn.sigmoid(enc7)]

            if stp:
                stp_input0 = tf.reshape(hidden5, [int(batch_size), -1])
                stp_input1 = slim.layers.fully_connected(stp_input0,
                                                         100,
                                                         scope='fc_stp')

                # disabling capability to generete pixels
                reuse_stp = None
                if reuse:
                    reuse_stp = reuse
                transformed = stp_transformation(prev_image, stp_input1,
                                                 num_masks, reuse_stp)
                # transformed += stp_transformation(prev_image, stp_input1, num_masks)

            elif dna:
                # Only one mask is supported (more should be unnecessary).
                if num_masks != 1:
                    raise ValueError(
                        'Only one mask is supported for DNA model.')
                transformed = [
                    dna_transformation(prev_image, enc7, DNA_KERN_SIZE)
                ]

            masks = slim.layers.conv2d_transpose(enc6,
                                                 num_masks + 1,
                                                 1,
                                                 stride=1,
                                                 scope='convt7')
            masks = tf.reshape(
                tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [
                    int(batch_size),
                    int(img_height),
                    int(img_width), num_masks + 1
                ])
            mask_list = tf.split(3, num_masks + 1, masks)
            output = mask_list[0] * prev_image
            for layer, mask in zip(transformed, mask_list[1:]):
                output += layer * mask
            gen_images.append(output)
            gen_masks.append(mask_list)

            next_state, next_pose = predict_next_low_dim(
                conf, hidden7, enc0, state_action)
            gen_states.append(next_state)
            gen_poses.append(next_pose)

    return gen_images, gen_states, gen_poses
Пример #22
0
    def encoder_decoder_fn(self, action, batch_size, input_image, lstm_func,
                           lstm_size, lstm_states, state_action):
        """
        :return:
            enc6: the representation use to construct the masks
            hidden5: the representation use to construct the CDNA kernels
            lstm_states: hidden lstm states
        """
        lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state6, lstm_state7 = lstm_states
        enc0 = slim.layers.conv2d(  # 32x32x32
            input_image,
            32, [5, 5],
            stride=2,
            scope='scale1_conv1',
            normalizer_fn=tf_layers.layer_norm,
            normalizer_params={'scope': 'layer_norm1'})
        hidden1, lstm_state1 = lstm_func(  # 32x32x16
            enc0, lstm_state1, lstm_size[0], scope='state1')
        hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')
        enc1 = slim.layers.conv2d(  # 16x16x16
            hidden1,
            hidden1.get_shape()[3], [3, 3],
            stride=2,
            scope='conv2')
        hidden3, lstm_state3 = lstm_func(  # 16x16x32
            enc1, lstm_state3, lstm_size[1], scope='state3')
        hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')
        enc2 = slim.layers.conv2d(  # 8x8x32
            hidden3,
            hidden3.get_shape()[3], [3, 3],
            stride=2,
            scope='conv3')
        if 'ignore_state_action' not in self.conf:
            # Pass in state and action.
            if 'ignore_state' in self.conf:
                lowdim = action
                print('ignoring state')
            else:
                lowdim = state_action

            smear = tf.reshape(
                lowdim, [int(batch_size), 1, 1,
                         int(lowdim.get_shape()[1])])
            smear = tf.tile(
                smear,
                [1, int(enc2.get_shape()[1]),
                 int(enc2.get_shape()[2]), 1])
            enc2 = tf.concat(axis=3, values=[enc2, smear])
        else:
            print('ignoring states and actions')
        enc3 = slim.layers.conv2d(  # 8x8x32
            enc2,
            hidden3.get_shape()[3], [1, 1],
            stride=1,
            scope='conv4')
        hidden5, lstm_state5 = lstm_func(  # 8x8x64
            enc3, lstm_state5, lstm_size[2], scope='state5')
        hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')
        enc4 = slim.layers.conv2d_transpose(  # 16x16x64
            hidden5,
            hidden5.get_shape()[3],
            3,
            stride=2,
            scope='convt1')
        hidden6, lstm_state6 = lstm_func(  # 16x16x32
            enc4, lstm_state6, lstm_size[3], scope='state6')
        hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')
        if 'noskip' not in self.conf:
            # Skip connection.
            hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16
        enc5 = slim.layers.conv2d_transpose(  # 32x32x32
            hidden6,
            hidden6.get_shape()[3],
            3,
            stride=2,
            scope='convt2')
        hidden7, lstm_state7 = lstm_func(  # 32x32x16
            enc5, lstm_state7, lstm_size[4], scope='state7')
        hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')
        if 'noskip' not in self.conf:
            # Skip connection.
            hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32
        enc6 = slim.layers.conv2d_transpose(  # 64x64x16
            hidden7,
            hidden7.get_shape()[3],
            3,
            stride=2,
            scope='convt3',
            normalizer_fn=tf_layers.layer_norm,
            normalizer_params={'scope': 'layer_norm9'})
        lstm_states = lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state6, lstm_state7
        return enc6, hidden5, lstm_states
Пример #23
0
def forward(images, index, dna, cdna, num_masks=10, reuse=None):
    stime = time.time()
    batch_size, img_height, img_width = images[0].get_shape()[0:3]
    lstm_func = basic_conv_lstm_cell
    # Generated robot states and images.
    gen_images = []
    lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32]))
    lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
    lstm_state5, lstm_state6, lstm_state7 = None, None, None

    for i in range(images.__len__()):
        # Reuse variables after the first timestep.
        if i > 0:
            reuse = True
        with slim.arg_scope(
                [lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                 tf_layers.layer_norm, slim.layers.conv2d_transpose],
                reuse=reuse):
            if i > index:
                prev_image = tf.reshape(gen_images[-1], [batch_size, img_height, img_width, 1])
            else:
                prev_image = tf.reshape(images[i], [batch_size, img_height, img_width, 1])

            enc0 = slim.layers.conv2d(
                prev_image,
                32, 5,
                stride=2,
                scope='scale1_conv1',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm1'})

            hidden1, lstm_state1 = lstm_func(
                enc0, lstm_state1, lstm_size[0], scope='state1')
            hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')
            hidden2, lstm_state2 = lstm_func(
                hidden1, lstm_state2, lstm_size[1], scope='state2')
            hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3')
            enc1 = slim.layers.conv2d(
                hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv2')

            hidden3, lstm_state3 = lstm_func(
                enc1, lstm_state3, lstm_size[2], scope='state3')
            hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')
            hidden4, lstm_state4 = lstm_func(
                hidden3, lstm_state4, lstm_size[3], scope='state4')
            hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5')
            enc2 = slim.layers.conv2d(
                hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv3')
            enc3 = slim.layers.conv2d(
                enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope='conv4')
            hidden5, lstm_state5 = lstm_func(
                enc3, lstm_state5, lstm_size[4], scope='state5')  # last 8x8
            hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')
            enc4 = slim.layers.conv2d_transpose(
                hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1')
            hidden6, lstm_state6 = lstm_func(
                enc4, lstm_state6, lstm_size[5], scope='state6')  # 16x16
            hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')
            # Skip connection.
            hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16

            enc5 = slim.layers.conv2d_transpose(
                hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2')
            hidden7, lstm_state7 = lstm_func(
                enc5, lstm_state7, lstm_size[6], scope='state7')  # 32x32
            hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')

            # Skip connection.
            hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32

            enc6 = slim.layers.conv2d_transpose(
                hidden7,
                hidden7.get_shape()[3], 3, stride=2, scope='convt3',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm9'})
            if dna:
                # Using largest hidden state for predicting untied conv kernels.
                enc7 = slim.layers.conv2d_transpose(
                    enc6, DNA_KERN_SIZE ** 2, 1, stride=1, scope='convt4')
            else:
                # Using largest hidden state for predicting a new image layer.
                enc7 = slim.layers.conv2d_transpose(
                    enc6, 1, 1, stride=1, scope='convt4')
                # This allows the network to also generate one image from scratch,
                # which is useful when regions of the image become unoccluded.
                transformed = [tf.nn.sigmoid(enc7)]

            if cdna:
                cdna_input = tf.reshape(hidden5, [int(batch_size), -1])
                transformed += cdna_transformation(prev_image, cdna_input, num_masks,
                                                   1)
            elif dna:
                # Only one mask is supported (more should be unnecessary).
                if num_masks != 1:
                    raise ValueError('Only one mask is supported for DNA model.')
                transformed = [dna_transformation(prev_image, enc7)]
            masks = slim.layers.conv2d_transpose(
                enc6, num_masks + 1, 1, stride=1, scope='convt7')
            masks = tf.reshape(
                tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])),
                [int(batch_size), int(img_height), int(img_width), num_masks + 1])
            mask_list = tf.split(axis=3, num_or_size_splits=num_masks + 1, value=masks)
            output = mask_list[0] * prev_image
            for layer, mask in zip(transformed, mask_list[1:]):
                output += layer * mask
            if (i > index - 1):
                gen_images.append(output)
        # print(gen_images.name)
    print(time.time() - stime)
    return gen_images
def construct_model(images,
                    actions=None,
                    states=None,
                    iter_num=-1.0,
                    k=-1,
                    use_state=True,
                    num_masks=10,
                    stp=False,
                    cdna=True,
                    dna=False,
                    context_frames=2,
                    pix_distributions=None,
                    conf=None):
    """Build convolutional lstm video predictor using STP, CDNA, or DNA.

    Args:
      images: tensor of ground truth image sequences
      actions: tensor of action sequences
      states: tensor of ground truth state sequences
      iter_num: tensor of the current training iteration (for sched. sampling)
      k: constant used for scheduled sampling. -1 to feed in own prediction.
      use_state: True to include state and action in prediction
      num_masks: the number of different pixel motion predictions (and
                 the number of masks for each of those predictions)
      stp: True to use Spatial Transformer Predictor (STP)
      cdna: True to use Convoluational Dynamic Neural Advection (CDNA)
      dna: True to use Dynamic Neural Advection (DNA)
      context_frames: number of ground truth frames to pass in before
                      feeding in own predictions
      pix_distrib: the initial one-hot distriubtion for designated pixels
    Returns:
      gen_images: predicted future image frames
      gen_states: predicted future states

    Raises:
      ValueError: if more than one network option specified or more than 1 mask
      specified for DNA model.
    """

    if 'dna_size' in conf.keys():
        DNA_KERN_SIZE = conf['dna_size']
    else:
        DNA_KERN_SIZE = 5

    print 'constructing network with less layers...'

    if stp + cdna + dna != 1:
        raise ValueError('More than one, or no network option specified.')
    batch_size, img_height, img_width, color_channels = images[0].get_shape(
    )[0:4]
    lstm_func = basic_conv_lstm_cell

    # Generated robot states and images.
    gen_states, gen_images, gen_masks = [], [], []
    current_state = states[0]
    gen_pix_distrib = []

    summaries = []

    if k == -1:
        feedself = True
    else:
        # Scheduled sampling:
        # Calculate number of ground-truth frames to pass in.
        num_ground_truth = tf.to_int32(
            tf.round(
                tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k)))))
        feedself = False

    # LSTM state sizes and states.

    if 'lstm_size' in conf:
        lstm_size = conf['lstm_size']
    else:
        lstm_size = np.int32(np.array([16, 16, 32, 32, 64, 32, 16]))

    lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
    lstm_state5, lstm_state6, lstm_state7 = None, None, None

    t = -1
    for image, action in zip(images[:-1], actions[:-1]):
        t += 1
        # Reuse variables after the first timestep.
        reuse = bool(gen_images)

        done_warm_start = len(gen_images) > context_frames - 1
        with slim.arg_scope([
                lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
                tf_layers.layer_norm, slim.layers.conv2d_transpose
        ],
                            reuse=reuse):

            if feedself and done_warm_start:
                # Feed in generated image.
                prev_image = gen_images[-1]
                if pix_distributions != None:
                    prev_pix_distrib = gen_pix_distrib[-1]
            elif done_warm_start:
                # Scheduled sampling
                prev_image = scheduled_sample(image, gen_images[-1],
                                              batch_size, num_ground_truth)
            else:
                # Always feed in ground_truth
                prev_image = image
                if pix_distributions != None:
                    prev_pix_distrib = pix_distributions[t]
                    prev_pix_distrib = tf.expand_dims(prev_pix_distrib, -1)

            if 'transform_from_firstimage' in conf:
                assert stp
                if t > 1:
                    prev_image = images[1]
                    print 'using image 1'

            # Predicted state is always fed back in
            state_action = tf.concat(1, [action, current_state])

            enc0 = slim.layers.conv2d(  #32x32x32
                prev_image,
                32, [5, 5],
                stride=2,
                scope='scale1_conv1',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm1'})

            hidden1, lstm_state1 = lstm_func(  # 32x32x16
                enc0, lstm_state1, lstm_size[0], scope='state1')
            hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')
            # hidden2, lstm_state2 = lstm_func(
            #     hidden1, lstm_state2, lstm_size[1], scope='state2')
            # hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3')
            enc1 = slim.layers.conv2d(  # 16x16x16
                hidden1,
                hidden1.get_shape()[3], [3, 3],
                stride=2,
                scope='conv2')

            hidden3, lstm_state3 = lstm_func(  #16x16x32
                enc1, lstm_state3, lstm_size[2], scope='state3')
            hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')
            # hidden4, lstm_state4 = lstm_func(
            #     hidden3, lstm_state4, lstm_size[3], scope='state4')
            # hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5')
            enc2 = slim.layers.conv2d(  #8x8x32
                hidden3,
                hidden3.get_shape()[3], [3, 3],
                stride=2,
                scope='conv3')

            # Pass in state and action.
            smear = tf.reshape(
                state_action,
                [int(batch_size), 1, 1,
                 int(state_action.get_shape()[1])])
            smear = tf.tile(
                smear,
                [1, int(enc2.get_shape()[1]),
                 int(enc2.get_shape()[2]), 1])
            if use_state:
                enc2 = tf.concat(3, [enc2, smear])
            enc3 = slim.layers.conv2d(  #8x8x32
                enc2,
                hidden3.get_shape()[3], [1, 1],
                stride=1,
                scope='conv4')

            hidden5, lstm_state5 = lstm_func(  #8x8x64
                enc3, lstm_state5, lstm_size[4], scope='state5')
            hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')
            enc4 = slim.layers.conv2d_transpose(  #16x16x64
                hidden5,
                hidden5.get_shape()[3],
                3,
                stride=2,
                scope='convt1')

            hidden6, lstm_state6 = lstm_func(  #16x16x32
                enc4, lstm_state6, lstm_size[5], scope='state6')
            hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')

            if not 'noskip' in conf:
                # Skip connection.
                hidden6 = tf.concat(3, [hidden6, enc1])  # both 16x16

            enc5 = slim.layers.conv2d_transpose(  #32x32x32
                hidden6,
                hidden6.get_shape()[3],
                3,
                stride=2,
                scope='convt2')
            hidden7, lstm_state7 = lstm_func(  # 32x32x16
                enc5, lstm_state7, lstm_size[6], scope='state7')
            hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')

            if not 'noskip' in conf:
                # Skip connection.
                hidden7 = tf.concat(3, [hidden7, enc0])  # both 32x32

            enc6 = slim.layers.conv2d_transpose(  # 64x64x16
                hidden7,
                hidden7.get_shape()[3],
                3,
                stride=2,
                scope='convt3',
                normalizer_fn=tf_layers.layer_norm,
                normalizer_params={'scope': 'layer_norm9'})

            if dna:
                # Using largest hidden state for predicting untied conv kernels.
                enc7 = slim.layers.conv2d_transpose(enc6,
                                                    DNA_KERN_SIZE**2,
                                                    1,
                                                    stride=1,
                                                    scope='convt4')
            else:
                # Using largest hidden state for predicting a new image layer.
                enc7 = slim.layers.conv2d_transpose(enc6,
                                                    color_channels,
                                                    1,
                                                    stride=1,
                                                    scope='convt4')
                # This allows the network to also generate one image from scratch,
                # which is useful when regions of the image become unoccluded.
                transformed = [tf.nn.sigmoid(enc7)]

            if stp:
                stp_input0 = tf.reshape(hidden5, [int(batch_size), -1])
                stp_input1 = slim.layers.fully_connected(stp_input0,
                                                         100,
                                                         scope='fc_stp')

                # disabling capability to generete pixels
                reuse_stp = None
                if reuse:
                    reuse_stp = reuse
                transformed = stp_transformation(prev_image, stp_input1,
                                                 num_masks, reuse_stp)
                # transformed += stp_transformation(prev_image, stp_input1, num_masks)

                if pix_distributions != None:
                    transf_distrib = stp_transformation(prev_pix_distrib,
                                                        stp_input1,
                                                        num_masks,
                                                        reuse=True)

            elif cdna:
                cdna_input = tf.reshape(hidden5, [int(batch_size), -1])

                new_transformed, new_cdna_filter = cdna_transformation(
                    prev_image,
                    cdna_input,
                    num_masks,
                    int(color_channels),
                    reuse_sc=reuse)
                transformed += new_transformed

                summaries += make_cdna_kerns_summary(new_cdna_filter, t,
                                                     'image')

                if pix_distributions != None:
                    transf_distrib, new_cdna_distrib_filter = cdna_transformation(
                        prev_pix_distrib,
                        cdna_input,
                        num_masks,
                        1,
                        reuse_sc=True)
                    summaries += make_cdna_kerns_summary(
                        new_cdna_distrib_filter, t, 'distrib')

            elif dna:
                # Only one mask is supported (more should be unnecessary).
                if num_masks != 1:
                    raise ValueError(
                        'Only one mask is supported for DNA model.')
                transformed = [
                    dna_transformation(prev_image, enc7, DNA_KERN_SIZE)
                ]

            masks = slim.layers.conv2d_transpose(enc6,
                                                 num_masks + 1,
                                                 1,
                                                 stride=1,
                                                 scope='convt7')
            masks = tf.reshape(
                tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [
                    int(batch_size),
                    int(img_height),
                    int(img_width), num_masks + 1
                ])
            mask_list = tf.split(3, num_masks + 1, masks)
            output = mask_list[0] * prev_image
            for layer, mask in zip(transformed, mask_list[1:]):
                output += layer * mask
            gen_images.append(output)
            gen_masks.append(mask_list)

            if dna and pix_distributions != None:
                transf_distrib = [
                    dna_transformation(prev_pix_distrib, enc7, DNA_KERN_SIZE)
                ]

            if pix_distributions != None:
                pix_distrib_output = mask_list[0] * prev_pix_distrib
                mult_list = []
                for i in range(num_masks):
                    mult_list.append(transf_distrib[i] * mask_list[i + 1])
                    pix_distrib_output += mult_list[i]

                gen_pix_distrib.append(pix_distrib_output)

            current_state = slim.layers.fully_connected(
                state_action,
                int(current_state.get_shape()[1]),
                scope='state_pred',
                activation_fn=None)
            gen_states.append(current_state)

    if pix_distributions != None:
        return gen_images, gen_states, gen_masks, gen_pix_distrib
    else:
        return gen_images, gen_states, gen_masks, None
Пример #25
0
def construct_model(images,
                    actions=None,
                    states=None,
                    iter_num=-1.0,
                    k=-1,
                    use_state=True,
                    num_masks=10,
                    stp=False,
                    cdna=True,
                    dna=False,
                    context_frames=2):
  """Build convolutional lstm video predictor using STP, CDNA, or DNA.
    使用STP,CDNA或DNA构建卷积lstm视频预测器。
  Args:
    images: tensor of ground truth image sequences
    真实图像序列张量
    actions: tensor of action sequences
    动作序列张量
    states: tensor of ground truth state sequences
    真实状态序列张量
    iter_num: tensor of the current training iteration (for sched. sampling)
    当前训练迭代的张量(用于计划采样)
    k: constant used for scheduled sampling. -1 to feed in own prediction.
    用于计划采样的常数。  -1输入自己的预测。
    use_state: True to include state and action in prediction
    确实将状态和动作包括在预测中
    num_masks: the number of different pixel motion predictions (and
               the number of masks for each of those predictions)
    不同像素运动预测的数量(以及每个预测的掩模数量)
    stp: True to use Spatial Transformer Predictor (STP)
    使用STP
    cdna: True to use Convoluational Dynamic Neural Advection (CDNA)
    使用CDNA
    dna: True to use Dynamic Neural Advection (DNA)
    使用DNA
    context_frames: number of ground truth frames to pass in before
                    feeding in own predictions
    传入真实图像的帧数,在输入自己预测之前
  Returns:
    gen_images: predicted future image frames
    预测未来图像帧
    gen_states: predicted future states
    预测未来状态
  Raises:
    ValueError: if more than one network option specified or more than 1 mask
    specified for DNA model.
    如果为DNA模型指定了多个网络选项或指定了多个掩码。 参数错误
  """
  if stp + cdna + dna != 1:
    raise ValueError('More than one, or no network option specified.')
  batch_size, img_height, img_width, color_channels = images[0].get_shape()[0:4]
  lstm_func = basic_conv_lstm_cell

  # Generated robot states and images.
  #生成机器人状态和图像
  gen_states, gen_images = [], []
  current_state = states[0]

  if k == -1:
    feedself = True
  else:
    # Scheduled sampling:
    #预定采样
    # Calculate number of ground-truth frames to pass in.
    #计算传入的真实图像帧的数量
    num_ground_truth = tf.to_int32(
        tf.round(tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k)))))
    feedself = False

  # LSTM state sizes and states.
  #LSTM状态大小和状态
  lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32]))
  lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None
  lstm_state5, lstm_state6, lstm_state7 = None, None, None

  for image, action in zip(images[:-1], actions[:-1]):
    # Reuse variables after the first timestep. 在第一个时间步后重用变量
    reuse = bool(gen_images)

    done_warm_start = len(gen_images) > context_frames - 1
    with slim.arg_scope(
        [lstm_func, slim.layers.conv2d, slim.layers.fully_connected,
         tf_layers.layer_norm, slim.layers.conv2d_transpose],
        reuse=reuse):

      if feedself and done_warm_start:
        # Feed in generated image. 传入生成图像
        prev_image = gen_images[-1]
      elif done_warm_start:
        # Scheduled sampling    预定采样
        prev_image = scheduled_sample(image, gen_images[-1], batch_size,
                                      num_ground_truth)
      else:
        # Always feed in ground_truth 始终传入真实图像
        prev_image = image

      # Predicted state is always fed back in 预测状态始终会反馈
      state_action = tf.concat(axis=1, values=[action, current_state])

      enc0 = slim.layers.conv2d(
          prev_image,
          32, [5, 5],
          stride=2,
          scope='scale1_conv1',
          normalizer_fn=tf_layers.layer_norm,
          normalizer_params={'scope': 'layer_norm1'})

      hidden1, lstm_state1 = lstm_func(
          enc0, lstm_state1, lstm_size[0], scope='state1')
      hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2')
      hidden2, lstm_state2 = lstm_func(
          hidden1, lstm_state2, lstm_size[1], scope='state2')
      hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3')
      enc1 = slim.layers.conv2d(
          hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv2')

      hidden3, lstm_state3 = lstm_func(
          enc1, lstm_state3, lstm_size[2], scope='state3')
      hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4')
      hidden4, lstm_state4 = lstm_func(
          hidden3, lstm_state4, lstm_size[3], scope='state4')
      hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5')
      enc2 = slim.layers.conv2d(
          hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv3')

      # Pass in state and action. 传递状态和动作
      smear = tf.reshape(
          state_action,
          [int(batch_size), 1, 1, int(state_action.get_shape()[1])])
      smear = tf.tile(
          smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1])
      if use_state:
        enc2 = tf.concat(axis=3, values=[enc2, smear])
      enc3 = slim.layers.conv2d(
          enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope='conv4')

      hidden5, lstm_state5 = lstm_func(
          enc3, lstm_state5, lstm_size[4], scope='state5')  # last 8x8
      hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6')
      enc4 = slim.layers.conv2d_transpose(
          hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1')

      hidden6, lstm_state6 = lstm_func(
          enc4, lstm_state6, lstm_size[5], scope='state6')  # 16x16
      hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7')
      # Skip connection.跳跃连接
      hidden6 = tf.concat(axis=3, values=[hidden6, enc1])  # both 16x16

      enc5 = slim.layers.conv2d_transpose(
          hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2')
      hidden7, lstm_state7 = lstm_func(
          enc5, lstm_state7, lstm_size[6], scope='state7')  # 32x32
      hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8')

      # Skip connection. 跳跃连接
      hidden7 = tf.concat(axis=3, values=[hidden7, enc0])  # both 32x32

      enc6 = slim.layers.conv2d_transpose(
          hidden7,
          hidden7.get_shape()[3], 3, stride=2, scope='convt3',
          normalizer_fn=tf_layers.layer_norm,
          normalizer_params={'scope': 'layer_norm9'})

      if dna:
        # Using largest hidden state for predicting untied conv kernels. 使用最大化隐藏状态 预测 卷积核
        enc7 = slim.layers.conv2d_transpose(
            enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4')
      else:
        # Using largest hidden state for predicting a new image layer. 使用最大化隐藏状态预测一个新图层

        enc7 = slim.layers.conv2d_transpose(
            enc6, color_channels, 1, stride=1, scope='convt4')
        # This allows the network to also generate one image from scratch,这样一来,网络也可以从头开始生成一张图片,
        # which is useful when regions of the image become unoccluded.当图像区域不被遮挡时,此功能很有用。
        transformed = [tf.nn.sigmoid(enc7)]

      if stp:
        stp_input0 = tf.reshape(hidden5, [int(batch_size), -1])
        stp_input1 = slim.layers.fully_connected(
            stp_input0, 100, scope='fc_stp')
        transformed += stp_transformation(prev_image, stp_input1, num_masks)
      elif cdna:
        cdna_input = tf.reshape(hidden5, [int(batch_size), -1])
        transformed += cdna_transformation(prev_image, cdna_input, num_masks,
                                           int(color_channels))
      elif dna:
        # Only one mask is supported (more should be unnecessary).仅支持一个掩码(应该没有必要更多)。
        if num_masks != 1:
          raise ValueError('Only one mask is supported for DNA model.')
        transformed = [dna_transformation(prev_image, enc7)]

      masks = slim.layers.conv2d_transpose(
          enc6, num_masks + 1, 1, stride=1, scope='convt7')
      masks = tf.reshape(
          tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])),
          [int(batch_size), int(img_height), int(img_width), num_masks + 1])
      mask_list = tf.split(axis=3, num_or_size_splits=num_masks + 1, value=masks)
      output = mask_list[0] * prev_image
      for layer, mask in zip(transformed, mask_list[1:]):
        output += layer * mask
      gen_images.append(output)

      current_state = slim.layers.fully_connected(
          state_action,
          int(current_state.get_shape()[1]),
          scope='state_pred',
          activation_fn=None)
      gen_states.append(current_state)

  return gen_images, gen_states