예제 #1
0
  def __init__(self, n_components, n_frames_output, n_channels, image_size,
               image_latent_size, hidden_size, ngf, output_size, independent_components):
    super(Encoder, self).__init__()

    n_layers = int(np.log2(image_size)) - 1
    self.image_encoder = ImageEncoder(n_channels, image_latent_size, ngf, n_layers)
    # Encoder
    self.encode_rnn = nn.LSTM(image_latent_size + hidden_size, hidden_size,
                              num_layers=1, batch_first=True)
    # if independent_components:
    #   predict_input_size = hidden_size
    # else:
    #   predict_input_size = hidden_size * 2
    # self.predict_rnn = nn.LSTM(predict_input_size, hidden_size, num_layers=1, batch_first=True)

    # Betad
    self.fc_layer = nn.Linear(hidden_size, output_size)
    self.bnorm = nn.BatchNorm1d(output_size, affine=False)

    # Initial pose
    # self.initial_pose_rnn = nn.LSTM(hidden_size, hidden_size, num_layers=1, batch_first=True)
    # self.initial_pose_mu = nn.Linear(hidden_size, output_size)
    # self.initial_pose_sigma = nn.Linear(hidden_size, output_size)

    self.n_components = n_components
    self.n_frames_output = n_frames_output
    self.image_latent_size = image_latent_size
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.independent_components = independent_components
예제 #2
0
  def __init__(self, n_frames_input, n_frames_output, n_channels, image_size,
               feat_latent_size, time_enc_size, t_enc_rnn_hidden_size, trans_rnn_hidden_size, manifold_size, ngf):

    super(ManifoldEncoder, self).__init__()
    n_layers = int(np.log2(image_size)) - 1

    self.image_encoding_flag = False # Wether we work with videos.
    self.full_time_enc_flag = False # Wether we use all hidden vectors for the time encoding.

    if self.image_encoding_flag:
      # Option 1: DDPAE image encoder.
      self.image_encoder = ImageEncoder(n_channels, feat_latent_size, ngf, n_layers)
      # Option 2: If we use resnetnet as feature extractor
      # pretrained_resnet = models.resnet18(pretrained=True)
      # self.image_encoder = nn.Sequential(*list(pretrained_resnet.children())[:-1])
      # Option 3: Good encoder for MNIST
      # self.image_encoder = ImageEncoder([1, image_size, image_size], feat_latent_size)
      # Option 4: Pointnet++
      # Option 5: ELSE: no feature extraction for toy cases

    # Time encoding
    self.time_enc_rnn = nn.GRU(feat_latent_size, t_enc_rnn_hidden_size,
                               num_layers=1, batch_first=True, bidirectional=True)
    if self.full_time_enc_flag:
      self.time_enc_fc = nn.Linear(t_enc_rnn_hidden_size * 2 * n_frames_input, time_enc_size)
    else:
      self.time_enc_fc = nn.Linear(t_enc_rnn_hidden_size * 2, time_enc_size)

    # Transition rnn
    self.trans_rnn = nn.LSTMCell(feat_latent_size + time_enc_size + manifold_size,
                                 trans_rnn_hidden_size)  # TODO: change to LSTM similar to DDPAE (why cell?)
    self.y_mu = nn.Linear(trans_rnn_hidden_size, manifold_size)
    self.y_sigma = nn.Linear(trans_rnn_hidden_size, manifold_size)

    # Initial conditions
    # Option 1: RNN + fc
    # self.y0_rnn = nn.LSTM(feat_latent_size + time_enc_size, trans_rnn_hidden_size,
    #                                 num_layers=1, batch_first=True)
    # self.y0_mu = nn.Linear(trans_rnn_hidden_size, manifold_size)
    # self.y0_sigma = nn.Linear(trans_rnn_hidden_size, manifold_size)
    # Option 2: fc
    self.y0_mu = nn.Linear(feat_latent_size + time_enc_size, manifold_size)
    self.y0_sigma = nn.Linear(feat_latent_size + time_enc_size, manifold_size)

    # Prior encoder (backwards prediction): Note: We ignore it for this version.
    # self.n_prior = 0
    # self.n_window = 9
    # self.prior_rnn = nn.LSTMCell(manifold_size, trans_rnn_hidden_size)
    # self.prior_fc = nn.Linear(trans_rnn_hidden_size, manifold_size)

    self.input_size = image_size
    self.n_frames_input = n_frames_input
    self.n_frames_output = n_frames_output
    self.feat_latent_size = feat_latent_size
    self.time_enc_size = time_enc_size
    self.t_enc_rnn_hidden_size = t_enc_rnn_hidden_size
    self.trans_rnn_hidden_size = trans_rnn_hidden_size
    self.manifold_size = manifold_size
예제 #3
0
    def setup_networks(self):
        '''
    Networks for DDPAE.
    '''
        self.nets = {}
        # These will be registered in model() and guide() with pyro.module().
        self.model_modules = {}
        self.guide_modules = {}

        # Backbone, Pose RNN
        pose_model = PoseRNN(self.n_components, self.n_frames_output,
                             self.n_channels, self.image_size,
                             self.image_latent_size, self.hidden_size,
                             self.ngf, self.pose_latent_size,
                             self.independent_components)
        self.pose_model = nn.DataParallel(pose_model.cuda())

        self.nets['pose_model'] = self.pose_model
        self.guide_modules['pose_model'] = self.pose_model

        # Content LSTM
        content_lstm = SequenceEncoder(self.content_latent_size,
                                       self.hidden_size,
                                       self.content_latent_size * 2)
        # Note: content_latent_size = 128 (input of SEnc) --> output is 2*input Why?
        self.content_lstm = nn.DataParallel(content_lstm.cuda())
        self.nets['content_lstm'] = self.content_lstm
        self.model_modules['content_lstm'] = self.content_lstm

        # Image encoder and decoder
        n_layers = int(np.log2(self.object_size)) - 1
        object_encoder = ImageEncoder(self.n_channels,
                                      self.content_latent_size, self.ngf,
                                      n_layers)
        object_decoder = ImageDecoder(self.content_latent_size,
                                      self.n_channels, self.ngf, n_layers,
                                      'sigmoid')
        self.object_encoder = nn.DataParallel(object_encoder.cuda())
        self.object_decoder = nn.DataParallel(object_decoder.cuda())
        self.nets.update({
            'object_encoder': self.object_encoder,
            'object_decoder': self.object_decoder
        })
        self.model_modules['decoder'] = self.object_decoder
        self.guide_modules['encoder'] = self.object_encoder
예제 #4
0
    def __init__(self, opt):
        super().__init__()
        self.opt = opt
        nf = opt.ngf

        self.sw, self.sh = self.compute_latent_vector_size(opt)

        if opt.use_vae:
            # In case of VAE, we will sample from random z vector
            self.fc = nn.Linear(opt.z_dim, 16 * nf * self.sw * self.sh)
        elif opt.use_encoder:
            # In case of encoder, we will encoder the image
            if self.opt.Image_encoder_mode == 'norm':
                self.fc = ImageEncoder(opt, self.sw, self.sh)
            elif self.opt.Image_encoder_mode == 'instance':
                self.fc = ImageEncoder2(opt, self.sw, self.sh)
            elif self.opt.Image_encoder_mode == 'partialconv':
                self.fc = ImageEncoder3(opt, self.sw, self.sh)
        else:
            # Otherwise, we make the network deterministic by starting with
            # downsampled segmentation map instead of random z
            if not opt.no_orientation:
                # self.fc = nn.Conv2d(self.opt.semantic_nc, 16 * nf, 3, padding=1) # for mask input
                self.fc = nn.Conv2d(3, 16 * nf, 3,
                                    padding=1)  # for image input
            else:
                # self.fc = nn.Conv2d(self.opt.semantic_nc, 16 * nf, 3, padding=1) # for mask input
                self.fc = nn.Conv2d(3, 16 * nf, 3,
                                    padding=1)  # for image input

        self.head_0 = SPADEResnetBlock(16 * nf, 16 * nf, opt)

        self.G_middle_0 = SPADEResnetBlock(16 * nf, 16 * nf, opt)
        self.G_middle_1 = SPADEResnetBlock(16 * nf, 16 * nf, opt)

        self.up_0 = SPADEResnetBlock(16 * nf, 8 * nf, opt)
        self.up_1 = SPADEResnetBlock(8 * nf, 4 * nf, opt)
        self.up_2 = SPADEResnetBlock(4 * nf, 2 * nf, opt)
        self.up_3 = SPADEResnetBlock(2 * nf, 1 * nf, opt)

        final_nc = nf

        if opt.num_upsampling_layers == 'most':
            self.up_4 = SPADEResnetBlock(1 * nf, nf // 2, opt)
            final_nc = nf // 2

        self.conv_img = nn.Conv2d(final_nc, 3, 3, padding=1)

        self.up = nn.Upsample(scale_factor=2)

        if not self.opt.noise_background:
            self.backgroud_enc = BackgroundEncode(opt)
        else:
            self.backgroud_enc = BackgroundEncode2(opt)
예제 #5
0
    def __init__(self, n_components, n_frames_input, n_frames_output,
                 n_channels, image_size, image_latent_size, hidden_size, ngf,
                 output_size, independent_components):
        super(PoseRNN, self).__init__()

        n_layers = int(np.log2(image_size)) - 1
        self.image_encoder = ImageEncoder(n_channels, image_latent_size, ngf,
                                          n_layers)
        # Encoder
        # TODO: they use the whole LSTM for a single step at each time. NOT LSTMcell
        self.encode_rnn = nn.LSTM(image_latent_size + hidden_size,
                                  hidden_size,
                                  num_layers=1,
                                  batch_first=True)
        if independent_components:
            predict_input_size = hidden_size
        else:
            predict_input_size = hidden_size * 2
        self.predict_rnn = nn.LSTM(predict_input_size,
                                   hidden_size,
                                   num_layers=1,
                                   batch_first=True)

        # Betad
        self.beta_mu_layer = nn.Linear(hidden_size, output_size)
        self.beta_sigma_layer = nn.Linear(hidden_size, output_size)

        # Initial pose
        self.initial_pose_rnn = nn.LSTM(hidden_size,
                                        hidden_size,
                                        num_layers=1,
                                        batch_first=True)
        self.initial_pose_mu = nn.Linear(hidden_size, output_size)
        self.initial_pose_sigma = nn.Linear(hidden_size, output_size)

        self.labels_rnn = nn.LSTM(
            hidden_size, hidden_size, num_layers=1,
            batch_first=True)  #TODO: , bidirectional=True
        self.labels_mu_layer = nn.Linear(hidden_size, 1)
        self.labels_sigma_layer = nn.Linear(hidden_size, 1)
        # self.labels_mu_layer_2 = nn.Linear(hidden_size//2, 1)
        # self.labels_sigma_layer_2 = nn.Linear(hidden_size//2, 1)

        self.n_components = n_components
        self.n_frames_output = n_frames_output
        self.n_frames_input = n_frames_input
        self.image_latent_size = image_latent_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.independent_components = independent_components
    def __init__(self, n_components, n_frames_output, n_channels, image_size,
                 image_latent_size, hidden_size, ngf, output_size,
                 independent_components):
        super(PoseRNN, self).__init__()

        n_layers = int(np.log2(image_size)) - 1
        self.image_encoder = ImageEncoder(n_channels, image_latent_size, ngf,
                                          n_layers)
        # Encoder
        self.encode_rnn = nn.LSTM(image_latent_size + hidden_size,
                                  hidden_size,
                                  num_layers=1,
                                  batch_first=True)
        if independent_components:
            predict_input_size = hidden_size
        else:
            predict_input_size = hidden_size * 2
        self.predict_rnn = nn.LSTM(predict_input_size,
                                   hidden_size,
                                   num_layers=1,
                                   batch_first=True)

        # Beta
        self.beta_mu_layer = nn.Linear(hidden_size, output_size)
        self.beta_sigma_layer = nn.Linear(hidden_size, output_size)

        # Initial pose
        self.initial_pose_rnn = nn.LSTM(hidden_size,
                                        hidden_size,
                                        num_layers=1,
                                        batch_first=True)
        self.initial_pose_mu = nn.Linear(hidden_size, output_size)
        self.initial_pose_sigma = nn.Linear(hidden_size, output_size)

        self.n_components = n_components
        self.n_frames_output = n_frames_output
        self.image_latent_size = image_latent_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.independent_components = independent_components
예제 #7
0
    def __init__(self, n_frames_input, n_frames_output, n_channels, image_size,
                 feat_latent_size, time_enc_size, t_enc_rnn_hidden_size,
                 trans_rnn_hidden_size, manifold_size, ngf):

        super(Encoder, self).__init__()
        n_layers = int(np.log2(image_size)) - 1

        self.image_encoding_flag = False  # Wether we work with videos.
        self.full_time_enc_flag = False  # Wether we use all hidden vectors for the time encoding.

        if self.image_encoding_flag:
            # Option 1: DDPAE image encoder.
            self.image_encoder = ImageEncoder(n_channels, feat_latent_size,
                                              ngf, n_layers)
            # Option 2: If we use resnetnet as feature extractor
            # pretrained_resnet = models.resnet18(pretrained=True)
            # self.image_encoder = nn.Sequential(*list(pretrained_resnet.children())[:-1])
            # Option 3: Good encoder for MNIST
            # self.image_encoder = ImageEncoder([1, image_size, image_size], feat_latent_size)
            # Option 4: Pointnet++
        else:
            # Option 5: Toy feature extraction
            self.feat_mu = nn.Linear(feat_latent_size, feat_latent_size)
            self.feat_sigma = nn.Linear(feat_latent_size, feat_latent_size)

        # Time encoding
        self.time_enc_rnn = nn.GRU(feat_latent_size,
                                   t_enc_rnn_hidden_size,
                                   num_layers=1,
                                   batch_first=True,
                                   bidirectional=True)
        if self.full_time_enc_flag:
            self.time_enc_mu = nn.Linear(
                t_enc_rnn_hidden_size * 2 * n_frames_input, time_enc_size)
            self.time_enc_sigma = nn.Linear(
                t_enc_rnn_hidden_size * 2 * n_frames_input, time_enc_size)
        else:
            self.time_enc_mu = nn.Linear(t_enc_rnn_hidden_size * 2,
                                         time_enc_size)
            self.time_enc_sigma = nn.Linear(t_enc_rnn_hidden_size * 2,
                                            time_enc_size)

        # Initial conditions
        # Option 1: RNN + fc
        # self.y0_rnn = nn.LSTM(feat_latent_size + time_enc_size, trans_rnn_hidden_size,
        #                                 num_layers=1, batch_first=True)
        # self.y0_mu = nn.Linear(trans_rnn_hidden_size, manifold_size)
        # self.y0_sigma = nn.Linear(trans_rnn_hidden_size, manifold_size)
        # Option 2: fc
        self.y0_mu = nn.Linear(feat_latent_size + time_enc_size, manifold_size)
        self.y0_sigma = nn.Linear(feat_latent_size + time_enc_size,
                                  manifold_size)

        self.input_size = image_size
        self.n_frames_input = n_frames_input
        self.n_frames_output = n_frames_output
        self.feat_latent_size = feat_latent_size
        self.time_enc_size = time_enc_size
        self.t_enc_rnn_hidden_size = t_enc_rnn_hidden_size
        self.trans_rnn_hidden_size = trans_rnn_hidden_size
        self.manifold_size = manifold_size