def __init__(self, n_components, n_frames_output, n_channels, image_size, image_latent_size, hidden_size, ngf, output_size, independent_components): super(Encoder, self).__init__() n_layers = int(np.log2(image_size)) - 1 self.image_encoder = ImageEncoder(n_channels, image_latent_size, ngf, n_layers) # Encoder self.encode_rnn = nn.LSTM(image_latent_size + hidden_size, hidden_size, num_layers=1, batch_first=True) # if independent_components: # predict_input_size = hidden_size # else: # predict_input_size = hidden_size * 2 # self.predict_rnn = nn.LSTM(predict_input_size, hidden_size, num_layers=1, batch_first=True) # Betad self.fc_layer = nn.Linear(hidden_size, output_size) self.bnorm = nn.BatchNorm1d(output_size, affine=False) # Initial pose # self.initial_pose_rnn = nn.LSTM(hidden_size, hidden_size, num_layers=1, batch_first=True) # self.initial_pose_mu = nn.Linear(hidden_size, output_size) # self.initial_pose_sigma = nn.Linear(hidden_size, output_size) self.n_components = n_components self.n_frames_output = n_frames_output self.image_latent_size = image_latent_size self.hidden_size = hidden_size self.output_size = output_size self.independent_components = independent_components
def __init__(self, n_frames_input, n_frames_output, n_channels, image_size, feat_latent_size, time_enc_size, t_enc_rnn_hidden_size, trans_rnn_hidden_size, manifold_size, ngf): super(ManifoldEncoder, self).__init__() n_layers = int(np.log2(image_size)) - 1 self.image_encoding_flag = False # Wether we work with videos. self.full_time_enc_flag = False # Wether we use all hidden vectors for the time encoding. if self.image_encoding_flag: # Option 1: DDPAE image encoder. self.image_encoder = ImageEncoder(n_channels, feat_latent_size, ngf, n_layers) # Option 2: If we use resnetnet as feature extractor # pretrained_resnet = models.resnet18(pretrained=True) # self.image_encoder = nn.Sequential(*list(pretrained_resnet.children())[:-1]) # Option 3: Good encoder for MNIST # self.image_encoder = ImageEncoder([1, image_size, image_size], feat_latent_size) # Option 4: Pointnet++ # Option 5: ELSE: no feature extraction for toy cases # Time encoding self.time_enc_rnn = nn.GRU(feat_latent_size, t_enc_rnn_hidden_size, num_layers=1, batch_first=True, bidirectional=True) if self.full_time_enc_flag: self.time_enc_fc = nn.Linear(t_enc_rnn_hidden_size * 2 * n_frames_input, time_enc_size) else: self.time_enc_fc = nn.Linear(t_enc_rnn_hidden_size * 2, time_enc_size) # Transition rnn self.trans_rnn = nn.LSTMCell(feat_latent_size + time_enc_size + manifold_size, trans_rnn_hidden_size) # TODO: change to LSTM similar to DDPAE (why cell?) self.y_mu = nn.Linear(trans_rnn_hidden_size, manifold_size) self.y_sigma = nn.Linear(trans_rnn_hidden_size, manifold_size) # Initial conditions # Option 1: RNN + fc # self.y0_rnn = nn.LSTM(feat_latent_size + time_enc_size, trans_rnn_hidden_size, # num_layers=1, batch_first=True) # self.y0_mu = nn.Linear(trans_rnn_hidden_size, manifold_size) # self.y0_sigma = nn.Linear(trans_rnn_hidden_size, manifold_size) # Option 2: fc self.y0_mu = nn.Linear(feat_latent_size + time_enc_size, manifold_size) self.y0_sigma = nn.Linear(feat_latent_size + time_enc_size, manifold_size) # Prior encoder (backwards prediction): Note: We ignore it for this version. # self.n_prior = 0 # self.n_window = 9 # self.prior_rnn = nn.LSTMCell(manifold_size, trans_rnn_hidden_size) # self.prior_fc = nn.Linear(trans_rnn_hidden_size, manifold_size) self.input_size = image_size self.n_frames_input = n_frames_input self.n_frames_output = n_frames_output self.feat_latent_size = feat_latent_size self.time_enc_size = time_enc_size self.t_enc_rnn_hidden_size = t_enc_rnn_hidden_size self.trans_rnn_hidden_size = trans_rnn_hidden_size self.manifold_size = manifold_size
def setup_networks(self): ''' Networks for DDPAE. ''' self.nets = {} # These will be registered in model() and guide() with pyro.module(). self.model_modules = {} self.guide_modules = {} # Backbone, Pose RNN pose_model = PoseRNN(self.n_components, self.n_frames_output, self.n_channels, self.image_size, self.image_latent_size, self.hidden_size, self.ngf, self.pose_latent_size, self.independent_components) self.pose_model = nn.DataParallel(pose_model.cuda()) self.nets['pose_model'] = self.pose_model self.guide_modules['pose_model'] = self.pose_model # Content LSTM content_lstm = SequenceEncoder(self.content_latent_size, self.hidden_size, self.content_latent_size * 2) # Note: content_latent_size = 128 (input of SEnc) --> output is 2*input Why? self.content_lstm = nn.DataParallel(content_lstm.cuda()) self.nets['content_lstm'] = self.content_lstm self.model_modules['content_lstm'] = self.content_lstm # Image encoder and decoder n_layers = int(np.log2(self.object_size)) - 1 object_encoder = ImageEncoder(self.n_channels, self.content_latent_size, self.ngf, n_layers) object_decoder = ImageDecoder(self.content_latent_size, self.n_channels, self.ngf, n_layers, 'sigmoid') self.object_encoder = nn.DataParallel(object_encoder.cuda()) self.object_decoder = nn.DataParallel(object_decoder.cuda()) self.nets.update({ 'object_encoder': self.object_encoder, 'object_decoder': self.object_decoder }) self.model_modules['decoder'] = self.object_decoder self.guide_modules['encoder'] = self.object_encoder
def __init__(self, opt): super().__init__() self.opt = opt nf = opt.ngf self.sw, self.sh = self.compute_latent_vector_size(opt) if opt.use_vae: # In case of VAE, we will sample from random z vector self.fc = nn.Linear(opt.z_dim, 16 * nf * self.sw * self.sh) elif opt.use_encoder: # In case of encoder, we will encoder the image if self.opt.Image_encoder_mode == 'norm': self.fc = ImageEncoder(opt, self.sw, self.sh) elif self.opt.Image_encoder_mode == 'instance': self.fc = ImageEncoder2(opt, self.sw, self.sh) elif self.opt.Image_encoder_mode == 'partialconv': self.fc = ImageEncoder3(opt, self.sw, self.sh) else: # Otherwise, we make the network deterministic by starting with # downsampled segmentation map instead of random z if not opt.no_orientation: # self.fc = nn.Conv2d(self.opt.semantic_nc, 16 * nf, 3, padding=1) # for mask input self.fc = nn.Conv2d(3, 16 * nf, 3, padding=1) # for image input else: # self.fc = nn.Conv2d(self.opt.semantic_nc, 16 * nf, 3, padding=1) # for mask input self.fc = nn.Conv2d(3, 16 * nf, 3, padding=1) # for image input self.head_0 = SPADEResnetBlock(16 * nf, 16 * nf, opt) self.G_middle_0 = SPADEResnetBlock(16 * nf, 16 * nf, opt) self.G_middle_1 = SPADEResnetBlock(16 * nf, 16 * nf, opt) self.up_0 = SPADEResnetBlock(16 * nf, 8 * nf, opt) self.up_1 = SPADEResnetBlock(8 * nf, 4 * nf, opt) self.up_2 = SPADEResnetBlock(4 * nf, 2 * nf, opt) self.up_3 = SPADEResnetBlock(2 * nf, 1 * nf, opt) final_nc = nf if opt.num_upsampling_layers == 'most': self.up_4 = SPADEResnetBlock(1 * nf, nf // 2, opt) final_nc = nf // 2 self.conv_img = nn.Conv2d(final_nc, 3, 3, padding=1) self.up = nn.Upsample(scale_factor=2) if not self.opt.noise_background: self.backgroud_enc = BackgroundEncode(opt) else: self.backgroud_enc = BackgroundEncode2(opt)
def __init__(self, n_components, n_frames_input, n_frames_output, n_channels, image_size, image_latent_size, hidden_size, ngf, output_size, independent_components): super(PoseRNN, self).__init__() n_layers = int(np.log2(image_size)) - 1 self.image_encoder = ImageEncoder(n_channels, image_latent_size, ngf, n_layers) # Encoder # TODO: they use the whole LSTM for a single step at each time. NOT LSTMcell self.encode_rnn = nn.LSTM(image_latent_size + hidden_size, hidden_size, num_layers=1, batch_first=True) if independent_components: predict_input_size = hidden_size else: predict_input_size = hidden_size * 2 self.predict_rnn = nn.LSTM(predict_input_size, hidden_size, num_layers=1, batch_first=True) # Betad self.beta_mu_layer = nn.Linear(hidden_size, output_size) self.beta_sigma_layer = nn.Linear(hidden_size, output_size) # Initial pose self.initial_pose_rnn = nn.LSTM(hidden_size, hidden_size, num_layers=1, batch_first=True) self.initial_pose_mu = nn.Linear(hidden_size, output_size) self.initial_pose_sigma = nn.Linear(hidden_size, output_size) self.labels_rnn = nn.LSTM( hidden_size, hidden_size, num_layers=1, batch_first=True) #TODO: , bidirectional=True self.labels_mu_layer = nn.Linear(hidden_size, 1) self.labels_sigma_layer = nn.Linear(hidden_size, 1) # self.labels_mu_layer_2 = nn.Linear(hidden_size//2, 1) # self.labels_sigma_layer_2 = nn.Linear(hidden_size//2, 1) self.n_components = n_components self.n_frames_output = n_frames_output self.n_frames_input = n_frames_input self.image_latent_size = image_latent_size self.hidden_size = hidden_size self.output_size = output_size self.independent_components = independent_components
def __init__(self, n_components, n_frames_output, n_channels, image_size, image_latent_size, hidden_size, ngf, output_size, independent_components): super(PoseRNN, self).__init__() n_layers = int(np.log2(image_size)) - 1 self.image_encoder = ImageEncoder(n_channels, image_latent_size, ngf, n_layers) # Encoder self.encode_rnn = nn.LSTM(image_latent_size + hidden_size, hidden_size, num_layers=1, batch_first=True) if independent_components: predict_input_size = hidden_size else: predict_input_size = hidden_size * 2 self.predict_rnn = nn.LSTM(predict_input_size, hidden_size, num_layers=1, batch_first=True) # Beta self.beta_mu_layer = nn.Linear(hidden_size, output_size) self.beta_sigma_layer = nn.Linear(hidden_size, output_size) # Initial pose self.initial_pose_rnn = nn.LSTM(hidden_size, hidden_size, num_layers=1, batch_first=True) self.initial_pose_mu = nn.Linear(hidden_size, output_size) self.initial_pose_sigma = nn.Linear(hidden_size, output_size) self.n_components = n_components self.n_frames_output = n_frames_output self.image_latent_size = image_latent_size self.hidden_size = hidden_size self.output_size = output_size self.independent_components = independent_components
def __init__(self, n_frames_input, n_frames_output, n_channels, image_size, feat_latent_size, time_enc_size, t_enc_rnn_hidden_size, trans_rnn_hidden_size, manifold_size, ngf): super(Encoder, self).__init__() n_layers = int(np.log2(image_size)) - 1 self.image_encoding_flag = False # Wether we work with videos. self.full_time_enc_flag = False # Wether we use all hidden vectors for the time encoding. if self.image_encoding_flag: # Option 1: DDPAE image encoder. self.image_encoder = ImageEncoder(n_channels, feat_latent_size, ngf, n_layers) # Option 2: If we use resnetnet as feature extractor # pretrained_resnet = models.resnet18(pretrained=True) # self.image_encoder = nn.Sequential(*list(pretrained_resnet.children())[:-1]) # Option 3: Good encoder for MNIST # self.image_encoder = ImageEncoder([1, image_size, image_size], feat_latent_size) # Option 4: Pointnet++ else: # Option 5: Toy feature extraction self.feat_mu = nn.Linear(feat_latent_size, feat_latent_size) self.feat_sigma = nn.Linear(feat_latent_size, feat_latent_size) # Time encoding self.time_enc_rnn = nn.GRU(feat_latent_size, t_enc_rnn_hidden_size, num_layers=1, batch_first=True, bidirectional=True) if self.full_time_enc_flag: self.time_enc_mu = nn.Linear( t_enc_rnn_hidden_size * 2 * n_frames_input, time_enc_size) self.time_enc_sigma = nn.Linear( t_enc_rnn_hidden_size * 2 * n_frames_input, time_enc_size) else: self.time_enc_mu = nn.Linear(t_enc_rnn_hidden_size * 2, time_enc_size) self.time_enc_sigma = nn.Linear(t_enc_rnn_hidden_size * 2, time_enc_size) # Initial conditions # Option 1: RNN + fc # self.y0_rnn = nn.LSTM(feat_latent_size + time_enc_size, trans_rnn_hidden_size, # num_layers=1, batch_first=True) # self.y0_mu = nn.Linear(trans_rnn_hidden_size, manifold_size) # self.y0_sigma = nn.Linear(trans_rnn_hidden_size, manifold_size) # Option 2: fc self.y0_mu = nn.Linear(feat_latent_size + time_enc_size, manifold_size) self.y0_sigma = nn.Linear(feat_latent_size + time_enc_size, manifold_size) self.input_size = image_size self.n_frames_input = n_frames_input self.n_frames_output = n_frames_output self.feat_latent_size = feat_latent_size self.time_enc_size = time_enc_size self.t_enc_rnn_hidden_size = t_enc_rnn_hidden_size self.trans_rnn_hidden_size = trans_rnn_hidden_size self.manifold_size = manifold_size