def __init__(self, vp_value_count, output_shape, name='Full Network'): """ Initializes the Full Network. :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape. Legal values: (bsz, 3, 8, 112, 112) and (bsz, 3, 16, 112, 112) :param name: (str, optional) The name of the network (default 'Full Network'). Raises: ValueError: if 'vp_value_count' is not a legal value count ValueError: if 'output_shape' does not contain a legal number of frames. """ if vp_value_count not in self.VALID_VP_VALUE_COUNTS: raise ValueError('Invalid number of vp values: %d' % vp_value_count) if output_shape[2] not in self.VALID_FRAME_COUNTS: raise ValueError('Invalid number of frames in desired output: %d' % output_shape[2]) super(FullNetwork, self).__init__() self.net_name = name self.vp_value_count = vp_value_count self.output_shape = output_shape self.out_frames = output_shape[2] self.rep_channels = 256 self.rep_frames = 4 self.rep_size = 14 self.vgg = vgg16(pretrained=True, weights_path=vgg_weights_path) self.i3d = InceptionI3d(final_endpoint='Mixed_5c', in_frames=self.out_frames, pretrained=True, weights_path=i3d_weights_path) self.exp = Expander(vp_value_count=self.vp_value_count, out_frames=self.rep_frames, out_size=self.rep_size) self.trans = Transformer(in_channels=self.rep_channels + self.vp_value_count, out_channels=self.rep_channels) self.gen = Generator(in_channels=[self.rep_channels, self.rep_channels], out_frames=self.out_frames)
def __init__(self, vp_value_count, stdev, output_shape, pretrained=False, vgg_weights_path='', i3d_weights_path='', name='Full Network'): """ Initializes the Full Network. :param vp_value_count: (int) The number of values that identify the viewpoint. :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape. Legal values: (bsz, 3, 8/16, 112, 112) and (bsz, 3, 16, 112, 112) :param name: (str, optional) The name of the network (default 'Full Network'). Raises: ValueError: if 'vp_value_count' is not a legal value count ValueError: if 'output_shape' does not contain a legal number of frames. """ if vp_value_count not in self.VALID_VP_VALUE_COUNTS: raise ValueError('Invalid number of vp values: %d' % vp_value_count) if output_shape[2] not in self.VALID_FRAME_COUNTS: raise ValueError('Invalid number of frames in desired output: %d' % output_shape[2]) super(FullNetwork, self).__init__() self.net_name = name self.vp_value_count = vp_value_count self.stdev = stdev self.output_shape = output_shape self.out_frames = output_shape[2] # specs of various features self.app_feat = 128 self.rep_feat = 128 self.rep_frames = 4 self.rep_size = 14 self.nkp = 32 self.vgg = vgg16(pretrained=pretrained, weights_path=vgg_weights_path) self.i3d = InceptionI3d(final_endpoint='Mixed_5c', in_frames=self.out_frames, pretrained=pretrained, weights_path=i3d_weights_path) self.exp = Expander(vp_value_count=self.vp_value_count) # convs to make all appearance encodings have same number of channels, so they can be used in the same convGRU self.app_conv128 = nn.Conv2d(in_channels=128, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv256a = nn.Conv2d(in_channels=256, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv256b = nn.Conv2d(in_channels=256, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_convs = [ nn.Sequential(self.app_conv128, nn.ReLU(inplace=True)), nn.Sequential(self.app_conv256a, nn.ReLU(inplace=True)), nn.Sequential(self.app_conv256b, nn.ReLU(inplace=True)) ] # convs to make all motion features have the same number of channels, so they can be used in the same trans net self.rep_conv64 = nn.Conv3d(in_channels=64, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv192 = nn.Conv3d(in_channels=192, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv256 = nn.Conv3d(in_channels=256, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_convs = [ nn.Sequential(self.rep_conv64, nn.ReLU(inplace=True)), nn.Sequential(self.rep_conv192, nn.ReLU(inplace=True)), nn.Sequential(self.rep_conv256, nn.ReLU(inplace=True)) ] self.trans = Transformer(in_channels=self.rep_feat + self.vp_value_count, out_channels=self.rep_feat) self.kpp = KPPredictor(in_channels=self.rep_feat, nkp=self.nkp, stdev=self.stdev) self.vpp = VPPredictor(in_channels=256) self.gru = ConvGRU(input_dim=self.rep_feat, hidden_dim=[self.app_feat], kernel_size=(7, 7), num_layers=1, batch_first=True, bias=False, return_all_layers=False) self.gen = Generator(in_channels=[self.app_feat, self.nkp], out_frames=self.out_frames)
def __init__(self, vp_value_count, output_shape, name='Full Network'): """ Initializes the Full Network. :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape. Legal values: (bsz, 3, 8, 112, 112) and (bsz, 3, 16, 112, 112) :param name: (str, optional) The name of the network (default 'Full Network'). Raises: ValueError: if 'vp_value_count' is not a legal value count ValueError: if 'output_shape' does not contain a legal number of frames. """ if vp_value_count not in self.VALID_VP_VALUE_COUNTS: raise ValueError('Invalid number of vp values: %d' % vp_value_count) if output_shape[2] not in self.VALID_FRAME_COUNTS: raise ValueError('Invalid number of frames in desired output: %d' % output_shape[2]) super(FullNetwork, self).__init__() self.net_name = name self.vp_value_count = vp_value_count self.output_shape = output_shape self.out_frames = output_shape[2] self.rep_channels = 256 self.rep_frames = 4 self.rep_size = 14 self.vgg = vgg16(pretrained=True, weights_path=vgg_weights_path) self.i3d = InceptionI3d(final_endpoint='Mixed_5c', in_frames=self.out_frames, pretrained=True, weights_path=i3d_weights_path) self.exp = Expander(vp_value_count=self.vp_value_count) # convs to make all appearance encoding have same number of channels, so they can be used in the same convLSTM self.app_conv128 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv256a = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv256b = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_convs = [ self.app_conv128, self.app_conv256a, self.app_conv256b ] # convs for the initial hidden and current states of the convLSTM self.hconv = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.cconv = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) # convs to make all motion features have the same number of channels, so they can be used in the same Trans Net self.rep_conv64 = nn.Conv3d(in_channels=64, out_channels=256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv192 = nn.Conv3d(in_channels=192, out_channels=256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv256 = nn.Conv3d(in_channels=256, out_channels=256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_convs = { 64: self.rep_conv64, 192: self.rep_conv192, 256: self.rep_conv256 } self.trans = Transformer(in_channels=256 + self.vp_value_count, out_channels=128) self.conv_lstm = ConvLSTM(input_dim=128, hidden_dim=[128], kernel_size=(3, 3), num_layers=1, batch_first=True, bias=False, return_all_layers=False) self.gen = Generator(in_channels=[128], out_frames=self.out_frames)
def __init__(self, vp_value_count, output_shape, name='Full Network'): """ Initializes the Full Network. :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape. Legal values: (bsz, 3, 8, 112, 112) and (bsz, 3, 16, 112, 112) :param name: (str, optional) The name of the network (default 'Full Network'). Raises: ValueError: if 'vp_value_count' is not a legal value count ValueError: if 'output_shape' does not contain a legal number of frames. """ if vp_value_count not in self.VALID_VP_VALUE_COUNTS: raise ValueError('Invalid number of vp values: %d' % vp_value_count) if output_shape[2] not in self.VALID_FRAME_COUNTS: raise ValueError('Invalid number of frames in desired output: %d' % output_shape[2]) super(FullNetwork, self).__init__() # params self.net_name = name self.vp_value_count = vp_value_count self.output_shape = output_shape self.out_frames = output_shape[2] self.rep_feat = 128 self.app_feat = 256 # networks self.vgg = vgg16(pretrained=True, weights_path=vgg_weights_path) self.i3d = InceptionI3d(final_endpoint='Mixed_5c', in_frames=self.out_frames, pretrained=True, weights_path=i3d_weights_path) self.exp = Expander(vp_value_count=self.vp_value_count) self.trans = Transformer(in_channels=self.rep_feat + self.vp_value_count, out_channels=self.rep_feat) self.gen = Generator(in_channels=[self.app_feat, self.rep_feat], out_frames=self.out_frames) self.conv_lstms = { 56: ConvLSTM(input_dim=self.rep_feat, hidden_dim=[self.app_feat], kernel_size=(3, 3), num_layers=1, in_shape=(56, 56), batch_first=True, bias=False, return_all_layers=False), 28: ConvLSTM(input_dim=self.rep_feat, hidden_dim=[self.app_feat], kernel_size=(3, 3), num_layers=1, in_shape=(28, 28), batch_first=True, bias=False, return_all_layers=False), 14: ConvLSTM(input_dim=self.rep_feat, hidden_dim=[self.app_feat], kernel_size=(3, 3), num_layers=1, in_shape=(14, 14), batch_first=True, bias=False, return_all_layers=False) } # convs self.app_conv128 = nn.Conv2d(in_channels=128, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv256 = nn.Conv2d(in_channels=256, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv512 = nn.Conv2d(in_channels=512, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_convs = { 128: self.app_conv128, 256: self.app_conv256, 512: self.app_conv512 } self.hconv = nn.Conv2d(in_channels=self.app_feat, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.cconv = nn.Conv2d(in_channels=self.app_feat, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.rep_conv64 = nn.Conv3d(in_channels=64, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv192 = nn.Conv3d(in_channels=192, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv256 = nn.Conv3d(in_channels=256, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_convs = { 64: self.rep_conv64, 192: self.rep_conv192, 256: self.rep_conv256 }