def __init__(self, vp_value_count, output_shape, name='Full Network'): """ Initializes the Full Network. :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape. Legal values: (bsz, 3, 8, 112, 112) and (bsz, 3, 16, 112, 112) :param name: (str, optional) The name of the network (default 'Full Network'). Raises: ValueError: if 'vp_value_count' is not a legal value count ValueError: if 'output_shape' does not contain a legal number of frames. """ if vp_value_count not in self.VALID_VP_VALUE_COUNTS: raise ValueError('Invalid number of vp values: %d' % vp_value_count) if output_shape[2] not in self.VALID_FRAME_COUNTS: raise ValueError('Invalid number of frames in desired output: %d' % output_shape[2]) super(FullNetwork, self).__init__() self.net_name = name self.vp_value_count = vp_value_count self.output_shape = output_shape self.out_frames = output_shape[2] self.rep_channels = 256 self.rep_frames = 4 self.rep_size = 14 self.vgg = vgg16(pretrained=True, weights_path=vgg_weights_path) self.i3d = InceptionI3d(final_endpoint='Mixed_5c', in_frames=self.out_frames, pretrained=True, weights_path=i3d_weights_path) self.exp = Expander(vp_value_count=self.vp_value_count, out_frames=self.rep_frames, out_size=self.rep_size) self.trans = Transformer(in_channels=self.rep_channels + self.vp_value_count, out_channels=self.rep_channels) self.gen = Generator(in_channels=[self.rep_channels, self.rep_channels], out_frames=self.out_frames)
def __init__(self, num_classes=92, num_queries=100, backbone=None, pos_encoder=None, transformer=None, num_encoder_layers=6, num_decoder_layers=6, return_intermediate_dec=True, **kwargs): super().__init__(**kwargs) self.num_queries = num_queries self.backbone = ResNet50Backbone(name='backbone') self.transformer = transformer or Transformer( num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, return_intermediate_dec=return_intermediate_dec, name='transformer' ) self.model_dim = self.transformer.model_dim self.pos_encoder = pos_encoder or PositionEmbeddingSine( num_pos_features=self.model_dim // 2, normalize=True) self.input_proj = tf.keras.layers.Conv2D(self.model_dim, kernel_size=1, name='input_proj') self.query_embed = FixedEmbedding((num_queries, self.model_dim), name='query_embed') self.class_embed = Linear(num_classes, name='class_embed') self.bbox_embed_linear1 = Linear(self.model_dim, name='bbox_embed_0') self.bbox_embed_linear2 = Linear(self.model_dim, name='bbox_embed_1') self.bbox_embed_linear3 = Linear(4, name='bbox_embed_2') self.activation = tf.keras.layers.ReLU()
def __init__(self, vp_value_count, stdev, output_shape, pretrained=False, vgg_weights_path='', i3d_weights_path='', name='Full Network'): """ Initializes the Full Network. :param vp_value_count: (int) The number of values that identify the viewpoint. :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape. Legal values: (bsz, 3, 8/16, 112, 112) and (bsz, 3, 16, 112, 112) :param name: (str, optional) The name of the network (default 'Full Network'). Raises: ValueError: if 'vp_value_count' is not a legal value count ValueError: if 'output_shape' does not contain a legal number of frames. """ if vp_value_count not in self.VALID_VP_VALUE_COUNTS: raise ValueError('Invalid number of vp values: %d' % vp_value_count) if output_shape[2] not in self.VALID_FRAME_COUNTS: raise ValueError('Invalid number of frames in desired output: %d' % output_shape[2]) super(FullNetwork, self).__init__() self.net_name = name self.vp_value_count = vp_value_count self.stdev = stdev self.output_shape = output_shape self.out_frames = output_shape[2] # specs of various features self.app_feat = 128 self.rep_feat = 128 self.rep_frames = 4 self.rep_size = 14 self.nkp = 32 self.vgg = vgg16(pretrained=pretrained, weights_path=vgg_weights_path) self.i3d = InceptionI3d(final_endpoint='Mixed_5c', in_frames=self.out_frames, pretrained=pretrained, weights_path=i3d_weights_path) self.exp = Expander(vp_value_count=self.vp_value_count) # convs to make all appearance encodings have same number of channels, so they can be used in the same convGRU self.app_conv128 = nn.Conv2d(in_channels=128, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv256a = nn.Conv2d(in_channels=256, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv256b = nn.Conv2d(in_channels=256, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_convs = [ nn.Sequential(self.app_conv128, nn.ReLU(inplace=True)), nn.Sequential(self.app_conv256a, nn.ReLU(inplace=True)), nn.Sequential(self.app_conv256b, nn.ReLU(inplace=True)) ] # convs to make all motion features have the same number of channels, so they can be used in the same trans net self.rep_conv64 = nn.Conv3d(in_channels=64, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv192 = nn.Conv3d(in_channels=192, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv256 = nn.Conv3d(in_channels=256, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_convs = [ nn.Sequential(self.rep_conv64, nn.ReLU(inplace=True)), nn.Sequential(self.rep_conv192, nn.ReLU(inplace=True)), nn.Sequential(self.rep_conv256, nn.ReLU(inplace=True)) ] self.trans = Transformer(in_channels=self.rep_feat + self.vp_value_count, out_channels=self.rep_feat) self.kpp = KPPredictor(in_channels=self.rep_feat, nkp=self.nkp, stdev=self.stdev) self.vpp = VPPredictor(in_channels=256) self.gru = ConvGRU(input_dim=self.rep_feat, hidden_dim=[self.app_feat], kernel_size=(7, 7), num_layers=1, batch_first=True, bias=False, return_all_layers=False) self.gen = Generator(in_channels=[self.app_feat, self.nkp], out_frames=self.out_frames)
def __init__(self, vp_value_count, output_shape, name='Full Network'): """ Initializes the Full Network. :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape. Legal values: (bsz, 3, 8, 112, 112) and (bsz, 3, 16, 112, 112) :param name: (str, optional) The name of the network (default 'Full Network'). Raises: ValueError: if 'vp_value_count' is not a legal value count ValueError: if 'output_shape' does not contain a legal number of frames. """ if vp_value_count not in self.VALID_VP_VALUE_COUNTS: raise ValueError('Invalid number of vp values: %d' % vp_value_count) if output_shape[2] not in self.VALID_FRAME_COUNTS: raise ValueError('Invalid number of frames in desired output: %d' % output_shape[2]) super(FullNetwork, self).__init__() self.net_name = name self.vp_value_count = vp_value_count self.output_shape = output_shape self.out_frames = output_shape[2] self.rep_channels = 256 self.rep_frames = 4 self.rep_size = 14 self.vgg = vgg16(pretrained=True, weights_path=vgg_weights_path) self.i3d = InceptionI3d(final_endpoint='Mixed_5c', in_frames=self.out_frames, pretrained=True, weights_path=i3d_weights_path) self.exp = Expander(vp_value_count=self.vp_value_count) # convs to make all appearance encoding have same number of channels, so they can be used in the same convLSTM self.app_conv128 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv256a = nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv256b = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_convs = [ self.app_conv128, self.app_conv256a, self.app_conv256b ] # convs for the initial hidden and current states of the convLSTM self.hconv = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.cconv = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) # convs to make all motion features have the same number of channels, so they can be used in the same Trans Net self.rep_conv64 = nn.Conv3d(in_channels=64, out_channels=256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv192 = nn.Conv3d(in_channels=192, out_channels=256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv256 = nn.Conv3d(in_channels=256, out_channels=256, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_convs = { 64: self.rep_conv64, 192: self.rep_conv192, 256: self.rep_conv256 } self.trans = Transformer(in_channels=256 + self.vp_value_count, out_channels=128) self.conv_lstm = ConvLSTM(input_dim=128, hidden_dim=[128], kernel_size=(3, 3), num_layers=1, batch_first=True, bias=False, return_all_layers=False) self.gen = Generator(in_channels=[128], out_frames=self.out_frames)
def __init__(self, vp_value_count, output_shape, name='Full Network'): """ Initializes the Full Network. :param output_shape: (5-tuple) The desired output shape for generated videos. Must match video input shape. Legal values: (bsz, 3, 8, 112, 112) and (bsz, 3, 16, 112, 112) :param name: (str, optional) The name of the network (default 'Full Network'). Raises: ValueError: if 'vp_value_count' is not a legal value count ValueError: if 'output_shape' does not contain a legal number of frames. """ if vp_value_count not in self.VALID_VP_VALUE_COUNTS: raise ValueError('Invalid number of vp values: %d' % vp_value_count) if output_shape[2] not in self.VALID_FRAME_COUNTS: raise ValueError('Invalid number of frames in desired output: %d' % output_shape[2]) super(FullNetwork, self).__init__() # params self.net_name = name self.vp_value_count = vp_value_count self.output_shape = output_shape self.out_frames = output_shape[2] self.rep_feat = 128 self.app_feat = 256 # networks self.vgg = vgg16(pretrained=True, weights_path=vgg_weights_path) self.i3d = InceptionI3d(final_endpoint='Mixed_5c', in_frames=self.out_frames, pretrained=True, weights_path=i3d_weights_path) self.exp = Expander(vp_value_count=self.vp_value_count) self.trans = Transformer(in_channels=self.rep_feat + self.vp_value_count, out_channels=self.rep_feat) self.gen = Generator(in_channels=[self.app_feat, self.rep_feat], out_frames=self.out_frames) self.conv_lstms = { 56: ConvLSTM(input_dim=self.rep_feat, hidden_dim=[self.app_feat], kernel_size=(3, 3), num_layers=1, in_shape=(56, 56), batch_first=True, bias=False, return_all_layers=False), 28: ConvLSTM(input_dim=self.rep_feat, hidden_dim=[self.app_feat], kernel_size=(3, 3), num_layers=1, in_shape=(28, 28), batch_first=True, bias=False, return_all_layers=False), 14: ConvLSTM(input_dim=self.rep_feat, hidden_dim=[self.app_feat], kernel_size=(3, 3), num_layers=1, in_shape=(14, 14), batch_first=True, bias=False, return_all_layers=False) } # convs self.app_conv128 = nn.Conv2d(in_channels=128, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv256 = nn.Conv2d(in_channels=256, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_conv512 = nn.Conv2d(in_channels=512, out_channels=self.app_feat, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.app_convs = { 128: self.app_conv128, 256: self.app_conv256, 512: self.app_conv512 } self.hconv = nn.Conv2d(in_channels=self.app_feat, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.cconv = nn.Conv2d(in_channels=self.app_feat, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.rep_conv64 = nn.Conv3d(in_channels=64, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv192 = nn.Conv3d(in_channels=192, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_conv256 = nn.Conv3d(in_channels=256, out_channels=self.rep_feat, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1)) self.rep_convs = { 64: self.rep_conv64, 192: self.rep_conv192, 256: self.rep_conv256 }