def create_acvideo_discriminator(clips, actions, ndf=64, norm_layer='instance', use_noise=False, noise_sigma=None): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]] clips = clips * 2 - 1 clip_pairs = tf.concat([clips[:-1], clips[1:]], axis=-1) clip_pairs = tile_concat([clip_pairs, actions[..., None, None, :]], axis=-1) clip_pairs = tf_utils.transpose_batch_time(clip_pairs) with tf.variable_scope("acvideo_layer_1"): h1 = noise(clip_pairs, use_noise, noise_sigma) h1 = conv3d(tf.pad(h1, paddings), ndf, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("acvideo_layer_2"): h2 = noise(h1, use_noise, noise_sigma) h2 = conv3d(tf.pad(h2, paddings), ndf * 2, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("acvideo_layer_3"): h3 = noise(h2, use_noise, noise_sigma) h3 = conv3d(tf.pad(h3, paddings), ndf * 4, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("acvideo_layer_4"): logits = conv3d(tf.pad(h3, paddings), 1, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) layers.append(logits) return nest.map_structure(tf_utils.transpose_batch_time, layers)
def video_sn_discriminator(clips, ndf=64): clips = tf_utils.transpose_batch_time(clips) batch_size = clips.shape[0].value layers = [] paddings = [[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]] def conv3d(inputs, *args, **kwargs): kwargs.setdefault('padding', 'VALID') kwargs.setdefault('use_spectral_norm', True) return ops.conv3d(tf.pad(inputs, paddings), *args, **kwargs) with tf.variable_scope("sn_conv0_0"): layers.append(lrelu(conv3d(clips, ndf, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_conv0_1"): layers.append( lrelu( conv3d(layers[-1], ndf * 2, kernel_size=4, strides=(1, 2, 2)), 0.1)) with tf.variable_scope("sn_conv1_0"): layers.append( lrelu(conv3d(layers[-1], ndf * 2, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_conv1_1"): layers.append( lrelu( conv3d(layers[-1], ndf * 4, kernel_size=4, strides=(1, 2, 2)), 0.1)) with tf.variable_scope("sn_conv2_0"): layers.append( lrelu(conv3d(layers[-1], ndf * 4, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_conv2_1"): layers.append( lrelu(conv3d(layers[-1], ndf * 8, kernel_size=4, strides=2), 0.1)) with tf.variable_scope("sn_conv3_0"): layers.append( lrelu(conv3d(layers[-1], ndf * 8, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_fc4"): logits = dense(tf.reshape(layers[-1], [batch_size, -1]), 1, use_spectral_norm=True) layers.append(logits) layers = nest.map_structure(tf_utils.transpose_batch_time, layers) return layers
def create_video_discriminator(clips, ndf=64, norm_layer='instance'): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]] clips = tf_utils.transpose_batch_time(clips) with tf.variable_scope("video_layer_1"): h1 = conv3d(tf.pad(clips, paddings), ndf, kernel_size=4, strides=(1, 2, 2), padding='VALID') h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("video_layer_2"): h2 = conv3d(tf.pad(h1, paddings), ndf * 2, kernel_size=4, strides=(1, 2, 2), padding='VALID') h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("video_layer_3"): h3 = conv3d(tf.pad(h2, paddings), ndf * 4, kernel_size=4, strides=(1, 2, 2), padding='VALID') h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("video_layer_4"): if h3.shape[1].value < 4: kernel_size = (h3.shape[1].value, 4, 4) else: kernel_size = 4 logits = conv3d(h3, 1, kernel_size=kernel_size, strides=1, padding='VALID') layers.append(logits) return nest.map_structure(tf_utils.transpose_batch_time, layers)
def where_axis1(cond, x, y): return transpose_batch_time( tf.where(cond, transpose_batch_time(x), transpose_batch_time(y)))