def block(x, res): # res = 2..resolution_log2 t = x with tf.variable_scope('Conv0'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res-1), kernel=3), act=act) with tf.variable_scope('Conv1_down'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res-2), kernel=3, down=True, resample_kernel=resample_kernel), act=act) if architecture == 'resnet': with tf.variable_scope('Skip'): t = conv2d_layer(t, fmaps=nf(res-2), kernel=1, down=True, resample_kernel=resample_kernel) x = (x + t) * (1 / np.sqrt(2)) return x
def block(x, res): # res = 3..resolution_log2 t = x with tf.variable_scope('Conv0_up'): x = layer(x, layer_idx=res*2-5, fmaps=nf(res-1), kernel=3, up=True) with tf.variable_scope('Conv1'): x = layer(x, layer_idx=res*2-4, fmaps=nf(res-1), kernel=3) if architecture == 'resnet': with tf.variable_scope('Skip'): t = conv2d_layer(t, fmaps=nf(res-1), kernel=1, up=True, resample_kernel=resample_kernel) x = (x + t) * (1 / np.sqrt(2)) return x
def block(x, res): # res = 2..resolution_log2 attention_map = tf.constant(0) t = x with tf.variable_scope('Conv0'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res - 1), kernel=3), act=act) with tf.variable_scope('Conv1_down'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res - 2), kernel=3, down=True, resample_kernel=resample_kernel), act=act) if res == 4: with tf.variable_scope('fmap_attention'): fmap_attention = conv2d_layer(x, fmaps=1, kernel=1) fmap_attention = tf.reshape(fmap_attention, [-1, x.shape[2] * x.shape[3]]) with tf.variable_scope('label_attention'): label_attention = dense_layer(dlabel, fmaps=label_mapping_fmaps) with tf.variable_scope('combine_attention'): attention_map = dense_layer(tf.concat( [fmap_attention, label_attention], axis=-1), fmaps=x.shape[2] * x.shape[3]) with tf.variable_scope('x_reduced_channels'): x_reduced_channels = conv2d_layer(x, fmaps=1, kernel=1) attention_map = tf.nn.softmax(attention_map, axis=-1) attention_map = tf.reshape(attention_map, [-1, 1, x.shape[2], x.shape[3]]) combine = x_reduced_channels * attention_map with tf.variable_scope('x_increase_channels'): x_increase_channels = conv2d_layer(combine, fmaps=x.shape[1], kernel=1) with tf.variable_scope('Gamma_Attention'): gamma = tf.get_variable(shape=[], initializer=tf.initializers.zeros(), name='attention_gamma') x = x + x_increase_channels * gamma if architecture == 'resnet': with tf.variable_scope('Skip'): t = conv2d_layer(t, fmaps=nf(res - 2), kernel=1, down=True, resample_kernel=resample_kernel) x = (x + t) * (1 / np.sqrt(2)) return x, attention_map
def fromrgb(x, y, res): # res = 2..resolution_log2 with tf.variable_scope('FromRGB'): t = apply_bias_act(conv2d_layer(y, fmaps=nf(res - 1), kernel=1), act=act) return t if x is None else x + t
def D_stylegan2( images_in, # First input: Images [minibatch, channel, height, width]. labels_in, # Second input: Labels [minibatch, label_size]. num_channels=3, # Number of input color channels. Overridden based on dataset. resolution=1024, # Input resolution. Overridden based on dataset. label_size=0, # Dimensionality of the labels, 0 if no labels. Overridden based on dataset. fmap_base=16 << 10, # Overall multiplier for the number of feature maps. fmap_decay=1.0, # log2 feature map reduction when doubling the resolution. fmap_min=1, # Minimum number of feature maps in any layer. fmap_max=512, # Maximum number of feature maps in any layer. architecture='resnet', # Architecture: 'orig', 'skip', 'resnet'. nonlinearity='lrelu', # Activation function: 'relu', 'lrelu', etc. mbstd_group_size=4, # Group size for the minibatch standard deviation layer, 0 = disable. mbstd_num_features=1, # Number of features for the minibatch standard deviation layer. dtype='float32', # Data type to use for activations and outputs. resample_kernel=[ 1, 3, 3, 1 ], # Low-pass filter to apply when resampling activations. None = no filtering. **_kwargs): # Ignore unrecognized keyword args. resolution_log2 = int(np.log2(resolution)) assert resolution == 2**resolution_log2 and resolution >= 4 def nf(stage): return np.clip(int(fmap_base / (2.0**(stage * fmap_decay))), fmap_min, fmap_max) assert architecture in ['orig', 'skip', 'resnet'] act = nonlinearity images_in.set_shape([None, num_channels, resolution, resolution]) labels_in.set_shape([None, label_size]) images_in = tf.cast(images_in, dtype) labels_in = tf.cast(labels_in, dtype) # Building blocks for main layers. def fromrgb(x, y, res): # res = 2..resolution_log2 with tf.variable_scope('FromRGB'): t = apply_bias_act(conv2d_layer(y, fmaps=nf(res - 1), kernel=1), act=act) return t if x is None else x + t def block(x, res): # res = 2..resolution_log2 t = x with tf.variable_scope('Conv0'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res - 1), kernel=3), act=act) with tf.variable_scope('Conv1_down'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res - 2), kernel=3, down=True, resample_kernel=resample_kernel), act=act) if architecture == 'resnet': with tf.variable_scope('Skip'): t = conv2d_layer(t, fmaps=nf(res - 2), kernel=1, down=True, resample_kernel=resample_kernel) x = (x + t) * (1 / np.sqrt(2)) return x def downsample(y): with tf.variable_scope('Downsample'): return downsample_2d(y, k=resample_kernel) # Main layers. x = None y = images_in for res in range(resolution_log2, 2, -1): with tf.variable_scope('%dx%d' % (2**res, 2**res)): if architecture == 'skip' or res == resolution_log2: x = fromrgb(x, y, res) x = block(x, res) if architecture == 'skip': y = downsample(y) # Final layers. with tf.variable_scope('4x4'): if architecture == 'skip': x = fromrgb(x, y, 2) if mbstd_group_size > 1: with tf.variable_scope('MinibatchStddev'): x = minibatch_stddev_layer(x, mbstd_group_size, mbstd_num_features) with tf.variable_scope('Conv'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(1), kernel=3), act=act) with tf.variable_scope('Dense0'): x = apply_bias_act(dense_layer(x, fmaps=nf(0)), act=act) # Output layer with label conditioning from "Which Training Methods for GANs do actually Converge?" with tf.variable_scope('Output'): x = apply_bias_act(dense_layer(x, fmaps=max(labels_in.shape[1], 1))) if labels_in.shape[1] > 0: # Ignore interpolated labels [1, 0, 0, 0.3, 0.7] -> [1, 0, 0, 0, 0] x = tf.reduce_sum(x * tf.floor(labels_in), axis=1, keepdims=True) scores_out = x # Output. assert scores_out.dtype == tf.as_dtype(dtype) scores_out = tf.identity(scores_out, name='scores_out') return scores_out #----------------------------------------------------------------------------
def D_stylegan2( images_in, # First input: Images [minibatch, channel, height, width]. labels_in, # Second input: Labels [minibatch, label_size]. num_channels=3, # Number of input color channels. Overridden based on dataset. resolution=256, # Input resolution. Overridden based on dataset. label_size=127, # Dimensionality of the labels, 0 if no labels. Overridden based on dataset. fmap_base=16 << 10, # Overall multiplier for the number of feature maps. fmap_decay=1.0, # log2 feature map reduction when doubling the resolution. fmap_min=1, # Minimum number of feature maps in any layer. fmap_max=512, # Maximum number of feature maps in any layer. architecture='resnet', # Architecture: 'orig', 'skip', 'resnet'. nonlinearity='lrelu', # Activation function: 'relu', 'lrelu', etc. mbstd_group_size=4, # Group size for the minibatch standard deviation layer, 0 = disable. mbstd_num_features=1, # Number of features for the minibatch standard deviation layer. dtype='float32', # Data type to use for activations and outputs. resample_kernel=[ 1, 3, 3, 1 ], # Low-pass filter to apply when resampling activations. None = no filtering. dlabel_size=32, cutoff_layer=7, **_kwargs): # Ignore unrecognized keyword args. resolution_log2 = int(np.log2(resolution)) assert resolution == 2**resolution_log2 and resolution >= 4 def nf(stage): return np.clip(int(fmap_base / (2.0**(stage * fmap_decay))), fmap_min, fmap_max) assert architecture in ['orig', 'skip', 'resnet'] act = nonlinearity dlabel = D_mapping_label(labels_in=labels_in, label_size=label_size, dlabel_size=dlabel_size) images_in.set_shape([None, num_channels, resolution, resolution]) labels_in.set_shape([None, label_size]) dlabel.set_shape([None, dlabel_size]) images_in = tf.cast(images_in, dtype) labels_in = tf.cast(labels_in, dtype) dlabel = tf.cast(dlabel, dtype) # Building blocks for main layers. def fromrgb(x, y, res): # res = 2..resolution_log2 with tf.variable_scope('FromRGB'): t = apply_bias_act(conv2d_layer(y, fmaps=nf(res - 1), kernel=1), act=act) return t if x is None else x + t def downsample(y): with tf.variable_scope('Downsample'): return downsample_2d(y, k=resample_kernel) def block(x, res): # res = 2..resolution_log2 t = x with tf.variable_scope('Conv0'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res - 1), kernel=3), act=act) with tf.variable_scope('Conv1_down'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res - 2), kernel=3, down=True, resample_kernel=resample_kernel), act=act) if 3 < res < 8: with tf.variable_scope('Downsample'): x_downsample = downsample_2d(x) height = x_downsample.shape[2] width = x_downsample.shape[3] c_reduced = 1 label_mapping_fmaps = 16 with tf.variable_scope('F_Attention'): f_x = conv2d_layer(x_downsample, fmaps=1, kernel=1) f_x = tf.reshape(f_x, [-1, height * width]) with tf.variable_scope('Label_F_Attention'): label_f = dense_layer(dlabel, fmaps=label_mapping_fmaps) label_f = apply_bias_act(label_f) + 1 with tf.variable_scope('F_concat_Attention'): f_x_s = dense_layer(tf.concat([f_x, label_f], axis=-1), fmaps=c_reduced * height * width) f_x_s = tf.reshape(f_x_s, [-1, c_reduced, height * width]) f_x_s = tf.transpose(f_x_s, perm=[0, 2, 1]) with tf.variable_scope('G_Attention'): g_x = conv2d_layer(x_downsample, fmaps=1, kernel=1) g_x = tf.reshape(g_x, [-1, height * width]) with tf.variable_scope('Label_G_Attention'): label_g = dense_layer(dlabel, fmaps=label_mapping_fmaps) label_g = apply_bias_act(label_g) + 1 with tf.variable_scope('G_concat_Attention'): g_x_s = dense_layer(tf.concat([g_x, label_g], axis=-1), fmaps=c_reduced * height * width) g_x_s = tf.reshape(g_x_s, [-1, c_reduced, height * width]) with tf.variable_scope('H_Attention'): h_x = conv2d_layer(x_downsample, fmaps=c_reduced, kernel=1) h_x = tf.reshape(h_x, [-1, c_reduced, height * width]) f_g_multiply = tf.matmul(f_x_s, g_x_s) attention_map = tf.nn.softmax(f_g_multiply, axis=-1) attention_map_h_multiply = tf.matmul( h_x, tf.transpose(attention_map, [0, 2, 1])) attention_map_h_multiply_reshape = tf.reshape( attention_map_h_multiply, [-1, c_reduced, height, width]) with tf.variable_scope('V_Attention'): v_x = conv2d_layer(attention_map_h_multiply_reshape, fmaps=x_downsample.shape[1], kernel=1) with tf.variable_scope('Upsample'): v_x_upsample = upsample_2d(v_x) with tf.variable_scope('Gamma_Attention'): gamma = tf.get_variable(shape=[], initializer=tf.initializers.zeros(), name='attention_gamma') x = x + v_x_upsample * gamma if res == cutoff_layer: return v_x_upsample, gamma, x, attention_map_h_multiply_reshape if architecture == 'resnet': with tf.variable_scope('Skip'): t = conv2d_layer(t, fmaps=nf(res - 2), kernel=1, down=True, resample_kernel=resample_kernel) x = (x + t) * (1 / np.sqrt(2)) return x # Main layers. x = None y = images_in for res in range(resolution_log2, 2, -1): with tf.variable_scope('%dx%d' % (2**res, 2**res)): if architecture == 'skip' or res == resolution_log2: x = fromrgb(x, y, res) if res == cutoff_layer: return block(x, res) x = block(x, res) if architecture == 'skip': y = downsample(y) # Final layers. with tf.variable_scope('4x4'): if architecture == 'skip': x = fromrgb(x, y, 2) if mbstd_group_size > 1: with tf.variable_scope('MinibatchStddev'): x = minibatch_stddev_layer(x, mbstd_group_size, mbstd_num_features) with tf.variable_scope('Conv'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(1), kernel=3), act=act) with tf.variable_scope('Dense0'): x = apply_bias_act(dense_layer(x, fmaps=nf(0)), act=act) # Output layer with label conditioning from "Which Training Methods for GANs do actually Converge?" with tf.variable_scope('Output'): x = apply_bias_act(dense_layer(x, fmaps=max(labels_in.shape[1], 1))) if labels_in.shape[1] > 0: x = tf.reduce_sum(x * labels_in, axis=1, keepdims=True) scores_out = x # Output. assert scores_out.dtype == tf.as_dtype(dtype) scores_out = tf.identity(scores_out, name='scores_out') return scores_out #----------------------------------------------------------------------------
def block(x, res): # res = 2..resolution_log2 t = x with tf.variable_scope('Conv0'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res - 1), kernel=3), act=act) with tf.variable_scope('Conv1_down'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res - 2), kernel=3, down=True, resample_kernel=resample_kernel), act=act) if 3 < res < 8: with tf.variable_scope('Downsample'): x_downsample = downsample_2d(x) height = x_downsample.shape[2] width = x_downsample.shape[3] c_reduced = 1 label_mapping_fmaps = 16 with tf.variable_scope('F_Attention'): f_x = conv2d_layer(x_downsample, fmaps=1, kernel=1) f_x = tf.reshape(f_x, [-1, height * width]) with tf.variable_scope('Label_F_Attention'): label_f = dense_layer(dlabel, fmaps=label_mapping_fmaps) label_f = apply_bias_act(label_f) + 1 with tf.variable_scope('F_concat_Attention'): f_x_s = dense_layer(tf.concat([f_x, label_f], axis=-1), fmaps=c_reduced * height * width) f_x_s = tf.reshape(f_x_s, [-1, c_reduced, height * width]) f_x_s = tf.transpose(f_x_s, perm=[0, 2, 1]) with tf.variable_scope('G_Attention'): g_x = conv2d_layer(x_downsample, fmaps=1, kernel=1) g_x = tf.reshape(g_x, [-1, height * width]) with tf.variable_scope('Label_G_Attention'): label_g = dense_layer(dlabel, fmaps=label_mapping_fmaps) label_g = apply_bias_act(label_g) + 1 with tf.variable_scope('G_concat_Attention'): g_x_s = dense_layer(tf.concat([g_x, label_g], axis=-1), fmaps=c_reduced * height * width) g_x_s = tf.reshape(g_x_s, [-1, c_reduced, height * width]) with tf.variable_scope('H_Attention'): h_x = conv2d_layer(x_downsample, fmaps=c_reduced, kernel=1) h_x = tf.reshape(h_x, [-1, c_reduced, height * width]) f_g_multiply = tf.matmul(f_x_s, g_x_s) attention_map = tf.nn.softmax(f_g_multiply, axis=-1) attention_map_h_multiply = tf.matmul( h_x, tf.transpose(attention_map, [0, 2, 1])) attention_map_h_multiply_reshape = tf.reshape( attention_map_h_multiply, [-1, c_reduced, height, width]) with tf.variable_scope('V_Attention'): v_x = conv2d_layer(attention_map_h_multiply_reshape, fmaps=x_downsample.shape[1], kernel=1) with tf.variable_scope('Upsample'): v_x_upsample = upsample_2d(v_x) with tf.variable_scope('Gamma_Attention'): gamma = tf.get_variable(shape=[], initializer=tf.initializers.zeros(), name='attention_gamma') x = x + v_x_upsample * gamma if res == cutoff_layer: return v_x_upsample, gamma, x, attention_map_h_multiply_reshape if architecture == 'resnet': with tf.variable_scope('Skip'): t = conv2d_layer(t, fmaps=nf(res - 2), kernel=1, down=True, resample_kernel=resample_kernel) x = (x + t) * (1 / np.sqrt(2)) return x
def D_stylegan2( images_in, # First input: Images [minibatch, channel, height, width]. labels_in, # Second input: Labels [minibatch, label_size]. num_channels=3, # Number of input color channels. Overridden based on dataset. resolution=256, # Input resolution. Overridden based on dataset. label_size=127, # Dimensionality of the labels, 0 if no labels. Overridden based on dataset. fmap_base=16 << 10, # Overall multiplier for the number of feature maps. fmap_decay=1.0, # log2 feature map reduction when doubling the resolution. fmap_min=1, # Minimum number of feature maps in any layer. fmap_max=512, # Maximum number of feature maps in any layer. architecture='resnet', # Architecture: 'orig', 'skip', 'resnet'. nonlinearity='lrelu', # Activation function: 'relu', 'lrelu', etc. mbstd_group_size=4, # Group size for the minibatch standard deviation layer, 0 = disable. mbstd_num_features=1, # Number of features for the minibatch standard deviation layer. dtype='float32', # Data type to use for activations and outputs. resample_kernel=[ 1, 3, 3, 1 ], # Low-pass filter to apply when resampling activations. None = no filtering. components=dnnlib.EasyDict( ), # Container for sub-networks. Retained between calls. mapping_label_func='D_mapping_label', dlabel_size=64, use_attention_downsampling=False, label_mapping_fmaps=32, output_fmap_res=3, **_kwargs): # Ignore unrecognized keyword args. resolution_log2 = int(np.log2(resolution)) assert resolution == 2**resolution_log2 and resolution >= 4 def nf(stage): return np.clip(int(fmap_base / (2.0**(stage * fmap_decay))), fmap_min, fmap_max) assert architecture in ['orig', 'skip', 'resnet'] act = nonlinearity # dlabel = D_mapping_label(labels_in=labels_in, label_size=label_size, dlabel_size=dlabel_size) if 'mapping_label' not in components: components.mapping_label = tflib.Network( 'D_mapping_label', func_name=globals()[mapping_label_func], label_size=label_size, dlabel_size=dlabel_size) dlabel = components.mapping_label.get_output_for(labels_in) dlabel = tf.cast(dlabel, dtype) images_in.set_shape([None, num_channels, resolution, resolution]) labels_in.set_shape([None, label_size]) images_in = tf.cast(images_in, dtype) labels_in = tf.cast(labels_in, dtype) # Building blocks for main layers. def fromrgb(x, y, res): # res = 2..resolution_log2 with tf.variable_scope('FromRGB'): t = apply_bias_act(conv2d_layer(y, fmaps=nf(res - 1), kernel=1), act=act) return t if x is None else x + t def downsample(y): with tf.variable_scope('Downsample'): return downsample_2d(y, k=resample_kernel) def block(x, res): # res = 2..resolution_log2 attention_map = tf.constant(0) t = x with tf.variable_scope('Conv0'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res - 1), kernel=3), act=act) with tf.variable_scope('Conv1_down'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(res - 2), kernel=3, down=True, resample_kernel=resample_kernel), act=act) if res == 4: with tf.variable_scope('fmap_attention'): fmap_attention = conv2d_layer(x, fmaps=1, kernel=1) fmap_attention = tf.reshape(fmap_attention, [-1, x.shape[2] * x.shape[3]]) with tf.variable_scope('label_attention'): label_attention = dense_layer(dlabel, fmaps=label_mapping_fmaps) with tf.variable_scope('combine_attention'): attention_map = dense_layer(tf.concat( [fmap_attention, label_attention], axis=-1), fmaps=x.shape[2] * x.shape[3]) with tf.variable_scope('x_reduced_channels'): x_reduced_channels = conv2d_layer(x, fmaps=1, kernel=1) attention_map = tf.nn.softmax(attention_map, axis=-1) attention_map = tf.reshape(attention_map, [-1, 1, x.shape[2], x.shape[3]]) combine = x_reduced_channels * attention_map with tf.variable_scope('x_increase_channels'): x_increase_channels = conv2d_layer(combine, fmaps=x.shape[1], kernel=1) with tf.variable_scope('Gamma_Attention'): gamma = tf.get_variable(shape=[], initializer=tf.initializers.zeros(), name='attention_gamma') x = x + x_increase_channels * gamma if architecture == 'resnet': with tf.variable_scope('Skip'): t = conv2d_layer(t, fmaps=nf(res - 2), kernel=1, down=True, resample_kernel=resample_kernel) x = (x + t) * (1 / np.sqrt(2)) return x, attention_map # Main layers. x = None y = images_in for res in range(resolution_log2, 2, -1): with tf.variable_scope('%dx%d' % (2**res, 2**res)): if architecture == 'skip' or res == resolution_log2: x = fromrgb(x, y, res) x, attention_map = block(x, res) if res == output_fmap_res: fmap_output = x attention_map_out = attention_map if architecture == 'skip': y = downsample(y) # Final layers. with tf.variable_scope('4x4'): if architecture == 'skip': x = fromrgb(x, y, 2) if mbstd_group_size > 1: with tf.variable_scope('MinibatchStddev'): x = minibatch_stddev_layer(x, mbstd_group_size, mbstd_num_features) with tf.variable_scope('Conv'): x = apply_bias_act(conv2d_layer(x, fmaps=nf(1), kernel=3), act=act) with tf.variable_scope('Dense0'): x = apply_bias_act(dense_layer(x, fmaps=nf(0)), act=act) # Output layer with label conditioning from "Which Training Methods for GANs do actually Converge?" with tf.variable_scope('Output'): x = apply_bias_act(dense_layer(x, fmaps=max(labels_in.shape[1], 1))) return x, fmap_output, attention_map_out if labels_in.shape[1] > 0: x = tf.reduce_sum(x * labels_in, axis=1, keepdims=True) scores_out = x # Output. assert scores_out.dtype == tf.as_dtype(dtype) scores_out = tf.identity(scores_out, name='scores_out') return scores_out #----------------------------------------------------------------------------