def construct_latent_tower(self, images): """Builds convolutional latent tower for stochastic model. At training time this tower generates a latent distribution (mean and std) conditioned on the entire video. This latent variable will be fed to the main tower as an extra variable to be used for future frames prediction. At inference time, the tower is disabled and only returns latents sampled from N(0,1). If the multi_latent flag is on, a different latent for every timestep would be generated. Args: images: tensor of ground truth image sequences Returns: latent_mean: predicted latent mean latent_std: predicted latent standard deviation latent_loss: loss of the latent twoer samples: random samples sampled from standard guassian """ conv_size = self.tinyify([32, 64, 64]) with tf.variable_scope("latent", reuse=tf.AUTO_REUSE): # this allows more predicted frames at inference time latent_num_frames = self.hparams.latent_num_frames if latent_num_frames == 0: # use all frames by default. latent_num_frames = (self.hparams.video_num_input_frames + self.hparams.video_num_target_frames) tf.logging.info("Creating latent tower with %d frames."%latent_num_frames) latent_images = tf.unstack(images[:latent_num_frames], axis=0) images = tf.concat(latent_images, 3) x = images x = common_layers.make_even_size(x) x = tfl.conv2d(x, conv_size[0], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_conv1") x = tfcl.batch_norm(x, updates_collections=None, is_training=self.is_training, scope="latent_bn1") x = common_layers.make_even_size(x) x = tfl.conv2d(x, conv_size[1], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_conv2") x = tfcl.batch_norm(x, updates_collections=None, is_training=self.is_training, scope="latent_bn2") x = tfl.conv2d(x, conv_size[2], [3, 3], strides=(1, 1), padding="SAME", activation=tf.nn.relu, name="latent_conv3") x = tfcl.batch_norm(x, updates_collections=None, is_training=self.is_training, scope="latent_bn3") nc = self.hparams.latent_channels mean = tfl.conv2d(x, nc, [3, 3], strides=(2, 2), padding="SAME", activation=None, name="latent_mean") std = tfl.conv2d(x, nc, [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_std") std += self.hparams.latent_std_min # No latent tower at inference time, just standard gaussian. if self.hparams.mode != tf.estimator.ModeKeys.TRAIN: return tf.zeros_like(mean), tf.zeros_like(std) return mean, std
def construct_latent_tower(self, images): """Builds convolutional latent tower for stochastic model. At training time this tower generates a latent distribution (mean and std) conditioned on the entire video. This latent variable will be fed to the main tower as an extra variable to be used for future frames prediction. At inference time, the tower is disabled and only returns latents sampled from N(0,1). If the multi_latent flag is on, a different latent for every timestep would be generated. Args: images: tensor of ground truth image sequences Returns: latent_mean: predicted latent mean latent_std: predicted latent standard deviation latent_loss: loss of the latent twoer samples: random samples sampled from standard guassian """ sequence_length = len(images) with tf.variable_scope("latent"): images = tf.concat(images, 3) x = images x = common_layers.make_even_size(x) x = slim.conv2d(x, 32, [3, 3], stride=2, scope="latent_conv1") x = slim.batch_norm(x, scope="latent_bn1") x = common_layers.make_even_size(x) x = slim.conv2d(x, 64, [3, 3], stride=2, scope="latent_conv2") x = slim.batch_norm(x, scope="latent_bn2") x = slim.conv2d(x, 64, [3, 3], stride=1, scope="latent_conv3") x = slim.batch_norm(x, scope="latent_bn3") nc = self.hparams.latent_channels mean = slim.conv2d(x, nc, [3, 3], stride=2, activation_fn=None, scope="latent_mean") std = slim.conv2d(x, nc, [3, 3], stride=2, scope="latent_std") std += self.hparams.latent_std_min if self.hparams.multi_latent: # timestep x batch_size x latent_size samples = tf.random_normal([sequence_length - 1] + mean.shape, 0, 1, dtype=tf.float32) else: # batch_size x latent_size samples = tf.random_normal(tf.shape(mean), 0, 1, dtype=tf.float32) if self.hparams.mode == tf.estimator.ModeKeys.TRAIN: return mean, std, samples else: # No latent tower at inference time, just standard gaussian. return None, None, samples
def encode(self, inputs, target_space, hparams, features=None, losses=None): """Add layers of strided convolutions on top of encoder.""" with tf.variable_scope("downstride"): hparams = self.hparams kernel, strides = (4, 4), (2, 2) x = inputs # Down-convolutions. for i in range(hparams.num_compress_steps): x = common_layers.make_even_size(x) x = tf.layers.conv2d(x, hparams.hidden_size, kernel, strides=strides, padding="SAME", activation=common_layers.belu, name="conv_%d" % i) x = common_layers.layer_norm(x) encoder_output, encoder_decoder_attention_bias = super( TransformerSketch, self).encode(x, target_space, hparams, features=features, losses=losses) return encoder_output, encoder_decoder_attention_bias
def conv_latent_tower(images, time_axis, latent_channels=1, min_logvar=-5, is_training=False, random_latent=False, tiny_mode=False): """Builds convolutional latent tower for stochastic model. At training time this tower generates a latent distribution (mean and std) conditioned on the entire video. This latent variable will be fed to the main tower as an extra variable to be used for future frames prediction. At inference time, the tower is disabled and only returns latents sampled from N(0,1). If the multi_latent flag is on, a different latent for every timestep would be generated. Args: images: tensor of ground truth image sequences time_axis: the time axis in images tensor latent_channels: number of latent channels min_logvar: minimum value for log_var is_training: whether or not it is training mode random_latent: whether or not generate random latents tiny_mode: whether or not it is tiny_mode Returns: latent_mean: predicted latent mean latent_logvar: predicted latent log variance """ conv_size = tinyify([32, 64, 64], tiny_mode) with tf.variable_scope("latent", reuse=tf.AUTO_REUSE): images = tf.to_float(images) images = tf.unstack(images, axis=time_axis) images = tf.concat(images, axis=3) x = images x = common_layers.make_even_size(x) x = tfl.conv2d(x, conv_size[0], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_conv1") x = tfcl.layer_norm(x) x = tfl.conv2d(x, conv_size[1], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_conv2") x = tfcl.layer_norm(x) x = tfl.conv2d(x, conv_size[2], [3, 3], strides=(1, 1), padding="SAME", activation=tf.nn.relu, name="latent_conv3") x = tfcl.layer_norm(x) nc = latent_channels mean = tfl.conv2d(x, nc, [3, 3], strides=(2, 2), padding="SAME", activation=None, name="latent_mean") logv = tfl.conv2d(x, nc, [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_std") logvar = logv + min_logvar # No latent tower at inference time, just standard gaussian. if not is_training: return tf.zeros_like(mean), tf.zeros_like(logvar) # No latent in the first phase ret_mean, ret_logvar = tf.cond( random_latent, lambda: (tf.zeros_like(mean), tf.zeros_like(logvar)), lambda: (mean, logvar)) return ret_mean, ret_logvar
def construct_latent_tower(self, images): """Builds convolutional latent tower for stochastic model. At training time this tower generates a latent distribution (mean and std) conditioned on the entire video. This latent variable will be fed to the main tower as an extra variable to be used for future frames prediction. At inference time, the tower is disabled and only returns latents sampled from N(0,1). If the multi_latent flag is on, a different latent for every timestep would be generated. Args: images: tensor of ground truth image sequences Returns: latent_mean: predicted latent mean latent_std: predicted latent standard deviation latent_loss: loss of the latent twoer samples: random samples sampled from standard guassian """ conv_size = self.tinyify([32, 64, 64]) with tf.variable_scope("latent", reuse=tf.AUTO_REUSE): # this allows more predicted frames at inference time latent_images = images[:self.hparams.latent_num_frames] images = tf.concat(latent_images, 3) x = images x = common_layers.make_even_size(x) x = slim.conv2d(x, conv_size[0], [3, 3], stride=2, scope="latent_conv1") x = slim.batch_norm(x, scope="latent_bn1") x = common_layers.make_even_size(x) x = slim.conv2d(x, conv_size[1], [3, 3], stride=2, scope="latent_conv2") x = slim.batch_norm(x, scope="latent_bn2") x = slim.conv2d(x, conv_size[2], [3, 3], stride=1, scope="latent_conv3") x = slim.batch_norm(x, scope="latent_bn3") nc = self.hparams.latent_channels mean = slim.conv2d( x, nc, [3, 3], stride=2, activation_fn=None, scope="latent_mean") std = slim.conv2d(x, nc, [3, 3], stride=2, scope="latent_std") std += self.hparams.latent_std_min # No latent tower at inference time, just standard gaussian. if self.hparams.mode != tf.estimator.ModeKeys.TRAIN: return tf.zeros_like(mean), tf.zeros_like(std) return mean, std
def construct_latent_tower(self, images): """Builds convolutional latent tower for stochastic model. At training time this tower generates a latent distribution (mean and std) conditioned on the entire video. This latent variable will be fed to the main tower as an extra variable to be used for future frames prediction. At inference time, the tower is disabled and only returns latents sampled from N(0,1). If the multi_latent flag is on, a different latent for every timestep would be generated. Args: images: tensor of ground truth image sequences Returns: latent_mean: predicted latent mean latent_std: predicted latent standard deviation latent_loss: loss of the latent twoer samples: random samples sampled from standard guassian """ conv_size = self.tinyify([32, 64, 64]) with tf.variable_scope("latent", reuse=tf.AUTO_REUSE): # this allows more predicted frames at inference time latent_images = images[:self.hparams.latent_num_frames] images = tf.concat(latent_images, 3) x = images x = common_layers.make_even_size(x) x = slim.conv2d(x, conv_size[0], [3, 3], stride=2, scope="latent_conv1") x = slim.batch_norm(x, scope="latent_bn1") x = common_layers.make_even_size(x) x = slim.conv2d(x, conv_size[1], [3, 3], stride=2, scope="latent_conv2") x = slim.batch_norm(x, scope="latent_bn2") x = slim.conv2d(x, conv_size[2], [3, 3], stride=1, scope="latent_conv3") x = slim.batch_norm(x, scope="latent_bn3") nc = self.hparams.latent_channels mean = slim.conv2d( x, nc, [3, 3], stride=2, activation_fn=None, scope="latent_mean") std = slim.conv2d(x, nc, [3, 3], stride=2, scope="latent_std") std += self.hparams.latent_std_min # No latent tower at inference time, just standard gaussian. if self.hparams.mode != tf.estimator.ModeKeys.TRAIN: return tf.zeros_like(mean), tf.zeros_like(std) return mean, std
def inject_latent(self, layer, features, filters): """Inject a deterministic latent based on the target frame.""" del filters hparams = self.hparams final_filters = common_layers.shape_list(layer)[-1] filters = hparams.hidden_size kernel = (4, 4) if hparams.mode == tf.estimator.ModeKeys.PREDICT: layer_shape = common_layers.shape_list(layer) if hparams.full_latent_tower: rand = tf.random_uniform(layer_shape[:-1] + [hparams.bottleneck_bits]) else: rand = tf.random_uniform(layer_shape[:-3] + [1, 1, hparams.bottleneck_bits]) d = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0 z = tf.layers.dense(d, final_filters, name="unbottleneck") return layer + z, 0.0 # Embed. frames = tf.concat([features["cur_target_frame"], features["inputs"]], axis=-1) x = tf.layers.dense( frames, filters, name="latent_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) if hparams.full_latent_tower: for i in range(hparams.num_compress_steps): with tf.variable_scope("latent_downstride%d" % i): x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tf.layers.conv2d(x, filters, kernel, activation=common_layers.belu, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) else: x = common_layers.double_discriminator(x) x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1) x = tf.tanh( tf.layers.dense(x, hparams.bottleneck_bits, name="bottleneck")) d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 - x) if hparams.mode == tf.estimator.ModeKeys.TRAIN: noise = tf.random_uniform(common_layers.shape_list(x)) noise = 2.0 * tf.to_float(tf.less(hparams.bottleneck_noise, noise)) - 1.0 d *= noise z = tf.layers.dense(d, final_filters, name="unbottleneck") return layer + z, 0.0
def make_even_size(self, x): if not self.is1d: return common_layers.make_even_size(x) shape1 = x.get_shape().as_list()[1] if shape1 is not None and shape1 % 2 == 0: return x x, _ = common_layers.pad_to_same_length( x, x, final_length_divisible_by=2, axis=1) return x
def basic_conv_net(self, images, conv_size, scope): """Simple multi conv ln relu.""" conv_size = self.tinyify(conv_size) with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): x = images for i, c in enumerate(conv_size): if i > 0: x = tf.nn.relu(x) x = common_layers.make_even_size(x) x = tfl.conv2d(x, c, [3, 3], strides=(2, 2), activation=None, padding="SAME", name="conv%d" % i) x = tfcl.layer_norm(x) return x
def encode(self, inputs, target_space, hparams, features=None, losses=None): """Add layers of strided convolutions on top of encoder.""" with tf.variable_scope("downstride"): hparams = self.hparams kernel, strides = (4, 4), (2, 2) x = inputs # Down-convolutions. for i in range(hparams.num_compress_steps): x = common_layers.make_even_size(x) x = tf.layers.conv2d( x, hparams.hidden_size, kernel, strides=strides, padding="SAME", activation=common_layers.belu, name="conv_%d" % i) x = common_layers.layer_norm(x) encoder_output, encoder_decoder_attention_bias = super( TransformerSketch, self).encode( x, target_space, hparams, features=features, losses=losses) return encoder_output, encoder_decoder_attention_bias
def construct_model(self, images, actions, rewards, k=-1, num_masks=10, cdna=True, dna=False, context_frames=2): """Build convolutional lstm video predictor using CDNA, or DNA. Args: images: list of tensors of ground truth image sequences there should be a 4D image ?xWxHxC for each timestep actions: list of action tensors each action should be in the shape ?x1xZ rewards: list of reward tensors each reward should be in the shape ?x1xZ k: constant used for scheduled sampling. -1 to feed in own prediction. num_masks: the number of different pixel motion predictions (and the number of masks for each of those predictions) cdna: True to use Convoluational Dynamic Neural Advection (CDNA) dna: True to use Dynamic Neural Advection (DNA) context_frames: number of ground truth frames to pass in before feeding in own predictions Returns: gen_images: predicted future image frames gen_rewards: predicted future rewards latent_mean: mean of approximated posterior latent_std: std of approximated posterior Raises: ValueError: if more than one network option specified or more than 1 mask specified for DNA model. """ # Each image is being used twice, in latent tower and main tower. # This is to make sure we are using the *same* image for both, ... # ... given how TF queues work. images = [tf.identity(image) for image in images] if cdna + dna != 1: raise ValueError("More than one, or no network option specified.") img_height, img_width, color_channels = self.hparams.problem.frame_shape batch_size = common_layers.shape_list(images[0])[0] # Predicted images and rewards. gen_rewards, gen_images = [], [] if k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. iter_num = tf.train.get_global_step() # TODO(mbz): what should it be if it's undefined? if iter_num is None: iter_num = _LARGE_STEP_NUMBER num_ground_truth = tf.to_int32( tf.round( tf.to_float(batch_size) * (k / (k + tf.exp(tf.to_float(iter_num) / tf.to_float(k)))))) feedself = False # LSTM state sizes and states. lstm_size = np.array([32, 32, 64, 64, 128, 64, 32], dtype=np.int32) lstm_state = [None] * 7 # Latent tower if self.hparams.stochastic_model: latent_tower_outputs = self.construct_latent_tower(images) latent_mean, latent_std, samples = latent_tower_outputs # Main tower layer_norm = tf.contrib.layers.layer_norm lstm_func = self.conv_lstm_2d for timestep, image, action, reward in zip(range(len(images) - 1), images[:-1], actions[:-1], rewards[:-1]): done_warm_start = len(gen_images) > context_frames - 1 with tf.variable_scope("main", reuse=tf.AUTO_REUSE): if feedself and done_warm_start: # Feed in generated image. prev_image = gen_images[-1] prev_reward = gen_rewards[-1] elif done_warm_start: # Scheduled sampling prev_image = self.scheduled_sample(image, gen_images[-1], batch_size, num_ground_truth) prev_reward = self.scheduled_sample( reward, gen_rewards[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image prev_reward = reward prev_image = common_layers.make_even_size(prev_image) enc0 = slim.layers.conv2d( prev_image, 32, [5, 5], stride=2, scope="scale1_conv1", normalizer_fn=layer_norm, normalizer_params={"scope": "layer_norm1"}) hidden1, lstm_state[0] = lstm_func(enc0, lstm_state[0], lstm_size[0], scope="state1") hidden1 = layer_norm(hidden1, scope="layer_norm2") hidden2, lstm_state[1] = lstm_func(hidden1, lstm_state[1], lstm_size[1], scope="state2") hidden2 = layer_norm(hidden2, scope="layer_norm3") hidden2 = common_layers.make_even_size(hidden2) enc1 = slim.layers.conv2d(hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope="conv2") hidden3, lstm_state[2] = lstm_func(enc1, lstm_state[2], lstm_size[2], scope="state3") hidden3 = layer_norm(hidden3, scope="layer_norm4") hidden4, lstm_state[3] = lstm_func(hidden3, lstm_state[3], lstm_size[3], scope="state4") hidden4 = layer_norm(hidden4, scope="layer_norm5") hidden4 = common_layers.make_even_size(hidden4) enc2 = slim.layers.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope="conv3") # Pass in reward and action. emb_action = self.encode_to_shape(action, enc2.get_shape()) emb_reward = self.encode_to_shape(prev_reward, enc2.get_shape()) enc2 = tf.concat(axis=3, values=[enc2, emb_action, emb_reward]) # Setup latent if self.hparams.stochastic_model: latent = samples if self.hparams.multi_latent: latent = samples[timestep] if self.hparams.mode == tf.estimator.ModeKeys.TRAIN: # TODO(mbz): put 1st stage of training back in if necessary latent = latent_mean + tf.exp( latent_std / 2.0) * latent with tf.control_dependencies([latent]): enc2 = tf.concat([enc2, latent], 3) enc3 = slim.layers.conv2d(enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope="conv4") hidden5, lstm_state[4] = lstm_func(enc3, lstm_state[4], lstm_size[4], scope="state5") # last 8x8 hidden5 = layer_norm(hidden5, scope="layer_norm6") enc4 = slim.layers.conv2d_transpose(hidden5, hidden5.get_shape()[3], 3, stride=2, scope="convt1") enc1_shape = common_layers.shape_list(enc1) enc4 = enc4[:, :enc1_shape[1], : enc1_shape[2], :] # Cut to shape. hidden6, lstm_state[5] = lstm_func(enc4, lstm_state[5], lstm_size[5], scope="state6") # 16x16 hidden6 = layer_norm(hidden6, scope="layer_norm7") # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose(hidden6, hidden6.get_shape()[3], 3, stride=2, scope="convt2") enc0_shape = common_layers.shape_list(enc0) enc5 = enc5[:, :enc0_shape[1], : enc0_shape[2], :] # Cut to shape. hidden7, lstm_state[6] = lstm_func(enc5, lstm_state[6], lstm_size[6], scope="state7") # 32x32 hidden7 = layer_norm(hidden7, scope="layer_norm8") # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( hidden7, hidden7.get_shape()[3], 3, stride=2, scope="convt3", activation_fn=None, normalizer_fn=layer_norm, normalizer_params={"scope": "layer_norm9"}) if dna: # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose( enc6, self.hparams.dna_kernel_size**2, 1, stride=1, scope="convt4", activation_fn=None) else: # Using largest hidden state for predicting a new image layer. enc7 = slim.layers.conv2d_transpose(enc6, color_channels, 1, stride=1, scope="convt4", activation_fn=None) # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. transformed = [tf.nn.sigmoid(enc7)] if cdna: # cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) cdna_input = tf.contrib.layers.flatten(hidden5) transformed += self.cdna_transformation( prev_image, cdna_input, num_masks, int(color_channels)) elif dna: # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError( "Only one mask is supported for DNA model.") transformed = [self.dna_transformation(prev_image, enc7)] masks = slim.layers.conv2d_transpose(enc6, num_masks + 1, 1, stride=1, scope="convt7", activation_fn=None) masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [ batch_size, int(img_height), int(img_width), num_masks + 1 ]) mask_list = tf.split(axis=3, num_or_size_splits=num_masks + 1, value=masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask gen_images.append(output) p_reward = self.reward_prediction(hidden5) p_reward = self.decode_to_shape(p_reward, reward.shape) gen_rewards.append(p_reward) return gen_images, gen_rewards, latent_mean, latent_std
def next_frame(self, frames, actions, rewards, target_frame, internal_states, video_extra): del rewards, video_extra hparams = self.hparams filters = hparams.hidden_size kernel2 = (4, 4) # Embed the inputs. stacked_frames = tf.concat(frames, axis=-1) inputs_shape = common_layers.shape_list(stacked_frames) # Using non-zero bias initializer below for edge cases of uniform inputs. x = tf.layers.dense( stacked_frames, filters, name="inputs_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) # Down-stride. layer_inputs = [x] for i in range(hparams.num_compress_steps): with tf.variable_scope("downstride%d" % i): layer_inputs.append(x) x = tf.nn.dropout(x, 1.0 - self.hparams.dropout) x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) # Add embedded action if present. if self.has_actions: action = actions[-1] x = common_video.inject_additional_input(x, action, "action_enc", hparams.action_injection) # Inject latent if present. Only for stochastic models. x, extra_loss = self.inject_latent(x, frames, target_frame) x_mid = tf.reduce_mean(x, axis=[1, 2], keepdims=True) x, internal_states = self.middle_network(x, internal_states) # Up-convolve. layer_inputs = list(reversed(layer_inputs)) for i in range(hparams.num_compress_steps): with tf.variable_scope("upstride%d" % i): x = tf.nn.dropout(x, 1.0 - self.hparams.dropout) if self.has_actions: x = common_video.inject_additional_input( x, action, "action_enc", hparams.action_injection) if i >= hparams.num_compress_steps - hparams.filter_double_steps: filters //= 2 x = tf.layers.conv2d_transpose(x, filters, kernel2, activation=common_layers.belu, strides=(2, 2), padding="SAME") y = layer_inputs[i] shape = common_layers.shape_list(y) x = x[:, :shape[1], :shape[2], :] x = common_layers.layer_norm(x + y) x = common_attention.add_timing_signal_nd(x) # Cut down to original size. x = x[:, :inputs_shape[1], :inputs_shape[2], :] x_fin = tf.reduce_mean(x, axis=[1, 2], keepdims=True) if self.is_per_pixel_softmax: x = tf.layers.dense(x, hparams.problem.num_channels * 256, name="logits") else: x = tf.layers.dense(x, hparams.problem.num_channels, name="logits") # No reward prediction if not needed. if not self.has_rewards: return x, None, extra_loss, internal_states # Reward prediction based on middle and final logits. reward_pred = tf.concat([x_mid, x_fin], axis=-1) reward_pred = tf.nn.relu( tf.layers.dense(reward_pred, 128, name="reward_pred")) reward_pred = tf.squeeze(reward_pred, axis=1) # Remove extra dims reward_pred = tf.squeeze(reward_pred, axis=1) # Remove extra dims return x, reward_pred, extra_loss, internal_states
def body_single(self, features): hparams = self.hparams filters = hparams.hidden_size kernel1, kernel2 = (3, 3), (4, 4) # Embed the inputs. inputs_shape = common_layers.shape_list(features["inputs"]) # Using non-zero bias initializer below for edge cases of uniform inputs. x = tf.layers.dense( features["inputs"], filters, name="inputs_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) # Down-stride. layer_inputs = [x] for i in range(hparams.num_compress_steps): with tf.variable_scope("downstride%d" % i): layer_inputs.append(x) x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) # Add embedded action if present. if "input_action" in features: action = features["input_action"][:, -1, :] x = self.inject_additional_input(x, action, "action_enc", hparams.action_injection) x, extra_loss = self.inject_latent(x, features, filters) # Run a stack of convolutions. for i in range(hparams.num_hidden_layers): with tf.variable_scope("layer%d" % i): y = tf.nn.dropout(x, 1.0 - hparams.dropout) y = tf.layers.conv2d(y, filters, kernel1, activation=common_layers.belu, strides=(1, 1), padding="SAME") if i == 0: x = y else: x = common_layers.layer_norm(x + y) # Up-convolve. layer_inputs = list(reversed(layer_inputs)) for i in range(hparams.num_compress_steps): with tf.variable_scope("upstride%d" % i): if "input_action" in features: x = self.inject_additional_input(x, action, "action_enc", hparams.action_injection) if i >= hparams.num_compress_steps - hparams.filter_double_steps: filters //= 2 x = tf.layers.conv2d_transpose(x, filters, kernel2, activation=common_layers.belu, strides=(2, 2), padding="SAME") y = layer_inputs[i] shape = common_layers.shape_list(y) x = x[:, :shape[1], :shape[2], :] x = common_layers.layer_norm(x + y) x = common_attention.add_timing_signal_nd(x) # Cut down to original size. x = x[:, :inputs_shape[1], :inputs_shape[2], :] if self.is_per_pixel_softmax: x = tf.layers.dense(x, hparams.problem.num_channels * 256, name="logits") else: x = tf.layers.dense(x, hparams.problem.num_channels, name="logits") # Reward prediction if needed. if "target_reward" not in features: return x reward_pred = tf.expand_dims( # Add a fake channels dim. tf.reduce_mean(x, axis=[1, 2], keepdims=True), axis=3) return {"targets": x, "target_reward": reward_pred}, extra_loss
def next_frame(self, frames, actions, rewards, target_frame, internal_states, video_extra): del rewards, video_extra hparams = self.hparams filters = hparams.hidden_size kernel2 = (4, 4) action = actions[-1] # Stack the inputs. if internal_states is not None and hparams.concat_internal_states: # Use the first part of the first internal state if asked to concatenate. batch_size = common_layers.shape_list(frames[0])[0] internal_state = internal_states[0][0][:batch_size, :, :, :] stacked_frames = tf.concat(frames + [internal_state], axis=-1) else: stacked_frames = tf.concat(frames, axis=-1) inputs_shape = common_layers.shape_list(stacked_frames) # Update internal states early if requested. if hparams.concat_internal_states: internal_states = self.update_internal_states_early( internal_states, frames) # Using non-zero bias initializer below for edge cases of uniform inputs. x = tf.layers.dense( stacked_frames, filters, name="inputs_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) # Down-stride. layer_inputs = [x] for i in range(hparams.num_compress_steps): with tf.variable_scope("downstride%d" % i): layer_inputs.append(x) x = tf.nn.dropout(x, 1.0 - self.hparams.dropout) x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) # Add embedded action if present. if self.has_actions: x = common_video.inject_additional_input( x, action, "action_enc", hparams.action_injection) # Inject latent if present. Only for stochastic models. x, extra_loss = self.inject_latent(x, frames, target_frame, action) x_mid = tf.reduce_mean(x, axis=[1, 2], keepdims=True) x, internal_states = self.middle_network(x, internal_states) # Up-convolve. layer_inputs = list(reversed(layer_inputs)) for i in range(hparams.num_compress_steps): with tf.variable_scope("upstride%d" % i): x = tf.nn.dropout(x, 1.0 - self.hparams.dropout) if self.has_actions: x = common_video.inject_additional_input( x, action, "action_enc", hparams.action_injection) if i >= hparams.num_compress_steps - hparams.filter_double_steps: filters //= 2 x = tf.layers.conv2d_transpose( x, filters, kernel2, activation=common_layers.belu, strides=(2, 2), padding="SAME") y = layer_inputs[i] shape = common_layers.shape_list(y) x = x[:, :shape[1], :shape[2], :] x = common_layers.layer_norm(x + y) x = common_attention.add_timing_signal_nd(x) # Cut down to original size. x = x[:, :inputs_shape[1], :inputs_shape[2], :] x_fin = tf.reduce_mean(x, axis=[1, 2], keepdims=True) if self.is_per_pixel_softmax: x = tf.layers.dense(x, hparams.problem.num_channels * 256, name="logits") else: x = tf.layers.dense(x, hparams.problem.num_channels, name="logits") # No reward prediction if not needed. if not self.has_rewards: return x, None, extra_loss, internal_states # Reward prediction based on middle and final logits. reward_pred = tf.concat([x_mid, x_fin], axis=-1) reward_pred = tf.nn.relu(tf.layers.dense( reward_pred, 128, name="reward_pred")) reward_pred = tf.squeeze(reward_pred, axis=1) # Remove extra dims reward_pred = tf.squeeze(reward_pred, axis=1) # Remove extra dims return x, reward_pred, extra_loss, internal_states
def construct_model(self, images, actions, states, k=-1, use_state=False, num_masks=10, cdna=True, dna=False, context_frames=2): """Build convolutional lstm video predictor using CDNA, or DNA. Args: images: tensor of ground truth image sequences actions: tensor of action sequences states: tensor of ground truth state sequences k: constant used for scheduled sampling. -1 to feed in own prediction. use_state: True to include state and action in prediction num_masks: the number of different pixel motion predictions (and the number of masks for each of those predictions) cdna: True to use Convoluational Dynamic Neural Advection (CDNA) dna: True to use Dynamic Neural Advection (DNA) context_frames: number of ground truth frames to pass in before feeding in own predictions Returns: gen_images: predicted future image frames gen_states: predicted future states Raises: ValueError: if more than one network option specified or more than 1 mask specified for DNA model. """ # Each image is being used twice, in latent tower and main tower. # This is to make sure we are using the *same* image for both, ... # ... given how TF queues work. images = [tf.identity(image) for image in images] if cdna + dna != 1: raise ValueError("More than one, or no network option specified.") img_height, img_width, color_channels = self.hparams.problem.frame_shape batch_size = common_layers.shape_list(images[0])[0] lstm_func = self.basic_conv_lstm_cell # Generated robot states and images. gen_states, gen_images = [], [] current_state = states[0] if k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. iter_num = tf.train.get_or_create_global_step() num_ground_truth = tf.to_int32( tf.round( tf.to_float(batch_size) * (k / (k + tf.exp(tf.to_float(iter_num) / tf.to_float(k)))))) feedself = False # LSTM state sizes and states. lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None # Latent tower if self.hparams.stochastic_model: latent_tower_outputs = self.construct_latent_tower(images) latent_mean, latent_std, samples = latent_tower_outputs # Main tower timestep = 0 layer_norm = tf.contrib.layers.layer_norm for image, action in zip(images[:-1], actions[:-1]): # Reuse variables after the first timestep. reuse = bool(gen_images) done_warm_start = len(gen_images) > context_frames - 1 with slim.arg_scope( [ lstm_func, slim.layers.conv2d, slim.layers.fully_connected, layer_norm, slim.layers.conv2d_transpose ], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = gen_images[-1] elif done_warm_start: # Scheduled sampling prev_image = self.scheduled_sample( image, gen_images[-1], self.hparams.batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image # Predicted state is always fed back in state_action = tf.concat(axis=1, values=[action, current_state]) prev_image = common_layers.make_even_size(prev_image) enc0 = slim.layers.conv2d( prev_image, 32, [5, 5], stride=2, scope="scale1_conv1", normalizer_fn=layer_norm, normalizer_params={"scope": "layer_norm1"}) hidden1, lstm_state1 = lstm_func( enc0, lstm_state1, lstm_size[0], scope="state1") hidden1 = layer_norm(hidden1, scope="layer_norm2") hidden2, lstm_state2 = lstm_func( hidden1, lstm_state2, lstm_size[1], scope="state2") hidden2 = layer_norm(hidden2, scope="layer_norm3") hidden2 = common_layers.make_even_size(hidden2) enc1 = slim.layers.conv2d( hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope="conv2") hidden3, lstm_state3 = lstm_func( enc1, lstm_state3, lstm_size[2], scope="state3") hidden3 = layer_norm(hidden3, scope="layer_norm4") hidden4, lstm_state4 = lstm_func( hidden3, lstm_state4, lstm_size[3], scope="state4") hidden4 = layer_norm(hidden4, scope="layer_norm5") hidden4 = common_layers.make_even_size(hidden4) enc2 = slim.layers.conv2d( hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope="conv3") # Pass in state and action. smear = tf.reshape( state_action, [-1, 1, 1, int(common_layers.shape_list(state_action)[1])]) enc2_shape = common_layers.shape_list(enc2) smear = tf.tile( smear, [1, enc2_shape[1], enc2_shape[2], 1]) if use_state: enc2 = tf.concat(axis=3, values=[enc2, smear]) # Setup latent if self.hparams.stochastic_model: latent = samples if self.hparams.multi_latent: latent = samples[timestep] if self.hparams.mode == tf.estimator.ModeKeys.TRAIN: # TODO(mbz): put 1st stage of training back in if necessary latent = latent_mean + tf.exp(latent_std / 2.0) * latent with tf.control_dependencies([latent]): enc2 = tf.concat([enc2, latent], 3) enc3 = slim.layers.conv2d( enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope="conv4") hidden5, lstm_state5 = lstm_func( enc3, lstm_state5, lstm_size[4], scope="state5") # last 8x8 hidden5 = layer_norm(hidden5, scope="layer_norm6") enc4 = slim.layers.conv2d_transpose( hidden5, hidden5.get_shape()[3], 3, stride=2, scope="convt1") enc1_shape = common_layers.shape_list(enc1) enc4 = enc4[:, :enc1_shape[1], :enc1_shape[2], :] # Cut to shape. hidden6, lstm_state6 = lstm_func( enc4, lstm_state6, lstm_size[5], scope="state6") # 16x16 hidden6 = layer_norm(hidden6, scope="layer_norm7") # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( hidden6, hidden6.get_shape()[3], 3, stride=2, scope="convt2") enc0_shape = common_layers.shape_list(enc0) enc5 = enc5[:, :enc0_shape[1], :enc0_shape[2], :] # Cut to shape. hidden7, lstm_state7 = lstm_func( enc5, lstm_state7, lstm_size[6], scope="state7") # 32x32 hidden7 = layer_norm(hidden7, scope="layer_norm8") # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( hidden7, hidden7.get_shape()[3], 3, stride=2, scope="convt3", activation_fn=None, normalizer_fn=layer_norm, normalizer_params={"scope": "layer_norm9"}) if dna: # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose( enc6, self.hparams.dna_kernel_size**2, 1, stride=1, scope="convt4", activation_fn=None) else: # Using largest hidden state for predicting a new image layer. enc7 = slim.layers.conv2d_transpose( enc6, color_channels, 1, stride=1, scope="convt4", activation_fn=None) # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. transformed = [tf.nn.sigmoid(enc7)] if cdna: # cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) cdna_input = tf.contrib.layers.flatten(hidden5) transformed += self.cdna_transformation( prev_image, cdna_input, num_masks, int(color_channels)) elif dna: # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError("Only one mask is supported for DNA model.") transformed = [self.dna_transformation(prev_image, enc7)] masks = slim.layers.conv2d_transpose( enc6, num_masks + 1, 1, stride=1, scope="convt7", activation_fn=None) masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [batch_size, int(img_height), int(img_width), num_masks + 1]) mask_list = tf.split( axis=3, num_or_size_splits=num_masks + 1, value=masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask gen_images.append(output) current_state = slim.layers.fully_connected( state_action, int(current_state.get_shape()[1]), scope="state_pred", activation_fn=None) gen_states.append(current_state) timestep += 1 return gen_images, gen_states, latent_mean, latent_std
def next_frame(self, frames, actions, rewards, target_frame, internal_states, video_extra): del rewards, video_extra hparams = self.hparams filters = hparams.hidden_size kernel2 = (4, 4) action = actions[-1] activation_fn = common_layers.belu if self.hparams.activation_fn == "relu": activation_fn = tf.nn.relu # Normalize frames. frames = [common_layers.standardize_images(f) for f in frames] # Stack the inputs. if internal_states is not None and hparams.concat_internal_states: # Use the first part of the first internal state if asked to concatenate. batch_size = common_layers.shape_list(frames[0])[0] internal_state = internal_states[0][0][:batch_size, :, :, :] stacked_frames = tf.concat(frames + [internal_state], axis=-1) else: stacked_frames = tf.concat(frames, axis=-1) inputs_shape = common_layers.shape_list(stacked_frames) # Update internal states early if requested. if hparams.concat_internal_states: internal_states = self.update_internal_states_early( internal_states, frames) # Using non-zero bias initializer below for edge cases of uniform inputs. x = tf.layers.dense( stacked_frames, filters, name="inputs_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) # Down-stride. layer_inputs = [x] for i in range(hparams.num_compress_steps): with tf.variable_scope("downstride%d" % i): layer_inputs.append(x) x = tf.nn.dropout(x, 1.0 - self.hparams.dropout) x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tf.layers.conv2d(x, filters, kernel2, activation=activation_fn, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) if self.has_actions: with tf.variable_scope("policy"): x_flat = tf.layers.flatten(x) policy_pred = tf.layers.dense(x_flat, self.hparams.problem.num_actions) value_pred = tf.layers.dense(x_flat, 1) value_pred = tf.squeeze(value_pred, axis=-1) else: policy_pred, value_pred = None, None # Add embedded action if present. if self.has_actions: x = common_video.inject_additional_input(x, action, "action_enc", hparams.action_injection) # Inject latent if present. Only for stochastic models. norm_target_frame = common_layers.standardize_images(target_frame) x, extra_loss = self.inject_latent(x, frames, norm_target_frame, action) x_mid = tf.reduce_mean(x, axis=[1, 2], keepdims=True) x, internal_states = self.middle_network(x, internal_states) # Up-convolve. layer_inputs = list(reversed(layer_inputs)) for i in range(hparams.num_compress_steps): with tf.variable_scope("upstride%d" % i): x = tf.nn.dropout(x, 1.0 - self.hparams.dropout) if self.has_actions: x = common_video.inject_additional_input( x, action, "action_enc", hparams.action_injection) if i >= hparams.num_compress_steps - hparams.filter_double_steps: filters //= 2 x = tf.layers.conv2d_transpose(x, filters, kernel2, activation=activation_fn, strides=(2, 2), padding="SAME") y = layer_inputs[i] shape = common_layers.shape_list(y) x = x[:, :shape[1], :shape[2], :] x = common_layers.layer_norm(x + y) x = common_attention.add_timing_signal_nd(x) # Cut down to original size. x = x[:, :inputs_shape[1], :inputs_shape[2], :] x_fin = tf.reduce_mean(x, axis=[1, 2], keepdims=True) if hparams.do_autoregressive_rnn: # If enabled, we predict the target frame autoregregressively using rnns. # To this end, the current prediciton is flattened into one long sequence # of sub-pixels, and so is the target frame. Each sub-pixel (RGB value, # from 0 to 255) is predicted with an RNN. To avoid doing as many steps # as width * height * channels, we only use a number of pixels back, # as many as hparams.autoregressive_rnn_lookback. with tf.variable_scope("autoregressive_rnn"): batch_size = common_layers.shape_list(frames[0])[0] # Height, width, channels and lookback are the constants we need. h, w = inputs_shape[1], inputs_shape[ 2] # 105, 80 on Atari games c = hparams.problem.num_channels lookback = hparams.autoregressive_rnn_lookback assert ( h * w ) % lookback == 0, "Number of pixels must divide lookback." m = (h * w) // lookback # Batch size multiplier for the RNN. # These are logits that will be used as inputs to the RNN. rnn_inputs = tf.layers.dense(x, c * 64, name="rnn_inputs") # They are of shape [batch_size, h, w, c, 64], reshaping now. rnn_inputs = tf.reshape(rnn_inputs, [batch_size * m, lookback * c, 64]) # Same for the target frame. rnn_target = tf.reshape(target_frame, [batch_size * m, lookback * c]) # Construct rnn starting state: flatten rnn_inputs, apply a relu layer. rnn_start_state = tf.nn.relu( tf.layers.dense(tf.nn.relu(tf.layers.flatten(rnn_inputs)), 256, name="rnn_start_state")) # Our RNN function API is on bits, each subpixel has 8 bits. total_num_bits = lookback * c * 8 # We need to provide RNN targets as bits (due to the API). rnn_target_bits = discretization.int_to_bit(rnn_target, 8) rnn_target_bits = tf.reshape(rnn_target_bits, [batch_size * m, total_num_bits]) if self.is_training: # Run the RNN in training mode, add it's loss to the losses. rnn_predict, rnn_loss = discretization.predict_bits_with_lstm( rnn_start_state, 128, total_num_bits, target_bits=rnn_target_bits, extra_inputs=rnn_inputs) extra_loss += rnn_loss # We still use non-RNN predictions too in order to guide the network. x = tf.layers.dense(x, c * 256, name="logits") x = tf.reshape(x, [batch_size, h, w, c, 256]) rnn_predict = tf.reshape(rnn_predict, [batch_size, h, w, c, 256]) # Mix non-RNN and RNN predictions so that after warmup the RNN is 90%. x = tf.reshape(tf.nn.log_softmax(x), [batch_size, h, w, c * 256]) rnn_predict = tf.nn.log_softmax(rnn_predict) rnn_predict = tf.reshape(rnn_predict, [batch_size, h, w, c * 256]) alpha = 0.9 * common_layers.inverse_lin_decay( hparams.autoregressive_rnn_warmup_steps) x = alpha * rnn_predict + (1.0 - alpha) * x else: # In prediction mode, run the RNN without any targets. bits, _ = discretization.predict_bits_with_lstm( rnn_start_state, 128, total_num_bits, extra_inputs=rnn_inputs, temperature=0.0 ) # No sampling from this RNN, just greedy. # The output is in bits, get back the predicted pixels. bits = tf.reshape(bits, [batch_size * m, lookback * c, 8]) ints = discretization.bit_to_int(tf.maximum(bits, 0), 8) ints = tf.reshape(ints, [batch_size, h, w, c]) x = tf.reshape(tf.one_hot(ints, 256), [batch_size, h, w, c * 256]) elif self.is_per_pixel_softmax: x = tf.layers.dense(x, hparams.problem.num_channels * 256, name="logits") else: x = tf.layers.dense(x, hparams.problem.num_channels, name="logits") reward_pred = None if self.has_rewards: # Reward prediction based on middle and final logits. reward_pred = tf.concat([x_mid, x_fin], axis=-1) reward_pred = tf.nn.relu( tf.layers.dense(reward_pred, 128, name="reward_pred")) reward_pred = tf.squeeze(reward_pred, axis=1) # Remove extra dims reward_pred = tf.squeeze(reward_pred, axis=1) # Remove extra dims return x, reward_pred, policy_pred, value_pred, extra_loss, internal_states
def inject_latent(self, layer, features, filters): """Inject a deterministic latent based on the target frame.""" del filters hparams = self.hparams final_filters = common_layers.shape_list(layer)[-1] filters = hparams.hidden_size kernel = (4, 4) layer_shape = common_layers.shape_list(layer) batch_size = layer_shape[0] state_size = hparams.latent_predictor_state_size lstm_cell = tf.contrib.rnn.LSTMCell(state_size) discrete_predict = tf.layers.Dense(256, name="discrete_predict") discrete_embed = tf.layers.Dense(state_size, name="discrete_embed") def add_d(layer, d): z_mul = tf.layers.dense(d, final_filters, name="unbottleneck_mul") if not hparams.complex_addn: return layer + z_mul layer *= tf.nn.sigmoid(z_mul) z_add = tf.layers.dense(d, final_filters, name="unbottleneck_add") layer += z_add return layer if self.is_predicting: if hparams.full_latent_tower: rand = tf.random_uniform(layer_shape[:-1] + [hparams.bottleneck_bits]) else: layer_pred = tf.reshape( layer, [batch_size, prod(layer_shape[1:])]) prediction = tf.layers.dense(layer_pred, state_size, name="istate") c_state = tf.layers.dense(layer_pred, state_size, name="cstate") m_state = tf.layers.dense(layer_pred, state_size, name="mstate") state = (c_state, m_state) outputs = [] for i in range(hparams.bottleneck_bits // 8): output, state = lstm_cell(prediction, state) discrete_logits = discrete_predict(output) discrete_samples = common_layers.sample_with_temperature( discrete_logits, hparams.latent_predictor_temperature) outputs.append(tf.expand_dims(discrete_samples, axis=1)) prediction = discrete_embed( tf.one_hot(discrete_samples, 256)) outputs = tf.concat(outputs, axis=1) outputs = discretization.int_to_bit(outputs, 8) rand = tf.reshape(outputs, [batch_size, 1, 1, hparams.bottleneck_bits]) d = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0 return add_d(layer, d), 0.0 # Embed. frames = tf.concat([features["cur_target_frame"], features["inputs"]], axis=-1) x = tf.layers.dense( frames, filters, name="latent_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) if hparams.full_latent_tower: for i in range(hparams.num_compress_steps): with tf.variable_scope("latent_downstride%d" % i): x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tf.layers.conv2d(x, filters, kernel, activation=common_layers.belu, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) else: x = common_layers.double_discriminator(x) x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1) x = tf.layers.dense(x, hparams.bottleneck_bits, name="bottleneck") x0 = tf.tanh(x) d = x0 + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x0)) - 1.0 - x0) pred_loss = 0.0 if not hparams.full_latent_tower: d_pred = tf.reshape(tf.maximum(tf.stop_gradient(d), 0), [batch_size, hparams.bottleneck_bits // 8, 8]) d_int = discretization.bit_to_int(d_pred, 8) tf.summary.histogram("d_int", tf.reshape(d_int, [-1])) d_hot = tf.one_hot(d_int, 256, axis=-1) d_pred = discrete_embed(d_hot) layer_pred = tf.reshape(layer, [batch_size, prod(layer_shape[1:])]) prediction0 = tf.layers.dense(layer_pred, state_size, name="istate") c_state = tf.layers.dense(layer_pred, state_size, name="cstate") m_state = tf.layers.dense(layer_pred, state_size, name="mstate") pred = tf.concat([tf.expand_dims(prediction0, axis=1), d_pred], axis=1) state = (c_state, m_state) outputs = [] for i in range(hparams.bottleneck_bits // 8): output, state = lstm_cell(pred[:, i, :], state) outputs.append(tf.expand_dims(output, axis=1)) outputs = tf.concat(outputs, axis=1) d_int_pred = discrete_predict(outputs) pred_loss = tf.losses.sparse_softmax_cross_entropy( logits=d_int_pred, labels=d_int) pred_loss = tf.reduce_mean(pred_loss) if hparams.mode == tf.estimator.ModeKeys.TRAIN: x += tf.truncated_normal(common_layers.shape_list(x), mean=0.0, stddev=0.2) x = tf.tanh(x) noise = tf.random_uniform(common_layers.shape_list(x)) noise = 2.0 * tf.to_float(tf.less(hparams.bottleneck_noise, noise)) - 1.0 x *= noise d = x + tf.stop_gradient(2.0 * tf.to_float(tf.less(0.0, x)) - 1.0 - x) p = common_layers.inverse_lin_decay(hparams.discrete_warmup_steps) d = tf.where(tf.less(tf.random_uniform([batch_size]), p), d, x) return add_d(layer, d), pred_loss
def bottom_part_tower(self, input_image, input_reward, action, latent, lstm_state, lstm_size, conv_size, concat_latent=False): """The bottom part of predictive towers. With the current (early) design, the main prediction tower and the reward prediction tower share the same arcitecture. TF Scope can be adjusted as required to either share or not share the weights between the two towers. Args: input_image: the current image. input_reward: the current reward. action: the action taken by the agent. latent: the latent vector. lstm_state: the current internal states of conv lstms. lstm_size: the size of lstms. conv_size: the size of convolutions. concat_latent: whether or not to concatenate the latent at every step. Returns: - the output of the partial network. - intermidate outputs for skip connections. """ lstm_func = common_video.conv_lstm_2d tile_and_concat = common_video.tile_and_concat input_image = common_layers.make_even_size(input_image) concat_input_image = tile_and_concat(input_image, latent, concat_latent=concat_latent) enc0 = tfl.conv2d(concat_input_image, conv_size[0], [5, 5], strides=(2, 2), activation=tf.nn.relu, padding="SAME", name="scale1_conv1") enc0 = tfcl.layer_norm(enc0, scope="layer_norm1") hidden1, lstm_state[0] = lstm_func(enc0, lstm_state[0], lstm_size[0], name="state1") hidden1 = tile_and_concat(hidden1, latent, concat_latent=concat_latent) hidden1 = tfcl.layer_norm(hidden1, scope="layer_norm2") hidden2, lstm_state[1] = lstm_func(hidden1, lstm_state[1], lstm_size[1], name="state2") hidden2 = tfcl.layer_norm(hidden2, scope="layer_norm3") hidden2 = common_layers.make_even_size(hidden2) enc1 = tfl.conv2d(hidden2, hidden2.get_shape()[3], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="conv2") enc1 = tile_and_concat(enc1, latent, concat_latent=concat_latent) hidden3, lstm_state[2] = lstm_func(enc1, lstm_state[2], lstm_size[2], name="state3") hidden3 = tile_and_concat(hidden3, latent, concat_latent=concat_latent) hidden3 = tfcl.layer_norm(hidden3, scope="layer_norm4") hidden4, lstm_state[3] = lstm_func(hidden3, lstm_state[3], lstm_size[3], name="state4") hidden4 = tile_and_concat(hidden4, latent, concat_latent=concat_latent) hidden4 = tfcl.layer_norm(hidden4, scope="layer_norm5") hidden4 = common_layers.make_even_size(hidden4) enc2 = tfl.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="conv3") if action is not None: enc2 = self.inject_additional_input( enc2, action, "action_enc", self.hparams.concatenate_actions) if input_reward is not None: enc2 = self.inject_additional_input(enc2, input_reward, "reward_enc") if latent is not None and not concat_latent: with tf.control_dependencies([latent]): enc2 = tf.concat([enc2, latent], axis=3) enc3 = tfl.conv2d(enc2, hidden4.get_shape()[3], [1, 1], strides=(1, 1), padding="SAME", activation=tf.nn.relu, name="conv4") hidden5, lstm_state[4] = lstm_func(enc3, lstm_state[4], lstm_size[4], name="state5") # last 8x8 hidden5 = tfcl.layer_norm(hidden5, scope="layer_norm6") hidden5 = tile_and_concat(hidden5, latent, concat_latent=concat_latent) return hidden5, (enc0, enc1)
def construct_predictive_tower( self, input_image, input_reward, action, lstm_state, latent): # Main tower layer_norm = tf.contrib.layers.layer_norm lstm_func = self.conv_lstm_2d batch_size = common_layers.shape_list(input_image)[0] # the number of different pixel motion predictions # and the number of masks for each of those predictions num_masks = self.hparams.num_masks lstm_size = self.tinyify([32, 32, 64, 64, 128, 64, 32]) conv_size = self.tinyify([32]) img_height, img_width, color_channels = self.hparams.problem.frame_shape with tf.variable_scope("main", reuse=tf.AUTO_REUSE): input_image = common_layers.make_even_size(input_image) enc0 = slim.layers.conv2d( input_image, conv_size[0], [5, 5], stride=2, scope="scale1_conv1", normalizer_fn=layer_norm, normalizer_params={"scope": "layer_norm1"}) hidden1, lstm_state[0] = lstm_func( enc0, lstm_state[0], lstm_size[0], scope="state1") hidden1 = layer_norm(hidden1, scope="layer_norm2") hidden2, lstm_state[1] = lstm_func( hidden1, lstm_state[1], lstm_size[1], scope="state2") hidden2 = layer_norm(hidden2, scope="layer_norm3") hidden2 = common_layers.make_even_size(hidden2) enc1 = slim.layers.conv2d( hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope="conv2") hidden3, lstm_state[2] = lstm_func( enc1, lstm_state[2], lstm_size[2], scope="state3") hidden3 = layer_norm(hidden3, scope="layer_norm4") hidden4, lstm_state[3] = lstm_func( hidden3, lstm_state[3], lstm_size[3], scope="state4") hidden4 = layer_norm(hidden4, scope="layer_norm5") hidden4 = common_layers.make_even_size(hidden4) enc2 = slim.layers.conv2d( hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope="conv3") # Pass in reward and action. emb_action = self.encode_to_shape(action, enc2.get_shape(), "action_enc") emb_reward = self.encode_to_shape( input_reward, enc2.get_shape(), "reward_enc") enc2 = tf.concat(axis=3, values=[enc2, emb_action, emb_reward]) if latent is not None: with tf.control_dependencies([latent]): enc2 = tf.concat([enc2, latent], 3) enc3 = slim.layers.conv2d( enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope="conv4") hidden5, lstm_state[4] = lstm_func( enc3, lstm_state[4], lstm_size[4], scope="state5") # last 8x8 hidden5 = layer_norm(hidden5, scope="layer_norm6") enc4 = slim.layers.conv2d_transpose( hidden5, hidden5.get_shape()[3], 3, stride=2, scope="convt1") enc1_shape = common_layers.shape_list(enc1) enc4 = enc4[:, :enc1_shape[1], :enc1_shape[2], :] # Cut to shape. hidden6, lstm_state[5] = lstm_func( enc4, lstm_state[5], lstm_size[5], scope="state6") # 16x16 hidden6 = layer_norm(hidden6, scope="layer_norm7") # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( hidden6, hidden6.get_shape()[3], 3, stride=2, scope="convt2") enc0_shape = common_layers.shape_list(enc0) enc5 = enc5[:, :enc0_shape[1], :enc0_shape[2], :] # Cut to shape. hidden7, lstm_state[6] = lstm_func( enc5, lstm_state[6], lstm_size[6], scope="state7") # 32x32 hidden7 = layer_norm(hidden7, scope="layer_norm8") # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( hidden7, hidden7.get_shape()[3], 3, stride=2, scope="convt3", activation_fn=None, normalizer_fn=layer_norm, normalizer_params={"scope": "layer_norm9"}) if self.hparams.model_options == "DNA": # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose( enc6, self.hparams.dna_kernel_size**2, 1, stride=1, scope="convt4", activation_fn=None) else: # Using largest hidden state for predicting a new image layer. enc7 = slim.layers.conv2d_transpose( enc6, color_channels, 1, stride=1, scope="convt4", activation_fn=None) # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. transformed = [tf.nn.sigmoid(enc7)] if self.hparams.model_options == "CDNA": # cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) cdna_input = tf.contrib.layers.flatten(hidden5) transformed += self.cdna_transformation( input_image, cdna_input, num_masks, int(color_channels)) elif self.hparams.model_options == "DNA": # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError("Only one mask is supported for DNA model.") transformed = [self.dna_transformation(input_image, enc7)] masks = slim.layers.conv2d_transpose( enc6, num_masks + 1, 1, stride=1, scope="convt7", activation_fn=None) masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [batch_size, int(img_height), int(img_width), num_masks + 1]) mask_list = tf.split( axis=3, num_or_size_splits=num_masks + 1, value=masks) output = mask_list[0] * input_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask p_reward = self.reward_prediction(hidden5) p_reward = self.decode_to_shape( p_reward, input_reward.shape, "reward_dec") return output, p_reward, lstm_state
def body(self, features): hparams = self.hparams filters = hparams.hidden_size kernel1, kernel2 = (3, 3), (4, 4) # Embed the inputs. inputs_shape = common_layers.shape_list(features["inputs"]) # Using non-zero bias initializer below for edge cases of uniform inputs. x = tf.layers.dense( features["inputs"], filters, name="inputs_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) # Down-stride. layer_inputs = [x] for i in range(hparams.num_compress_steps): with tf.variable_scope("downstride%d" % i): layer_inputs.append(x) x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) # Add embedded action if present. if "input_action" in features: action = tf.reshape(features["input_action"][:, -1, :], [-1, 1, 1, hparams.hidden_size]) action_mask = tf.layers.dense(action, filters, name="action_mask") zeros_mask = tf.zeros(common_layers.shape_list(x)[:-1] + [filters], dtype=tf.float32) x *= action_mask + zeros_mask # Run a stack of convolutions. for i in range(hparams.num_hidden_layers): with tf.variable_scope("layer%d" % i): y = tf.layers.conv2d(x, filters, kernel1, activation=common_layers.belu, strides=(1, 1), padding="SAME") y = tf.nn.dropout(y, 1.0 - hparams.dropout) if i == 0: x = y else: x = common_layers.layer_norm(x + y) # Up-convolve. layer_inputs = list(reversed(layer_inputs)) for i in range(hparams.num_compress_steps): with tf.variable_scope("upstride%d" % i): if i >= hparams.num_compress_steps - hparams.filter_double_steps: filters //= 2 x = tf.layers.conv2d_transpose( x, filters, kernel2, activation=common_layers.belu, strides=(2, 2), padding="SAME") y = layer_inputs[i] shape = common_layers.shape_list(y) x = x[:, :shape[1], :shape[2], :] x = common_layers.layer_norm(x + y) x = common_attention.add_timing_signal_nd(x) # Cut down to original size. x = x[:, :inputs_shape[1], :inputs_shape[2], :] # Reward prediction if needed. if "target_reward" not in features: return x reward_pred = tf.reduce_mean(x, axis=[1, 2], keepdims=True) return {"targets": x, "target_reward": reward_pred}
def conv_latent_tower(images, time_axis, latent_channels=1, min_logvar=-5, is_training=False, random_latent=False, tiny_mode=False, small_mode=False): """Builds convolutional latent tower for stochastic model. At training time this tower generates a latent distribution (mean and std) conditioned on the entire video. This latent variable will be fed to the main tower as an extra variable to be used for future frames prediction. At inference time, the tower is disabled and only returns latents sampled from N(0,1). If the multi_latent flag is on, a different latent for every timestep would be generated. Args: images: tensor of ground truth image sequences time_axis: the time axis in images tensor latent_channels: number of latent channels min_logvar: minimum value for log_var is_training: whether or not it is training mode random_latent: whether or not generate random latents tiny_mode: whether or not it is tiny_mode. tiny_mode sets the number of conv channels to 1 at each layer. useful for testing the integration tests. small_mode: whether or not it is small_mode. small mode is the same model with less conv and lstm layers and also lower number of channels. suitable for videos with less complexity and testing. Returns: latent_mean: predicted latent mean latent_logvar: predicted latent log variance """ conv_size = tinyify([32, 64, 64], tiny_mode, small_mode) with tf.variable_scope("latent", reuse=tf.AUTO_REUSE): images = tf.to_float(images) images = tf.unstack(images, axis=time_axis) images = tf.concat(images, axis=3) x = images x = common_layers.make_even_size(x) x = tfl.conv2d(x, conv_size[0], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_conv1") x = tfcl.layer_norm(x) if not small_mode: x = tfl.conv2d(x, conv_size[1], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_conv2") x = tfcl.layer_norm(x) x = tfl.conv2d(x, conv_size[2], [3, 3], strides=(1, 1), padding="SAME", activation=tf.nn.relu, name="latent_conv3") x = tfcl.layer_norm(x) nc = latent_channels mean = tfl.conv2d(x, nc, [3, 3], strides=(2, 2), padding="SAME", activation=None, name="latent_mean") logv = tfl.conv2d(x, nc, [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="latent_std") logvar = logv + min_logvar # No latent tower at inference time, just standard gaussian. if not is_training: return tf.zeros_like(mean), tf.zeros_like(logvar) # No latent in the first phase ret_mean, ret_logvar = tf.cond( random_latent, lambda: (tf.zeros_like(mean), tf.zeros_like(logvar)), lambda: (mean, logvar)) return ret_mean, ret_logvar
def body(self, features): hparams = self.hparams filters = hparams.hidden_size kernel1, kernel2 = (3, 3), (4, 4) # Embed the inputs. inputs_shape = common_layers.shape_list(features["inputs"]) # Using non-zero bias initializer below for edge cases of uniform inputs. x = tf.layers.dense( features["inputs"], filters, name="inputs_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) # Down-stride. layer_inputs = [x] for i in range(hparams.num_compress_steps): with tf.variable_scope("downstride%d" % i): layer_inputs.append(x) x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = tf.layers.conv2d(x, filters, kernel2, activation=common_layers.belu, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) # Add embedded action if present. if "input_action" in features: action = tf.reshape(features["input_action"][:, -1, :], [-1, 1, 1, hparams.hidden_size]) action_mask = tf.layers.dense(action, filters, name="action_mask") zeros_mask = tf.zeros(common_layers.shape_list(x)[:-1] + [filters], dtype=tf.float32) x *= action_mask + zeros_mask # Run a stack of convolutions. for i in range(hparams.num_hidden_layers): with tf.variable_scope("layer%d" % i): y = tf.layers.conv2d(x, filters, kernel1, activation=common_layers.belu, strides=(1, 1), padding="SAME") y = tf.nn.dropout(y, 1.0 - hparams.dropout) if i == 0: x = y else: x = common_layers.layer_norm(x + y) # Up-convolve. layer_inputs = list(reversed(layer_inputs)) for i in range(hparams.num_compress_steps): with tf.variable_scope("upstride%d" % i): if i >= hparams.num_compress_steps - hparams.filter_double_steps: filters //= 2 x = tf.layers.conv2d_transpose( x, filters, kernel2, activation=common_layers.belu, strides=(2, 2), padding="SAME") y = layer_inputs[i] shape = common_layers.shape_list(y) x = x[:, :shape[1], :shape[2], :] x = common_layers.layer_norm(x + y) x = common_attention.add_timing_signal_nd(x) # Cut down to original size. x = x[:, :inputs_shape[1], :inputs_shape[2], :] # Reward prediction if needed. if "target_reward" not in features: return x reward_pred = tf.reduce_mean(x, axis=[1, 2], keepdims=True) return {"targets": x, "target_reward": reward_pred}
def construct_predictive_tower( self, input_image, input_reward, action, lstm_state, latent): # Main tower layer_norm = tf.contrib.layers.layer_norm lstm_func = self.conv_lstm_2d batch_size = common_layers.shape_list(input_image)[0] # the number of different pixel motion predictions # and the number of masks for each of those predictions num_masks = self.hparams.num_masks lstm_size = self.tinyify([32, 32, 64, 64, 128, 64, 32]) conv_size = self.tinyify([32]) img_height, img_width, color_channels = self.hparams.problem.frame_shape with tf.variable_scope("main", reuse=tf.AUTO_REUSE): input_image = common_layers.make_even_size(input_image) enc0 = slim.layers.conv2d( input_image, conv_size[0], [5, 5], stride=2, scope="scale1_conv1", normalizer_fn=layer_norm, normalizer_params={"scope": "layer_norm1"}) hidden1, lstm_state[0] = lstm_func( enc0, lstm_state[0], lstm_size[0], scope="state1") hidden1 = layer_norm(hidden1, scope="layer_norm2") hidden2, lstm_state[1] = lstm_func( hidden1, lstm_state[1], lstm_size[1], scope="state2") hidden2 = layer_norm(hidden2, scope="layer_norm3") hidden2 = common_layers.make_even_size(hidden2) enc1 = slim.layers.conv2d( hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope="conv2") hidden3, lstm_state[2] = lstm_func( enc1, lstm_state[2], lstm_size[2], scope="state3") hidden3 = layer_norm(hidden3, scope="layer_norm4") hidden4, lstm_state[3] = lstm_func( hidden3, lstm_state[3], lstm_size[3], scope="state4") hidden4 = layer_norm(hidden4, scope="layer_norm5") hidden4 = common_layers.make_even_size(hidden4) enc2 = slim.layers.conv2d( hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope="conv3") # Pass in reward and action. emb_action = self.encode_to_shape(action, enc2.get_shape()) emb_reward = self.encode_to_shape(input_reward, enc2.get_shape()) enc2 = tf.concat(axis=3, values=[enc2, emb_action, emb_reward]) if latent is not None: with tf.control_dependencies([latent]): enc2 = tf.concat([enc2, latent], 3) enc3 = slim.layers.conv2d( enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope="conv4") hidden5, lstm_state[4] = lstm_func( enc3, lstm_state[4], lstm_size[4], scope="state5") # last 8x8 hidden5 = layer_norm(hidden5, scope="layer_norm6") enc4 = slim.layers.conv2d_transpose( hidden5, hidden5.get_shape()[3], 3, stride=2, scope="convt1") enc1_shape = common_layers.shape_list(enc1) enc4 = enc4[:, :enc1_shape[1], :enc1_shape[2], :] # Cut to shape. hidden6, lstm_state[5] = lstm_func( enc4, lstm_state[5], lstm_size[5], scope="state6") # 16x16 hidden6 = layer_norm(hidden6, scope="layer_norm7") # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( hidden6, hidden6.get_shape()[3], 3, stride=2, scope="convt2") enc0_shape = common_layers.shape_list(enc0) enc5 = enc5[:, :enc0_shape[1], :enc0_shape[2], :] # Cut to shape. hidden7, lstm_state[6] = lstm_func( enc5, lstm_state[6], lstm_size[6], scope="state7") # 32x32 hidden7 = layer_norm(hidden7, scope="layer_norm8") # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( hidden7, hidden7.get_shape()[3], 3, stride=2, scope="convt3", activation_fn=None, normalizer_fn=layer_norm, normalizer_params={"scope": "layer_norm9"}) if self.hparams.model_options == "DNA": # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose( enc6, self.hparams.dna_kernel_size**2, 1, stride=1, scope="convt4", activation_fn=None) else: # Using largest hidden state for predicting a new image layer. enc7 = slim.layers.conv2d_transpose( enc6, color_channels, 1, stride=1, scope="convt4", activation_fn=None) # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. transformed = [tf.nn.sigmoid(enc7)] if self.hparams.model_options == "CDNA": # cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) cdna_input = tf.contrib.layers.flatten(hidden5) transformed += self.cdna_transformation( input_image, cdna_input, num_masks, int(color_channels)) elif self.hparams.model_options == "DNA": # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError("Only one mask is supported for DNA model.") transformed = [self.dna_transformation(input_image, enc7)] masks = slim.layers.conv2d_transpose( enc6, num_masks + 1, 1, stride=1, scope="convt7", activation_fn=None) masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [batch_size, int(img_height), int(img_width), num_masks + 1]) mask_list = tf.split( axis=3, num_or_size_splits=num_masks + 1, value=masks) output = mask_list[0] * input_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask p_reward = self.reward_prediction(hidden5) p_reward = self.decode_to_shape(p_reward, input_reward.shape) return output, p_reward, lstm_state
def inject_latent(self, layer, inputs, target, action): """Inject a deterministic latent based on the target frame.""" hparams = self.hparams final_filters = common_layers.shape_list(layer)[-1] filters = hparams.hidden_size kernel = (4, 4) layer_shape = common_layers.shape_list(layer) def add_bits(layer, bits): z_mul = tfl.dense(bits, final_filters, name="unbottleneck_mul") if not hparams.complex_addn: return layer + z_mul layer *= tf.nn.sigmoid(z_mul) z_add = tfl.dense(bits, final_filters, name="unbottleneck_add") layer += z_add return layer if not self.is_training: if hparams.full_latent_tower: rand = tf.random_uniform(layer_shape[:-1] + [hparams.bottleneck_bits]) bits = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0 else: bits, _ = discretization.predict_bits_with_lstm( layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits, temperature=hparams.latent_predictor_temperature) bits = tf.expand_dims(tf.expand_dims(bits, axis=1), axis=2) return add_bits(layer, bits), 0.0 # Embed. frames = tf.concat(inputs + [target], axis=-1) x = tfl.dense( frames, filters, name="latent_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) # Add embedded action if present. if action is not None: x = common_video.inject_additional_input( x, action, "action_enc_latent", hparams.action_injection) if hparams.full_latent_tower: for i in range(hparams.num_compress_steps): with tf.variable_scope("latent_downstride%d" % i): x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tfl.conv2d(x, filters, kernel, activation=common_layers.belu, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) else: x = common_layers.double_discriminator(x) x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1) bits, bits_clean = discretization.tanh_discrete_bottleneck( x, hparams.bottleneck_bits, hparams.bottleneck_noise, hparams.discretize_warmup_steps, hparams.mode) if not hparams.full_latent_tower: _, pred_loss = discretization.predict_bits_with_lstm( layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits, target_bits=bits_clean) # Mix bits from latent with predicted bits on forward pass as a noise. if hparams.latent_rnn_max_sampling > 0.0: with tf.variable_scope(tf.get_variable_scope(), reuse=True): bits_pred, _ = discretization.predict_bits_with_lstm( layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits, temperature=hparams.latent_predictor_temperature) bits_pred = tf.expand_dims(tf.expand_dims(bits_pred, axis=1), axis=2) # Be bits_pred on the forward pass but bits on the backward one. bits_pred = bits_clean + tf.stop_gradient(bits_pred - bits_clean) # Select which bits to take from pred sampling with bit_p probability. which_bit = tf.random_uniform(common_layers.shape_list(bits)) bit_p = common_layers.inverse_lin_decay(hparams.latent_rnn_warmup_steps) bit_p *= hparams.latent_rnn_max_sampling bits = tf.where(which_bit < bit_p, bits_pred, bits) res = add_bits(layer, bits) # During training, sometimes skip the latent to help action-conditioning. res_p = common_layers.inverse_lin_decay(hparams.latent_rnn_warmup_steps / 2) res_p *= hparams.latent_use_max_probability res_rand = tf.random_uniform([layer_shape[0]]) res = tf.where(res_rand < res_p, res, layer) return res, pred_loss
def inject_latent(self, layer, inputs, target, action): """Inject a deterministic latent based on the target frame.""" hparams = self.hparams final_filters = common_layers.shape_list(layer)[-1] filters = hparams.hidden_size kernel = (4, 4) layer_shape = common_layers.shape_list(layer) activation_fn = common_layers.belu if hparams.activation_fn == "relu": activation_fn = tf.nn.relu def add_bits(layer, bits): z_mul = tfl.dense(bits, final_filters, name="unbottleneck_mul") if not hparams.complex_addn: return layer + z_mul layer *= tf.nn.sigmoid(z_mul) z_add = tfl.dense(bits, final_filters, name="unbottleneck_add") layer += z_add return layer if not self.is_training: if hparams.full_latent_tower: rand = tf.random_uniform(layer_shape[:-1] + [hparams.bottleneck_bits]) bits = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0 else: bits, _ = discretization.predict_bits_with_lstm( layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits, temperature=hparams.latent_predictor_temperature) bits = tf.expand_dims(tf.expand_dims(bits, axis=1), axis=2) return add_bits(layer, bits), 0.0 # Embed. frames = tf.concat(inputs + [target], axis=-1) x = tfl.dense( frames, filters, name="latent_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) # Add embedded action if present. if action is not None: x = common_video.inject_additional_input(x, action, "action_enc_latent", hparams.action_injection) if hparams.full_latent_tower: for i in range(hparams.num_compress_steps): with tf.variable_scope("latent_downstride%d" % i): x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tfl.conv2d(x, filters, kernel, activation=activation_fn, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) else: x = common_layers.double_discriminator(x) x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1) bits, bits_clean = discretization.tanh_discrete_bottleneck( x, hparams.bottleneck_bits, hparams.bottleneck_noise, hparams.discretize_warmup_steps, hparams.mode) if not hparams.full_latent_tower: _, pred_loss = discretization.predict_bits_with_lstm( layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits, target_bits=bits_clean) # Mix bits from latent with predicted bits on forward pass as a noise. if hparams.latent_rnn_max_sampling > 0.0: with tf.variable_scope(tf.get_variable_scope(), reuse=True): bits_pred, _ = discretization.predict_bits_with_lstm( layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits, temperature=hparams.latent_predictor_temperature) bits_pred = tf.expand_dims(tf.expand_dims(bits_pred, axis=1), axis=2) # Be bits_pred on the forward pass but bits on the backward one. bits_pred = bits_clean + tf.stop_gradient(bits_pred - bits_clean) # Select which bits to take from pred sampling with bit_p probability. which_bit = tf.random_uniform(common_layers.shape_list(bits)) bit_p = common_layers.inverse_lin_decay( hparams.latent_rnn_warmup_steps) bit_p *= hparams.latent_rnn_max_sampling bits = tf.where(which_bit < bit_p, bits_pred, bits) res = add_bits(layer, bits) # During training, sometimes skip the latent to help action-conditioning. res_p = common_layers.inverse_lin_decay( hparams.latent_rnn_warmup_steps / 2) res_p *= hparams.latent_use_max_probability res_rand = tf.random_uniform([layer_shape[0]]) res = tf.where(res_rand < res_p, res, layer) return res, pred_loss
def bottom_part_tower(self, input_image, input_reward, action, latent, lstm_state, lstm_size, conv_size, concat_latent=False): """The bottom part of predictive towers. With the current (early) design, the main prediction tower and the reward prediction tower share the same arcitecture. TF Scope can be adjusted as required to either share or not share the weights between the two towers. Args: input_image: the current image. input_reward: the current reward. action: the action taken by the agent. latent: the latent vector. lstm_state: the current internal states of conv lstms. lstm_size: the size of lstms. conv_size: the size of convolutions. concat_latent: whether or not to concatenate the latent at every step. Returns: - the output of the partial network. - intermidate outputs for skip connections. """ lstm_func = common_video.conv_lstm_2d tile_and_concat = common_video.tile_and_concat input_image = common_layers.make_even_size(input_image) concat_input_image = tile_and_concat( input_image, latent, concat_latent=concat_latent) layer_id = 0 enc0 = tfl.conv2d( concat_input_image, conv_size[0], [5, 5], strides=(2, 2), activation=tf.nn.relu, padding="SAME", name="scale1_conv1") enc0 = tfcl.layer_norm(enc0, scope="layer_norm1") hidden1, lstm_state[layer_id] = lstm_func( enc0, lstm_state[layer_id], lstm_size[layer_id], name="state1") hidden1 = tile_and_concat(hidden1, latent, concat_latent=concat_latent) hidden1 = tfcl.layer_norm(hidden1, scope="layer_norm2") layer_id += 1 hidden2, lstm_state[layer_id] = lstm_func( hidden1, lstm_state[layer_id], lstm_size[layer_id], name="state2") hidden2 = tfcl.layer_norm(hidden2, scope="layer_norm3") hidden2 = common_layers.make_even_size(hidden2) enc1 = tfl.conv2d(hidden2, hidden2.get_shape()[3], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="conv2") enc1 = tile_and_concat(enc1, latent, concat_latent=concat_latent) layer_id += 1 if self.hparams.small_mode: hidden4, enc2 = hidden2, enc1 else: hidden3, lstm_state[layer_id] = lstm_func( enc1, lstm_state[layer_id], lstm_size[layer_id], name="state3") hidden3 = tile_and_concat(hidden3, latent, concat_latent=concat_latent) hidden3 = tfcl.layer_norm(hidden3, scope="layer_norm4") layer_id += 1 hidden4, lstm_state[layer_id] = lstm_func( hidden3, lstm_state[layer_id], lstm_size[layer_id], name="state4") hidden4 = tile_and_concat(hidden4, latent, concat_latent=concat_latent) hidden4 = tfcl.layer_norm(hidden4, scope="layer_norm5") hidden4 = common_layers.make_even_size(hidden4) enc2 = tfl.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], strides=(2, 2), padding="SAME", activation=tf.nn.relu, name="conv3") layer_id += 1 if action is not None: enc2 = common_video.inject_additional_input( enc2, action, "action_enc", self.hparams.action_injection) if input_reward is not None: enc2 = common_video.inject_additional_input( enc2, input_reward, "reward_enc") if latent is not None and not concat_latent: with tf.control_dependencies([latent]): enc2 = tf.concat([enc2, latent], axis=3) enc3 = tfl.conv2d(enc2, hidden4.get_shape()[3], [1, 1], strides=(1, 1), padding="SAME", activation=tf.nn.relu, name="conv4") hidden5, lstm_state[layer_id] = lstm_func( enc3, lstm_state[layer_id], lstm_size[layer_id], name="state5") hidden5 = tfcl.layer_norm(hidden5, scope="layer_norm6") hidden5 = tile_and_concat(hidden5, latent, concat_latent=concat_latent) layer_id += 1 return hidden5, (enc0, enc1), layer_id
def inject_latent(self, layer, inputs, target): """Inject a deterministic latent based on the target frame.""" hparams = self.hparams final_filters = common_layers.shape_list(layer)[-1] filters = hparams.hidden_size kernel = (4, 4) layer_shape = common_layers.shape_list(layer) def add_bits(layer, bits): z_mul = tfl.dense(bits, final_filters, name="unbottleneck_mul") if not hparams.complex_addn: return layer + z_mul layer *= tf.nn.sigmoid(z_mul) z_add = tfl.dense(bits, final_filters, name="unbottleneck_add") layer += z_add return layer if not self.is_training: if hparams.full_latent_tower: rand = tf.random_uniform(layer_shape[:-1] + [hparams.bottleneck_bits]) bits = 2.0 * tf.to_float(tf.less(0.5, rand)) - 1.0 else: bits, _ = discretization.predict_bits_with_lstm( layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits, temperature=hparams.latent_predictor_temperature) bits = tf.expand_dims(tf.expand_dims(bits, axis=1), axis=2) return add_bits(layer, bits), 0.0 # Embed. frames = tf.concat(inputs + [target], axis=-1) x = tfl.dense( frames, filters, name="latent_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) if hparams.full_latent_tower: for i in range(hparams.num_compress_steps): with tf.variable_scope("latent_downstride%d" % i): x = common_layers.make_even_size(x) if i < hparams.filter_double_steps: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tfl.conv2d(x, filters, kernel, activation=common_layers.belu, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) else: x = common_layers.double_discriminator(x) x = tf.expand_dims(tf.expand_dims(x, axis=1), axis=1) bits, bits_clean = discretization.tanh_discrete_bottleneck( x, hparams.bottleneck_bits, hparams.bottleneck_noise, hparams.discretize_warmup_steps, hparams.mode) if not hparams.full_latent_tower: _, pred_loss = discretization.predict_bits_with_lstm( layer, hparams.latent_predictor_state_size, hparams.bottleneck_bits, target_bits=bits_clean) return add_bits(layer, bits), pred_loss
def network(self): def middle_network(layer): # Run a stack of convolutions. x = layer kernel1 = (3, 3) filters = common_layers.shape_list(x)[-1] for i in range(2): with tf.variable_scope("layer%d" % i): y = tf.nn.dropout(x, 1.0 - 0.5) y = tf.layers.conv2d(y, filters, kernel1, activation=self.activation_fn, strides=(1, 1), padding="SAME") if i == 0: x = y else: x = common_layers.layer_norm(x + y) return x batch_size = tf.shape(self.states_ph)[0] filters = self.hidden_size kernel2 = (4, 4) action = self.actions_oph #[0] NOTE - might remove this # Normalize states if (self.n_envs > 1): states = [ common_layers.standardize_images(self.states_ph[i, :, :, :]) for i in range(self.n_envs) ] stacked_states = tf.stack(states) else: stacked_states = common_layers.standardize_images(self.states_ph) inputs_shape = common_layers.shape_list(stacked_states) # Using non-zero bias initializer below for edge cases of uniform inputs. x = tf.layers.dense( stacked_states, filters, name="inputs_embed", bias_initializer=tf.random_normal_initializer(stddev=0.01)) x = common_attention.add_timing_signal_nd(x) # Down-stride. layer_inputs = [x] for i in range(self.layers): with tf.variable_scope("downstride%d" % i): layer_inputs.append(x) x = tf.nn.dropout(x, 1.0 - self.dropout_p) x = common_layers.make_even_size(x) if i < 2: filters *= 2 x = common_attention.add_timing_signal_nd(x) x = tf.layers.conv2d(x, filters, kernel2, activation=self.activation_fn, strides=(2, 2), padding="SAME") x = common_layers.layer_norm(x) if self.is_policy: with tf.variable_scope("policy"): x_flat = tf.layers.flatten(x) policy_pred = tf.layers.dense(x_flat, self.action_dim) value_pred = tf.layers.dense(x_flat, 1) value_pred = tf.squeeze(value_pred, axis=-1) else: policy_pred, value_pred = None, None #if self.has_actions: x = inject_additional_input(x, action, "action_enc", "multi_additive") # Inject latent if present. Only for stochastic models. target_states = common_layers.standardize_images(self.target_states) x_mid = tf.reduce_mean(x, axis=[1, 2], keepdims=True) x = middle_network(x) # Up-convolve. layer_inputs = list(reversed(layer_inputs)) for i in range(self.layers): with tf.variable_scope("upstride%d" % i): x = tf.nn.dropout(x, 1.0 - 0.1) if i >= self.layers - 2: filters //= 2 x = tf.layers.conv2d_transpose(x, filters, kernel2, activation=self.activation_fn, strides=(2, 2), padding="SAME") y = layer_inputs[i] shape = common_layers.shape_list(y) x = x[:, :shape[1], :shape[2], :] x = common_layers.layer_norm(x + y) x = common_attention.add_timing_signal_nd(x) # Cut down to original size. x = x[:, :inputs_shape[1], :inputs_shape[2], :] x_fin = tf.reduce_mean(x, axis=[1, 2], keepdims=True) x = tf.layers.dense(x, self.depth, name="logits") reward_pred = None if self.has_rewards: # Reward prediction based on middle and final logits. reward_pred = tf.concat([x_mid, x_fin], axis=-1) reward_pred = tf.nn.relu( tf.layers.dense(reward_pred, 128, name="reward_pred")) reward_pred = tf.squeeze(reward_pred, axis=1) # Remove extra dims reward_pred = tf.squeeze(reward_pred, axis=1) # Remove extra dims return x, reward_pred, policy_pred, value_pred