def _predict(self, input_data, n_frames_input, n_frames_predict, batch_size, is_training, params, encoded, input_images, predict_images, action_sequence, abs_action_sequence): """ Runs the prediction in the latent space. """ predicted = AttrDict() assert FLAGS.separate_attention_key with tf.name_scope("high_level_rnn"): # build initial input dict goal_latent = utils.batchwise_gather(tensor=encoded["future"], idxs=input_data.goal_timestep, batch_dim=1) first_input = tf.concat([encoded.past[-1], goal_latent], axis=-1) high_level_rnn_initial_state = self._high_level_rnn_init_state_getter(first_input) init_tensor = self.init_mlp(first_input) high_level_initial_input = AttrDict({"frame": encoded["past"][-1], "dt": None, "next_prior_dists": init_tensor[:, self.encoded_img_size:], "prior_dists": None, "inference_dists": None, "z_sample": None, "attention_weights": None}) high_level_initial_input["attention_key"] = init_tensor[:, :self.encoded_img_size] # Get predictions for all future times. high_level_rnn_output, _ = self.high_level_rnn( initial_input=high_level_initial_input, # The inference latents are not passed here initial_state=high_level_rnn_initial_state, rollout_len=FLAGS.n_segments, use_inference=is_training, inference_attention_keys=self.get_attention_keys(encoded), predict_dt=False, goal_latent=goal_latent) for key, value in high_level_initial_input.items(): debug("High-level RNN", "input", "high_level_rnn_input[{}]".format(key), "(None)" if value is None else shape(value)) for key, value in high_level_rnn_output.items(): debug("High-level RNN", "output", "high_level_rnn_output[{}]".format(key), shape(value)) if FLAGS.debug: print() predicted["seq_future"] = decoder_rnn_output return predicted
def time_distributed_dense_layer(inputs, output_units, bias=True, activation=None, dropout=None, scope='time-distributed-dense-layer', reuse=False): """ Applies a shared dense layer to each timestep of a tensor of shape [batch_size, max_seq_len, input_units] to produce a tensor of shape [batch_size, max_seq_len, output_units]. Args: inputs: Tensor of shape [batch size, max sequence length, ...]. output_units: Number of output units. activation: activation function. dropout: dropout keep prob. Returns: Tensor of shape [batch size, max sequence length, output_units]. """ with tf.variable_scope(scope, reuse=reuse): W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(inputs, -1), output_units] ) z = tf.einsum('ijk,kl->ijl', inputs, W) if bias: b = tf.get_variable( name='biases', initializer=tf.constant_initializer(), shape=[output_units] ) z = z + b z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return z
def _decode_images(self, input_data, n_frames_input, is_training, params, encoded, predicted, skips, input_images, predict_images): decoded = dict() # Decoding phase with tf.name_scope("image_decoder"): if self._has_image_input: pred_enc = tf.expand_dims(tf.expand_dims(predicted["seq_future"], axis=-1), axis=-1) decoded_frames = self._build_image_decoder( pred_enc, skips["predict"], is_training, decoder_phase="future", last_input_frame=input_images[-1], use_recursive_image=self._use_recursive_image) else: pred_coord = snt.BatchApply(self.conv_decoder)(predicted["seq_future"], is_training) decoded_frames = tf.py_func(self._render_fcn, [pred_coord], tf.float32) render_shape = shape(pred_coord)[:2] + self._render_shape decoded_frames = tf.reshape(decoded_frames, render_shape) decoded["pred_frames"] = decoded_frames decoded["pred_coords"] = pred_coord if not self._has_image_input else None return decoded
def dense_layer(inputs, output_units, bias=True, activation=None, dropout=None, scope='dense-layer', reuse=False): """ Applies a dense layer to a 2D tensor of shape [batch_size, input_units] to produce a tensor of shape [batch_size, output_units]. Args: inputs: Tensor of shape [batch size, input_units]. output_units: Number of output units. activation: activation function. dropout: dropout keep prob. Returns: Tensor of shape [batch size, output_units]. """ with tf.variable_scope(scope, reuse=reuse): W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(inputs, -1), output_units] ) z = tf.matmul(inputs, W) if bias: b = tf.get_variable( name='biases', initializer=tf.constant_initializer(), shape=[output_units] ) z = z + b z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return z
def K(self, tau): N = shape(tau)[0] Bs = [self.coregs.getMat(q) for q in range(self.Q)] #List of tensors BLO = [tf.linalg.LinearOperatorFullMatrix(b) for b in Bs] cv = model.kernels.covFun(tau) Ks = [toeplitz(cv[i]) for i in range(shape(cv)[0])] KSO = [tf.linalg.LinearOperatorFullMatrix(k) for k in Ks] BKs = [ tf.linalg.LinearOperatorKronecker([BLO[i], KSO[i]]) for i in range(self.Q) ] BKS = [mat.to_dense() for mat in BKs] resNoNoise = tf.add_n(BKS) resNoise = resNoNoise + tf.cast( tf.scalar_mul(1. / self.gamma, tf.eye(N * self.C)), tf.complex64) return resNoNoise, resNoise
def dense_word_embedding_from_chars(chars, embed_dim, bias=True, scope='dense-word-embed', reuse=False): """ Word embeddings via dense transformation + maxpooling of character sequences. Args: chars: Tensor of shape [batch_size, word sequence length, char sequence length, alphabet size]. embed_dim: Dimension of word embeddings. Integer. Returns: Sequence of embedding vectors. Tensor of shape [batch_size, word sequence length, embed_dim]. """ with tf.variable_scope(scope, reuse=reuse): chars = tf.cast(chars, tf.float32) W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(chars, -1), embed_dim] ) z = tf.einsum('ijkl,lm->ijkm', chars, W) if bias: b = tf.get_variable( name='biases', initializer=tf.constant_initializer(), shape=[embed_dim] ) z = z + b dense_word_embedding = tf.reduce_max(z, 2) return dense_word_embedding
def dense_layer(inputs, output_units, bias=True, activation=None, dropout=None, scope='dense-layer', reuse=False): """ Applies a dense layer to a 2D tensor of shape [batch_size, input_units] to produce a tensor of shape [batch_size, output_units]. Args: inputs: Tensor of shape [batch size, input_units]. output_units: Number of output units. activation: activation function. dropout: dropout keep prob. Returns: Tensor of shape [batch size, output_units]. """ with tf.variable_scope(scope, reuse=reuse): W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(inputs, -1), output_units]) z = tf.matmul(inputs, W) if bias: b = tf.get_variable(name='biases', initializer=tf.constant_initializer(), shape=[output_units]) z = z + b z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return z
def dense(a_enc, b_enc, bias=True, activation=None, dropout=None, scope='dense', reuse=False): """ Compare the encoded representations a_enc and b_enc using a learnable paramterized function in the form of dense layer applied to the concatenation of a_enc and b_enc. Args: a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units]. b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units]. activation: Activation function. dropout: Dropout keep prob. Float. Returns: Tensor of shape [batch size]. """ with tf.variable_scope(scope, reuse=reuse): inputs = tf.concat([a_enc, b_enc], axis=1) W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(inputs, -1), 1] ) z = tf.matmul(inputs, W) if bias: b_enc = tf.get_variable( name='biases', initializer=tf.constant_initializer(), shape=[1] ) z = z + b_enc z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return tf.squeeze(z)
def time_distributed_dense_layer(inputs, output_units, bias=True, activation=None, dropout=None, scope='time-distributed-dense-layer', reuse=False): """ Applies a shared dense layer to each timestep of a tensor of shape [batch_size, max_seq_len, input_units] to produce a tensor of shape [batch_size, max_seq_len, output_units]. Args: inputs: Tensor of shape [batch size, max sequence length, ...]. output_units: Number of output units. activation: activation function. dropout: dropout keep prob. Returns: Tensor of shape [batch size, max sequence length, output_units]. """ with tf.variable_scope(scope, reuse=reuse): W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(inputs, -1), output_units]) z = tf.einsum('ijk,kl->ijl', inputs, W) if bias: b = tf.get_variable(name='biases', initializer=tf.constant_initializer(), shape=[output_units]) z = z + b z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return z
def run_irl(world, car, reward, theta, data): def gen(): for point in data: for c, x0, u in zip(world.cars, point['x0'], point['u']): c.traj.x0.set_value(x0) for cu, uu in zip(c.traj.u, u): cu.set_value(uu) yield r = car.traj.reward(reward) g = utils.grad(r, car.traj.u) H = utils.hessian(r, car.traj.u) I = tt.eye(utils.shape(H)[0]) reg = utils.vector(1) reg.set_value([1e-1]) H = H - reg[0] * I L = tt.dot(g, tt.dot(tn.MatrixInverse()(H), g)) + tt.log(tn.Det()(-H)) for _ in gen(): pass optimizer = utils.Maximizer(L, [theta], gen=gen, method='gd', eps=0.1, debug=True, iters=1000, inf_ignore=10) optimizer.maximize() print theta.get_value()
def argmax_attentive_matching(a, b, a_lengths, b_lengths, max_seq_len, attention_func=dot_attention, attention_func_kwargs={}): """ Matches each vector in a with the weighted vector in b that has the largest inner product. The weightings are determined by the attention matrix. The attention matrix is computed using attention_func. Args: a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. max_seq_len: Length of padded sequences a and b. Integer. attention_func: Function used to calculate attention matrix. Can be one of the following: multiplicative_attention, additive_attention, concat_attention, dot_attention, or cosine_attention. attention_func_kwargs: Keyword arguments to pass to attention_func. Returns: Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for each timestep in a. """ attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len, **attention_func_kwargs) b_match_idx = tf.argmax(attn, axis=2) batch_index = tf.tile(tf.expand_dims(tf.range(shape(b, 0), dtype=tf.int64), 1), (1, max_seq_len)) b_idx = tf.stack([batch_index, b_match_idx], axis=2) return tf.gather_nd(b, b_idx)
def temporal_convolution_layer(inputs, output_units, convolution_width, bias=True, activation=None, dropout=None, scope='time-distributed-conv-layer', reuse=False): """ Convolution over the temporal axis of sequence data. Args: inputs: Tensor of shape [batch size, max sequence length, input_units]. output_units: Output channels for convolution. convolution_width: Number of timesteps (words) to use in convolution. Returns: Tensor of shape [batch size, max sequence length, output_units]. """ with tf.variable_scope(scope, reuse=reuse): W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[convolution_width, shape(inputs, 2), output_units] ) z = tf.nn.convolution(inputs, W, padding='SAME', strides=[1]) if bias: b = tf.get_variable( name='biases', initializer=tf.constant_initializer(), shape=[output_units] ) z = z + b z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return z
def dense_word_embedding_from_chars(chars, embed_dim, bias=True, scope='dense-word-embed', reuse=False): """ Word embeddings via dense transformation + maxpooling of character sequences. Args: chars: Tensor of shape [batch_size, word sequence length, char sequence length, alphabet size]. embed_dim: Dimension of word embeddings. Integer. Returns: Sequence of embedding vectors. Tensor of shape [batch_size, word sequence length, embed_dim]. """ with tf.variable_scope(scope, reuse=reuse): chars = tf.cast(chars, tf.float32) W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(chars, -1), embed_dim]) z = tf.einsum('ijkl,lm->ijkm', chars, W) if bias: b = tf.get_variable(name='biases', initializer=tf.constant_initializer(), shape=[embed_dim]) z = z + b dense_word_embedding = tf.reduce_max(z, 2) return dense_word_embedding
def update_parameters(self, loss): """ step相当于训练 :param loss: :return: """ if self.regularization_constant != 0: # 所有训练变量 平方和求根 的平方和-->这个正则项,迫使参数 平方和的根变小 l2_norm = tf.reduce_sum([ tf.sqrt(tf.reduce_sum(tf.square(param))) for param in tf.trainable_variables() ]) loss = loss + self.regularization_constant * l2_norm optimizer = self.get_optimizer(self.learning_rate_var) # list(zip(grads, var_list)) 梯度和变量 grads = optimizer.compute_gradients(loss) # <-20 >20的将会被裁剪(用-20,20替代) clipped = [(tf.clip_by_value(g, -self.grad_clip, self.grad_clip), v_) for g, v_ in grads] # 应用梯度下降 step = optimizer.apply_gradients(clipped, global_step=self.global_step) if self.enable_parameter_averaging: maintain_averages_op = self.ema.apply(tf.trainable_variables()) with tf.control_dependencies([step]): # 执行一组操作,在执行滑动平均前,执行梯度计算 self.step = tf.group(maintain_averages_op) else: self.step = step logging.info('all parameters:') logging.info( pp.pformat([(var.name, shape(var)) for var in tf.global_variables()])) logging.info('trainable parameters:') logging.info( pp.pformat([(var.name, shape(var)) for var in tf.trainable_variables()])) logging.info('trainable parameter count:') # 所有参数的个数 prod 求乘积 logging.info( str(np.sum( np.prod(shape(var)) for var in tf.trainable_variables())))
def logEvidence(self, tau, y): K = tf.real(self.K(tau)) logdetK = tf.linalg.logdet(K) N = tf.size(tau) W = shape(y)[2] #y = reshape(y,[N*self.C,W]) y2 = tf.reshape(y, [N * self.C, W]) res = -0.5 * N * self.C * tf.log(2 * pi) - 0.5 * logdetK res2 = res - 0.5 * tf.reduce_sum(tf.multiply(y2, tf.solve(K, y2), 1)) return res2
def _net(self): # RNN and dense layers rnn_layer = MultiRNNCell([GRUCell(self.hidden_size) for _ in range(self.n_layer)]) output_rnn, rnn_state = tf.nn.dynamic_rnn(rnn_layer, self.x_mixed, dtype=tf.float32) input_size = shape(self.x_mixed)[2] y_hat_src1 = tf.layers.dense(inputs=output_rnn, units=input_size, activation=tf.nn.relu, name='y_hat_src1') y_hat_src2 = tf.layers.dense(inputs=output_rnn, units=input_size, activation=tf.nn.relu, name='y_hat_src2') # time-freq masking layer y_tilde_src1 = y_hat_src1 / (y_hat_src1 + y_hat_src2 + np.finfo(float).eps) * self.x_mixed y_tilde_src2 = y_hat_src2 / (y_hat_src1 + y_hat_src2 + np.finfo(float).eps) * self.x_mixed return y_tilde_src1, y_tilde_src2
def get_high_kl_keyframes(frames, inference_dists, prior_dists): n_dim = int(shape(inference_dists)[-1] / 2) batch_size = shape(inference_dists)[1] n_kfs = FLAGS.n_segments kl = Gaussian(inference_dists[..., :n_dim], inference_dists[..., n_dim:]).kl_divergence( Gaussian(prior_dists[..., :n_dim], prior_dists[..., n_dim:])) kl = tf.reduce_sum(kl, axis=-1) # filter with maxima maxima = tf.logical_and(kl[1:-1] > kl[2:], kl[1:-1] > kl[:-2]) max_shape = shape(maxima)[1:] mask = tf.concat([tf.zeros([1]+max_shape, dtype=maxima.dtype), maxima, tf.zeros([1]+max_shape, dtype=maxima.dtype)], axis=0) filtered_kl = kl * tf.cast(mask, dtype=kl.dtype) kf_idxs = tf.contrib.framework.argsort(filtered_kl, axis=0, direction='DESCENDING')[:n_kfs] kf_idxs = tf.contrib.framework.sort(kf_idxs, axis=0) kf_idxs_binary = tf.reduce_sum(tf.one_hot(kf_idxs, depth=shape(frames)[0], axis=0), axis=1) gather_idxs = tf.reshape(tf.stack((kf_idxs, tf.tile(tf.expand_dims(tf.range(batch_size), axis=0), [n_kfs, 1])), axis=-1), (-1, 2)) gathered_kfs = tf.gather_nd(frames, gather_idxs) gathered_kfs = tf.reshape(gathered_kfs, [n_kfs, batch_size] + shape(gathered_kfs)[1:]) return gathered_kfs, kf_idxs_binary, kl
def run(self, input, state, inference_seq, use_inference, inference_attention_keys=None, attention_idxs=None, predict_dt=True, step=None, z_sequence=None, goal_latent=None): """ :param input: :param state: :param inference_seq: passed via kwargs, non-optional :param use_inference: passed via kwargs, non-optional :param inference_attention_keys: :param attention_idxs: :param predict_dt: :param step: Step index in the LSTM exection. :return: """ current_frame_enc, learned_prior = tf.squeeze( input["frame"]), input["next_prior_dists"] if tf.flags.FLAGS.hl_learned_prior: prior = learned_prior else: prior = utils.get_fixed_prior(shape(learned_prior)) inference_dists, attention_weights = self._attend_inference( current_frame_enc if not tf.flags.FLAGS.separate_attention_key else input["attention_key"], inference_seq, inference_attention_keys, attention_idxs, step) dist_dim = int(self._prior_latent_size / 2) if use_inference: mu, std_dev = inference_dists[:, :dist_dim], tf.exp( inference_dists[:, dist_dim:]) else: mu, std_dev = prior[:, :dist_dim], tf.exp(prior[:, dist_dim:]) z = self._sample(mu, std_dev) if z_sequence is None else z_sequence[step] lstm_input = self.append(z, current_frame_enc) lstm_input = self.append(lstm_input, goal_latent) lstm_output, new_state = self._base_core(lstm_input, state) output = self._format_output(lstm_output, prior, inference_dists, z, predict_dt, attention_weights) return output, new_state
def _setup_output(self, input_data, n_frames_input, action_sequence, encoded, predicted, decoded, is_training, abs_action_sequence): """ Runs the prediction in the latent space. """ model_output = super(StochasticSingleStepPredictorKFDetect, self)._setup_output(input_data, n_frames_input, action_sequence, encoded, predicted, decoded, is_training, abs_action_sequence) batch_size = shape(model_output["decoded_low_level_frames"])[1] # reencode generate frames and compute inference distributions (mostly for test time) if not self._has_image_input: reencoded_seq = snt.BatchApply(self.conv_encoder)(model_output["decoded_low_level_coords"], False) else: reencoded_seq, _ = snt.BatchApply( self.conv_encoder)(self._output_activation(model_output["decoded_low_level_frames"]), False) if len(shape(reencoded_seq)) == 5: reencoded_seq = reencoded_seq[:, :, :, 0, 0] with tf.name_scope("reinfer_rnn"): inference_rnn_init_state = self._inference_rnn_init_state_getter(batch_size) inference_rnn_output, _ = self.inference_rnn( input_seq=tf.concat((encoded["past"][-1:], reencoded_seq), axis=0), initial_state=inference_rnn_init_state) model_output["inference_dists_reencode"] = inference_rnn_output[1:] return model_output
def _compute_cdna_kernels(self, input): batch_size = shape(input)[0] cdna_activation = tf.reshape( input, [batch_size, -1]) # flatten input activations cdna_kernels = self._cdna_kernel_layer(cdna_activation) cdna_kernels = tf.reshape( cdna_kernels, (batch_size, self.cdna_kernel_size, self.cdna_kernel_size, self.num_cdna_kernels)) # do not append identity kernel, as we only use last input image and it is skipped to the end anyways cdna_kernels = tf.nn.relu( cdna_kernels - RELU_SHIFT) + RELU_SHIFT # make strictly positive cdna_kernels /= tf.reduce_sum(cdna_kernels, axis=[1, 2], keep_dims=True) # normalize spatially return cdna_kernels
def perform_matching(self, mention_emb, question_emb): if self.general_config.matching_op == "matmul": # [batch, mention, emb] * [batch, emb, 1] ==> [batch, mention, 1] logits = mention_emb.matmul( question_emb.unsqueeze(dim=2)).squeeze(dim=2) return logits elif self.general_config.matching_op == "concat": mention_max_num = utils.shape(mention_emb, 1) question_emb = question_emb.unsqueeze(dim=1).expand( -1, mention_max_num, -1) combined_emb = torch.cat([mention_emb, question_emb], dim=2) # [batch, mention, emb] logits = self.concat_projector(combined_emb).squeeze( dim=2) # [batch, mention] return logits else: assert False, "Unsupported matching_op: {}".format( self.general_config.matching_op)
def concat_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150, scope='concat-attention', reuse=False): """ For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, where attn(i, j) = dot(v, tanh(W*[a_i; b_j])). v is a learnable vector and W is a learnable matrix. The rows of attn are softmax normalized. Args: a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. max_seq_len: Length of padded sequences a and b. Integer. hidden_units: Number of hidden units. Integer. Returns: Attention matrix. Tensor of shape [max_seq_len, max_seq_len]. """ with tf.variable_scope(scope, reuse=reuse): a = tf.expand_dims(a, 2) b = tf.expand_dims(b, 1) c = tf.concat([a, b], axis=3) W = tf.get_variable( name='matmul_weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(c, -1), hidden_units]) cW = tf.einsum('ijkl,lm->ijkm', c, W) v = tf.get_variable(name='dot_weights', initializer=tf.ones_initializer(), shape=[hidden_units]) logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(cW), v) logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) attn = tf.exp(logits) attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)
def run_irl(world, car, reward, theta, data): def gen(): for point in data: for c, x0, u in zip(world.cars, point['x0'], point['u']): c.traj.x0.set_value(x0) for cu, uu in zip(c.traj.u, u): cu.set_value(uu) yield r = car.traj.reward(reward) g = utils.grad(r, car.traj.u) H = utils.hessian(r, car.traj.u) I = tt.eye(utils.shape(H)[0]) reg = utils.vector(1) reg.set_value([1e-1]) H = H-reg[0]*I L = tt.dot(g, tt.dot(tn.MatrixInverse()(H), g))+tt.log(tn.Det()(-H)) for _ in gen(): pass optimizer = utils.Maximizer(L, [theta], gen=gen, method='gd', eps=0.1, debug=True, iters=1000, inf_ignore=10) optimizer.maximize() print theta.get_value()
def dists_keyframe_to_first_segment(propagated_distributions, n_loss_frames): """ :param propagated_distributions: list of tensors batchsize x frames_in_distribution, len = segments :param n_loss_frames: :return: """ num_segments = len(propagated_distributions) batch_size = shape(propagated_distributions[0])[0] # Pad the distributions for every segment so that they are the same size # this only doubles the computation needed for the loss, but we don't need nested loops propagated_distributions = propagated_distributions[: -1] # discard the last segment for t in range(num_segments - 1): propagated_distribution = propagated_distributions[t] length = propagated_distribution.get_shape().as_list()[1] if length > n_loss_frames: propagated_distribution = propagated_distribution[:, : n_loss_frames] if length < n_loss_frames: propagated_distribution = tf.pad( propagated_distribution, [[0, 0], [0, n_loss_frames - length]]) propagated_distributions[t] = propagated_distribution propagated_distributions = tf.stack(propagated_distributions) # Transform the distributions for keyframes to distributions of first frames in a segment propagated_distributions = tf.pad(propagated_distributions[:, :, :-1], [[0, 0], [0, 0], [1, 0]]) # Add the first segment propagated_distributions = tf.concat([ tf.concat([ tf.ones([1, batch_size, 1]), tf.zeros([1, batch_size, n_loss_frames - 1]) ], axis=2), propagated_distributions ], axis=0) return propagated_distributions
def _net(self): # RNN and dense layers # 256 _ in 3 # returns Input tensor or list of input tensors. #class # MultiRNNCell(RNNCell): rnn_layer = MultiRNNCell( [GRUCell(self.hidden_size) for _ in range(self.n_layer)]) output_rnn, rnn_state = tf.nn.dynamic_rnn(rnn_layer, self.x_mixed, dtype=tf.float32) input_size = shape(self.x_mixed)[2] y_hat_src1 = tf.layers.dense(inputs=output_rnn, units=input_size, activation=tf.nn.relu, name='y_hat_src1') # y_hat_src1 = Conv2D(513, 3, activation = 'relu', # padding = 'same', kernel_initializer = 'he_normal') # (self.x_mixed_unet) y_hat_src2 = tf.layers.dense(inputs=output_rnn, units=input_size, activation=tf.nn.relu, name='y_hat_src2') # input_size = shape(self.x_mixed)[2] # y_hat_src1 = unet(input_size = input_size) # y_hat_src2 = unet(input_size = input_size) # model = Model(inputs = inputs, outputs = conv10) # model.compile(optimizer = Adam(lr = 1e-4), # loss = 'binary_crossentropy', metrics = ['accuracy']) # time-freq masking layer y_tilde_src1 = y_hat_src1 / \ (y_hat_src1 + y_hat_src2 + np.finfo(float).eps) * self.x_mixed y_tilde_src2 = y_hat_src2 / \ (y_hat_src1 + y_hat_src2 + np.finfo(float).eps) * self.x_mixed return y_tilde_src1, y_tilde_src2
def concat_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150, scope='concat-attention', reuse=False): """ For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, where attn(i, j) = dot(v, tanh(W*[a_i; b_j])). v is a learnable vector and W is a learnable matrix. The rows of attn are softmax normalized. Args: a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. max_seq_len: Length of padded sequences a and b. Integer. hidden_units: Number of hidden units. Integer. Returns: Attention matrix. Tensor of shape [max_seq_len, max_seq_len]. """ with tf.variable_scope(scope, reuse=reuse): a = tf.expand_dims(a, 2) b = tf.expand_dims(b, 1) c = tf.concat([a, b], axis=3) W = tf.get_variable( name='matmul_weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(c, -1), hidden_units] ) cW = tf.einsum('ijkl,lm->ijkm', c, W) v = tf.get_variable( name='dot_weights', initializer=tf.ones_initializer(), shape=[hidden_units] ) logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(cW), v) logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) attn = tf.exp(logits) attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)
def temporal_convolution_layer(inputs, output_units, convolution_width, bias=True, activation=None, dropout=None, scope='time-distributed-conv-layer', reuse=False): """ Convolution over the temporal axis of sequence data. Args: inputs: Tensor of shape [batch size, max sequence length, input_units]. output_units: Output channels for convolution. convolution_width: Number of timesteps (words) to use in convolution. Returns: Tensor of shape [batch size, max sequence length, output_units]. """ with tf.variable_scope(scope, reuse=reuse): W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[convolution_width, shape(inputs, 2), output_units]) z = tf.nn.convolution(inputs, W, padding='SAME', strides=[1]) if bias: b = tf.get_variable(name='biases', initializer=tf.constant_initializer(), shape=[output_units]) z = z + b z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return z
def dense(a_enc, b_enc, bias=True, activation=None, dropout=None, scope='dense', reuse=False): """ Compare the encoded representations a_enc and b_enc using a learnable paramterized function in the form of dense layer applied to the concatenation of a_enc and b_enc. Args: a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units]. b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units]. activation: Activation function. dropout: Dropout keep prob. Float. Returns: Tensor of shape [batch size]. """ with tf.variable_scope(scope, reuse=reuse): inputs = tf.concat([a_enc, b_enc], axis=1) W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(inputs, -1), 1]) z = tf.matmul(inputs, W) if bias: b_enc = tf.get_variable(name='biases', initializer=tf.constant_initializer(), shape=[1]) z = z + b_enc z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return tf.squeeze(z)
def argmax_attentive_matching(a, b, a_lengths, b_lengths, max_seq_len, attention_func=dot_attention, attention_func_kwargs={}): """ Matches each vector in a with the weighted vector in b that has the largest inner product. The weightings are determined by the attention matrix. The attention matrix is computed using attention_func. Args: a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. max_seq_len: Length of padded sequences a and b. Integer. attention_func: Function used to calculate attention matrix. Can be one of the following: multiplicative_attention, additive_attention, concat_attention, dot_attention, or cosine_attention. attention_func_kwargs: Keyword arguments to pass to attention_func. Returns: Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for each timestep in a. """ attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len, **attention_func_kwargs) b_match_idx = tf.argmax(attn, axis=2) batch_index = tf.tile( tf.expand_dims(tf.range(shape(b, 0), dtype=tf.int64), 1), (1, max_seq_len)) b_idx = tf.stack([batch_index, b_match_idx], axis=2) return tf.gather_nd(b, b_idx)
def _net(self): # RNN and dense layers cells_fw = [] cells_bw = [] for i, layer_size in enumerate( CONFIG_MAP['flat-R-VAE'].hparams.enc_rnn_size): cells_fw.append( lstm_utils.rnn_cell( [layer_size], CONFIG_MAP['flat-R-VAE'].hparams.dropout_keep_prob, CONFIG_MAP['flat-R-VAE'].hparams.residual_encoder)) cells_bw.append( lstm_utils.rnn_cell( [layer_size], CONFIG_MAP['flat-R-VAE'].hparams.dropout_keep_prob, CONFIG_MAP['flat-R-VAE'].hparams.residual_encoder)) _, states_fw, states_bw = rnn.stack_bidirectional_dynamic_rnn( cells_fw, cells_bw, self.x_mixed, dtype=tf.float32) # Note we access the outputs (h) from the states since the backward # ouputs are reversed to the input order in the returned outputs. last_h_fw = states_fw[-1][-1].h last_h_bw = states_bw[-1][-1].h last_h = tf.concat([last_h_fw, last_h_bw], 1) mu = tf.layers.dense( last_h, CONFIG_MAP['flat-R-VAE'].hparams.z_size, name='encoder/mu', kernel_initializer=tf.random_normal_initializer(stddev=0.001)) sigma = tf.layers.dense( last_h, CONFIG_MAP['flat-R-VAE'].hparams.z_size, activation=tf.nn.softplus, name='encoder/sigma', kernel_initializer=tf.random_normal_initializer(stddev=0.001)) q_z = ds.MultivariateNormalDiag(loc=mu, scale_diag=sigma) z = q_z.sample() repeated_z = tf.tile(tf.expand_dims(z, axis=1), [1, tf.shape(self.x_drums)[1], 1]) #p_z = ds.MultivariateNormalDiag(loc=[0.] * hparams.z_size, scale_diag=[1.] * hparams.z_size) '''hier_cells = [lstm_utils.rnn_cell( hparams.dec_rnn_size, dropout_keep_prob=hparams.dropout_keep_prob, residual=hparams.residual_decoder) for _ in range(len(level_lengths))]''' dec_cell = lstm_utils.rnn_cell( CONFIG_MAP['flat-R-VAE'].hparams.dec_rnn_size, CONFIG_MAP['flat-R-VAE'].hparams.dropout_keep_prob, CONFIG_MAP['flat-R-VAE'].hparams.residual_decoder, True) x_input = tf.concat([self.x_drums, repeated_z], axis=2) self.x_length = shape(self.x_mixed)[2] self.x_length_b = np.array( CONFIG_MAP['flat-R-VAE'].hparams.batch_size * [self.x_length]) self.x_length_b = self.x_length_b.astype(np.int32) helper = seq2seq.TrainingHelper(x_input, self.x_length_b) output_layer = layers_core.Dense(self.x_length, name='output_projection') initial_state = lstm_utils.initial_cell_state_from_embedding( dec_cell, z, name='decoder/z_to_initial_state') self.input_shape_ = helper.inputs.shape[2:][0].value decoder = lstm_utils.Seq2SeqLstmDecoder(dec_cell, helper, initial_state=initial_state, input_shape=self.input_shape_, output_layer=output_layer) #max_length =None (Optional) The maximum iterations to decode. final_output, final_state, final_lengths = seq2seq.dynamic_decode( decoder, swap_memory=True, scope='decoder') #flat_x_target = flatten_maybe_padded_sequences(x_target, x_length) flat_rnn_output = flatten_maybe_padded_sequences( final_output.rnn_output, self.x_length_b) '''output_rnn, rnn_state = tf.nn.dynamic_rnn(rnn_layer, self.x_mixed, dtype=tf.float32) y_hat_src1 = tf.layers.dense(inputs=output_rnn, units=input_size, activation=tf.nn.relu, name='y_hat_src1') y_hat_src2 = tf.layers.dense(inputs=output_rnn, units=input_size, activation=tf.nn.relu, name='y_hat_src2') # time-freq masking layer y_tilde_src1 = y_hat_src1 / (y_hat_src1 + y_hat_src2 + np.finfo(float).eps) * self.x_mixed y_tilde_src2 = y_hat_src2 / (y_hat_src1 + y_hat_src2 + np.finfo(float).eps) * self.x_mixed return y_tilde_src1, y_tilde_src2''' return flat_rnn_output, q_z
problems = read_file(args.filename) x = [] y = [] data = [] heuristics_options = { "hc": run_hill_climbing, "hcr": run_hill_climbing_reduced, "sa": run_simulated_annealing } for p in problems: start = time.clock() initial_state = shape(p) s = Sudoku(initial_state) s.random_fill() final_state, steps, score = heuristics_options[args.heuristic](args, s) x.append(steps) y.append(score) tclock = time.clock() - start if args.verbose: print("execution time:", tclock) if args.heuristic == "sa": data.append((p, args.heuristic, array2str(final_state.flatten()), s.cost_function(final_state), len(steps), tclock)) else: data.append((p, args.heuristic, array2str(final_state.flatten()),
def forward(self, batch): if self.general_config.embedding_model.find('elmo') >= 0: batch_size, passage_max_len, other = list( batch['passage_ids'].size()) else: batch_size, passage_max_len = list(batch['passage_ids'].size()) assert passage_max_len % 10 == 0 if self.general_config.embedding_model.find('elmo') >= 0: passage_ids = batch['passage_ids'].view( batch_size * 10, passage_max_len // 10, other) # [batch*10, passage/10, other] else: passage_ids = batch['passage_ids'].view( batch_size * 10, passage_max_len // 10) # [batch*10, passage/10] passage_repre = self.get_repre( passage_ids) # [batch*10, passage/10, elmo_emb] passage_repre, _ = self.passage_encoder( passage_repre) # [batch*10, passage/10, lstm_emb] emb_size = utils.shape(passage_repre, 2) passage_repre = passage_repre.contiguous().view( batch_size, passage_max_len, emb_size) question_repre = self.get_repre( batch['question_ids']) # [batch, question, elmo_emb] question_repre, _ = self.question_encoder( question_repre) # [batch, question, lstm_emb] # modeling question batch_size = len(batch['ids']) question_starts = torch.zeros(batch_size, 1, dtype=torch.long).cuda() # [batch, 1] question_ends = batch['question_lens'].view(batch_size, 1) - 1 # [batch, 1] question_types = torch.zeros(batch_size, 1, dtype=torch.long).cuda() # [batch, 1] question_mask_float = torch.ones( batch_size, 1, dtype=torch.float).cuda() # [batch, 1] question_emb = self.get_mention_embedding( question_repre, question_starts, question_ends, question_types, question_mask_float).squeeze(dim=1) # [batch, emb] # modeling mentions mention_starts = batch['mention_starts'] mention_ends = batch['mention_ends'] mention_types = batch['mention_types'] mention_nums = batch['mention_nums'] mention_max_num = utils.shape(mention_starts, 1) mention_mask = utils.sequence_mask(mention_nums, mention_max_num) mention_emb = self.get_mention_embedding(passage_repre, mention_starts, mention_ends, mention_types, mention_mask.float()) if self.general_config.mention_compress_size > 0: question_emb = self.mention_compressor(question_emb) mention_emb = self.mention_compressor(mention_emb) matching_results = [] rst_seq = self.perform_matching(mention_emb, question_emb) matching_results.append(rst_seq) # graph encoding if self.general_config.graph_encoding in ('GCN', 'GRN'): if self.general_config.graph_encoding in ("GRN", "GCN"): edges = batch['edges'] # [batch, mention, edge] edge_nums = batch['edge_nums'] # [batch, mention] edge_max_num = utils.shape(edges, 2) edge_mask = utils.sequence_mask( edge_nums.view(batch_size * mention_max_num), edge_max_num).view(batch_size, mention_max_num, edge_max_num) # [batch, mention, edge] assert not (edge_mask & (~mention_mask.unsqueeze(dim=2))).any().item() for i in range(self.general_config.graph_encoding_steps): mention_emb_new = self.graph_encoder(mention_emb, mention_mask.float(), edges, edge_mask.float()) mention_emb = mention_emb_new + mention_emb if self.general_config.graph_residual else mention_emb_new rst_graph = self.perform_matching(mention_emb, question_emb) matching_results.append(rst_graph) if len(matching_results) > 1: assert len(matching_results ) == self.general_config.graph_encoding_steps + 1 matching_results = torch.stack( matching_results, dim=2) # [batch, mention, graph_step+1] logits = self.matching_integrater(matching_results).squeeze( dim=2) # [batch, mention] else: assert len(matching_results) == 1 logits = matching_results[0] # [batch, mention] candidates, candidate_num, candidate_appear_num = \ batch['candidates'], batch['candidate_num'], batch['candidate_appear_num'] _, cand_max_num, cand_pos_max_num = list(candidates.size()) candidate_mask = utils.sequence_mask(candidate_num, cand_max_num) # [batch, cand] candidate_appear_mask = utils.sequence_mask( candidate_appear_num.view(batch_size * cand_max_num), cand_pos_max_num).view(batch_size, cand_max_num, cand_pos_max_num) # [batch, cand, pos] assert not (candidate_appear_mask & (~candidate_mask.unsqueeze(dim=2))).any().item() # ideas to get 'candidate_appear_dist' ## idea 1 #candidate_appear_logits = (utils.batch_gather(logits, candidates) + \ # candidate_appear_mask.float().log()).view(batch_size, cand_max_num * cand_pos_max_num) # [batch, cand * pos] #candidate_appear_logits = torch.clamp(candidate_appear_logits, -1e1, 1e1) # [batch, cand * pos] #candidate_appear_dist = F.softmax(candidate_appear_logits, dim=1).view(batch_size, # cand_max_num, cand_pos_max_num) # [batch, cand, pos] ## idea 2 #candidate_appear_dist = torch.clamp(utils.batch_gather(logits, candidates).exp() * \ # candidate_appear_mask.float(), 1e-6, 1e6).view(batch_size, cand_max_num * cand_pos_max_num) # [batch, cand * pos] #candidate_appear_dist = candidate_appear_dist / candidate_appear_dist.sum(dim=1, keepdim=True) #candidate_appear_dist = candidate_appear_dist.view(batch_size, cand_max_num, cand_pos_max_num) ## idea 3 #candidate_appear_dist = F.softmax(utils.batch_gather(logits, candidates).view(batch_size, # cand_max_num * cand_pos_max_num), dim=1) # [batch, cand * pos] #candidate_appear_dist = torch.clamp(candidate_appear_dist * candidate_appear_mask.view(batch_size, # cand_max_num * cand_pos_max_num).float(), 1e-8, 1.0) # [batch, cand * pos] #candidate_appear_dist = (candidate_appear_dist / candidate_appear_dist.sum(dim=1, keepdim=True)).view(batch_size, # cand_max_num, cand_pos_max_num) # [batch, cand, pos] ## get 'candidate_dist', which is common for idea 1, 2 and 3 #if not (candidate_appear_dist > 0).all().item(): # print(candidate_appear_dist) # assert False #candidate_dist = candidate_appear_dist.sum(dim=2) # [batch, cand] # original impl mention_dist = F.softmax(logits, dim=1) if utils.contain_nan(mention_dist): print(logits) print(mention_dist) assert False candidate_appear_dist = utils.batch_gather( mention_dist, candidates) * candidate_appear_mask.float() candidate_dist = candidate_appear_dist.sum( dim=2) * candidate_mask.float() candidate_dist = utils.clip_and_normalize(candidate_dist, 1e-6) assert utils.contain_nan(candidate_dist) == False # end of original impl candidate_logits = candidate_dist.log() # [batch, cand] predictions = candidate_logits.argmax(dim=1) # [batch] if not (predictions < candidate_num).all().item(): print(candidate_dist) print(candidate_num) assert False if 'refs' not in batch or batch['refs'] is None: return {'predictions': predictions} refs = batch['refs'] loss = nn.CrossEntropyLoss()(candidate_logits, refs) right_count = (predictions == refs).sum() return { 'predictions': predictions, 'loss': loss, 'right_count': right_count }
def is_wrapped_call_expression(node): '''Corresponds to 'call_expression(ensure_native_compatibility=True)'.''' return utils.shape( node, 'expression.condition_expression__alternate.call_expression' )
def _predict(self, input_data, n_frames_input, n_frames_predict, batch_size, is_training, params, encoded, input_images, predict_images, action_sequence, abs_action_sequence, z_sequence=None, infer_z_inputs=None, infer_n_zs=None): """ Runs the prediction in the latent space. """ predicted = dict() if shape(encoded["past"])[0] > 1: with tf.name_scope("encoder_rnn"): encoder_rnn_init_state = self._encoder_rnn_init_state_getter(batch_size) encoder_kwargs = {"actions": action_sequence[:n_frames_input]} if self._action_conditioned else {} encoder_rnn_output, encoder_rnn_final_state = self.encoder_rnn( input_seq=encoded["past"][:-1], initial_state=encoder_rnn_init_state, **encoder_kwargs) debug("Encoder RNN", "input", "encoded[past]", shape(encoded["past"])) debug("Encoder RNN", "output", "encoder_rnn_output", shape(encoder_rnn_output)) if FLAGS.debug: print() predicted["seq_past"] = encoder_rnn_output else: # if we condition only on one image we do not need a separate encoding RNN encoder_rnn_final_state = self._encoder_rnn_init_state_getter(batch_size) predicted["seq_past"] = None with tf.name_scope("inference_rnn"): inference_rnn_init_state = self._inference_rnn_init_state_getter(batch_size) inference_rnn_output, _ = self.inference_rnn( input_seq=tf.concat((encoded["past"][-1:], encoded["future_complete"]), axis=0), initial_state=inference_rnn_init_state) debug("Inference RNN", "input", "encoded[future_complete]", shape(encoded["future_complete"])) debug("Inference RNN", "output", "inference_rnn_output", shape(inference_rnn_output)) if FLAGS.debug: print() if infer_z_inputs is not None: # infer the first N latents of the z_sequence with tf.name_scope("infer_initial_zs"): encoded_z_inputs, _ = snt.BatchApply(self.conv_encoder)(infer_z_inputs, is_training) encoded_z_inputs = encoded_z_inputs[:,:,:,0,0] inference_rnn_output, _ = self.inference_rnn( input_seq=encoded_z_inputs, initial_state=inference_rnn_init_state) z_dim = shape(z_sequence)[-1] z_sequence = tf.where(tf.cast(infer_n_zs, tf.bool), inference_rnn_output[1:, :, :z_dim], z_sequence) with tf.name_scope("decoder_rnn"): decoder_rnn_initial_input = {"frame": encoded["past"][-1], "prior_dists": None, "inference_dists": None, "z_sample": None} encoder_kwargs = {"actions": action_sequence[n_frames_input:]} if self._action_conditioned else {} decoder_rnn_output, _ = self.decoder_rnn( initial_input=decoder_rnn_initial_input, initial_state=encoder_rnn_final_state, inference_seq=inference_rnn_output[1:], # shift by one to get output of next frame use_inference=is_training, rollout_len=n_frames_predict, z_sequence=z_sequence, **encoder_kwargs) predicted_frames = decoder_rnn_output["frame"] debug("Decoder RNN", "output", "decoder_rnn_output", shape(predicted_frames)) if FLAGS.debug: print() if not self._action_conditioned: with tf.name_scope("action_regression"): assert FLAGS.train_action_regressor, "Need action regressor flag = True for SSSP!" act_reg_input = tf.concat((tf.concat((encoded["past"][-1:], predicted_frames[:-1]), axis=0), predicted_frames), axis=-1) regressed_actions = snt.BatchApply(self.action_discriminator)(tf.stop_gradient(act_reg_input)) with tf.name_scope("z_action_regression"): act_reg_input = tf.concat((tf.concat((encoded["past"][-1:], predicted_frames[:-1]), axis=0), decoder_rnn_output["z_sample"]), axis=-1) regressed_actions_z = snt.BatchApply(self.z_action_discriminator)(tf.stop_gradient(act_reg_input)) predicted["regressed_actions"] = regressed_actions predicted["regressed_actions_z"] = regressed_actions_z predicted["seq_future"] = predicted_frames predicted["z_sample"] = decoder_rnn_output["z_sample"] predicted["inference_dists"] = decoder_rnn_output["inference_dists"] predicted["prior_dists"] = decoder_rnn_output["prior_dists"] return predicted