def _top_k_sample(logits, ignore_ids=None, num_samples=1, k=10): """ Does top-k sampling. if ignore_ids is on, then we will zero out those logits. :param logits: [batch_size, vocab_size] tensor :param ignore_ids: [vocab_size] one-hot representation of the indices we'd like to ignore and never predict, like padding maybe :param p: topp threshold to use, either a float or a [batch_size] vector :return: [batch_size, num_samples] samples # TODO FIGURE OUT HOW TO DO THIS ON TPUS. IT'S HELLA SLOW RIGHT NOW, DUE TO ARGSORT I THINK """ with tf.variable_scope('top_p_sample'): batch_size, vocab_size = get_shape_list(logits, expected_rank=2) probs = tf.nn.softmax(logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10, axis=-1) # [batch_size, vocab_perm] indices = tf.argsort(probs, direction='DESCENDING') # find the top pth index to cut off. careful we don't want to cutoff everything! # result will be [batch_size, vocab_perm] k_expanded = k if isinstance(k, int) else k[:, None] exclude_mask = tf.range(vocab_size)[None] >= k_expanded # OPTION A - sample in the sorted space, then unsort. logits_to_use = tf.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10 sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples) sample = tf.batch_gather(indices, sample_perm) return { 'probs': probs, 'sample': sample, }
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, top_k_indices, truncation_factor): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs_student = tf.nn.log_softmax(logits, axis=-1) probs_student = tf.nn.softmax(logits, axis=-1) prob_shape = tf.shape(log_probs_student) new_shape = [prob_shape[0], truncation_factor] #[batch_size*seq_len,truncation_factor] top_k_indices = tf.reshape(top_k_indices, new_shape) top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices) top_k_probs_student = tf.batch_gather(probs_student, top_k_indices) return top_k_log_probs_student, top_k_probs_student
def keypoints_flip_left_right(keypoints,swap_index=None): x,y = tf.unstack(keypoints,axis=-1) org_x = x cond = tf.logical_and(x>=0,y>=0) x = 1.0-x x = tf.where(cond,x,org_x) if swap_index is not None: swap_dict = {} for a,b in swap_index: swap_dict[a] = b swap_dict[b] = a X,N,_ = btf.combined_static_and_dynamic_shape(keypoints) indexs = [] for i in range(N): if i in swap_dict: indexs.append(swap_dict[i]) else: indexs.append(i) indexs = tf.convert_to_tensor(indexs,dtype=tf.int32) indexs = tf.reshape(indexs,[1,N]) indexs = tf.tile(indexs,[X,1]) x = tf.batch_gather(x,indexs) y = tf.batch_gather(y,indexs) return tf.stack([x,y],axis=-1)
def _head(self, torso_output): torso_output, level_name = torso_output normalized_vf_games = snt.Linear(self._number_of_games, name='baseline')(torso_output) un_normalized_vf_games = self._std * normalized_vf_games + self._mean # Adding time dimension level_name = tf.reshape(level_name, [-1, 1, 1]) # Reshaping as to seperate the time and batch dimensions # We need to know the length of the time dimension, because it may differ in the initialization # E.g the learner and actors have different size batch/time dimension normalized_vf = tf.reshape(normalized_vf_games, [tf.shape(level_name)[0], -1, self._number_of_games]) un_normalized_vf = tf.reshape(un_normalized_vf_games, [tf.shape(level_name)[0], -1, self._number_of_games]) # Tile the time dimension level_name = tf.tile(level_name, [1, tf.shape(normalized_vf)[1], 1]) normalized_vf = tf.batch_gather(normalized_vf, level_name) # (batch_size, time, 1) un_normalized_vf = tf.batch_gather(un_normalized_vf, level_name) # (batch_size, time, 1) # Reshape to the batch size - because Sonnet's BatchApply expects a batch_size * time dimension. normalized_vf = tf.reshape(normalized_vf, [tf.shape(torso_output)[0]]) un_normalized_vf = tf.reshape(un_normalized_vf, [tf.shape(torso_output)[0]]) # Sample an action from the policy. policy_logits = snt.Linear(self._num_actions, name='policy_logits')(torso_output) new_action = tf.random.categorical(policy_logits, num_samples=1, dtype=tf.int32) new_action = tf.squeeze(new_action, 1, name='new_action') return PopArtAgentOutput(new_action, policy_logits, un_normalized_vf, normalized_vf)
def mask_completeness_loss(logit_1, logit_2, logit_3, relation_12, relation_23): with tf.name_scope('mask_completeness_loss'): L1 = logit_1 L2 = tf.batch_gather(logit_2, relation_12) L3 = tf.batch_gather(logit_3, tf.batch_gather(relation_23, relation_12)) loss = tf.reduce_mean((L1 + L2 + L3 - 1)**2) return loss
def kl_divergence(self, alpha, alpha_prior, i_perm=None, wrt='Dirichlet-Marginals'): """ Computes the KL divergence between the Kumaraswamy q distributions and the Dirichlet prior's Beta marginals. :param alpha: posterior approximation Dirichlet parameters :param alpha_prior: prior Dirichlet parameters :param i_perm: random permutation indices used during sampling procedure :param wrt: that which the KL divergence is with respect to, either Dirichlet marginal or Beta stick breaks :return: KL divergence of marginal Beta distributions of shape [batch size x K] """ assert wrt in {'Dirichlet-Marginals', 'Beta-Sticks'} # apply permutation if one was provided if i_perm is not None: alpha_prior = self.__parameter_rank_check(alpha_prior) alpha_prior = tf.tile(alpha_prior, tf.stack( (tf.shape(alpha)[0], 1))) alpha = tf.batch_gather(alpha, i_perm) alpha_prior = tf.batch_gather(alpha_prior, i_perm) # take KL divergence w.r.t. to the Dirichlet's marginal Betas if wrt == 'Dirichlet-Marginals': # compute marginal q(pi; a', b') approximation parameters a_prime = self.__parameter_rank_check(alpha) b_prime = tf.reduce_sum(a_prime, axis=1, keepdims=True) - a_prime # compute marginal p(pi; a, b) prior parameters a_prior = self.__parameter_rank_check(alpha_prior) b_prior = tf.reduce_sum(a_prior, axis=1, keepdims=True) - a_prior # take KL divergence w.r.t. to the stick-breaking marginal Betas else: # compute marginal q(pi; a', b') approximation parameters a_prime, b_prime = self.__stick_break_parameters(alpha) # compute marginal p(pi; a, b) prior parameters a_prior, b_prior = self.__stick_break_parameters(alpha_prior) # KL-Divergence kl = (a_prime - a_prior) / a_prime * (-np.euler_gamma - tf.digamma(b_prime) - 1 / b_prime) \ + (tf.log(a_prime * b_prime)) \ + (tf.lbeta(tf.stack((a_prior, b_prior), axis=-1))) \ - (b_prime - 1) / b_prime for m in range(1, self.M + 1): B = tf.exp( tf.lbeta( tf.concat((tf.expand_dims(m / a_prime, axis=-1), tf.expand_dims(b_prime, axis=-1)), axis=-1))) kl += (b_prior - 1) * b_prime / (m + a_prime * b_prime) * B # sum over the dimensions kl = tf.reduce_sum(kl, axis=1) return kl
def sample_is(self, x, n=1): mixture_distribution, mixture_components = \ self._gate.conditional_mixture_distribution(x),\ self._experts.conditional_components_distribution(x) y = mixture_components.sample(n) # npdt = y.dtype.as_numpy_dtype is_logits = self._is_function(mixture_distribution.logits) is_mixture_distribution = ds.Categorical(logits=is_logits) idx = is_mixture_distribution.sample(n) # TODO check if we should not renormalize mixture.logits - tf.stop_... weights = tf.batch_gather( mixture_distribution.logits - tf.stop_gradient(is_logits), tf.transpose(idx)) # TODO check axis # weights = tf.batch_gather( # log_normalize(mixture_distribution.logits - tf.stop_gradient(is_logits), axis=1), # tf.transpose(idx)) if n == 1: return tf.batch_gather(y, idx[:, :, None])[0, :, 0], tf.transpose(weights)[0] else: return tf.batch_gather(y, idx[:, :, None])[:, :, 0], tf.transpose(weights)
def vote_reg_loss(seed_xyz, vote_xyz, seed_inds, vote_label, vote_label_mask): """ seed_inds (B, 512) seed_xyz seed_points (B, 512, 3/C) vote_xyz vote_features (B, 512*vote_factor, 3/C) vote_num = num_seed * vote_factor GT_VOTE_FACTOR so vote_label (B,N,9) vote_label_mask: (B,N) """ batch_size = tf.shape(seed_xyz)[0] num_seed = tf.shape(seed_xyz)[1] # vote_num = num_seed * vote_factor # tf 1.13 seed_gt_votes_mask = tf.cast(tf.batch_gather(vote_label_mask, seed_inds), dtype=tf.float32) # same with torch.gather with 3 dims seed_gt_votes = tf.batch_gather(vote_label, seed_inds) + tf.tile(seed_xyz, [1, 1, 3]) vote_xyz_reshape = tf.reshape(vote_xyz, [batch_size * num_seed, vote_factor, 3]) seed_gt_votes_reshape = tf.reshape(seed_gt_votes, [batch_size * num_seed, GT_VOTE_FACTOR, 3]) diff = tf.expand_dims(vote_xyz_reshape, 2) - tf.expand_dims(seed_gt_votes_reshape, 1) dist2center = tf.reduce_sum(tf.losses.huber_loss(labels=tf.zeros_like(diff), predictions=diff, reduction=tf.losses.Reduction.NONE), axis=-1) # (B, N', BB) dist2 = tf.reduce_min(dist2center, axis=1) vote_dist = tf.reduce_min(dist2, axis=1) vote_dist = tf.reshape(vote_dist, [batch_size, num_seed]) vote_loss = tf.reduce_sum(vote_dist * seed_gt_votes_mask) / tf.reduce_sum(seed_gt_votes_mask + 1e-6) vote_loss = tf.identity(vote_loss, 'vote_loss') return vote_loss
def get_ecdf( sample: tf.Tensor, weights: Optional[tf.Tensor] = None) -> Tuple[tf.Tensor, tf.Tensor]: """ Get empirical CDF from a weighted 1D sample """ if weights is None: weights = tf.ones_like(sample) with tf.control_dependencies( [tf.assert_equal(tf.shape(sample), tf.shape(weights))]): i = tf.contrib.framework.argsort(sample, axis=0) x = _T(tf.batch_gather(_T(sample), _T(i))) w = _T(tf.batch_gather(_T(weights), _T(i))) w_cumsum = tf.cumsum(w, axis=0) smallest_wsum = tf.reduce_min(w_cumsum[-1]) with tf.control_dependencies( [tf.assert_greater(smallest_wsum, tf.zeros_like(smallest_wsum))]): w_cumsum /= w_cumsum[-1] return x, w_cumsum
def __init__(self, sess, n_features, lr=0.01): self.sess = sess self.s = tf.placeholder(tf.float32, [None, n_features], "state") self.q_a_ = tf.placeholder(tf.float32, [None, 1], "q_a_") self.r = tf.placeholder(tf.float32, [None, 1], 'r') self.a = tf.placeholder(tf.int32, [None, 1], 'act') self.q = self.build_net("Critic") self.q_target = self.build_net("Target") self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Target') self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic') self.replace_target_op = [tf.assign(t, p) for t, p in zip(self.t_params, self.params)] self.q_a = tf.batch_gather(self.q, self.a) self.q_a_target = tf.batch_gather(self.q_target, self.a) # self.v = alpha * tf.log(tf.reduce_sum(tf.exp(self.q/alpha), axis=1, keepdims=True)) # self.v = tf.reduce_sum(self.act_probs * self.q, axis=1, keepdims=True) # self.v_target = tf.reduce_max(self.q_target, axis=1, keepdims=True) with tf.variable_scope('squared_TD_error'): # self.td_error = self.r + 0.8 * self.v_ - self.v # self.td_error = self.q_a - (self.r + 0.8 * self.v_) self.td_error = self.r + 0.8 * self.q_a_ - self.q_a # self.h = -tf.reduce_sum(self.act_probs * tf.log(self.act_probs), axis=1, keepdims=True) # self.error = self.v - (self.r + 0.8 * self.v_ + alpha * self.h) self.loss = tf.reduce_mean(0.5*tf.square(self.td_error)) # TD_error = (r+gamma*V_next) - V_eval # self.loss = tf.reduce_mean(0.5*tf.square(self.error)) with tf.variable_scope('train'): self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss)
def beam_search_image(self, sentence, beam_width, num_classes): self.feature_beam = tf.tile(tf.expand_dims(self.feature, axis=1), [1, beam_width, 1, 1]) sentence = tf.tile(tf.expand_dims(sentence, axis=1), [1, beam_width, 1]) # ba x beam x hidden total_sentence = tf.ones( (self.batch_size, beam_width, 1), dtype=tf.int32) * 2 self.last_state = [ tf.reshape( tf.tile(tf.expand_dims(state, axis=1), [1, beam_width, 1]), [-1, hp.lstm_units]) for state in self.last_state ] # (ba x beam)x lstm_dim self.last_output = tf.tile(tf.expand_dims(self.last_output, axis=1), [1, beam_width, 1]) # 同上 value = tf.log([[1.] + [0.] * (beam_width - 1)]) mask = tf.ones((self.batch_size, beam_width)) for i in range(hp.maxlen - 1): alpha = self.attention(self.last_output, self.feature_beam) # ba x beam x 196 image_attention = tf.reduce_sum( self.feature_beam * tf.expand_dims(alpha, -1), axis=-2) # batch_size x beam x 1024 if self._selector: image_attention = self.selector(image_attention, self.last_output) inputs = tf.reshape( tf.concat((image_attention, sentence), axis=-1), [-1, hp.hidden_units_cap + 1024]) output, state = self.lstm(inputs, self.last_state) output = tf.reshape(output, [self.batch_size, beam_width, hp.lstm_units]) expanded_output = tf.concat([output, sentence, image_attention], axis=-1) # ba x beam x sth logits = self.decode(expanded_output) logits = tf.nn.log_softmax(logits) sum_logprob = tf.expand_dims( value, axis=2) + logits * tf.expand_dims(mask, axis=2) t = tf.reshape(sum_logprob, [-1, beam_width * num_classes]) value, index = tf.nn.top_k(t, k=beam_width) # batch x beam ids = index % num_classes # batch x beam pre_ids = index // num_classes # batch x beam sentence = tf.nn.embedding_lookup(self.lookup_table, ids) pre_sentence = tf.batch_gather(total_sentence, pre_ids) # batch x beam x len new_word = tf.expand_dims(ids, axis=2) # batch x beam x 1 total_sentence = tf.concat([pre_sentence, new_word], axis=2) # batch x beam x (len+1) mask = tf.batch_gather(mask, pre_ids) * tf.to_float( tf.not_equal(ids, 3)) # 第一项表示之前结束没,第二项表示现在结束了吗(0表示结束) # 下一循环要用的 self.last_output = output self.last_state = state preds = self.select(total_sentence, value) return preds
def extrac_subject_two(self, output, subject_ids): """根据subject_ids从output中取出subject的向量表征 """ index_s = subject_ids[:, :1] #s对应的向量 index_e = subject_ids[:, 1:] start = tf.batch_gather(output, index_s) # shape=(batch_size, 1, 768) end = tf.batch_gather(output, index_e) return start, end
def _top_p_sample(logits, ignore_ids=None, num_samples=1, p=0.9): """ Does top-p sampling. if ignore_ids is on, then we will zero out those logits. :param logits: [batch_size, vocab_size] tensor :param ignore_ids: [vocab_size] one-hot representation of the indices we'd like to ignore and never predict, like padding maybe :param p: topp threshold to use, either a float or a [batch_size] vector :return: [batch_size, num_samples] samples # TODO FIGURE OUT HOW TO DO THIS ON TPUS. IT'S HELLA SLOW RIGHT NOW, DUE TO ARGSORT I THINK """ with tf.variable_scope('top_p_sample'): batch_size, vocab_size = get_shape_list(logits, expected_rank=2) probs = tf.nn.softmax(logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10, axis=-1) if isinstance(p, float) and p > 0.999999: # Don't do top-p sampling in this case print("Top-p sampling DISABLED", flush=True) return { 'probs': probs, 'sample': tf.random.categorical( logits=logits if ignore_ids is None else logits - tf.cast(ignore_ids[None], tf.float32) * 1e10, num_samples=num_samples, dtype=tf.int32), } # [batch_size, vocab_perm] indices = tf.argsort(probs, direction='DESCENDING') cumulative_probabilities = tf.math.cumsum(tf.batch_gather(probs, indices), axis=-1, exclusive=False) # find the top pth index to cut off. careful we don't want to cutoff everything! # result will be [batch_size, vocab_perm] p_expanded = p if isinstance(p, float) else p[:, None] exclude_mask = tf.logical_not( tf.logical_or(cumulative_probabilities < p_expanded, tf.range(vocab_size)[None] < 1)) # OPTION A - sample in the sorted space, then unsort. logits_to_use = tf.batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10 sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples) sample = tf.batch_gather(indices, sample_perm) # OPTION B - unsort first - Indices need to go back to 0 -> N-1 -- then sample # unperm_indices = tf.argsort(indices, direction='ASCENDING') # include_mask_unperm = tf.batch_gather(include_mask, unperm_indices) # logits_to_use = logits - (1 - tf.cast(include_mask_unperm, tf.float32)) * 1e10 # sample = tf.random.categorical(logits=logits_to_use, num_samples=num_samples, dtype=tf.int32) return { 'probs': probs, # 'cumsum': cumulative_probabilities, 'sample': sample, # 'indices_sorted': indices, # 'logits_masked': logits_to_use, # 'logits_raw': tf.batch_gather(logits_to_use, indices), }
def myBeamSearch(batch_states, sequence_length, k, begin_id, end_id): ''' 给定初始状态和序列长度,集束搜索得到topk个分数最高的id序列以及它们的分数 input: batch_states: (batch,dim) tensor sequence_length: int output: sequence_ids:(batch,k,sequence_length) sequence_score:(batch,k) *在这里还要完成一个mask功能,也就是说当beam里的一个序列是以end_id结尾时,不再对这个序列的分数进行更新。 维护一个mask向量(batch,k)用来进行表示 *使用tf.batch_gather而不是tf.gather的原因是每个样本需要gather的位置是不同的,gather作用于第一个维度,batch_gather作用于第二个维度 *tf.nn.top_k()是tensorflow提供的函数,输入(batch,n),返回最大的k个数及他们的索引(batch,k)(batch,k) ''' batch_size = batch_states.shape.as_list()[0] #初始化状态和输入 states = tf.tile(tf.expand_dims(batch_states, axis=1), (1, k, 1)) inputs = tf.tile([[begin_id]], (batch_size, k)) #初始化top k个id序列和他们的分数 sequence_ids = tf.zeros(shape=(batch_size, k, 0)) sequence_score = tf.zeros(shape=(batch_size, k), dtype=tf.float32) mask = tf.ones((batch_size, k), dtype=tf.float32) for i in range(sequence_length): #将topk个状态和输入送给RNN,得到新的states和预测的概率分布 new_states, now_score = update_one_step( states, inputs) #(batch,k,states),(batch,k,num_class) #根据当前已有序列的分数+每个序列的概率分布,得到k*num_class种结果的分数 now_score = tf.multiply( now_score, tf.tile(tf.expand_dims(mask, axis=-1), (1, 1, now_score.shape.as_list()[-1]))) all_score = now_score + tf.expand_dims(sequence_score, axis=-1) #选出topk个高的分数以及它们的索引 sequence_score, indexs = tf.nn.top_k( tf.reshape(all_score, shape=(batch_size, None))) #得到这topk个分数所属哪个beam,以及它们对应的输出类别即下一时刻的输入 beam_ids = indexs // k inputs = indexs % k #更新topk个states,更新topk个输出序列 states = tf.batch_gather(new_states, beam_ids) sequence_ids = tf.concat(tf.batch_gather(sequence_ids, beam_ids), tf.expand_dims(inputs, axis=-1)) #根据end_id也就是当前的inputs来更新mask mask = tf.multiply(tf.cast(inputs != end_id, dtype=tf.float32), tf.batch_gather(mask, beam_ids)) return sequence_ids, sequence_score
def call(self, inputs, state=None): s1_tm, s2_tm, s1_mask, s2_mask, rh_tm = state # s: (B, Lx * dim), values: (B, keys_num * dim), # r_h: (B, dim) k = self.k s1_tm = tf.reshape(s1_tm, [-1, self.sent1_length, self.dim]) # (B, L1, dim) s2_tm = tf.reshape(s2_tm, [-1, self.sent2_length, self.dim]) # (B, L2, dim) s1_mask = tf.expand_dims(s1_mask, axis=2) s2_mask = tf.expand_dims(s2_mask, axis=2) s1_score, s1_mask = self.get_phrase(s1_tm, self.sent1_length, s1_mask, rh_tm) # (B, L1, 1) s2_score, s2_mask = self.get_phrase(s2_tm, self.sent2_length, s2_mask, rh_tm) # (B, L2, 1) # selecting k-max s1_kmax_values, s1_kmax_index = tf.nn.top_k(tf.squeeze(s1_score, axis=2), k=k) s2_kmax_values, s2_kmax_index = tf.nn.top_k(tf.squeeze(s2_score, axis=2), k=k) s1_kmax_values = s1_kmax_values / tf.reduce_sum(s1_kmax_values, axis=1, keepdims=True) s2_kmax_values = s2_kmax_values / tf.reduce_sum(s2_kmax_values, axis=1, keepdims=True) s1_kmax = tf.batch_gather(s1_tm, s1_kmax_index) s2_kmax = tf.batch_gather(s2_tm, s2_kmax_index) score_matrix_kmax = tf.keras.backend.batch_dot(tf.expand_dims(s1_kmax_values, axis=2), tf.expand_dims(s2_kmax_values, axis=2), [2, 2]) # (B, L1, L2) threshold = 0.08 condition = tf.less_equal(score_matrix_kmax, threshold) zero_tensor = tf.zeros_like(score_matrix_kmax) score_matrix_kmax = tf.keras.backend.switch(condition, zero_tensor, score_matrix_kmax) vec_matrix_kmax = self.get_vec_matrix(s1_kmax, k, s2_kmax, k) score_matrix_kmax = tf.expand_dims(score_matrix_kmax, axis=3) phrase_vec_kmax = self.get_cnn_feature(score_matrix_kmax * vec_matrix_kmax) rh, _ = self.r_cell(phrase_vec_kmax, rh_tm) s1_tm = tf.reshape(s1_tm, [-1, self.sent1_length * self.dim]) # (B, L1, dim) s2_tm = tf.reshape(s2_tm, [-1, self.sent2_length * self.dim]) # (B, L2, dim) s1_mask = tf.squeeze(s1_mask, axis=2) s2_mask = tf.squeeze(s2_mask, axis=2) # # compute mask # mask_temp = 1.0 - threshold_score # condition = tf.less_equal(mask_temp, 0.98) # zero_tensor = tf.zeros_like(mask_temp) # mask_temp = tf.keras.backend.switch(condition, zero_tensor, mask_temp) # s_mask = s_mask * mask_temp state = [s1_tm, s2_tm, s1_mask, s2_mask, rh] return rh, DoubleStateTuple(*state)
def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags, beam_size, prefix="default"): """Given sequences and scores, will gather the top k=beam size sequences. This function is used to grow alive, and finished. It takes sequences, scores, and flags, and returns the top k from sequences, scores_to_gather, and flags based on the values in scores. This method permits easy introspection using tfdbg. It adds three named ops that are prefixed by `prefix`: - _topk_seq: the tensor for topk_seq returned by this method. - _topk_flags: the tensor for topk_finished_flags returned by this method. - _topk_scores: the tensor for tokp_gathered_scores returned by this method. Args: sequences: Tensor of sequences that we need to gather from. [batch_size, beam_size, seq_length] scores: Tensor of scores for each sequence in sequences. [batch_size, beam_size]. We will use these to compute the topk. scores_to_gather: Tensor of scores for each sequence in sequences. [batch_size, beam_size]. We will return the gathered scores from here. Scores to gather is different from scores because for grow_alive, we will need to return log_probs, while for grow_finished, we will need to return the length penalized scores. flags: Tensor of bools for sequences that say whether a sequence has reached EOS or not beam_size: int prefix: string that will prefix unique names for the ops run. Returns: Tuple of (topk_seq [batch_size, beam_size, decode_length], topk_gathered_scores [batch_size, beam_size], topk_finished_flags[batch_size, beam_size], topk_indexes) """ _, topk_indexes = top_k_with_unique(scores, k=beam_size) # Gather up the highest scoring sequences. For each operation added, give # it a concrete name to simplify observing these operations with tfdbg. # Clients can capture these tensors by watching these node names. topk_seq = tf.batch_gather(sequences, topk_indexes, prefix + "_topk_seq") topk_flags = tf.batch_gather(flags, topk_indexes, prefix + "_topk_flags") topk_gathered_scores = tf.batch_gather(scores_to_gather, topk_indexes, prefix + "_topk_scores") return topk_seq, topk_gathered_scores, topk_flags, topk_indexes
def sample_step(tokens, ignore_ids, news_config, batch_size=1, p_for_topp=0.95, cache=None, do_topk=False): """ Helper function that samples from grover for a single step :param tokens: [batch_size, n_ctx_b] tokens that we will predict from :param ignore_ids: [n_vocab] mask of the tokens we don't want to predict :param news_config: config for the GroverModel :param batch_size: batch size to use :param p_for_topp: top-p or top-k threshold :param cache: [batch_size, news_config.num_hidden_layers, 2, news_config.num_attention_heads, n_ctx_a, news_config.hidden_size // news_config.num_attention_heads] OR, None :return: new_tokens, size [batch_size] new_probs, also size [batch_size] new_cache, size [batch_size, news_config.num_hidden_layers, 2, n_ctx_b, news_config.num_attention_heads, news_config.hidden_size // news_config.num_attention_heads] """ model = GroverModel( config=news_config, is_training=False, input_ids=tokens, reuse=tf.AUTO_REUSE, scope='newslm', chop_off_last_token=False, do_cache=True, cache=cache, ) # Extract the FINAL SEQ LENGTH batch_size_times_seq_length, vocab_size = get_shape_list(model.logits_flat, expected_rank=2) prev_probs = tf.exp(tf.squeeze(tf.batch_gather(model.log_probs[:, :-1], tokens[:, 1:, None]), axis=2)) logits = tf.reshape(model.logits_flat, [batch_size, -1, vocab_size]) next_logits = logits[:, -1] if do_topk: sample_info = _top_k_sample(next_logits, num_samples=1, k=tf.cast(p_for_topp, dtype=tf.int32)) else: sample_info = _top_p_sample(next_logits, ignore_ids=ignore_ids, num_samples=1, p=p_for_topp) new_tokens = tf.squeeze(sample_info['sample'], 1) new_probs = tf.squeeze(tf.batch_gather(sample_info['probs'], sample_info['sample']), 1) return { 'new_tokens': new_tokens, 'new_probs': new_probs, 'new_probs_all': tf.nn.softmax(next_logits, dim=-1), 'prev_probs': prev_probs, 'new_cache': model.new_kvs, }
def _transpose_and_gather(feat, ind): # tf.keras.layers.Permute(feat, ) feat = tf.transpose(feat, perm=(0, 2, 3, 1)) feat = tf.reshape(feat, (tf.shape(feat)[0], -1, tf.shape(feat)[-1])) ind = tf.cast(ind, tf.int32) feat = tf.batch_gather(feat, ind) return feat
def __init__(self, model, lr, train_config, num_classes=9): self.num_classes = num_classes self.model = model self.tf_input_states = model.input self.tf_action_probs = model.output self.tf_executed_actions = tf.placeholder(dtype=tf.int32, shape=[None, 1]) self.tf_returns = tf.placeholder(dtype=tf.float32, shape=[None, 1]) tf_executed_probs = tf.batch_gather(self.tf_action_probs, self.tf_executed_actions) self.tf_L = tf.reduce_mean(self.tf_returns * tf.log(tf_executed_probs)) self.global_step = tf.train.get_or_create_global_step() self.learning_rate = tf.train.exponential_decay( lr, self.global_step, train_config['decay_step'], train_config['decay_factor'], staircase=True) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self.train_op = tf.train.AdamOptimizer( learning_rate=self.learning_rate).minimize( -self.tf_L, global_step=self.global_step) gpu_ops = tf.GPUOptions(allow_growth=True) config = tf.ConfigProto(gpu_options=gpu_ops) self.sess = tf.Session(config=config) # initialization self.sess.run(tf.initializers.global_variables())
def gaussian_mixture_approximate_mode(gm): """Returns the mean of the most probable mixture component.""" # Find the most likely mixture component. mode_alpha = gm.mixture_distribution.mode()[Ellipsis, None] mus = gm.components_distribution.mean() # Gather the mean of the most likely component. return tf.squeeze(tf.batch_gather(mus, mode_alpha), axis=-2)
def _batch_gather_with_broadcast(params, indices, axis): """Like batch_gather, but broadcasts to the left of axis.""" # batch_gather assumes... # params.shape = [A1,...,AN, B1,...,BM] # indices.shape = [A1,...,AN, C] # which gives output of shape # [A1,...,AN, C, B1,...,BM] # Here we broadcast dims of each to the left of `axis` in params, and left of # the rightmost dim in indices, e.g. we can # have # params.shape = [A1,...,AN, B1,...,BM] # indices.shape = [a1,...,aN, C], # where Ai broadcasts with Ai. # leading_bcast_shape is the broadcast of [A1,...,AN] and [a1,...,aN]. leading_bcast_shape = tf.broadcast_dynamic_shape( tf.shape(params)[:axis], tf.shape(indices)[:-1]) params += tf.zeros(tf.concat( (leading_bcast_shape, tf.shape(params)[axis:]), axis=0), dtype=params.dtype) indices += tf.zeros(tf.concat( (leading_bcast_shape, tf.shape(indices)[-1:]), axis=0), dtype=indices.dtype) return tf.batch_gather(params, indices)
def _tf_sample_neg(batch_size: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor") -> "tf.Tensor": """Sample negative examples for given indices""" tiled_all_bs = tf.tile(tf.expand_dims(all_bs, 0), (batch_size, 1, 1)) return tf.batch_gather(tiled_all_bs, neg_ids)
def __call__(self, features, detection_priors, classes, is_training): """Generate instance masks from FPN features and detection priors. This corresponds to the Fig. 5-6 of the ShapeMask paper at https://arxiv.org/pdf/1904.03239.pdf Args: features: a float Tensor of shape [batch_size * num_instances, mask_crop_size, mask_crop_size, num_downsample_channels]. This is the instance feature crop. detection_priors: a float Tensor of shape [batch_size * num_instances, mask_crop_size, mask_crop_size, 1]. This is the detection prior for the instance. classes: a int Tensor of shape [batch_size, num_instances] of instance classes. is_training: a bool indicating whether in training mode. Returns: mask_outputs: instance mask prediction as a float Tensor of shape [batch_size * num_instances, mask_size, mask_size, num_classes]. """ with tf.variable_scope('coarse_mask', reuse=tf.AUTO_REUSE): # Transform detection priors to have the same dimension as features. detection_priors = tf.layers.dense( tf.expand_dims(detection_priors, axis=-1), self._num_downsample_channels) features += detection_priors mask_logits = self.decoder_net(features, is_training) # Gather the logits with right input class. mask_logits = tf.batch_gather( tf.transpose(mask_logits, [0, 1, 4, 2, 3]), tf.expand_dims(classes, -1)) mask_logits = tf.squeeze(mask_logits, axis=2) return mask_logits
def nearest_neighbor_interpolation( train_points, train_values, query_points, ): """Performs nearest neighbor interpolation. Args: train_points """ displacement_vectors = query_points[:, :, tf. newaxis, :] - train_points[:, tf. newaxis, :, :] # Comput distance between all `train_points` and `query_points`. # `displacement_length` has shape `[batch, query_count, train_count]`. displacement_length = tf.reduce_sum(displacement_vectors**2, -1) # Find indices with minimum distance along `train_count` axis. # `query_indices` is a tensor of indices into `train_values` with shape # `[batch, query_count]` query_indices = tf.math.argmin(displacement_length, axis=-1) print(train_values) print(query_indices) return tf.batch_gather(train_values, tf.cast(query_indices, tf.int32))
def reverse(self): if self._reversed is not None: return self._reversed max_num_edges = self.max_num_edges def fn(params): edges, num_nodes, num_edges = params true_edges = edges[:num_edges, :] # (E, 2) and (E) reverse, indices = tf_lexsort(tf.reverse(true_edges, axis=[-1])) padded_reverse = tf_pad_axis_to(reverse, -2, max_num_edges) padded_indices = tf_pad_axis_to(indices, -1, max_num_edges) return padded_reverse, padded_indices reverse_edges, reverse_indices = tf.map_fn( fn, (self.edges, self.num_nodes, self.num_edges), dtype=(tf.int32, tf.int32) ) reverse_edge_attrs = None if self.edge_attrs is not None: reverse_edge_attrs = self.mask_edge_info( tf.batch_gather(self.edge_attrs, reverse_indices), ndims=1 ) return BaseRuntimeGraph( edges=reverse_edges, node_mask=tf.cast(self.node_mask, tf.int32), center_mask=tf.cast(self.center_mask, tf.int32), edge_mask=tf.cast(self.edge_mask, tf.int32), dense=(self.dense_adjacency is not None), node_attrs=self.node_attrs, edge_attrs=reverse_edge_attrs, reversed=self )
def call(self, inputs, length, dropout=None, attention_dropout=None, use_2d=False): shape = gpt2.get_tensor_shape(inputs) x = self.embedding(inputs) if use_2d: x = tf.reshape( x, [shape[0] * shape[1], self.embedding.embedding_size]) x = self.transformer(inputs=x, dropout=dropout, attention_dropout=attention_dropout, use_2d=use_2d, shape=shape) result = None if self.mode == "last_token": if use_2d: x = tf.reshape( x, [shape[0], shape[1], self.embedding.embedding_size]) result = tf.batch_gather(x, tf.expand_dims(length, 1)) if use_2d: result = tf.squeeze(result, 1) elif self.mode == "attention": mask = tf.sequence_mask(length, shape[1]) result = self.aggregation(inputs=x, mask=mask, attention_dropout=attention_dropout, use_2d=use_2d, shape=shape) return result
def build_rs_graph(self): # FIXME: not sure if it workers for batch_size > 1 (num_rollouts > 1) returns = 0 # (batch_size * n_candidates,) act = tf.random.uniform( shape=[self.horizon, tf.shape(self.obs_ph)[0] * self.n_candidates, self.action_space_dims], minval=self.env.action_space.low, maxval=self.env.action_space.high) # Equivalent to np.repeat observation = tf.reshape( tf.tile(tf.expand_dims(self.obs_ph, -1), [1, self.n_candidates, 1]), [-1, self.obs_space_dims] ) # observation = tf.concat([self.obs_ph for _ in range(self.n_candidates)], axis=0) for t in range(self.horizon): # dynamics_dist = self.dynamics_model.distribution_info_sym(observation, act[t]) # mean, var = dynamics_dist['mean'], dynamics_dist['var'] # next_observation = mean + tf.random.normal(shape=tf.shape(mean))*tf.sqrt(var) next_observation = self.dynamics_model.predict_sym(observation, act[t]) assert self.reward_model is None rewards = self.unwrapped_env.tf_reward(observation, act[t], next_observation) returns += self.discount ** t * rewards observation = next_observation """ returns = tf.reshape(returns, (self.n_candidates, -1)) idx = tf.reshape(tf.argmax(returns, axis=0), [-1, 1]) # (batch_size, 1) cand_a = tf.reshape(act[0], [self.n_candidates, -1, self.action_space_dims]) # (n_candidates, batch_size, act_dims) cand_a = tf.transpose(cand_a, perm=[1, 0, 2]) # (batch_size, n_candidates, act_dims) self.optimal_action = tf.squeeze(tf.batch_gather(cand_a, idx), axis=1) """ returns = tf.reshape(returns, (-1, self.n_candidates)) # (batch_size, n_candidates) cand_a = tf.reshape(act[0], [-1, self.n_candidates, self.action_space_dims]) # (batch_size, n_candidates, act_dims) idx = tf.reshape(tf.argmax(returns, axis=1), [-1, 1]) # (batch_size, 1) self.optimal_action = tf.squeeze(tf.batch_gather(cand_a, idx), axis=1)
def get_params(self, c, b, m): B = tf.shape(c)[0] d = self.hps.dimension r = self.hps.linear_rank r = d if r <= 0 else r h = tf.concat([c, b, m], axis=1) wc = self.wnn(h) wc1, wc2 = tf.split(wc, 2, axis=1) wc1 = tf.reshape(wc1, [B, d, r]) wc2 = tf.reshape(wc2, [B, r, d]) wc = tf.matmul(wc1, wc2) bc = self.bnn(h) weight = wc + self.w bias = bc + self.b # reorder query = m * (1 - b) order = tf.contrib.framework.argsort(query, direction='DESCENDING', stable=True) t = tf.batch_gather(tf.matrix_diag(query), order) weight = tf.matmul(tf.matmul(t, weight), tf.transpose(t, perm=[0, 2, 1])) bias = tf.squeeze(tf.matmul(t, tf.expand_dims(bias, axis=-1)), axis=-1) return weight, bias
def sample(self, num_samples=1): """Sample from the rejection sampling distribution. For ease of implementation, draw the maximum number of proposal samples. Args: num_samples: integer, number of samples to draw. Returns: samples: Tensor of samples from the distribution, [num_samples] + data_dim """ flat_proposal_samples = self.proposal.sample(num_samples * self.T) proposal_samples = tf.reshape(flat_proposal_samples, [num_samples, self.T] + self.data_dim) flat_logit_accept = self.logit_accept_fn(flat_proposal_samples) logit_accept = tf.reshape(flat_logit_accept, [num_samples, self.T]) accept_samples = tfd.Bernoulli(logits=logit_accept[:, :-1]).sample() # Add forced accept to last sample to ensure truncation accept_samples = tf.concat([ accept_samples, tf.ones([num_samples, 1], dtype=accept_samples.dtype) ], axis=-1) # For each of sample_shape, find the first nonzero accept def get_first_nonzero_index(t): # t is batch_dims + [T], t is binary. _, indices = tf.math.top_k(t, k=1, sorted=False) return indices accept_indices = get_first_nonzero_index(accept_samples) # sample_shape samples = tf.batch_gather(proposal_samples, accept_indices) return tf.squeeze(samples, axis=1) # Squeeze the selected dim
def _interpolate(self, xy1, xy2, points2): batch_size = tf.shape(xy1)[0] ndataset1 = tf.shape(xy1)[1] eps = 1e-6 dist_mat = tf.matmul(xy1, xy2, transpose_b=True) norm1 = tf.reduce_sum(xy1 * xy1, axis=-1, keepdims=True) norm2 = tf.reduce_sum(xy2 * xy2, axis=-1, keepdims=True) dist_mat = tf.sqrt(norm1 - 2 * dist_mat + tf.linalg.matrix_transpose(norm2) + eps) dist, idx = tf.math.top_k(tf.negative(dist_mat), k=3) dist = tf.maximum(dist, 1e-10) norm = tf.reduce_sum((1.0 / dist), axis=2, keepdims=True) norm = tf.tile(norm, [1, 1, 3]) weight = (1.0 / dist) / norm idx = tf.reshape(idx, (batch_size, -1)) nn_points = tf.batch_gather(points2, idx) nn_points = tf.reshape( nn_points, (batch_size, ndataset1, 3, points2.get_shape()[-1].value)) interpolated_points = tf.reduce_sum(weight[..., tf.newaxis] * nn_points, axis=-2) return interpolated_points