def precision_recall(num_gbboxes, num_detections, tp, fp, scores, dtype=tf.float64, scope=None): """Compute precision and recall from scores, true positives and false positives booleans arrays """ # Input dictionaries: dict outputs as streaming metrics. if isinstance(scores, dict): d_precision = {} d_recall = {} for c in num_gbboxes.keys(): scope = 'precision_recall_%s' % c p, r = precision_recall(num_gbboxes[c], num_detections[c], tp[c], fp[c], scores[c], dtype, scope) d_precision[c] = p d_recall[c] = r return d_precision, d_recall # Sort by score. with tf.name_scope(scope, 'precision_recall', [num_gbboxes, num_detections, tp, fp, scores]): # Sort detections by score. scores, idxes = tf.nn.top_k(scores, k=num_detections, sorted=True) tp = tf.gather(tp, idxes) fp = tf.gather(fp, idxes) # Computer recall and precision. tp = tf.cumsum(tf.cast(tp, dtype), axis=0) fp = tf.cumsum(tf.cast(fp, dtype), axis=0) recall = _safe_div(tp, tf.cast(num_gbboxes, dtype), 'recall') precision = _safe_div(tp, tp + fp, 'precision') return tf.tuple([precision, recall])
def logits_to_epsilon_bounds(logits, images): probs = tf.reshape(tf.nn.softmax(tf.reshape(logits, [-1, 256])), tf.shape(logits)) cdf_lower = tf.cumsum(probs, axis=4, exclusive=True) cdf_upper = tf.cumsum(probs, axis=4, exclusive=False) # Awful hack to select the correct values images_mask = tf.one_hot(images, 256) cdf_lower = tf.reduce_sum(cdf_lower * images_mask, reduction_indices=[4]) cdf_upper = tf.reduce_sum(cdf_upper * images_mask, reduction_indices=[4]) return cdf_lower, cdf_upper
def lovasz_grad(gt_sorted): """ Computes gradient of the Lovasz extension w.r.t sorted errors See Alg. 1 in paper """ gts = tf.reduce_sum(gt_sorted) intersection = gts - tf.cumsum(gt_sorted) union = gts + tf.cumsum(1. - gt_sorted) jaccard = 1. - intersection / union jaccard = tf.concat((jaccard[0:1], jaccard[1:] - jaccard[:-1]), 0) return jaccard
def specgrams_to_melspecgrams(self, specgrams): """Converts specgrams to melspecgrams. Args: specgrams: Tensor of log magnitudes and instantaneous frequencies, shape [batch, time, freq, 2]. Returns: melspecgrams: Tensor of log magnitudes and instantaneous frequencies, shape [batch, time, freq, 2], mel scaling of frequencies. """ if self._mel_downscale is None: return specgrams logmag = specgrams[:, :, :, 0] p = specgrams[:, :, :, 1] mag2 = tf.exp(2.0 * logmag) phase_angle = tf.cumsum(p * np.pi, axis=-2) l2mel = tf.to_float(self._linear_to_mel_matrix()) logmelmag2 = self._safe_log(tf.tensordot(mag2, l2mel, 1)) mel_phase_angle = tf.tensordot(phase_angle, l2mel, 1) mel_p = spectral_ops.instantaneous_frequency(mel_phase_angle) return tf.concat( [logmelmag2[:, :, :, tf.newaxis], mel_p[:, :, :, tf.newaxis]], axis=-1)
def _compareGradient(self, shape, axis, exclusive, reverse): x = np.arange(0, 50).reshape(shape).astype(np.float64) with self.test_session(): t = tf.convert_to_tensor(x) result = tf.cumsum(t, axis, exclusive, reverse) jacob_t, jacob_n = tf.test.compute_gradient(t, shape, result, shape, x_init_value=x, delta=1) self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
def boolean_mask(boxlist, indicator, fields=None, scope=None, use_static_shapes=False, indicator_sum=None): """Select boxes from BoxList according to indicator and return new BoxList. `boolean_mask` returns the subset of boxes that are marked as "True" by the indicator tensor. By default, `boolean_mask` returns boxes corresponding to the input index list, as well as all additional fields stored in the boxlist (indexing into the first dimension). However one can optionally only draw from a subset of fields. Args: boxlist: BoxList holding N boxes indicator: a rank-1 boolean tensor fields: (optional) list of fields to also gather from. If None (default), all fields are gathered from. Pass an empty fields list to only gather the box coordinates. scope: name scope. use_static_shapes: Whether to use an implementation with static shape gurantees. indicator_sum: An integer containing the sum of `indicator` vector. Only required if `use_static_shape` is True. Returns: subboxlist: a BoxList corresponding to the subset of the input BoxList specified by indicator Raises: ValueError: if `indicator` is not a rank-1 boolean tensor. """ with tf.name_scope(scope, 'BooleanMask'): if indicator.shape.ndims != 1: raise ValueError('indicator should have rank 1') if indicator.dtype != tf.bool: raise ValueError('indicator should be a boolean tensor') if use_static_shapes: if not (indicator_sum and isinstance(indicator_sum, int)): raise ValueError('`indicator_sum` must be a of type int') selected_positions = tf.to_float(indicator) indexed_positions = tf.cast( tf.multiply( tf.cumsum(selected_positions), selected_positions), dtype=tf.int32) one_hot_selector = tf.one_hot( indexed_positions - 1, indicator_sum, dtype=tf.float32) sampled_indices = tf.cast( tf.tensordot( tf.to_float(tf.range(tf.shape(indicator)[0])), one_hot_selector, axes=[0, 0]), dtype=tf.int32) return gather(boxlist, sampled_indices, use_static_shapes=True) else: subboxlist = box_list.BoxList(tf.boolean_mask(boxlist.get(), indicator)) if fields is None: fields = boxlist.get_extra_fields() for field in fields: if not boxlist.has_field(field): raise ValueError('boxlist must contain all specified fields') subfieldlist = tf.boolean_mask(boxlist.get_field(field), indicator) subboxlist.add_field(field, subfieldlist) return subboxlist
def weights_concatenated(labels): """Assign weight 1.0 to the "target" part of the concatenated labels. The labels look like: source English I love you . ID1 target French Je t'aime . ID1 source English the cat ID1 target French le chat ID1 source English ... We want to assign weight 1.0 to all words in the target text (including the ID1 end symbol), but not to the source text or the boilerplate. In the above example, the target words that get positive weight are: Je t'aime . ID1 le chat ID1 Args: labels: a Tensor Returns: a Tensor """ eos_mask = tf.to_int32(tf.equal(labels, 1)) sentence_num = tf.cumsum(eos_mask, axis=1, exclusive=True) in_target = tf.equal(tf.mod(sentence_num, 2), 1) # first two tokens of each sentence are boilerplate. sentence_num_plus_one = sentence_num + 1 shifted = tf.pad(sentence_num_plus_one, [[0, 0], [2, 0], [0, 0], [0, 0]])[:, :-2, :, :] nonboilerplate = tf.equal(sentence_num_plus_one, shifted) ret = tf.to_float(tf.logical_and(nonboilerplate, in_target)) return ret
def unwrap(p, discont=np.pi, axis=-1): """Unwrap a cyclical phase tensor. Args: p: Phase tensor. discont: Float, size of the cyclic discontinuity. axis: Axis of which to unwrap. Returns: unwrapped: Unwrapped tensor of same size as input. """ dd = diff(p, axis=axis) ddmod = tf.mod(dd + np.pi, 2.0 * np.pi) - np.pi idx = tf.logical_and(tf.equal(ddmod, -np.pi), tf.greater(dd, 0)) ddmod = tf.where(idx, tf.ones_like(ddmod) * np.pi, ddmod) ph_correct = ddmod - dd idx = tf.less(tf.abs(dd), discont) ddmod = tf.where(idx, tf.zeros_like(ddmod), dd) ph_cumsum = tf.cumsum(ph_correct, axis=axis) shape = p.get_shape().as_list() shape[axis] = 1 ph_cumsum = tf.concat([tf.zeros(shape, dtype=p.dtype), ph_cumsum], axis=axis) unwrapped = p + ph_cumsum return unwrapped
def _get_values_from_start_and_end(self, input_tensor, num_start_samples, num_end_samples, total_num_samples): """slices num_start_samples and last num_end_samples from input_tensor. Args: input_tensor: An int32 tensor of shape [N] to be sliced. num_start_samples: Number of examples to be sliced from the beginning of the input tensor. num_end_samples: Number of examples to be sliced from the end of the input tensor. total_num_samples: Sum of is num_start_samples and num_end_samples. This should be a scalar. Returns: A tensor containing the first num_start_samples and last num_end_samples from input_tensor. """ input_length = tf.shape(input_tensor)[0] start_positions = tf.less(tf.range(input_length), num_start_samples) end_positions = tf.greater_equal( tf.range(input_length), input_length - num_end_samples) selected_positions = tf.logical_or(start_positions, end_positions) selected_positions = tf.cast(selected_positions, tf.int32) indexed_positions = tf.multiply(tf.cumsum(selected_positions), selected_positions) one_hot_selector = tf.one_hot(indexed_positions - 1, total_num_samples, dtype=tf.int32) return tf.tensordot(input_tensor, one_hot_selector, axes=[0, 0])
def __init__(self, state_size, num_timesteps, mixing_coeff=0.5, prior_mode_mean=1, sigma_min=1e-5, variance=1.0, dtype=tf.float32, random_seed=None, trainable=True, init_bs_to_zero=False, graph_collection_name="P_VARS"): self.state_size = state_size self.num_timesteps = num_timesteps self.sigma_min = sigma_min self.dtype = dtype self.variance = variance self.mixing_coeff = mixing_coeff self.prior_mode_mean = prior_mode_mean if init_bs_to_zero: initializers = [tf.zeros_initializer for _ in xrange(num_timesteps)] else: initializers = [tf.random_uniform_initializer(seed=random_seed) for _ in xrange(num_timesteps)] self.bs = [ tf.get_variable( shape=[state_size], dtype=self.dtype, name="b_%d" % (t + 1), initializer=initializers[t], collections=[tf.GraphKeys.GLOBAL_VARIABLES, graph_collection_name], trainable=trainable) for t in xrange(num_timesteps) ] self.Bs = tf.cumsum(self.bs, reverse=True, axis=0)
def melspecgrams_to_specgrams(self, melspecgrams): """Converts melspecgrams to specgrams. Args: melspecgrams: Tensor of log magnitudes and instantaneous frequencies, shape [batch, time, freq, 2], mel scaling of frequencies. Returns: specgrams: Tensor of log magnitudes and instantaneous frequencies, shape [batch, time, freq, 2]. """ if self._mel_downscale is None: return melspecgrams logmelmag2 = melspecgrams[:, :, :, 0] mel_p = melspecgrams[:, :, :, 1] mel2l = tf.to_float(self._mel_to_linear_matrix()) mag2 = tf.tensordot(tf.exp(logmelmag2), mel2l, 1) logmag = 0.5 * self._safe_log(mag2) mel_phase_angle = tf.cumsum(mel_p * np.pi, axis=-2) phase_angle = tf.tensordot(mel_phase_angle, mel2l, 1) p = spectral_ops.instantaneous_frequency(phase_angle) return tf.concat( [logmag[:, :, :, tf.newaxis], p[:, :, :, tf.newaxis]], axis=-1)
def _precision_recall(n_gbboxes, n_detections, scores, tp, fp, scope=None): """Compute precision and recall from scores, true positives and false positives booleans arrays """ # Sort by score. with tf.name_scope(scope, 'prec_rec', [n_gbboxes, scores, tp, fp]): # Sort detections by score. scores, idxes = tf.nn.top_k(scores, k=n_detections, sorted=True) tp = tf.gather(tp, idxes) fp = tf.gather(fp, idxes) # Computer recall and precision. dtype = tf.float64 tp = tf.cumsum(tf.cast(tp, dtype), axis=0) fp = tf.cumsum(tf.cast(fp, dtype), axis=0) recall = _safe_div(tp, tf.cast(n_gbboxes, dtype), 'recall') precision = _safe_div(tp, tp + fp, 'precision') return tf.tuple([precision, recall])
def reconstruction_loss(self, x_input, x_target, x_length, z=None): """Reconstruction loss calculation. Args: x_input: Batch of decoder input sequences for teacher forcing, sized `[batch_size, max(x_length), output_depth]`. x_target: Batch of expected output sequences to compute loss against, sized `[batch_size, max(x_length), output_depth]`. x_length: Length of input/output sequences, sized `[batch_size]`. z: (Optional) Latent vectors. Required if model is conditional. Sized `[n, z_size]`. Returns: r_loss: The reconstruction loss for each sequence in the batch. metric_map: Map from metric name to tf.metrics return values for logging. truths: Ground truth labels. predictions: Predicted labels. """ batch_size = x_input.shape[0].value has_z = z is not None z = tf.zeros([batch_size, 0]) if z is None else z repeated_z = tf.tile( tf.expand_dims(z, axis=1), [1, tf.shape(x_input)[1], 1]) sampling_probability_static = tensor_util.constant_value( self._sampling_probability) if sampling_probability_static == 0.0: # Use teacher forcing. x_input = tf.concat([x_input, repeated_z], axis=2) helper = seq2seq.TrainingHelper(x_input, x_length) else: # Use scheduled sampling. helper = seq2seq.ScheduledOutputTrainingHelper( inputs=x_input, sequence_length=x_length, auxiliary_inputs=repeated_z if has_z else None, sampling_probability=self._sampling_probability, next_inputs_fn=self._sample) decoder_outputs = self._decode(z, helper=helper, x_input=x_input) flat_x_target = flatten_maybe_padded_sequences(x_target, x_length) flat_rnn_output = flatten_maybe_padded_sequences( decoder_outputs.rnn_output, x_length) r_loss, metric_map, truths, predictions = self._flat_reconstruction_loss( flat_x_target, flat_rnn_output) # Sum loss over sequences. cum_x_len = tf.concat([(0,), tf.cumsum(x_length)], axis=0) r_losses = [] for i in range(batch_size): b, e = cum_x_len[i], cum_x_len[i + 1] r_losses.append(tf.reduce_sum(r_loss[b:e])) r_loss = tf.stack(r_losses) return r_loss, metric_map, truths, predictions
def remap_keys(sparse_tensor): # Current indices of our SparseTensor that we need to fix bad_indices = sparse_tensor.indices # Current values of our SparseTensor that we need to fix bad_values = sparse_tensor.values # Group by the batch_indices and get the count for each size = tf.segment_sum(data = tf.ones_like(bad_indices[:,0], dtype = tf.int64), segment_ids = bad_indices[:,0]) - 1 # The number of batch_indices (this should be batch_size unless it is a partially full batch) length = tf.shape(size, out_type = tf.int64)[0] # Finds the cumulative sum which we can use for indexing later cum = tf.cumsum(size) # The offsets between each example in the batch due to our concatentation of the keys in the decode_example method length_range = tf.range(start = 0, limit = length, delta = 1, dtype = tf.int64) # Indices of the SparseTensor's indices member of the rows we added by the concatentation of our keys in the decode_example method cum_range = cum + length_range # The keys that we have extracted back out of our concatentated SparseTensor gathered_indices = tf.squeeze(tf.gather(bad_indices, cum_range)[:,1]) # The enumerated row indices of the SparseTensor's indices member sparse_indices_range = tf.range(tf.shape(bad_indices, out_type = tf.int64)[0], dtype = tf.int64) # We want to find here the row indices of the SparseTensor's indices member that are of our actual data and not the concatentated rows # So we want to find the intersection of the two sets and then take the opposite of that x = sparse_indices_range s = cum_range # Number of multiples we are going to tile x, which is our sparse_indices_range tile_multiples = tf.concat([tf.ones(tf.shape(tf.shape(x)), dtype=tf.int64), tf.shape(s, out_type = tf.int64)], axis = 0) # Expands x, our sparse_indices_range, into a rank 2 tensor and then multiplies the rows by 1 (no copying) and the columns by the number of examples in the batch x_tile = tf.tile(tf.expand_dims(x, -1), tile_multiples) # Essentially a vectorized logical or, that we then negate x_not_in_s = ~tf.reduce_any(tf.equal(x_tile, s), -1) # The SparseTensor's indices that are our actual data by using the boolean_mask we just made above applied to the entire indices member of our SparseTensor selected_indices = tf.boolean_mask(tensor = bad_indices, mask = x_not_in_s, axis = 0) # Apply the same boolean_mask to the entire values member of our SparseTensor to get the actual values data selected_values = tf.boolean_mask(tensor = bad_values, mask = x_not_in_s, axis = 0) # Need to replace the first column of our selected_indices with keys, so we first need to tile our gathered_indices tiling = tf.tile(input = tf.expand_dims(gathered_indices[0], -1), multiples = tf.expand_dims(size[0] , -1)) # We have to repeatedly apply the tiling to each example in the batch # Since it is jagged we cannot use tf.map_fn due to the stacking of the TensorArray, so we have to create our own custom version def loop_body(i, tensor_grow): return i + 1, tf.concat(values = [tensor_grow, tf.tile(input = tf.expand_dims(gathered_indices[i], -1), multiples = tf.expand_dims(size[i] , -1))], axis = 0) _, result = tf.while_loop(lambda i, tensor_grow: i < length, loop_body, [tf.constant(1, dtype = tf.int64), tiling]) # Concatenate tiled keys with the 2nd column of selected_indices selected_indices_fixed = tf.concat([tf.expand_dims(result, -1), tf.expand_dims(selected_indices[:, 1], -1)], axis = 1) # Combine everything together back into a SparseTensor remapped_sparse_tensor = tf.SparseTensor(indices = selected_indices_fixed, values = selected_values, dense_shape = sparse_tensor.dense_shape) return remapped_sparse_tensor
def crappy_plot(val, levels): x_len = val.get_shape().as_list()[1] left_val = tf.concat(1, (val[:, 0:1], val[:, 0:x_len - 1])) right_val = tf.concat(1, (val[:, 1:], val[:, x_len - 1:])) left_mean = (val + left_val) // 2 right_mean = (val + right_val) // 2 low_val = tf.minimum(tf.minimum(left_mean, right_mean), val) high_val = tf.maximum(tf.maximum(left_mean, right_mean), val + 1) return tf.cumsum(tf.one_hot(low_val, levels, axis=1) - tf.one_hot(high_val, levels, axis=1), axis=1)
def __init__(self, requests, expert_capacity): """Create a TruncatingDispatcher. Args: requests: a boolean `Tensor` of shape `[batch, length, num_experts]`. Alternatively, a float or int Tensor containing zeros and ones. expert_capacity: a Scalar - maximum number of examples per expert per batch element. Returns: a TruncatingDispatcher """ self._requests = tf.to_float(requests) self._expert_capacity = expert_capacity expert_capacity_f = tf.to_float(expert_capacity) self._batch, self._length, self._num_experts = tf.unstack( tf.shape(self._requests), num=3) # [batch, length, num_experts] position_in_expert = tf.cumsum(self._requests, axis=1, exclusive=True) # [batch, length, num_experts] self._gates = self._requests * tf.to_float( tf.less(position_in_expert, expert_capacity_f)) batch_index = tf.reshape( tf.to_float(tf.range(self._batch)), [self._batch, 1, 1]) length_index = tf.reshape( tf.to_float(tf.range(self._length)), [1, self._length, 1]) expert_index = tf.reshape( tf.to_float(tf.range(self._num_experts)), [1, 1, self._num_experts]) # position in a Tensor with shape [batch * num_experts * expert_capacity] flat_position = ( position_in_expert + batch_index * (tf.to_float(self._num_experts) * expert_capacity_f) + expert_index * expert_capacity_f) # Tensor of shape [batch * num_experts * expert_capacity]. # each element is an integer in [0, length) self._indices = tf.unsorted_segment_sum( data=tf.reshape((length_index + 1.0) * self._gates, [-1]), segment_ids=tf.to_int32(tf.reshape(flat_position, [-1])), num_segments=self._batch * self._num_experts * expert_capacity) self._indices = tf.reshape( self._indices, [self._batch, self._num_experts, expert_capacity]) # Tensors of shape [batch, num_experts, expert_capacity]. # each element is 0.0 or 1.0 self._nonpadding = tf.minimum(self._indices, 1.0) # each element is an integer in [0, length) self._indices = tf.nn.relu(self._indices - 1.0) # self._flat_indices is [batch, num_experts, expert_capacity], with values # in [0, batch * length) self._flat_indices = tf.to_int32( self._indices + (tf.reshape(tf.to_float(tf.range(self._batch)), [-1, 1, 1]) * tf.to_float(self._length))) self._indices = tf.to_int32(self._indices)
def systematic_resampling(log_weights, states, n, b): """Resample states with systematic resampling. Args: log_weights: A (n x b) Tensor representing a batch of b logits for n-ary Categorical distribution. states: A list of (b*n x d) Tensors that will be resample in from the groups of every n-th row. Returns: resampled_states: A list of (b*n x d) Tensors resampled via stratified sampling. log_probs: A (n x b) Tensor of the log probabilities of the ancestry decisions. resampling_parameters: The Tensor of parameters of the resampling distribution. ancestors: An (n x b) Tensor of integral indices representing the ancestry decisions. resampling_dist: The distribution object for resampling. """ log_weights = tf.convert_to_tensor(log_weights) states = [tf.convert_to_tensor(state) for state in states] log_weights = tf.transpose(log_weights, perm=[1,0]) probs = tf.nn.softmax( tf.tile(tf.expand_dims(log_weights, axis=1), [1, n, 1]) ) cdfs = tf.concat([tf.zeros((b,n,1), dtype=probs.dtype), tf.cumsum(probs, axis=2)], 2) bins = tf.range(n, dtype=probs.dtype) / n bins = tf.tile(tf.reshape(bins, [1,-1,1]), [b,1,n+1]) strat_cdfs = tf.minimum(tf.maximum((cdfs - bins) * n, 0.0), 1.0) resampling_parameters = strat_cdfs[:,:,1:] - strat_cdfs[:,:,:-1] resampling_dist = tf.contrib.distributions.Categorical( probs=resampling_parameters, allow_nan_stats=True) U = tf.random_uniform((b, 1, 1), dtype=probs.dtype) ancestors = tf.stop_gradient(tf.reduce_sum(tf.to_float(U > strat_cdfs[:,:,1:]), axis=-1)) log_probs = resampling_dist.log_prob(ancestors) ancestors = tf.transpose(ancestors, perm=[1,0]) log_probs = tf.transpose(log_probs, perm=[1,0]) offset = tf.expand_dims(tf.range(b, dtype=probs.dtype), 0) ancestor_inds = tf.reshape(ancestors * b + offset, [-1]) resampled_states = [] for state in states: resampled_states.append(tf.gather(state, ancestor_inds)) return resampled_states, log_probs, resampling_parameters, ancestors, resampling_dist
def compute(x): batch = x[0] start = x[1] end = x[2] padded_onehot = tf.concat([onehot[batch][start:end], tf.zeros([tf.maximum(window - (end - start), 0), num_classes])], axis=0) classes = tf.cumsum(padded_onehot) normalization = tf.cast(tf.expand_dims(tf.range(1, window + 1), -1), classes.dtype) return classes / normalization
def _effective_sample_size_single_state(states, filter_beyond_lag, filter_threshold): """ESS computation for one single Tensor argument.""" with tf.name_scope( 'effective_sample_size_single_state', values=[states, filter_beyond_lag, filter_threshold]): states = tf.convert_to_tensor(states, name='states') dt = states.dtype # filter_beyond_lag == None ==> auto_corr is the full sequence. auto_corr = stats.auto_correlation( states, axis=0, max_lags=filter_beyond_lag) if filter_threshold is not None: filter_threshold = tf.convert_to_tensor( filter_threshold, dtype=dt, name='filter_threshold') # Get a binary mask to zero out values of auto_corr below the threshold. # mask[i, ...] = 1 if auto_corr[j, ...] > threshold for all j <= i, # mask[i, ...] = 0, otherwise. # So, along dimension zero, the mask will look like [1, 1, ..., 0, 0,...] # Building step by step, # Assume auto_corr = [1, 0.5, 0.0, 0.3], and filter_threshold = 0.2. # Step 1: mask = [False, False, True, False] mask = auto_corr < filter_threshold # Step 2: mask = [0, 0, 1, 1] mask = tf.cast(mask, dtype=dt) # Step 3: mask = [0, 0, 1, 2] mask = tf.cumsum(mask, axis=0) # Step 4: mask = [1, 1, 0, 0] mask = tf.maximum(1. - mask, 0.) auto_corr *= mask # With R[k] := auto_corr[k, ...], # ESS = N / {1 + 2 * Sum_{k=1}^N (N - k) / N * R[k]} # = N / {-1 + 2 * Sum_{k=0}^N (N - k) / N * R[k]} (since R[0] = 1) # approx N / {-1 + 2 * Sum_{k=0}^M (N - k) / N * R[k]} # where M is the filter_beyond_lag truncation point chosen above. # Get the factor (N - k) / N, and give it shape [M, 1,...,1], having total # ndims the same as auto_corr n = _axis_size(states, axis=0) k = tf.range(0., _axis_size(auto_corr, axis=0)) nk_factor = (n - k) / n if auto_corr.shape.ndims is not None: new_shape = [-1] + [1] * (auto_corr.shape.ndims - 1) else: new_shape = tf.concat( ([-1], tf.ones([tf.rank(auto_corr) - 1], dtype=tf.int32)), axis=0) nk_factor = tf.reshape(nk_factor, new_shape) return n / (-1 + 2 * tf.reduce_sum(nk_factor * auto_corr, axis=0))
def _compare(self, x, axis, reverse, use_gpu=False): np_out = x if reverse: np_out = numpy_reverse(np_out, axis) np_out = np.cumsum(np_out, axis=axis) if reverse: np_out = numpy_reverse(np_out, axis) with self.test_session(use_gpu=use_gpu): tf_out = tf.cumsum(x, axis, reverse).eval() self.assertAllClose(np_out, tf_out)
def shift_values(values, discount, rollout, final_values=0.0): """Shift values up by some amount of time. Those values that shift from a value beyond the last value are calculated using final_values. """ roll_range = tf.cumsum(tf.ones_like(values[:rollout, :]), 0, exclusive=True, reverse=True) final_pad = tf.expand_dims(final_values, 0) * discount ** roll_range return tf.concat([discount ** rollout * values[rollout:, :], final_pad], 0)
def infer_length(seq, eos_ix, time_major=False, dtype=tf.int32): """ compute length given output indices and eos code :param seq: tf matrix [time,batch] if time_major else [batch,time] :param eos_ix: integer index of end-of-sentence token :returns: lengths, int32 vector of shape [batch] """ axis = 0 if time_major else 1 is_eos = tf.cast(tf.equal(seq, eos_ix), dtype) count_eos = tf.cumsum(is_eos,axis=axis,exclusive=True) lengths = tf.reduce_sum(tf.cast(tf.equal(count_eos,0),dtype),axis=axis) return lengths
def _subsample_selection_to_desired_neg_pos_ratio(self, indices, match, max_negatives_per_positive, min_negatives_per_image=0): """Subsample a collection of selected indices to a desired neg:pos ratio. This function takes a subset of M indices (indexing into a large anchor collection of N anchors where M<N) which are labeled as positive/negative via a Match object (matched indices are positive, unmatched indices are negative). It returns a subset of the provided indices retaining all positives as well as up to the first K negatives, where: K=floor(num_negative_per_positive * num_positives). For example, if indices=[2, 4, 5, 7, 9, 10] (indexing into 12 anchors), with positives=[2, 5] and negatives=[4, 7, 9, 10] and num_negatives_per_positive=1, then the returned subset of indices is [2, 4, 5, 7]. Args: indices: An integer tensor of shape [M] representing a collection of selected anchor indices match: A matcher.Match object encoding the match between anchors and groundtruth boxes for a given image, with rows of the Match objects corresponding to groundtruth boxes and columns corresponding to anchors. max_negatives_per_positive: (float) maximum number of negatives for each positive anchor. min_negatives_per_image: minimum number of negative anchors for a given image. Allow sampling negatives in image without any positive anchors. Returns: selected_indices: An integer tensor of shape [M'] representing a collection of selected anchor indices with M' <= M. num_positives: An integer tensor representing the number of positive examples in selected set of indices. num_negatives: An integer tensor representing the number of negative examples in selected set of indices. """ positives_indicator = tf.gather(match.matched_column_indicator(), indices) negatives_indicator = tf.gather(match.unmatched_column_indicator(), indices) num_positives = tf.reduce_sum(tf.to_int32(positives_indicator)) max_negatives = tf.maximum(min_negatives_per_image, tf.to_int32(max_negatives_per_positive * tf.to_float(num_positives))) topk_negatives_indicator = tf.less_equal( tf.cumsum(tf.to_int32(negatives_indicator)), max_negatives) subsampled_selection_indices = tf.where( tf.logical_or(positives_indicator, topk_negatives_indicator)) num_negatives = tf.size(subsampled_selection_indices) - num_positives return (tf.reshape(tf.gather(indices, subsampled_selection_indices), [-1]), num_positives, num_negatives)
def get_train_choice(state_ph,var_dict,random_t,mask,dropout_keep_prob): score = get_q(state_ph,var_dict,dropout_keep_prob) mid = score # mid = mid + random_t mid = tf.maximum(mid, -2.5) mid = tf.minimum(mid, 1.5) mid = mid - tf.reduce_min(mid) mid = mid + 0.00001 * mask mid = mid / tf.reduce_max(mid) mid = mid * (1-0.05) mid = mid + 0.05 mid = mid * mask weight = mid weight_sum = tf.reduce_sum(weight,reduction_indices=[1]) high = tf.cumsum(weight, axis=1, exclusive=False) low = tf.cumsum(weight, axis=1, exclusive=True) sss0 = tf.reshape(weight_sum,[-1,1]) high0 = high / sss0 low0 = low / sss0 r = tf.random_uniform(tf.shape(sss0), dtype=tf.float32) high1 = tf.less(r, high0) low1 = tf.less_equal(low0, r) good = tf.logical_and(high1,low1) good0 = tf.to_float(good) mid = tf.argmax(good0, dimension=1) train_choice = mid mid = score mid = mid + random_t mid = mid - tf.reduce_min(mid) #mid = tf.exp(mid) mid = mid * mask mid = mid + mask mid = tf.argmax(mid, dimension=1) cal_choice = mid return score, weight, train_choice, cal_choice
def test_readme_example(self): data = tf.random.uniform((128, 128), 0, 10, dtype=tf.int32) histogram = tf.bincount(data, minlength=10, maxlength=10) cdf = tf.cumsum(histogram, exclusive=False) cdf = tf.pad(cdf, [[1, 0]]) cdf = tf.reshape(cdf, [1, 1, -1]) data = tf.cast(data, tf.int16) encoded = range_coding_ops.range_encode(data, cdf, precision=14) decoded = range_coding_ops.range_decode( encoded, tf.shape(data), cdf, precision=14) with self.cached_session() as sess: self.assertAllEqual(*sess.run((data, decoded)))
def _randomize(coeffs, radixes, seed=None): """Applies the Owen (2017) randomization to the coefficients.""" given_dtype = coeffs.dtype coeffs = tf.to_int32(coeffs) num_coeffs = tf.shape(coeffs)[-1] radixes = tf.reshape(tf.to_int32(radixes), shape=[-1]) stream = distributions.SeedStream(seed, salt='MCMCSampleHaltonSequence2') perms = _get_permutations(num_coeffs, radixes, seed=stream()) perms = tf.reshape(perms, shape=[-1]) radix_sum = tf.reduce_sum(radixes) radix_offsets = tf.reshape(tf.cumsum(radixes, exclusive=True), shape=[-1, 1]) offsets = radix_offsets + tf.range(num_coeffs) * radix_sum permuted_coeffs = tf.gather(perms, coeffs + offsets) return tf.cast(permuted_coeffs, dtype=given_dtype)
def call(self, inputs, mask=None): if mask is None: mask = K.zeros_like(inputs) mask = K.sum(mask, axis=-1) mask = 1 + mask else: mask = K.cast(mask, K.dtype(inputs)) safe_n1 = K.sum(mask, axis=1) - 1 safe_n1 = K.maximum(safe_n1, K.ones_like(safe_n1)) safe_n1 = K.expand_dims(safe_n1) r = tf.cumsum(mask, axis=1) - 1 r = self.start + (self.stop - self.start) * r / safe_n1 r = mask * r r = K.expand_dims(r) return r
def __init__(self, state_size, num_timesteps, sigma_min=1e-5, dtype=tf.float32): self.state_size = state_size self.num_timesteps = num_timesteps self.sigma_min = sigma_min self.dtype = dtype self.bs = [ tf.get_variable( shape=[state_size], dtype=self.dtype, name="b_%d" % (t + 1), initializer=tf.zeros_initializer) for t in xrange(num_timesteps) ] self.Bs = tf.cumsum(self.bs, reverse=True, axis=0) self.q_mus = [ snt.Linear(output_size=state_size) for _ in xrange(num_timesteps) ] self.q_sigmas = [ tf.get_variable( shape=[state_size], dtype=self.dtype, name="q_sigma_%d" % (t + 1), initializer=tf.zeros_initializer) for t in xrange(num_timesteps) ] self.r_mus = [ tf.get_variable( shape=[state_size], dtype=self.dtype, name="r_mu_%d" % (t + 1), initializer=tf.zeros_initializer) for t in xrange(num_timesteps) ] self.r_sigmas = [ tf.get_variable( shape=[state_size], dtype=self.dtype, name="r_sigma_%d" % (t + 1), initializer=tf.zeros_initializer) for t in xrange(num_timesteps) ]
def bottleneck(self, x): # pylint: disable=arguments-differ hparams = self.hparams if hparams.unordered: return super(AutoencoderOrderedDiscrete, self).bottleneck(x) noise = hparams.bottleneck_noise hparams.bottleneck_noise = 0.0 # We'll add noise below. x, loss = discretization.parametrized_bottleneck(x, hparams) hparams.bottleneck_noise = noise if hparams.mode == tf.estimator.ModeKeys.TRAIN: # We want a number p such that p^bottleneck_bits = 1 - noise. # So log(p) * bottleneck_bits = log(noise) log_p = tf.log(1 - float(noise) / 2) / float(hparams.bottleneck_bits) # Probabilities of flipping are p, p^2, p^3, ..., p^bottleneck_bits. noise_mask = 1.0 - tf.exp(tf.cumsum(tf.zeros_like(x) + log_p, axis=-1)) # Having the no-noise mask, we can make noise just uniformly at random. ordered_noise = tf.random_uniform(tf.shape(x)) # We want our noise to be 1s at the start and random {-1, 1} bits later. ordered_noise = tf.to_float(tf.less(noise_mask, ordered_noise)) # Now we flip the bits of x on the noisy positions (ordered and normal). x *= 2.0 * ordered_noise - 1 return x, loss
def specgrams_to_stfts(self, specgrams): """Converts specgrams to stfts. Args: specgrams: Tensor of log magnitudes and instantaneous frequencies, shape [batch, time, freq, 2]. Returns: stfts: Complex64 tensor of stft, shape [batch, time, freq, 1]. """ logmag = specgrams[:, :, :, 0] p = specgrams[:, :, :, 1] mag = tf.exp(logmag) if self._ifreq: phase_angle = tf.cumsum(p * np.pi, axis=-2) else: phase_angle = p * np.pi return spectral_ops.polar2rect(mag, phase_angle)[:, :, :, tf.newaxis]
keep_dims=False, name=None) 对tensor中各个元素求逻辑’与’ # ‘x’ is # [[True, True] # [False, False]] tf.reduce_all(x) ==> False tf.reduce_all(x, 0) ==> [False, False] tf.reduce_all(x, 1) ==> [True, False] tf.reduce_any(input_tensor, reduction_indices=None, keep_dims=False, name=None) 对tensor中各个元素求逻辑’或’ tf.accumulate_n(inputs, shape=None, tensor_dtype=None, name=None) 计算一系列tensor的和 # tensor ‘a’ is [[1, 2], [3, 4]] # tensor b is [[5, 0], [0, 6]] tf.accumulate_n([a, b, a]) ==> [[7, 4], [6, 14]] tf.cumsum(x, axis=0, exclusive=False, reverse=False, name=None) 求累积和 tf.cumsum([a, b, c]) ==> [a, a + b, a + b + c] tf.cumsum([a, b, c], exclusive=True) ==> [0, a, a + b] tf.cumsum([a, b, c], reverse=True) ==> [a + b + c, b + c, c] tf.cumsum([a, b, c], exclusive=True, reverse=True) ==> [b + c, c, 0] 六、分割(Segmentation) tf.segment_sum(data, segment_ids, name=None) 根据segment_ids的分段计算各个片段的和 其中segment_ids为一个size与data第一维相同的tensor 其中id为int型数据,最大id不大于size c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]]) tf.segment_sum(c, tf.constant([0, 0, 1])) ==>[[0 0 0 0] [5 6 7 8]] 上面例子分为[0,1]两id,对相同id的data相应数据进行求和, 并放入结果的相应id中,
def _compute_loss(self, logits): """Compute optimization loss.""" target_output = self.iterator.target_output if self.time_major: target_output = tf.transpose(target_output) max_time = self.get_max_time(target_output) ### experiments # batch_cd_list = tf.zeros([1, tf.cast(logits.get_shape()[2], tf.int32)]) # i = tf.constant(0) # while_condition = lambda i, b: tf.less(i, self.batch_size) # def body(i, batch_cd_list): # one_hot = tf.one_hot(target_output[i], logits.get_shape()[2]) # dist = tf.reduce_sum(one_hot, axis=0) # cd_list = tf.divide(dist, tf.reduce_sum(dist)) # j = tf.constant(1) # def while_condition_2(j, dist, cd_list): # return tf.less(j, max_time) # def body_2(j, dist, cd_list): # dist = tf.subtract(dist, tf.one_hot(tf.argmax(logits[i][j-1]), # logits.get_shape()[2])) # cd_list = tf.concat([cd_list, tf.divide(dist, tf.reduce_sum(dist))], axis=0) # tf.Print(j, [j], message="This is j!!!!") # return [tf.add(j, 1), dist, cd_list] # s = tf.while_loop(while_condition_2, body_2, [j, dist, cd_list], # shape_invariants=[j.get_shape(), # dist.get_shape(), tf.TensorShape([None])]) # cd_list = tf.reshape(cd_list, [-1, tf.cast(logits.get_shape()[2], tf.int32)]) # def assign1(batch_cd_list, cd_list): # batch_cd_list = cd_list # print("this is where I am" + str(cd_list.get_shape())) # return batch_cd_list # def assign2(batch_cd_list, cd_list): # print(batch_cd_list.get_shape()) # batch_cd_list = tf.concat([batch_cd_list, cd_list], axis=0) # print("I am here as well") # return batch_cd_list # batch_cd_list = tf.cond(tf.equal(i, tf.constant(0)), lambda: assign1(batch_cd_list, cd_list), lambda: assign2(batch_cd_list, cd_list)) # print("see this: ", batch_cd_list) # return [tf.add(i, 1), batch_cd_list] # r = tf.while_loop(while_condition, body, [i, batch_cd_list], # shape_invariants=[i.get_shape(), tf.TensorShape([None, logits.get_shape()[2]])]) # batch_cd = tf.reshape(batch_cd_list, [self.batch_size, max_time, tf.cast(logits.get_shape()[2], tf.int32)]) # """ # one_hot_target_output = tf.one_hot(target_output[0][0], logits.get_shape()[2]) # """ preds = tf.argmax(logits, axis=2) one_hot_preds = tf.one_hot(preds, logits.get_shape()[2]) cum_preds = tf.cumsum(one_hot_preds, axis=0) cum_preds = tf.slice(cum_preds, [0, 0, 0], [ max_time - 1, self.batch_size, tf.cast(logits.get_shape()[2], tf.int32) ]) cum_preds = tf.concat([ tf.zeros( [1, self.batch_size, tf.cast(logits.get_shape()[2], tf.int32)]), cum_preds ], axis=0) # Now, we have the cumulative predictions tensor. one_hot_targets = tf.one_hot(target_output, logits.get_shape()[2]) targets_test = tf.reshape(one_hot_targets, [ max_time, self.batch_size, tf.cast(logits.get_shape()[2], tf.int32) ]) dist = tf.reduce_sum(one_hot_targets, axis=0, keep_dims=True) rep_dist = tf.tile(dist, [max_time, 1, 1]) target_dist = tf.subtract(rep_dist, cum_preds) target_dist = tf.maximum( target_dist, tf.zeros([ max_time, self.batch_size, tf.cast(logits.get_shape()[2], tf.int32) ])) # setting eos count to 1 target_dist_first = tf.slice(target_dist, [0, 0, 0], [max_time, self.batch_size, 2]) target_dist_last = tf.slice(target_dist, [0, 0, 3], [ max_time, self.batch_size, tf.cast(logits.get_shape()[2], tf.int32) - 3 ]) target_dist = tf.concat( [target_dist_first, tf.ones([max_time, self.batch_size, 1])], axis=2) target_dist = tf.concat([target_dist, target_dist_last], axis=2) # eos count has been set to 1 # Pushing in Puru's method for precision at k. # Loss at false negatives = -log(p) # Loss at false positives = -log(1-p) k = 3 beta = 0.1 top_k_values, top_k_targets = tf.nn.top_k(target_dist, k=k) top_k_predicted_values, top_k_predicted = tf.nn.top_k(logits, k=k) false_negatives = tf.sets.set_difference(top_k_targets, top_k_predicted) false_positives = tf.sets.set_difference(top_k_predicted, top_k_targets) dense_fn = tf.sparse_tensor_to_dense(false_negatives) one_hot_fn = tf.one_hot(indices=dense_fn, depth=logits.get_shape()[2], on_value=1.0) target_k = tf.reduce_sum(one_hot_fn, axis=2) print("target_k vector shape: ", target_k.get_shape()) dense_fp = tf.sparse_tensor_to_dense(false_positives) one_hot_fp = tf.one_hot(indices=dense_fp, depth=logits.get_shape()[2], on_value=1.0) predicted_k = tf.reduce_sum(one_hot_fp, axis=2) # normalising the target distribution. # Have tried softmax normalization and linear normalization. target_sums = tf.reduce_sum(target_dist, axis=2, keep_dims=True) reci_target_sums = tf.reciprocal(target_sums) reci_target_sums_rep = tf.tile( reci_target_sums, [1, 1, tf.cast(logits.get_shape()[2], tf.int32)]) target_dist_norm = tf.multiply(target_dist, reci_target_sums_rep) # target_dist_norm = tf.nn.softmax(target_dist) one_minus_logits = tf.subtract( tf.ones([ max_time, self.batch_size, tf.cast(logits.get_shape()[2], tf.int32) ]), logits) crossent_fn = tf.nn.softmax_cross_entropy_with_logits(labels=target_k, logits=logits) crossent_fp = tf.nn.softmax_cross_entropy_with_logits( labels=predicted_k, logits=one_minus_logits) # crossent_impl = tf.nn.softmax_cross_entropy_with_logits( # labels=target_dist_norm, logits=logits) crossent_impl = (crossent_fn + crossent_fp) / 2.0 #### # target_output_test = tf.reshape(target_output, [self.batch_size, max_time]) # print("reshape successful!!!!", target_output.get_shape()) crossent_orig = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=target_output, logits=logits) target_weights = tf.sequence_mask(self.iterator.target_sequence_length, max_time, dtype=logits.dtype) if self.time_major: target_weights = tf.transpose(target_weights) # cross entropy is a weighted combiation of original cross entropy and # the one implemented by us (crossent_impl) crossent = (1.0 - beta) * crossent_orig + beta * crossent_impl loss = tf.reduce_sum(crossent * target_weights) / tf.to_float( self.batch_size) return loss
def initialize(self, sess, summary_writer, omega=5): self.summary_writer = summary_writer self.sess = sess # placeholders self.mask = U.get_placeholder("mask", self.dtype, (None, 1)) # Tf vars self.observations_ph = U.get_placeholder( dtype=self.dtype, name="obs", shape=(None, self.env.observation_space_size)) # one hot tensor self.actions_one_hot_ph = U.get_placeholder( name="action_one_hot", dtype=self.dtype, shape=(None, self.env.action_space_size), ) # -1, 0, +1 tensor # or -1 +1 tensor # actual action taken or # all actions possible # e.g. [-1, 1; -1, 1 ...] self.actions_ph = U.get_placeholder(name="action", dtype=self.dtype, shape=(None, self.env.n_actions)) self.rewards_ph = U.get_placeholder(dtype=self.dtype, name="rewards", shape=(None, 1)) self.returns_ph = U.get_placeholder(name="returns", dtype=self.dtype, shape=(None, )) self.timesteps_ph = U.get_placeholder(name="timestep", dtype=self.dtype, shape=(None, )) # next state centered on the previous one self.next_states_ph = U.get_placeholder( name="next_states", dtype=self.dtype, shape=(None, self.env.observation_space_size), ) self.optimizer = get_tf_optimizer("adam") theta = np.random.rand() policy_tf, log_prob_policy = self.policy(self.observations_ph, theta) model_log_prob_tf, model_prob_tf = self.model( self.observations_ph, self.actions_ph, self.next_states_ph, initial_omega=omega, actions_one_hot=self.actions_one_hot_ph, sess=sess, summary_writer=summary_writer, ) policy_prob_taken_ac = tf.reduce_sum(policy_tf * self.actions_one_hot_ph, axis=1, keepdims=True) model_prob_taken_ac = tf.reduce_sum(model_prob_tf * self.actions_one_hot_ph, axis=1, keepdims=True) log_prob = tf.log(model_prob_taken_ac * policy_prob_taken_ac + 1e-20) # split using trajectory size splitted_probs = tf.concat(tf.split(tf.transpose(log_prob), self.n_trajectories, axis=1), axis=0) splitted_mask = tf.concat(tf.split(tf.transpose(self.mask), self.n_trajectories, axis=1), axis=0) splitted_reward = tf.concat(tf.split(tf.transpose(self.rewards_ph), self.n_trajectories, axis=1), axis=0) # this is the cumulative sum from 0 to t for each timestep t along each trajectory cum_sum_probs = tf.cumsum(splitted_probs, axis=1) # apply the mask cum_sum_probs = tf.multiply(cum_sum_probs, splitted_mask) # product between p and discounted reward p_times_rew = tf.multiply(cum_sum_probs, splitted_reward) # sum over the timesteps sum_H = tf.reduce_sum(p_times_rew, axis=1, name="Final_sum_over_timesteps") # mean over episodes mean_N = tf.reduce_mean(sum_H, axis=0) # compute and apply gradients self.grad = tf.gradients( mean_N, self.model.trainable_vars + self.policy.trainable_vars) # Summary things self.sum_reward = tf.reduce_sum(splitted_reward, axis=1) self.mean_reward = tf.reduce_mean(self.sum_reward) self.mean_timesteps = tf.reduce_mean(self.timesteps_ph) # plot purpose mean_ret = tf.reduce_mean(self.returns_ph) mean_ts = tf.reduce_mean(self.timesteps_ph) ret_sum = tf.summary.scalar("Return", mean_ret) ts_sum = tf.summary.scalar("Timesteps", mean_ts) om_sum = tf.summary.scalar("Omega", tf.norm(self.model.get_omega())) # th_sum = tf.summary.scalar("Theta",tf.norm(self.policy.getTheta())) self.summary_writer.add_graph(sess.graph) self.summarize = tf.summary.merge([ret_sum, ts_sum, om_sum]) # th_sum]) # minimize op # change sign since we want to maximize self.minimize_op = self.optimizer.minimize( -mean_N, var_list=self.model.trainable_vars + self.policy.trainable_vars) self.policy_tf = policy_tf self.model_tf = model_prob_tf self.log_prob = log_prob
def _log_prob(self, x): n = self.logits.shape k = x.shape wz = tf.gather(self.probs, x, axis=-1) W = tf.cumsum(wz, reverse=True) return tf.reduce_sum(wz - tf.math.log(W))
'outputs': [ { 'name': 'coll', # yhat # :param pre_yhat: [batch_size, H] # :param obs_vec: {name : [batch_size, 1]} 'yhat': lambda pre_yhat, obs_vec: tf.nn.sigmoid(pre_yhat), # yhat_label # :param rewards: [batch_size, N] # :param dones: [batch_size, N+1] # :param goals: {name: [batch_size, 1]} # :param target_obs_vec: [batch_size, N] # :param gamma: scalar 'yhat_label': lambda rewards, dones, goals, future_goals, target_obs_vec, gamma: \ tf.cast(tf.cumsum(target_obs_vec['coll'], axis=1) >= 1.0, tf.float32), # yhat training cost 'yhat_loss': 'xentropy', # <mse / huber / xentropy> 'yhat_loss_weight': 1.0, # how much to weight this loss compared to other losses 'yhat_loss_use_pre': True, # use the pre-activation for the loss? needed for xentropy 'yhat_loss_xentropy_posweight': 10.0, # larger value --> false negatives cost more # bhat # :param pre_bhat: [batch_size, H] # :param obs_vec: {name: [batch_size, 1]} 'bhat': None, # bhat_label # :param rewards: [batch_size, N] # :param dones: [batch_size, N+1]
def calculate(self, quantity: tf.Tensor): each = tf.multiply(quantity, tf.constant(0)) total = tf.cumsum(each) return total
def render_nd_bboxes_tf_spreading(elems, target_shape, ndim=2): """ elems: tensor of size [..., n_boxes, 2*ndim + val_dim], where in the last dimension, there are packed edge coordinates and values (of val_dim) to be filled in the specified box. target_shape: list/tuple of ndim entries. returns: rendered image of size [elems(...), target_shape..., val_dim] ('elems(...)' usually means batch_size) """ assert_shape_ndim = tf.Assert(tf.equal(tf.size(target_shape), ndim), [target_shape]) assert_nonempty_data = tf.Assert(tf.greater(tf.shape(elems)[-1], 2 * ndim), [elems]) with tf.control_dependencies([assert_shape_ndim, assert_nonempty_data]): """ +1 ...... -1 ++++++ ++++++ ........... ...... ++++++ ........... -> ...... -> ++++++ ........... ------ ++++++ -1 +1 in 3d there must be another wall of minuses. looking like that: - + ..... + - so when indexing [0, 1] to ltrb... pluses are when there is even number of 0s, - when odd. """ el_ndim = len(elems.shape) # we do not access this property in tensorflow runtime, but in 'compile time', because, well, # number of dimensions # should be known before assert el_ndim >= 2 and el_ndim <= 3, "elements should be in the form of [batch, n, coordinates] or [n, " \ "coordinates]" if el_ndim == 3: # we use batch_size dimension also! bboxes_per_batch = tf.shape(elems)[1] batch_size = tf.shape(elems)[ 0] # should be the same as image_input.shape[0] index_to_batch = tf.tile(tf.expand_dims(tf.range(batch_size), -1), (1, bboxes_per_batch)) index_to_batch = tf.reshape(index_to_batch, (-1, 1)) else: index_to_batch = None val_vector_size = tf.shape(elems)[-1] - 2 * ndim corner_ids = list(itertools.product([0, 1], repeat=ndim)) corners_lists = [] corners_values = [] for corner in corner_ids: plus = sum(corner) % 2 == 0 id_from_corner = [ i + ndim * c for i, c in enumerate(corner) ] # indexes a corner into [left, top, right, bottom] notation corner_coord = tf.gather(elems[..., 0:2 * ndim], id_from_corner, axis=-1) corner_value = elems[..., 2 * ndim:] * ( 1 if plus else -1) # last dimension is == val_vector_size if index_to_batch is not None: # if the operation is called in batches, remember to rehape it all into one long list for scatter_nd # and add (concatenate) the batch ids corner_coord = tf.concat( [index_to_batch, tf.reshape(corner_coord, (-1, 2))], axis=-1) corner_value = tf.reshape(corner_value, (-1, val_vector_size)) corners_lists.append(corner_coord) corners_values.append(corner_value) indices = tf.concat(corners_lists, axis=0) updates = tf.concat(corners_values, axis=0) shape = tf.concat( [tf.shape(elems)[:-2], target_shape, [val_vector_size]], axis=0) dense_orig = tf.scatter_nd( indices, updates, shape=shape, ) dense = dense_orig for dim in range(ndim): # we want to start from the axis before the last one. The last one is the value dimension, and # the first dimensions hidden in the '...' might be the batched dimensions dense = tf.cumsum(dense, axis=-2 - dim, exclusive=False, reverse=False, name=None) return dense
def learn( make_env, make_policy, *, n_episodes, horizon, delta, gamma, max_iters, sampler=None, use_natural_gradient=False, #can be 'exact', 'approximate' fisher_reg=1e-2, iw_method='is', iw_norm='none', bound='J', line_search_type='parabola', save_weights=False, improvement_tol=0., center_return=False, render_after=None, max_offline_iters=100, callback=None, clipping=False, entropy='none', positive_return=False, reward_clustering='none'): np.set_printoptions(precision=3) max_samples = horizon * n_episodes if line_search_type == 'binary': line_search = line_search_binary elif line_search_type == 'parabola': line_search = line_search_parabola else: raise ValueError() # Building the environment env = make_env() ob_space = env.observation_space ac_space = env.action_space # Building the policy pi = make_policy('pi', ob_space, ac_space) oldpi = make_policy('oldpi', ob_space, ac_space) all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split('/')[1].startswith('pol') ] shapes = [U.intprod(var.get_shape().as_list()) for var in var_list] n_parameters = sum(shapes) # Placeholders ob_ = ob = U.get_placeholder_cached(name='ob') ac_ = pi.pdtype.sample_placeholder([max_samples], name='ac') mask_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='mask') rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='rew') disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(max_samples), name='disc_rew') gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient') iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number') losses_with_name = [] # Policy densities target_log_pdf = pi.pd.logp(ac_) behavioral_log_pdf = oldpi.pd.logp(ac_) log_ratio = target_log_pdf - behavioral_log_pdf # Split operations disc_rew_split = tf.stack(tf.split(disc_rew_ * mask_, n_episodes)) rew_split = tf.stack(tf.split(rew_ * mask_, n_episodes)) log_ratio_split = tf.stack(tf.split(log_ratio * mask_, n_episodes)) target_log_pdf_split = tf.stack( tf.split(target_log_pdf * mask_, n_episodes)) behavioral_log_pdf_split = tf.stack( tf.split(behavioral_log_pdf * mask_, n_episodes)) mask_split = tf.stack(tf.split(mask_, n_episodes)) # Renyi divergence emp_d2_split = tf.stack( tf.split(pi.pd.renyi(oldpi.pd, 2) * mask_, n_episodes)) emp_d2_cum_split = tf.reduce_sum(emp_d2_split, axis=1) empirical_d2 = tf.reduce_mean(tf.exp(emp_d2_cum_split)) # Return ep_return = tf.reduce_sum(mask_split * disc_rew_split, axis=1) if clipping: rew_split = tf.clip_by_value(rew_split, -1, 1) if center_return: ep_return = ep_return - tf.reduce_mean(ep_return) rew_split = rew_split - (tf.reduce_sum(rew_split) / (tf.reduce_sum(mask_split) + 1e-24)) discounter = [pow(gamma, i) for i in range(0, horizon)] # Decreasing gamma discounter_tf = tf.constant(discounter) disc_rew_split = rew_split * discounter_tf return_mean = tf.reduce_mean(ep_return) return_std = U.reduce_std(ep_return) return_max = tf.reduce_max(ep_return) return_min = tf.reduce_min(ep_return) return_abs_max = tf.reduce_max(tf.abs(ep_return)) return_step_max = tf.reduce_max(tf.abs(rew_split)) # Max step reward return_step_mean = tf.abs(tf.reduce_mean(rew_split)) positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split)) negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split)) return_step_maxmin = tf.abs(positive_step_return_max - negative_step_return_max) # Reward clustering rew_clustering_options = reward_clustering.split(':') if reward_clustering == 'none': pass # Do nothing elif rew_clustering_options[0] == 'global': assert len( rew_clustering_options ) == 2, "Reward clustering: Provide the correct number of parameters" N = int(rew_clustering_options[1]) tf.add_to_collection( 'prints', tf.Print(ep_return, [ep_return], 'ep_return', summarize=20)) global_rew_min = tf.Variable(float('+inf'), trainable=False) global_rew_max = tf.Variable(float('-inf'), trainable=False) rew_min = tf.reduce_min(ep_return) rew_max = tf.reduce_max(ep_return) global_rew_min = tf.assign(global_rew_min, tf.minimum(global_rew_min, rew_min)) global_rew_max = tf.assign(global_rew_max, tf.maximum(global_rew_max, rew_max)) interval_size = (global_rew_max - global_rew_min) / N ep_return = tf.floordiv(ep_return, interval_size) * interval_size elif rew_clustering_options[0] == 'batch': assert len( rew_clustering_options ) == 2, "Reward clustering: Provide the correct number of parameters" N = int(rew_clustering_options[1]) rew_min = tf.reduce_min(ep_return) rew_max = tf.reduce_max(ep_return) interval_size = (rew_max - rew_min) / N ep_return = tf.floordiv(ep_return, interval_size) * interval_size elif rew_clustering_options[0] == 'manual': assert len( rew_clustering_options ) == 4, "Reward clustering: Provide the correct number of parameters" N, rew_min, rew_max = map(int, rew_clustering_options[1:]) interval_size = (rew_max - rew_min) / N # Clip to avoid overflow and cluster ep_return = tf.clip_by_value(ep_return, rew_min, rew_max) ep_return = tf.floordiv(ep_return, interval_size) * interval_size else: raise Exception('Unrecognized reward clustering scheme.') losses_with_name.extend([(return_mean, 'InitialReturnMean'), (return_max, 'InitialReturnMax'), (return_min, 'InitialReturnMin'), (return_std, 'InitialReturnStd'), (empirical_d2, 'EmpiricalD2'), (return_step_max, 'ReturnStepMax'), (return_step_maxmin, 'ReturnStepMaxmin')]) if iw_method == 'pdis': # log_ratio_split cumulative sum log_ratio_cumsum = tf.cumsum(log_ratio_split, axis=1) # Exponentiate ratio_cumsum = tf.exp(log_ratio_cumsum) # Multiply by the step-wise reward (not episode) ratio_reward = ratio_cumsum * disc_rew_split # Average on episodes ratio_reward_per_episode = tf.reduce_sum(ratio_reward, axis=1) w_return_mean = tf.reduce_sum(ratio_reward_per_episode, axis=0) / n_episodes # Get d2(w0:t) with mask d2_w_0t = tf.exp(tf.cumsum(emp_d2_split, axis=1)) * mask_split # LEAVE THIS OUTSIDE # Sum d2(w0:t) over timesteps episode_d2_0t = tf.reduce_sum(d2_w_0t, axis=1) # Sample variance J_sample_variance = (1 / (n_episodes - 1)) * tf.reduce_sum( tf.square(ratio_reward_per_episode - w_return_mean)) losses_with_name.append((J_sample_variance, 'J_sample_variance')) losses_with_name.extend([(tf.reduce_max(ratio_cumsum), 'MaxIW'), (tf.reduce_min(ratio_cumsum), 'MinIW'), (tf.reduce_mean(ratio_cumsum), 'MeanIW'), (U.reduce_std(ratio_cumsum), 'StdIW')]) losses_with_name.extend([(tf.reduce_max(d2_w_0t), 'MaxD2w0t'), (tf.reduce_min(d2_w_0t), 'MinD2w0t'), (tf.reduce_mean(d2_w_0t), 'MeanD2w0t'), (U.reduce_std(d2_w_0t), 'StdD2w0t')]) elif iw_method == 'is': iw = tf.exp(tf.reduce_sum(log_ratio_split, axis=1)) if iw_norm == 'none': iwn = iw / n_episodes w_return_mean = tf.reduce_sum(iwn * ep_return) J_sample_variance = (1 / (n_episodes - 1)) * tf.reduce_sum( tf.square(iw * ep_return - w_return_mean)) losses_with_name.append((J_sample_variance, 'J_sample_variance')) elif iw_norm == 'sn': iwn = iw / tf.reduce_sum(iw) w_return_mean = tf.reduce_sum(iwn * ep_return) elif iw_norm == 'regression': iwn = iw / n_episodes mean_iw = tf.reduce_mean(iw) beta = tf.reduce_sum( (iw - mean_iw) * ep_return * iw) / (tf.reduce_sum( (iw - mean_iw)**2) + 1e-24) w_return_mean = tf.reduce_mean(iw * ep_return - beta * (iw - 1)) else: raise NotImplementedError() ess_classic = tf.linalg.norm(iw, 1)**2 / tf.linalg.norm(iw, 2)**2 sqrt_ess_classic = tf.linalg.norm(iw, 1) / tf.linalg.norm(iw, 2) ess_renyi = n_episodes / empirical_d2 losses_with_name.extend([(tf.reduce_max(iwn), 'MaxIWNorm'), (tf.reduce_min(iwn), 'MinIWNorm'), (tf.reduce_mean(iwn), 'MeanIWNorm'), (U.reduce_std(iwn), 'StdIWNorm'), (tf.reduce_max(iw), 'MaxIW'), (tf.reduce_min(iw), 'MinIW'), (tf.reduce_mean(iw), 'MeanIW'), (U.reduce_std(iw), 'StdIW'), (ess_classic, 'ESSClassic'), (ess_renyi, 'ESSRenyi')]) elif iw_method == 'rbis': # Get pdfs for episodes target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1) behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdf_split, axis=1) # Normalize log_proba (avoid as overflows as possible) normalization_factor = tf.reduce_mean( tf.stack([target_log_pdf_episode, behavioral_log_pdf_episode])) target_norm_log_pdf_episode = target_log_pdf_episode - normalization_factor behavioral_norm_log_pdf_episode = behavioral_log_pdf_episode - normalization_factor # Exponentiate target_pdf_episode = tf.clip_by_value( tf.cast(tf.exp(target_norm_log_pdf_episode), tf.float64), 1e-300, 1e+300) behavioral_pdf_episode = tf.clip_by_value( tf.cast(tf.exp(behavioral_norm_log_pdf_episode), tf.float64), 1e-300, 1e+300) tf.add_to_collection( 'asserts', tf.assert_positive(target_pdf_episode, name='target_pdf_positive')) tf.add_to_collection( 'asserts', tf.assert_positive(behavioral_pdf_episode, name='behavioral_pdf_positive')) # Compute the merging matrix (reward-clustering) and the number of clusters reward_unique, reward_indexes = tf.unique(ep_return) episode_clustering_matrix = tf.cast( tf.one_hot(reward_indexes, n_episodes), tf.float64) max_index = tf.reduce_max(reward_indexes) + 1 trajectories_per_cluster = tf.reduce_sum(episode_clustering_matrix, axis=0)[:max_index] tf.add_to_collection( 'asserts', tf.assert_positive(tf.reduce_sum(episode_clustering_matrix, axis=0)[:max_index], name='clustering_matrix')) # Get the clustered pdfs clustered_target_pdf = tf.matmul( tf.reshape(target_pdf_episode, (1, -1)), episode_clustering_matrix)[0][:max_index] clustered_behavioral_pdf = tf.matmul( tf.reshape(behavioral_pdf_episode, (1, -1)), episode_clustering_matrix)[0][:max_index] tf.add_to_collection( 'asserts', tf.assert_positive(clustered_target_pdf, name='clust_target_pdf_positive')) tf.add_to_collection( 'asserts', tf.assert_positive(clustered_behavioral_pdf, name='clust_behavioral_pdf_positive')) # Compute the J ratio_clustered = clustered_target_pdf / clustered_behavioral_pdf #ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique # ---- No cluster cardinality ratio_reward = tf.cast(ratio_clustered, tf.float32) * reward_unique * tf.cast( trajectories_per_cluster, tf.float32) # ---- Cluster cardinality #w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast(max_index, tf.float32) # ---- No cluster cardinality w_return_mean = tf.reduce_sum(ratio_reward) / tf.cast( n_episodes, tf.float32) # ---- Cluster cardinality # Divergences ess_classic = tf.linalg.norm(ratio_reward, 1)**2 / tf.linalg.norm( ratio_reward, 2)**2 sqrt_ess_classic = tf.linalg.norm(ratio_reward, 1) / tf.linalg.norm( ratio_reward, 2) ess_renyi = n_episodes / empirical_d2 # Summaries losses_with_name.extend([(tf.reduce_max(ratio_clustered), 'MaxIW'), (tf.reduce_min(ratio_clustered), 'MinIW'), (tf.reduce_mean(ratio_clustered), 'MeanIW'), (U.reduce_std(ratio_clustered), 'StdIW'), (1 - (max_index / n_episodes), 'RewardCompression'), (ess_classic, 'ESSClassic'), (ess_renyi, 'ESSRenyi')]) else: raise NotImplementedError() if bound == 'J': bound_ = w_return_mean elif bound == 'std-d2': bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi)) * return_std elif bound == 'max-d2': var_estimate = tf.sqrt( (1 - delta) / (delta * ess_renyi)) * return_abs_max bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi)) * return_abs_max elif bound == 'max-ess': bound_ = w_return_mean - tf.sqrt( (1 - delta) / delta) / sqrt_ess_classic * return_abs_max elif bound == 'std-ess': bound_ = w_return_mean - tf.sqrt( (1 - delta) / delta) / sqrt_ess_classic * return_std elif bound == 'pdis-max-d2': # Discount factor if gamma >= 1: discounter = [ float(1 + 2 * (horizon - t - 1)) for t in range(0, horizon) ] else: def f(t): return pow(gamma, 2 * t) + ( 2 * pow(gamma, t) * (pow(gamma, t + 1) - pow(gamma, horizon))) / (1 - gamma) discounter = [f(t) for t in range(0, horizon)] discounter_tf = tf.constant(discounter) mean_episode_d2 = tf.reduce_sum( d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24) discounted_d2 = mean_episode_d2 * discounter_tf # Discounted d2 discounted_total_d2 = tf.reduce_sum(discounted_d2, axis=0) # Sum over time bound_ = w_return_mean - tf.sqrt( (1 - delta) * discounted_total_d2 / (delta * n_episodes)) * return_step_max elif bound == 'pdis-mean-d2': # Discount factor if gamma >= 1: discounter = [ float(1 + 2 * (horizon - t - 1)) for t in range(0, horizon) ] else: def f(t): return pow(gamma, 2 * t) + ( 2 * pow(gamma, t) * (pow(gamma, t + 1) - pow(gamma, horizon))) / (1 - gamma) discounter = [f(t) for t in range(0, horizon)] discounter_tf = tf.constant(discounter) mean_episode_d2 = tf.reduce_sum( d2_w_0t, axis=0) / (tf.reduce_sum(mask_split, axis=0) + 1e-24) discounted_d2 = mean_episode_d2 * discounter_tf # Discounted d2 discounted_total_d2 = tf.reduce_sum(discounted_d2, axis=0) # Sum over time bound_ = w_return_mean - tf.sqrt( (1 - delta) * discounted_total_d2 / (delta * n_episodes)) * return_step_mean else: raise NotImplementedError() # Policy entropy for exploration ent = pi.pd.entropy() meanent = tf.reduce_mean(ent) losses_with_name.append((meanent, 'MeanEntropy')) # Add policy entropy bonus if entropy != 'none': scheme, v1, v2 = entropy.split(':') if scheme == 'step': entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1), lambda: float(0.0)) losses_with_name.append((entcoeff, 'EntropyCoefficient')) entbonus = entcoeff * meanent bound_ = bound_ + entbonus elif scheme == 'lin': ip = tf.cast(iter_number_ / max_iters, tf.float32) entcoeff_decay = tf.maximum( 0.0, float(v2) + (float(v1) - float(v2)) * (1.0 - ip)) losses_with_name.append((entcoeff_decay, 'EntropyCoefficient')) entbonus = entcoeff_decay * meanent bound_ = bound_ + entbonus elif scheme == 'exp': ent_f = tf.exp( -tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1) losses_with_name.append((ent_f, 'EntropyCoefficient')) bound_ = bound_ + ent_f * meanent else: raise Exception('Unrecognized entropy scheme.') losses_with_name.append((w_return_mean, 'ReturnMeanIW')) losses_with_name.append((bound_, 'Bound')) losses, loss_names = map(list, zip(*losses_with_name)) if use_natural_gradient: p = tf.placeholder(dtype=tf.float32, shape=[None]) target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1) grad_logprob = U.flatgrad( tf.stop_gradient(iwn) * target_logpdf_episode, var_list) dot_product = tf.reduce_sum(grad_logprob * p) hess_logprob = U.flatgrad(dot_product, var_list) compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob]) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assert_ops = tf.group(*tf.get_collection('asserts')) print_ops = tf.group(*tf.get_collection('prints')) compute_lossandgrad = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_grad = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_bound = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], [bound_, assert_ops, print_ops]) compute_losses = U.function( [ob_, ac_, rew_, disc_rew_, mask_, iter_number_], losses) #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, mask_], [ratio_cumsum, discounted_ratio]) set_parameter = U.SetFromFlat(var_list) get_parameter = U.GetFlat(var_list) if sampler is None: seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True) sampler = type("SequentialSampler", (object, ), { "collect": lambda self, _: seg_gen.__next__() })() U.initialize() # Starting optimizing episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=n_episodes) rewbuffer = deque(maxlen=n_episodes) while True: iters_so_far += 1 if render_after is not None and iters_so_far % render_after == 0: if hasattr(env, 'render'): render(env, pi, horizon) if callback: callback(locals(), globals()) if iters_so_far >= max_iters: print('Finised...') break logger.log('********** Iteration %i ************' % iters_so_far) theta = get_parameter() with timed('sampling'): seg = sampler.collect(theta) add_disc_rew(seg, gamma) lens, rets = seg['ep_lens'], seg['ep_rets'] lenbuffer.extend(lens) rewbuffer.extend(rets) episodes_so_far += len(lens) timesteps_so_far += sum(lens) args = ob, ac, rew, disc_rew, mask, iter_number = seg['ob'], seg[ 'ac'], seg['rew'], seg['disc_rew'], seg['mask'], iters_so_far assign_old_eq_new() def evaluate_loss(): loss = compute_bound(*args) return loss[0] def evaluate_gradient(): gradient = compute_grad(*args) return gradient[0] if use_natural_gradient: def evaluate_fisher_vector_prod(x): return compute_linear_operator(x, *args)[0] + fisher_reg * x def evaluate_natural_gradient(g): return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0) else: evaluate_natural_gradient = None with timed('summaries before'): logger.record_tabular("Iteration", iters_so_far) logger.record_tabular("InitialBound", evaluate_loss()) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if save_weights: logger.record_tabular('Weights', str(get_parameter())) import pickle file = open('checkpoint.pkl', 'wb') pickle.dump(theta, file) with timed("offline optimization"): theta, improvement = optimize_offline( theta, set_parameter, line_search, evaluate_loss, evaluate_gradient, evaluate_natural_gradient, max_offline_ite=max_offline_iters) set_parameter(theta) with timed('summaries after'): meanlosses = np.array(compute_losses(*args)) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.dump_tabular() env.close()
def _static_subsample(self, indicator, batch_size, labels): """Returns subsampled minibatch. Args: indicator: boolean tensor of shape [N] whose True entries can be sampled. N should be a complie time constant. batch_size: desired batch size. This scalar cannot be None. labels: boolean tensor of shape [N] denoting positive(=True) and negative (=False) examples. N should be a complie time constant. Returns: sampled_idx_indicator: boolean tensor of shape [N], True for entries which are sampled. It ensures the length of output of the subsample is always batch_size, even when number of examples set to True in indicator is less than batch_size. Raises: ValueError: if labels and indicator are not 1D boolean tensors. """ # Check if indicator and labels have a static size. if not indicator.shape.is_fully_defined(): raise ValueError( 'indicator must be static in shape when is_static is' 'True') if not labels.shape.is_fully_defined(): raise ValueError('labels must be static in shape when is_static is' 'True') if not isinstance(batch_size, int): raise ValueError( 'batch_size has to be an integer when is_static is' 'True.') input_length = tf.shape(indicator)[0] # Set the number of examples set True in indicator to be at least # batch_size. num_true_sampled = tf.reduce_sum(tf.cast(indicator, tf.float32)) additional_false_sample = tf.less_equal( tf.cumsum(tf.cast(tf.logical_not(indicator), tf.float32)), batch_size - num_true_sampled) indicator = tf.logical_or(indicator, additional_false_sample) # Shuffle indicator and label. Need to store the permutation to restore the # order post sampling. permutation = tf.random_shuffle(tf.range(input_length)) indicator = ops.matmul_gather_on_zeroth_axis( tf.cast(indicator, tf.float32), permutation) labels = ops.matmul_gather_on_zeroth_axis(tf.cast(labels, tf.float32), permutation) # index (starting from 1) when indicator is True, 0 when False indicator_idx = tf.where(tf.cast(indicator, tf.bool), tf.range(1, input_length + 1), tf.zeros(input_length, tf.int32)) # Replace -1 for negative, +1 for positive labels signed_label = tf.where( tf.cast(labels, tf.bool), tf.ones(input_length, tf.int32), tf.scalar_mul(-1, tf.ones(input_length, tf.int32))) # negative of index for negative label, positive index for positive label, # 0 when indicator is False. signed_indicator_idx = tf.multiply(indicator_idx, signed_label) sorted_signed_indicator_idx = tf.nn.top_k(signed_indicator_idx, input_length, sorted=True).values [num_positive_samples, num_negative_samples ] = self._get_num_pos_neg_samples(sorted_signed_indicator_idx, batch_size) sampled_idx = self._get_values_from_start_and_end( sorted_signed_indicator_idx, num_positive_samples, num_negative_samples, batch_size) # Shift the indices to start from 0 and remove any samples that are set as # False. sampled_idx = tf.abs(sampled_idx) - tf.ones(batch_size, tf.int32) sampled_idx = tf.multiply( tf.cast(tf.greater_equal(sampled_idx, tf.constant(0)), tf.int32), sampled_idx) sampled_idx_indicator = tf.cast( tf.reduce_sum(tf.one_hot(sampled_idx, depth=input_length), axis=0), tf.bool) # project back the order based on stored permutations reprojections = tf.one_hot(permutation, depth=input_length, dtype=tf.float32) return tf.cast( tf.tensordot(tf.cast(sampled_idx_indicator, tf.float32), reprojections, axes=[0, 0]), tf.bool)
def row_lengths_to_splits(row_lengths): return tf.pad(tf.cumsum(row_lengths), [[1, 0]])
def refine_stage(self, input_img_batch, gtboxes_batch_r, gthead_quadrant, gt_smooth_label, box_pred_list, cls_prob_list, proposal_list, angle_cls_list, feature_pyramid, gpu_id, pos_threshold, neg_threshold, stage, proposal_filter=False): with tf.variable_scope('refine_feature_pyramid{}'.format(stage)): refine_feature_pyramid = {} refine_boxes_list = [] # refine_boxes_angle_list = [] for box_pred, cls_prob, proposal, angle_prob, stride, level in \ zip(box_pred_list, cls_prob_list, proposal_list, angle_cls_list, cfgs.ANCHOR_STRIDE, cfgs.LEVEL): if proposal_filter: box_pred = tf.reshape( box_pred, [-1, self.num_anchors_per_location, 5]) proposal = tf.reshape(proposal, [ -1, self.num_anchors_per_location, 5 if self.method == 'R' else 4 ]) cls_prob = tf.reshape( cls_prob, [-1, self.num_anchors_per_location, cfgs.CLASS_NUM]) cls_max_prob = tf.reduce_max(cls_prob, axis=-1) box_pred_argmax = tf.cast( tf.reshape(tf.argmax(cls_max_prob, axis=-1), [-1, 1]), tf.int32) indices = tf.cast( tf.cumsum(tf.ones_like(box_pred_argmax), axis=0), tf.int32) - tf.constant(1, tf.int32) indices = tf.concat([indices, box_pred_argmax], axis=-1) box_pred = tf.reshape(tf.gather_nd(box_pred, indices), [-1, 5]) proposal = tf.reshape(tf.gather_nd(proposal, indices), [-1, 5 if self.method == 'R' else 4]) if cfgs.METHOD == 'H': x_c = (proposal[:, 2] + proposal[:, 0]) / 2 y_c = (proposal[:, 3] + proposal[:, 1]) / 2 h = proposal[:, 2] - proposal[:, 0] + 1 w = proposal[:, 3] - proposal[:, 1] + 1 theta = -90 * tf.ones_like(x_c) proposal = tf.transpose( tf.stack([x_c, y_c, w, h, theta])) if cfgs.ANGLE_RANGE == 180: proposal = coordinate90_2_180_tf(proposal, is_radian=False, change_range=True) # bboxes = bbox_transform.rbbox_transform_inv(boxes=proposal, deltas=box_pred) # if cfgs.ANGLE_RANGE == 180: # bboxes = coordinate90_2_180_tf(bboxes, is_radian=False, change_range=True) else: box_pred = tf.reshape(box_pred, [-1, 5]) proposal = tf.reshape(proposal, [-1, 5]) bboxes = bbox_transform.rbbox_transform_inv(boxes=proposal, deltas=box_pred) if angle_prob is not None: angle_cls = tf.cast( tf.argmax(tf.sigmoid(angle_prob), axis=1), tf.float32) angle_cls = (tf.reshape(angle_cls, [ -1, ]) * -1 - 0.5) * cfgs.OMEGA x, y, w, h, theta = tf.unstack(bboxes, axis=1) bboxes_angle = tf.transpose( tf.stack([x, y, w, h, angle_cls])) refine_boxes_list.append(bboxes_angle) center_point = bboxes_angle[:, :2] / stride else: center_point = bboxes[:, :2] / stride refine_boxes_list.append(bboxes) refine_feature_pyramid[level] = self.refine_feature_op( points=center_point, feature_map=feature_pyramid[level], name=level) refine_box_pred_list, refine_cls_score_list, refine_cls_prob_list, refine_head_cls_list, refine_angle_cls_list = self.refine_net( refine_feature_pyramid, 'refine_net{}'.format(stage)) refine_box_pred = tf.concat(refine_box_pred_list, axis=0) refine_cls_score = tf.concat(refine_cls_score_list, axis=0) # refine_cls_prob = tf.concat(refine_cls_prob_list, axis=0) refine_boxes = tf.concat(refine_boxes_list, axis=0) refine_head_cls = tf.concat(refine_head_cls_list, axis=0) refine_angle_cls = tf.concat(refine_angle_cls_list, axis=0) if self.is_training: with tf.variable_scope('build_refine_loss{}'.format(stage)): refine_labels, refine_target_delta, refine_box_states, refine_target_boxes, refine_target_head_quadrant, refine_target_smooth_label = tf.py_func( func=refinebox_target_layer, inp=[ gtboxes_batch_r, gthead_quadrant, gt_smooth_label, refine_boxes, pos_threshold, neg_threshold, gpu_id ], Tout=[ tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32 ]) if cfgs.ANGLE_RANGE == 180: refine_boxes_ = tf.py_func(coordinate_present_convert, inp=[refine_boxes, 1], Tout=[tf.float32]) refine_boxes_ = tf.reshape(refine_boxes_, [-1, 5]) self.add_anchor_img_smry(input_img_batch, refine_boxes_, refine_box_states, 1) else: self.add_anchor_img_smry(input_img_batch, refine_boxes, refine_box_states, 1) refine_cls_loss = losses.focal_loss(refine_labels, refine_cls_score, refine_box_states) if cfgs.USE_IOU_FACTOR: refine_reg_loss = losses.iou_smooth_l1_loss_( refine_target_delta, refine_box_pred, refine_box_states, refine_target_boxes, refine_boxes, is_refine=True) # refine_reg_loss = losses.iou_smooth_l1_loss_1(refine_box_pred, # refine_box_states, refine_target_boxes, # refine_boxes, is_refine=True) else: refine_reg_loss = losses.smooth_l1_loss( refine_target_delta, refine_box_pred, refine_box_states) if cfgs.DATASET_NAME.startswith('DOTA'): head_cls_loss = losses.head_specific_cls_focal_loss( refine_target_head_quadrant, refine_head_cls, refine_box_states, refine_labels, specific_cls=[6, 7, 8, 9, 10, 11]) else: head_cls_loss = losses.head_focal_loss( refine_target_head_quadrant, refine_head_cls, refine_box_states) angle_cls_loss = losses.angle_focal_loss( refine_target_smooth_label, refine_angle_cls, refine_box_states) self.losses_dict['refine_cls_loss{}'.format( stage)] = refine_cls_loss * cfgs.CLS_WEIGHT self.losses_dict['refine_reg_loss{}'.format( stage)] = refine_reg_loss * cfgs.REG_WEIGHT self.losses_dict['head_cls_loss{}'.format( stage)] = head_cls_loss * cfgs.HEAD_CLS_WEIGHT self.losses_dict['angle_cls_loss{}'.format( stage)] = angle_cls_loss * cfgs.ANGLE_CLS_WEIGHT return refine_box_pred_list, refine_cls_prob_list, refine_boxes_list, refine_head_cls_list, refine_angle_cls_list
def __init__(self, config): """Constructs a new RNN. Args: config: The configuration parameters unique_name: Define the unique name of this lstm num_input: The number of input units per step. num_output: The number of output units per step. num_hidden: The number of units in the hidden layer. num_cells: The number of cells per layer num_layers: Define number of time-step unfolds. clip_norm: The norm, to which a gradient should be clipped batch_size: This represents the batch size used for training. minimizer: Select the appropriate minimizer seed: Represents the seed for this model momentum: The momentum if the minimizer is momentum lr_rate: The initial learning rate lr_decay_steps: The steps until a decay should happen lr_decay_rate: How much should the learning rate be reduced """ # Save configuration and call the base class self.config = config self.name = config['unique_name'] # create a new session for running the model # the session is not visible to the callee self.sess = tf.Session() # use the name of the model as a first variable scope with tf.variable_scope(config['unique_name']): # ------------------------ INITIALIZATION ---------------------------- # create initializers and use xavier initialization for the weights # and use a bias of zero self.bias_initializer = tf.constant_initializer(0.0) self.weights_initializer =\ tf.contrib.layers.variance_scaling_initializer(1.0, 'FAN_AVG', True, config['seed']) # just a placeholder indicating whether this is training time or not self.training_time = tf.placeholder(tf.bool, None, name="training_time") # create preprocess and postprocess network. Both will be # modeled as a highway network self.pre_highway_network = self.get_preprocess_network() self.post_highway_network = self.get_postprocess_network() # initialize all cells self.cells = self.__init_all_cells() # ----------------------- VARIABLES & PLACEHOLDER --------------------------- self.global_step = tf.Variable(0, trainable=False, name='global_step') # X and Y Tensor self.x = tf.placeholder(tf.float32, [config['num_input'], config['rec_num_layers'] + config['rec_num_layers_teacher_forcing'], None], name="input") self.y = tf.placeholder(tf.float32, [config['num_output'], config['rec_num_layers_student_forcing'] + config['rec_num_layers_teacher_forcing'] + 1, None], name="target") # --------------------------------- GRAPH ------------------------------------ # define the memory state self.h = [tf.tile(cell.get_hidden_state(), [1, tf.shape(self.x)[2]]) for cell in self.cells] normalized_x = self.x + tf.random_normal(tf.shape(self.x), 0.0, 0.01) to_use_x = tf.cond(self.training_time, lambda: tf.identity(normalized_x), lambda: tf.identity(self.x)) \ if config['add_variance'] else self.x # unstack the input to a list, so it can be easier processed unstacked_x = tf.unstack(to_use_x, axis=1) # create all 3 components of the network, from preprocess, recurrent and # postprocess parts of the network. processed_unstacked_x = self.get_input_to_hidden_network(unstacked_x) lst_h = self.get_hidden_to_hidden_network(config, processed_unstacked_x, self.h) cutted_lst_h = lst_h[-(config['rec_num_layers_teacher_forcing'] + 1):] # create the outputs for each element in the list lst_output = self.get_hidden_to_output_network(cutted_lst_h) # apply some student forcing for self_l in range(config['rec_num_layers_student_forcing']): added_model = lst_output[-1] + (0 if not config['distance_model'] else unstacked_x[-1]) unstacked_x.append(added_model) processed_self_x_in = self.get_input_to_hidden_network([added_model]) h = self.get_hidden_to_hidden_network(config, processed_self_x_in, cutted_lst_h[-1]) lst_output.append(self.get_hidden_to_output_network(h)[0]) # define the target y self.target_y = tf.stack(lst_output, axis=1, name="target_y") if config['distance_model']: self.target_y = tf.cumsum(self.target_y, axis=1) + tf.expand_dims(unstacked_x[-1], axis=1) # first of create the reduced squared error err = self.target_y - self.y squared_err = tf.pow(err, 2) # So far we have got the model self.error = 0.5 * tf.reduce_mean(tf.reduce_sum(squared_err, axis=[0, 1])) self.single_absolute_error = tf.reduce_sum(tf.reduce_mean(tf.abs(err), axis=1), axis=1) # create minimizer self.learning_rate = tf.train.exponential_decay( self.config['lr_rate'], self.global_step, self.config['lr_decay_steps'], self.config['lr_decay_rate'], staircase=False) # create the minimizer self.minimizer = self.create_minimizer(self.learning_rate, self.error, self.global_step) # init the global variables initializer tf.set_random_seed(self.config['seed']) # init if not restored init = tf.global_variables_initializer() self.sess.run(init)
def get_cum_graph_size(node): cum_graph_sizes = tf.cumsum(graph_sizes, exclusive=True) indicator_if_smaller = tf.cast(node - cum_graph_sizes >= 0, tf.int32) graph_id = tf.reduce_sum(indicator_if_smaller) - 1 return tf.cumsum(graph_sizes, exclusive=True)[graph_id]
def call(self, X): num_examples = tf.shape(X)[0] if self.add_time: time = tf.tile( tf.range(tf.cast(tf.shape(X)[1], X.dtype), dtype=X.dtype)[None, :, None], [num_examples, 1, 1]) time *= 2. / (tf.cast(tf.shape(X)[1], X.dtype) - 1.) time -= 1. X = tf.concat((time, X), axis=-1) M = tf.matmul(tf.reshape(X, [-1, self.num_features]), tf.reshape(self.kernel, [self.num_features, -1])) M = tf.reshape(M, [-1, self.len_examples, self.len_tensors, self.units]) # do final differencing if self.difference: M = tf.concat((tf.zeros_like(M[:, :1]), M[:, 1:] - M[:, :-1]), axis=1) if self.return_sequences: Y = [tf.cumsum(M[..., 0, :], reverse=self.reverse, axis=1)] else: Y = [tf.reduce_sum(M[..., 0, :], axis=1)] if not self.recursive_tensors: k = 1 for m in range(1, self.num_levels): R = np.asarray([M[..., k, :]]) k += 1 for i in range(1, m + 1): d = min(i + 1, self.order) R_next = np.empty((d), dtype=tf.Tensor) R_next[0] = M[..., k, :] * tf.cumsum(tf.add_n(R.tolist()), reverse=self.reverse, exclusive=True, axis=1) for j in range(1, d): R_next[j] = 1 / tf.cast( j + 1, dtype=X.dtype) * M[..., k, :] * R[j - 1] k += 1 R = R_next if self.return_sequences: Y.append( tf.cumsum(tf.add_n(R.tolist()), reverse=self.reverse, axis=1)) else: Y.append(tf.reduce_sum(tf.add_n(R.tolist()), axis=1)) else: R = np.asarray([M[..., 0, :]]) for m in range(1, self.num_levels): d = min(m + 1, self.order) R_next = np.empty((d), dtype=tf.Tensor) R_next[0] = M[..., m, :] * tf.cumsum(tf.add_n(R.tolist()), exclusive=True, reverse=self.reverse, axis=1) for j in range(1, d): R_next[j] = 1 / tf.cast( j + 1, dtype=X.dtype) * M[..., m, :] * R[j - 1] R = R_next if self.return_sequences: Y.append( tf.cumsum(tf.add_n(R.tolist()), reverse=self.reverse, axis=1)) else: Y.append(tf.reduce_sum(tf.add_n(R.tolist()), axis=1)) if self.return_levels: return tf.stack(Y, axis=-2) else: return tf.add_n(Y)
def EffectiveSampleSize(states, filter_beyond_lag=300, filter_threshold=0.05, center=True, normalize=True): """ESS computation for one single Tensor argument.""" def _axis_size(x, axis=None): """Get number of elements of `x` in `axis`, as type `x.dtype`.""" if axis is None: return tf.cast(tf.size(x), x.dtype) return tf.cast(tf.reduce_prod(tf.gather(tf.shape(x), axis)), x.dtype) with tf.name_scope("effective_sample_size_single_state", values=[states, filter_beyond_lag, filter_threshold]): states = tf.convert_to_tensor(states, name="states") dt = states.dtype # filter_beyond_lag == None ==> auto_corr is the full sequence. auto_corr = SanitizedAutoCorrelation(states, axis=0, center=center, normalize=normalize, max_lags=filter_beyond_lag) auto_corr = tf.reduce_mean(auto_corr, 1) if filter_threshold is not None: filter_threshold = tf.convert_to_tensor(filter_threshold, dtype=dt, name="filter_threshold") # Get a binary mask to zero out values of auto_corr below the threshold. # mask[i, ...] = 1 if auto_corr[j, ...] > threshold for all j <= i, # mask[i, ...] = 0, otherwise. # So, along dimension zero, the mask will look like [1, 1, ..., 0, 0,...] # Building step by step, # Assume auto_corr = [1, 0.5, 0.0, 0.3], and filter_threshold = 0.2. # Step 1: mask = [False, False, True, False] mask = tf.abs(auto_corr) < filter_threshold # Step 2: mask = [0, 0, 1, 1] mask = tf.cast(mask, dtype=dt) # Step 3: mask = [0, 0, 1, 2] mask = tf.cumsum(mask, axis=0) # Step 4: mask = [1, 1, 0, 0] mask = tf.maximum(1. - mask, 0.) auto_corr *= mask # With R[k] := auto_corr[k, ...], # ESS = N / {1 + 2 * Sum_{k=1}^N (N - k) / N * R[k]} # = N / {-1 + 2 * Sum_{k=0}^N (N - k) / N * R[k]} (since R[0] = 1) # approx N / {-1 + 2 * Sum_{k=0}^M (N - k) / N * R[k]} # where M is the filter_beyond_lag truncation point chosen above. # Get the factor (N - k) / N, and give it shape [M, 1,...,1], having total # ndims the same as auto_corr n = _axis_size(states, axis=0) k = tf.range(0., _axis_size(auto_corr, axis=0)) nk_factor = (n - k) / n if auto_corr.shape.ndims is not None: new_shape = [-1] + [1] * (auto_corr.shape.ndims - 1) else: new_shape = tf.concat( ([-1], tf.ones([tf.rank(auto_corr) - 1], dtype=tf.int32)), axis=0) nk_factor = tf.reshape(nk_factor, new_shape) #return tf.reduce_mean(n / (-1 + 2 * tf.reduce_sum(nk_factor * auto_corr, axis=0)), 0) return n / (1.0 + 2 * tf.reduce_sum( nk_factor[1:, Ellipsis] * auto_corr[1:, Ellipsis], axis=0))
def safe_cumprod(x, **kwargs): """Computes cumprod in logspace using cumsum to avoid underflow.""" return tf.exp(tf.cumsum(tf.log(tf.clip_by_value(x, 1e-10, 1)), **kwargs))
def build(self, for_deploy, variants=""): conf = self.conf name = self.name job_type = self.job_type dtype = self.dtype self.beam_size = 1 if (not for_deploy or variants == "score") else sum( self.conf.beam_splits) graphlg.info("Creating placeholders...") self.enc_str_inps = tf.placeholder(tf.string, shape=(None, conf.input_max_len), name="enc_inps") self.enc_lens = tf.placeholder(tf.int32, shape=[None], name="enc_lens") self.dec_str_inps = tf.placeholder( tf.string, shape=[None, conf.output_max_len + 2], name="dec_inps") self.dec_lens = tf.placeholder(tf.int32, shape=[None], name="dec_lens") self.down_wgts = tf.placeholder(tf.float32, shape=[None], name="down_wgts") with tf.name_scope("TableLookup"): # lookup tables self.in_table = lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True) self.out_table = lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.string, default_value="_UNK", shared_name="out_table", name="out_table", checkpoint=True) self.enc_inps = self.in_table.lookup(self.enc_str_inps) self.dec_inps = self.in_table.lookup(self.dec_str_inps) # Create encode graph and get attn states graphlg.info("Creating embeddings and embedding enc_inps.") with ops.device("/cpu:0"): self.embedding = variable_scope.get_variable( "embedding", [conf.output_vocab_size, conf.embedding_size]) with tf.name_scope("Embed") as scope: dec_inps = tf.slice(self.dec_inps, [0, 0], [-1, conf.output_max_len + 1]) with ops.device("/cpu:0"): self.emb_inps = embedding_lookup_unique( self.embedding, self.enc_inps) emb_dec_inps = embedding_lookup_unique(self.embedding, dec_inps) # output projector (w, b) with tf.variable_scope("OutProj"): if conf.out_layer_size: w = tf.get_variable( "proj_w", [conf.out_layer_size, conf.output_vocab_size], dtype=dtype) elif conf.bidirectional: w = tf.get_variable( "proj_w", [conf.num_units * 2, conf.output_vocab_size], dtype=dtype) else: w = tf.get_variable("proj_w", [conf.num_units, conf.output_vocab_size], dtype=dtype) b = tf.get_variable("proj_b", [conf.output_vocab_size], dtype=dtype) graphlg.info("Creating dynamic rnn...") self.enc_outs, self.enc_states, mem_size, enc_state_size = DynRNN( conf.cell_model, conf.num_units, conf.num_layers, self.emb_inps, self.enc_lens, keep_prob=1.0, bidi=conf.bidirectional, name_scope="DynRNNEncoder") batch_size = tf.shape(self.enc_outs)[0] # Do vae on the state of the last layer of the encoder final_enc_states = [] KLDs = 0.0 for each in self.enc_states: z, KLD, l2 = CreateVAE([each], self.conf.enc_latent_dim, name_scope="VAE") if isinstance(each, LSTMStateTuple): final_enc_states.append( LSTMStateTuple(each.c, tf.concat([each.h, z], 1))) else: final_enc_state.append(tf.concat([z, each], 1)) KLDs += KLD / self.conf.num_layers with tf.name_scope("DynRNNDecode") as scope: with tf.name_scope("ShapeToBeam") as scope: beam_memory = tf.reshape( tf.tile(self.enc_outs, [1, 1, self.beam_size]), [-1, conf.input_max_len, mem_size]) beam_memory_lens = tf.squeeze( tf.reshape( tf.tile(tf.expand_dims(self.enc_lens, 1), [1, self.beam_size]), [-1, 1]), 1) def _to_beam(t): return tf.reshape(tf.tile(t, [1, self.beam_size]), [-1, int(t.get_shape()[1])]) beam_init_states = tf.contrib.framework.nest.map_structure( _to_beam, final_enc_states) max_mem_size = self.conf.input_max_len + self.conf.output_max_len + 2 cell = AttnCell(cell_model=conf.cell_model, num_units=mem_size, num_layers=conf.num_layers, attn_type=self.conf.attention, memory=beam_memory, mem_lens=beam_memory_lens, max_mem_size=max_mem_size, addmem=self.conf.addmem, keep_prob=conf.keep_prob, dtype=tf.float32, name_scope="AttnCell") dec_init_state = DecStateInit(all_enc_states=beam_init_states, decoder_cell=cell, batch_size=batch_size * self.beam_size, init_type="each2each") if not for_deploy: hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=self.conf.sample_prob, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = basic_decoder.BasicDecoder( cell=cell, helper=hp_train, initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, impute_finished=False, maximum_iterations=conf.output_max_len + 1, scope=scope) elif variants == "score": dec_init_state = zero_attn_states hp_train = helper.ScheduledEmbeddingTrainingHelper( inputs=emb_dec_inps, sequence_length=self.dec_lens, embedding=self.embedding, sampling_probability=0.0, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = score_decoder.ScoreDecoder( cell=cell, helper=hp_train, out_proj=(w, b), initial_state=dec_init_state, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=False) else: hp_infer = helper.GreedyEmbeddingHelper( embedding=self.embedding, start_tokens=tf.ones(shape=[batch_size * self.beam_size], dtype=tf.int32), end_token=EOS_ID, out_proj=(w, b)) output_layer = layers_core.Dense( self.conf.out_layer_size, use_bias=True) if self.conf.out_layer_size else None my_decoder = beam_decoder.BeamDecoder( cell=cell, helper=hp_infer, out_proj=(w, b), initial_state=dec_init_state, beam_splits=self.conf.beam_splits, max_res_num=self.conf.max_res_num, output_layer=output_layer) cell_outs, final_state = decoder.dynamic_decode( decoder=my_decoder, scope=scope, maximum_iterations=self.conf.output_max_len, impute_finished=True) if not for_deploy: outputs = cell_outs.rnn_output # Output ouputprojected to logits L = tf.shape(outputs)[1] outputs = tf.reshape(outputs, [-1, int(w.shape[0])]) outputs = tf.matmul(outputs, w) + b logits = tf.reshape(outputs, [-1, L, int(w.shape[1])]) # branch 1 for debugging, doesn't have to be called with tf.name_scope("DebugOutputs") as scope: self.outputs = tf.argmax(logits, axis=2) self.outputs = tf.reshape(self.outputs, [-1, L]) self.outputs = self.out_table.lookup( tf.cast(self.outputs, tf.int64)) with tf.name_scope("Loss") as scope: tars = tf.slice(self.dec_inps, [0, 1], [-1, L]) wgts = tf.cumsum(tf.one_hot(self.dec_lens, L), axis=1, reverse=True) #wgts = wgts * tf.expand_dims(self.down_wgts, 1) self.loss = loss.sequence_loss(logits=logits, targets=tars, weights=wgts, average_across_timesteps=False, average_across_batch=False) example_losses = tf.reduce_sum(self.loss, 1) batch_wgt = tf.reduce_sum(self.down_wgts) see_KLD = tf.reduce_sum(KLDs * self.down_wgts) / batch_wgt see_loss = tf.reduce_sum(example_losses / tf.cast( self.dec_lens, tf.float32) * self.down_wgts) / batch_wgt # not average over length self.loss = tf.reduce_sum( (example_losses + self.conf.kld_ratio * KLDs) * self.down_wgts) / batch_wgt with tf.name_scope(self.model_kind): tf.summary.scalar("loss", see_loss) tf.summary.scalar("kld", see_KLD) graph_nodes = { "loss": self.loss, "inputs": {}, "outputs": {}, "debug_outputs": self.outputs } elif variants == "score": L = tf.shape(cell_outs.logprobs)[1] one_hot = tf.one_hot(tf.slice(self.dec_inps, [0, 1], [-1, L]), depth=self.conf.output_vocab_size, axis=-1, on_value=1.0, off_value=0.0) outputs = tf.reduce_sum(cell_outs.logprobs * one_hot, 2) outputs = tf.reduce_sum(outputs, axis=1) inputs = { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens, "dec_inps:0": self.dec_str_inps, "dec_lens:0": self.dec_lens } graph_nodes = { "loss": None, "inputs": inputs, "outputs": { "logprobs": outputs }, "visualize": None } else: L = tf.shape(cell_outs.beam_ends)[1] beam_symbols = cell_outs.beam_symbols beam_parents = cell_outs.beam_parents beam_ends = cell_outs.beam_ends beam_end_parents = cell_outs.beam_end_parents beam_end_probs = cell_outs.beam_end_probs alignments = cell_outs.alignments beam_ends = tf.reshape(tf.transpose(beam_ends, [0, 2, 1]), [-1, L]) beam_end_parents = tf.reshape( tf.transpose(beam_end_parents, [0, 2, 1]), [-1, L]) beam_end_probs = tf.reshape( tf.transpose(beam_end_probs, [0, 2, 1]), [-1, L]) ## Creating tail_ids batch_size = tf.Print(batch_size, [batch_size], message="VAERNN2 batch") batch_offset = tf.expand_dims( tf.cumsum( tf.ones([batch_size, self.beam_size], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) offset2 = tf.expand_dims( tf.cumsum( tf.ones([batch_size, self.beam_size * 2], dtype=tf.int32) * self.beam_size, axis=0, exclusive=True), 2) out_len = tf.shape(beam_symbols)[1] self.beam_symbol_strs = tf.reshape( self.out_table.lookup(tf.cast(beam_symbols, tf.int64)), [batch_size, self.beam_size, -1]) self.beam_parents = tf.reshape( beam_parents, [batch_size, self.beam_size, -1]) - batch_offset self.beam_ends = tf.reshape(beam_ends, [batch_size, self.beam_size * 2, -1]) self.beam_end_parents = tf.reshape( beam_end_parents, [batch_size, self.beam_size * 2, -1]) - offset2 self.beam_end_probs = tf.reshape( beam_end_probs, [batch_size, self.beam_size * 2, -1]) self.beam_attns = tf.reshape( alignments, [batch_size, self.beam_size, out_len, -1]) inputs = { "enc_inps:0": self.enc_str_inps, "enc_lens:0": self.enc_lens } outputs = { "beam_symbols": self.beam_symbol_strs, "beam_parents": self.beam_parents, "beam_ends": self.beam_ends, "beam_end_parents": self.beam_end_parents, "beam_end_probs": self.beam_end_probs, "beam_attns": self.beam_attns } graph_nodes = { "loss": None, "inputs": inputs, "outputs": outputs, "visualize": { "z": z } } return graph_nodes
def monotonic_attention(p_choose_i, previous_attention, mode): # p_choose_i: (batch_size, encoder_seq_length), 각각의 원손는 sigmoid를 취한 값이기 때문에 0~1의 값을 가진다. """Compute monotonic attention distribution from choosing probabilities. Monotonic attention implies that the input sequence is processed in an explicitly left-to-right manner when generating the output sequence. In addition, once an input sequence element is attended to at a given output timestep, elements occurring before it cannot be attended to at subsequent output timesteps. This function generates attention distributions according to these assumptions. For more information, see ``Online and Linear-Time Attention by Enforcing Monotonic Alignments''. Args: p_choose_i: Probability of choosing input sequence/memory element i. Should be of shape (batch_size, input_sequence_length), and should all be in the range [0, 1]. previous_attention: The attention distribution from the previous output timestep. Should be of shape (batch_size, input_sequence_length). For the first output timestep, preevious_attention[n] should be [1, 0, 0, ..., 0] for all n in [0, ... batch_size - 1]. mode: How to compute the attention distribution. Must be one of 'recursive', 'parallel', or 'hard'. * 'recursive' uses tf.scan to recursively compute the distribution. This is slowest but is exact, general, and does not suffer from numerical instabilities. * 'parallel' uses parallelized cumulative-sum and cumulative-product operations to compute a closed-form solution to the recurrence relation defining the attention distribution. This makes it more efficient than 'recursive', but it requires numerical checks which make the distribution non-exact. This can be a problem in particular when input_sequence_length is long and/or p_choose_i has entries very close to 0 or 1. * 'hard' requires that the probabilities in p_choose_i are all either 0 or 1, and subsequently uses a more efficient and exact solution. Returns: A tensor of shape (batch_size, input_sequence_length) representing the attention distributions for each sequence in the batch. Raises: ValueError: mode is not one of 'recursive', 'parallel', 'hard'. """ if mode == "recursive": batch_size = tf.shape(p_choose_i)[0] # Compute [1, 1 - p_choose_i[0], 1 - p_choose_i[1], ..., 1 - p_choose_i[-2]] shifted_1mp_choose_i = tf.concat( [tf.ones((batch_size, 1)), 1 - p_choose_i[:, :-1]], 1) # Compute attention distribution recursively as # q[i] = (1 - p_choose_i[i])*q[i - 1] + previous_attention[i] # attention[i] = p_choose_i[i]*q[i] attention = p_choose_i*tf.transpose(tf.scan( # Need to use reshape to remind TF of the shape between loop iterations lambda x, yz: tf.reshape(yz[0]*x + yz[1], (batch_size,)), # Loop variables yz[0] and yz[1] [tf.transpose(shifted_1mp_choose_i), tf.transpose(previous_attention)], # Initial value of x is just zeros tf.zeros((batch_size,)))) elif mode == "parallel": # safe_cumprod computes cumprod in logspace with numeric checks cumprod_1mp_choose_i = safe_cumprod(1 - p_choose_i, axis=1, exclusive=True) # Compute recurrence relation solution attention = p_choose_i*cumprod_1mp_choose_i*tf.cumsum( previous_attention / # Clip cumprod_1mp to avoid divide-by-zero tf.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.), axis=1) elif mode == "hard": # Remove any probabilities before the index chosen last time step p_choose_i *= tf.cumsum(previous_attention, axis=1) # Now, use exclusive cumprod to remove probabilities after the first # chosen index, like so: # p_choose_i = [0, 0, 0, 1, 1, 0, 1, 1] # cumprod(1 - p_choose_i, exclusive=True) = [1, 1, 1, 1, 0, 0, 0, 0] # Product of above: [0, 0, 0, 1, 0, 0, 0, 0] attention = p_choose_i*tf.cumprod(1 - p_choose_i, axis=1, exclusive=True) else: raise ValueError("mode must be 'recursive', 'parallel', or 'hard'.") return attention
def __init__( self, num_symbols, num_qwords, #modify num_embed_units, num_units, num_layers, is_train, vocab=None, embed=None, question_data=True, learning_rate=0.5, learning_rate_decay_factor=0.95, max_gradient_norm=5.0, num_samples=512, max_length=30, use_lstm=False, use_bidrnn=False): self.posts = tf.placeholder(tf.string, shape=(None, None)) # batch*len self.posts_length = tf.placeholder(tf.int32, shape=(None)) # batch self.responses = tf.placeholder(tf.string, shape=(None, None)) # batch*len self.responses_length = tf.placeholder(tf.int32, shape=(None)) # batch self.keyword_tensor = tf.placeholder( tf.float32, shape=(None, 3, None)) #(batch * len) * 3 * numsymbol, not used in STD self.word_type = tf.placeholder(tf.int32, shape=(None)) #(batch * len) # build the vocab table (string to index) if is_train: self.symbols = tf.Variable(vocab, trainable=False, name="symbols") else: self.symbols = tf.Variable(np.array(['.'] * num_symbols), name="symbols") self.symbol2index = HashTable(KeyValueTensorInitializer( self.symbols, tf.Variable( np.array([i for i in range(num_symbols)], dtype=np.int32), False)), default_value=UNK_ID, name="symbol2index") #string2index for post and response self.posts_input = self.symbol2index.lookup(self.posts) # batch*len self.responses_target = self.symbol2index.lookup( self.responses) #batch*len batch_size, decoder_len = tf.shape(self.responses)[0], tf.shape( self.responses)[1] self.responses_input = tf.concat([ tf.ones([batch_size, 1], dtype=tf.int32) * GO_ID, tf.split(self.responses_target, [decoder_len - 1, 1], 1)[0] ], 1) # batch*len #delete the last column of responses_target) and add 'GO at the front of it. self.decoder_mask = tf.reshape( tf.cumsum(tf.one_hot(self.responses_length - 1, decoder_len), reverse=True, axis=1), [-1, decoder_len]) #bacth * len print "embedding..." # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable('embed', [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) #self.embed = tf.Print(self.embed, ['embed', self.embed]) self.encoder_input = tf.nn.embedding_lookup( self.embed, self.posts_input) #batch*len*unit self.decoder_input = tf.nn.embedding_lookup(self.embed, self.responses_input) print "embedding finished" if use_lstm: cell = MultiRNNCell([LSTMCell(num_units)] * num_layers) else: cell = MultiRNNCell([GRUCell(num_units)] * num_layers) #for bidirectional rnn, not used in STD in final experiment if use_bidrnn: if use_lstm: encoder_cell = LSTMCell else: encoder_cell = GRUCell # rnn encoder encoder_output, encoder_state = multi_bidirectional_rnn( encoder_cell, num_units / 2, num_layers, self.encoder_input, self.posts_length) else: # rnn encoder encoder_output, encoder_state = dynamic_rnn(cell, self.encoder_input, self.posts_length, dtype=tf.float32, scope="encoder") # get output projection function output_fn, sampled_sequence_loss = output_projection_layer( num_units, num_symbols, num_qwords, num_samples, question_data) print "encoder_output.shape:", encoder_output.get_shape() # get attention function attention_keys, attention_values, attention_score_fn, attention_construct_fn \ = attention_decoder_fn.prepare_attention(encoder_output, 'luong', num_units) # get decoding loop function decoder_fn_train = attention_decoder_fn.attention_decoder_fn_train( encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn) decoder_fn_inference = attention_decoder_fn.attention_decoder_fn_inference( output_fn, self.keyword_tensor, encoder_state, attention_keys, attention_values, attention_score_fn, attention_construct_fn, self.embed, GO_ID, EOS_ID, max_length, num_symbols) if is_train: # rnn decoder self.decoder_output, _, _ = dynamic_rnn_decoder( cell, decoder_fn_train, self.decoder_input, self.responses_length, scope="decoder") # calculate the loss of decoder self.decoder_loss, self.ppl_loss = sampled_sequence_loss( self.decoder_output, self.responses_target, self.decoder_mask, self.keyword_tensor, self.word_type) # building graph finished and get all parameters self.params = tf.trainable_variables() for item in tf.trainable_variables(): print item.name, item.get_shape() # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # calculate the gradient of parameters opt = tf.train.GradientDescentOptimizer(self.learning_rate) gradients = tf.gradients(self.decoder_loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) #self.train_op = tf.train.AdamOptimizer().minimize(self.decoder_loss, global_step=self.global_step) else: # rnn decoder self.decoder_distribution, _, _ = dynamic_rnn_decoder( cell, decoder_fn_inference, scope="decoder") print("self.decoder_distribution.shape():", self.decoder_distribution.get_shape()) self.decoder_distribution = tf.Print(self.decoder_distribution, [ "distribution.shape()", tf.reduce_sum(self.decoder_distribution) ]) # generating the response self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, num_symbols - 2], 2)[1], 2) + 2 # for removing UNK self.generation = tf.nn.embedding_lookup(self.symbols, self.generation_index) self.params = tf.trainable_variables() self.saver = tf.train.Saver(tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=3, pad_step_number=True, keep_checkpoint_every_n_hours=1.0)
def coords_several_sequences(): end_coords = tf.cumsum(sequence_lengths) start_coords = tf.concat([[0], end_coords[:-1]], axis=0) coords = tf.stack([start_coords, end_coords], axis=1) coords = tf.cast(coords, dtype=tf.int32) return tf.map_fn(join_charcaters_fn, coords, dtype=tf.string)
def stochastic(self): p = tf.cumsum(self.probability, axis=-1) x = tf.argmax(p > tf.random.uniform((len(p), 1)), axis=-1) return x
def init_evaluation_model(stabNet, sample_num): outputs = collections.OrderedDict() with tf.variable_scope('stabNet'): STN = ProjectiveTransformer([stabNet.h, stabNet.w]) outputs = collections.OrderedDict() stabNet.inputs['Iu'] = tf.placeholder( 'float32', [None, 1, stabNet.h, stabNet.w, stabNet.c], name='input_U') stabNet.inputs['U_t_1_seq'] = tf.placeholder( 'float32', [None, sample_num - 2, stabNet.h, stabNet.w, 2], name='input_U_t_1_seq') stabNet.inputs['S_t_1_seq'] = tf.placeholder( 'float32', [None, sample_num - 2, stabNet.h, stabNet.w, 2], name='input_S_t_1_seq') stabNet.inputs['B_t_1'] = tf.placeholder( 'float32', [None, stabNet.h, stabNet.w, 2], name='input_B_t_1') stabNet.inputs['B_t_1_H'] = tf.placeholder( 'float32', [None, stabNet.h, stabNet.w, 2], name='input_B_t_1_H') # find path for Iuu outputs['Iuu'] = tf.concat([ tf.expand_dims(stabNet.inputs['IU'][:, -1, :, :, :], axis=1), stabNet.inputs['Iu'] ], axis=1) outputs['src_Iuu_seq_flat'] = stabNet.flatten_seq( outputs['Iuu'][:, 1:, :, :, :]) # t outputs['trg_Iuu_seq_flat'] = stabNet.flatten_seq( outputs['Iuu'][:, :-1, :, :, :]) # t_1 outputs['Iuu_concat'] = tf.concat( [outputs['src_Iuu_seq_flat'], outputs['trg_Iuu_seq_flat']], axis=3) # t_1, t sample_num_uu = 1 outputs['U_t_seq_flat_H'] = pathFinder(outputs['Iuu_concat'], stabNet.F_dim, False, stabNet.get_reuse('pathFinder'), scope='pathFinder') outputs['U_t_seq_flat'] = STN.H2OF( tf.ones_like(outputs['src_Iuu_seq_flat']), outputs['U_t_seq_flat_H']) outputs['U_t_seq'] = stabNet.unflatten_seq(outputs['U_t_seq_flat'], sample_num_uu) outputs['U_t'] = outputs['U_t_seq'][:, 0, :, :, :] ## 2. P_t_1 outputs['P_t_1_seq'] = tf.cumsum(stabNet.inputs['S_t_1_seq'], axis=1) outputs['P_t_1'] = outputs['P_t_1_seq'][:, -1, :, :, :] ## 3. C_t_1 outputs['C_t_1_seq'] = tf.cumsum(stabNet.inputs['U_t_1_seq'], axis=1) outputs['C_t_1'] = outputs['C_t_1_seq'][:, -1, :, :, :] outputs['B_t_1_cumsum'] = outputs['P_t_1'] - outputs['C_t_1'] #################### ## PATH SMOOTHING ## #################### with tf.variable_scope('pathSmoother'): outputs['U_t_seq_c'] = stabNet.seq_to_channel( tf.concat([stabNet.inputs['U_t_1_seq'], outputs['U_t_seq']], axis=1)) outputs['S_t_1_seq_c'] = stabNet.seq_to_channel( stabNet.inputs['S_t_1_seq']) outputs['S_t_pred_H'] = pathPredictor( tf.concat([outputs['S_t_1_seq_c'], outputs['U_t_seq_c']], axis=3), stabNet.F_dim, False, stabNet.get_reuse('pathPredictor'), scope='pathPredictor') outputs['S_t_pred'] = STN.H2OF( tf.ones_like(stabNet.inputs['Iu'][:, 0, :, :, :]), outputs['S_t_pred_H']) outputs['S_t_pred_seq'] = tf.expand_dims(outputs['S_t_pred'], axis=1) outputs['IUu_seq_c'] = stabNet.seq_to_channel( tf.concat([ tf.expand_dims(stabNet.inputs['IU'][:, -1, :, :, :], axis=1), stabNet.inputs['Iu'] ], axis=1)) outputs['B_t_pred_H'] = pathUpdater( tf.concat([ tf.stop_gradient( outputs['S_t_pred']), stabNet.inputs['B_t_1'], outputs['U_t'], outputs['IUu_seq_c'] ], axis=3), stabNet.F_dim, False, stabNet.get_reuse('pathUpdater'), scope='pathRefiner') outputs['B_t'] = STN.H2OF( tf.ones_like(stabNet.inputs['Iu'][:, 0, :, :, :]), outputs['B_t_pred_H']) outputs['B_t_H'] = -1 * (outputs['S_t_pred'] - outputs['U_t']) ############# ## WARPING ## ############# # B_t = S_t - U_t + B_t_1 = S_t - U_t + (P_t_1 - C_t_1) - C_0 outputs['B_t_cumsum'] = -1 * (outputs['S_t_pred'] - outputs['U_t'] + outputs['B_t_1_cumsum']) outputs['Iu'] = tf.reshape(stabNet.inputs['Iu'], [-1, stabNet.h, stabNet.w, stabNet.c]) with tf.variable_scope('STN'): outputs['Is_pred'] = tf_warp(outputs['Iu'], outputs['B_t']) outputs['Is_pred_H'] = tf_warp(outputs['Iu'], outputs['B_t_H']) outputs['Is_pred_cumsum'] = tf_warp(outputs['Iu'], outputs['B_t_cumsum']) return outputs
def __init__(self, is_training=True, clue_level=1): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.xloc, self.yloc, self.m, self.num_batch = get_batch_data( ) # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.x_maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.y_maxlen)) self.xloc = tf.placeholder(tf.int32, shape=(None, hp.x_maxlen)) self.yloc = tf.placeholder(tf.int32, shape=(None, hp.y_maxlen)) self.m = tf.placeholder(tf.int32, shape=(None, hp.x_maxlen)) self.clue_level = clue_level # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary src2idx, idx2src = load_src_vocab() des2idx, idx2des = load_des_vocab() self.hidden_units = hp.hidden_units # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(src2idx), num_units=self.hidden_units, scale=True, scope="enc_embed") if is_training: self.clue_level = tf.random_poisson(shape=[1], lam=1, dtype=tf.int32) #clue_level = tf.Print(clue_level, [clue_level]) #self.enc_mask = tf.expand_dims(tf.cast(tf.equal(self.m, 1), tf.float32), 2) self.enc_mask = tf.expand_dims( tf.cast( tf.logical_and(tf.greater_equal(self.m, 1), tf.less_equal(self.m, self.clue_level)), tf.float32), 2) self.enc = tf.concat([self.enc, self.enc_mask], axis=2) self.hidden_units += 1 ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding( self.x, num_units=self.hidden_units, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.x_maxlen, num_units=self.hidden_units, zero_pad=False, scale=False, scope="enc_pe") tf.add_to_collection('explain_input', self.enc) ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=self.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward(self.enc, num_units=[ 4 * self.hidden_units, self.hidden_units ]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(des2idx), num_units=self.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding( self.decoder_inputs, vocab_size=hp.y_maxlen, num_units=self.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.y_maxlen, num_units=self.hidden_units, zero_pad=False, scale=False, scope="dec_pe") tf.add_to_collection('explain_input', self.dec) ## Dropout self.dec_word = self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec_word, num_units=self.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=self.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward with tf.variable_scope( "num_blocks_fc_dec_{}".format(i)): self.dec = feedforward(self.dec, num_units=[ 4 * self.hidden_units, self.hidden_units ]) self.loc_enc = self.enc self.loc_logits = attention_matrix(queries=self.loc_enc, keys=self.dec, num_units=self.hidden_units, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="copy_matrix") xloc_vec = tf.one_hot(self.xloc, depth=hp.y_maxlen, dtype=tf.float32) yloc_vec = tf.one_hot(self.yloc, depth=hp.y_maxlen, dtype=tf.float32) loc_label = tf.matmul(yloc_vec, tf.transpose(xloc_vec, [0, 2, 1])) self.loc_label_history = tf.cumsum(loc_label, axis=1, exclusive=True) # Final linear projection self.loc_logits = tf.transpose(self.loc_logits, [0, 2, 1]) self.loc_logits = tf.stack( [self.loc_logits, self.loc_label_history], axis=3) self.loc_logits = tf.squeeze(tf.layers.dense(self.loc_logits, 1), axis=[3]) x_masks = tf.tile(tf.expand_dims(tf.equal(self.x, 0), 1), [1, hp.y_maxlen, 1]) #y_masks = tf.tile(tf.expand_dims(tf.equal(self.y, 0), -1), [1, 1, hp.x_maxlen]) paddings = tf.ones_like(self.loc_logits) * (-1e6) self.loc_logits = tf.where(x_masks, paddings, self.loc_logits) # (N, T_q, T_k) #self.loc_logits = tf.where(y_masks, paddings, self.loc_logits) # (N, T_q, T_k) self.logits = tf.layers.dense(self.dec, len(des2idx)) self.final_logits = tf.concat([self.logits, self.loc_logits], axis=2) tf.add_to_collection('explain_output', self.final_logits) #self.final_logits = tf.Print(self.final_logits, [self.final_logits[0][0][-3:]], message="final_logits_last") #self.final_logits = tf.Print(self.final_logits, [self.final_logits[0][0][:3]], message="final_logits_first") self.preds = tf.to_int32(tf.argmax(self.final_logits, axis=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) if is_training: label = tf.one_hot(self.y, depth=len(des2idx), dtype=tf.float32) # A special case, when copy is open, we should not need unk label unk_pos = label[:, :, 1] copy_pos = tf.sign(tf.reduce_sum(loc_label, axis=2)) fix_pos = unk_pos * copy_pos #fix_pos = tf.Print(fix_pos, [tf.reduce_sum(unk_pos, axis=-1), tf.shape(unk_pos)], message="\nunk_pos", summarize=16) #fix_pos = tf.Print(fix_pos, [tf.reduce_sum(fix_pos, axis=-1), tf.shape(fix_pos)], message="\nfix_pos", summarize=16) fix_label = tf.expand_dims(label[:, :, 1] - fix_pos, axis=2) label = tf.concat( [label[:, :, :1], fix_label, label[:, :, 2:]], axis=-1) self.final_label = tf.concat([label, loc_label], axis=2) #self.final_label = tf.Print(self.final_label, [self.final_label[0][0][-3:]], message="final_label") # Loss self.min_logit_loc = min_logit_loc = tf.argmax( self.final_logits + (-1e6) * (1.0 - self.final_label), axis=-1) #min_logit_loc = tf.Print(min_logit_loc, [min_logit_loc[0]], message="min_logit_loc") self.min_label = tf.one_hot(min_logit_loc, depth=len(des2idx) + hp.x_maxlen, dtype=tf.float32) vocab_count = len(des2idx) + hp.x_maxlen - tf.reduce_sum( tf.cast(tf.equal(self.x, 0), dtype=tf.int32), axis=-1) #vocab_count = tf.Print(vocab_count, [vocab_count[0]], message="vocab_count") self.y_smoothed = label_smoothing_mask(self.min_label, vocab_count) #self.final_logits = tf.Print(self.final_logits, [self.final_logits[0][1][min_logit_loc[0][1]]], message="final_logits") #self.y_smoothed = tf.Print(self.y_smoothed, [self.y_smoothed[0][1][min_logit_loc[0][1]]], message="y_smoothed") self.loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.final_logits, labels=self.y_smoothed) #self.loss = tf.Print(self.loss, [self.final_label[0][1][min_logit_loc[0][1]]], message="final_label") #self.loss = tf.Print(self.loss, [self.loss[0][-3:]], message="loss_last") #self.loss = tf.Print(self.loss, [self.loss[0][:3]], message="loss_first") self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def conditional_answer_layer(size, encoded_question, question_length, encoded_support, support_length, correct_start, support2question, answer2support, is_eval, topk=1, max_span_size=10000, bilinear=False): question_state = compute_question_state(encoded_question, question_length) question_state = tf.gather(question_state, support2question) # Prediction # start if bilinear: hidden_start = tf.layers.dense(question_state, size, name="hidden_start") start_scores = tf.einsum('ik,ijk->ij', hidden_start, encoded_support) else: static_input = tf.concat([tf.expand_dims(question_state, 1) * encoded_support, encoded_support], 2) hidden_start = tf.layers.dense(question_state, size, name="hidden_start_1") hidden_start = tf.layers.dense( static_input, size, use_bias=False, name="hidden_start_2") + tf.expand_dims(hidden_start, 1) start_scores = tf.layers.dense(tf.nn.relu(hidden_start), 1, use_bias=False, name="start_scores") start_scores = tf.squeeze(start_scores, [2]) support_mask = misc.mask_for_lengths(support_length) start_scores = start_scores + support_mask max_support_length = tf.shape(start_scores)[1] _, _, num_doc_per_question = tf.unique_with_counts(support2question) offsets = tf.cumsum(num_doc_per_question, exclusive=True) doc_idx_for_support = tf.range(tf.shape(support2question)[0]) - tf.gather(offsets, support2question) doc_idx, start_pointer = tf.cond( is_eval, lambda: segment_top_k(start_scores, support2question, topk)[:2], lambda: (tf.expand_dims(answer2support, 1), tf.expand_dims(correct_start, 1))) doc_idx_flat = tf.reshape(doc_idx, [-1]) start_pointer = tf.reshape(start_pointer, [-1]) start_state = tf.gather_nd(encoded_support, tf.stack([doc_idx_flat, start_pointer], 1)) start_state.set_shape([None, size]) encoded_support_gathered = tf.gather(encoded_support, doc_idx_flat) question_state = tf.gather(question_state, doc_idx_flat) if bilinear: hidden_end = tf.layers.dense(tf.concat([question_state, start_state], 1), size, name="hidden_end") end_scores = tf.einsum('ik,ijk->ij', hidden_end, encoded_support_gathered) else: end_input = tf.concat([tf.expand_dims(start_state, 1) * encoded_support_gathered, tf.gather(static_input, doc_idx_flat)], 2) hidden_end = tf.layers.dense(tf.concat([question_state, start_state], 1), size, name="hidden_end_1") hidden_end = tf.layers.dense( end_input, size, use_bias=False, name="hidden_end_2") + tf.expand_dims(hidden_end, 1) end_scores = tf.layers.dense(tf.nn.relu(hidden_end), 1, use_bias=False, name="end_scores") end_scores = tf.squeeze(end_scores, [2]) end_scores = end_scores + tf.gather(support_mask, doc_idx_flat) def train(): predicted_end_pointer = tf.argmax(end_scores, axis=1, output_type=tf.int32) return start_scores, end_scores, doc_idx, start_pointer, predicted_end_pointer def eval(): # [num_questions * topk, support_length] left_mask = misc.mask_for_lengths(tf.cast(start_pointer, tf.int32), max_support_length, mask_right=False) right_mask = misc.mask_for_lengths(tf.cast(start_pointer + max_span_size, tf.int32), max_support_length) masked_end_scores = end_scores + left_mask + right_mask predicted_ends = tf.argmax(masked_end_scores, axis=1, output_type=tf.int32) return (start_scores, masked_end_scores, tf.gather(doc_idx_for_support, doc_idx_flat), start_pointer, predicted_ends) return tf.cond(is_eval, eval, train)
def call(self, inputs: tf.Tensor, states: Optional[States] = None, output_states: bool = True ) -> Union[tf.Tensor, Tuple[tf.Tensor, States]]: """Calls the layer with the given inputs. Args: inputs: An input `tf.Tensor`. states: A `dict` of states such that, if any of the keys match for this layer, will overwrite the contents of the buffer(s). output_states: A `bool`. If True, returns the output tensor and output states. Returns just the output tensor otherwise. Returns: An output `tf.Tensor` (and optionally the states if `output_states=True`). If `causal=True`, the output tensor will have shape `[batch_size, num_frames, 1, 1, channels]` if `keepdims=True`. We keep the frame dimension in this case to simulate a cumulative global average as if we are inputting one frame at a time. If `causal=False`, the output is equivalent to `tf.keras.layers.GlobalAveragePooling3D` with shape `[batch_size, 1, 1, 1, channels]` if `keepdims=True` (plus the optional buffer stored in `states`). Raises: ValueError: If using 'channels_first' data format. """ states = dict(states) if states is not None else {} if tf.keras.backend.image_data_format() == 'channels_first': raise ValueError('"channels_first" mode is unsupported.') # Shape: [batch_size, 1, 1, 1, channels] buffer = states.get(self._state_name, None) if buffer is None: buffer = tf.zeros_like(inputs[:, :1, :1, :1], dtype=inputs.dtype) states[self._state_name] = buffer # Keep a count of frames encountered across input iterations in # num_frames to be able to accurately take a cumulative average across # all frames when running in streaming mode num_frames = tf.shape(inputs)[1] frame_count = states.get(self._frame_count_name, 0) states[self._frame_count_name] = frame_count + num_frames if self._causal: # Take a mean of spatial dimensions to make computation more efficient. x = tf.reduce_mean(inputs, axis=[2, 3], keepdims=True) x = tf.cumsum(x, axis=1) x = x + buffer # The last frame will be the value of the next state # Shape: [batch_size, 1, 1, 1, channels] states[self._state_name] = x[:, -1:] # In causal mode, the divisor increments by 1 for every frame to # calculate cumulative averages instead of one global average mean_divisors = tf.range(num_frames) + frame_count + 1 mean_divisors = tf.reshape(mean_divisors, [1, num_frames, 1, 1, 1]) mean_divisors = tf.cast(mean_divisors, x.dtype) # Shape: [batch_size, num_frames, 1, 1, channels] x = x / mean_divisors else: # In non-causal mode, we (optionally) sum across frames to take a # cumulative average across input iterations rather than individual # frames. If no buffer state is passed, this essentially becomes # regular global average pooling. # Shape: [batch_size, 1, 1, 1, channels] x = tf.reduce_sum(inputs, axis=(1, 2, 3), keepdims=True) x = x / tf.cast(tf.shape(inputs)[2] * tf.shape(inputs)[3], x.dtype) x = x + buffer # Shape: [batch_size, 1, 1, 1, channels] states[self._state_name] = x x = x / tf.cast(frame_count + num_frames, x.dtype) if not self._keepdims: x = tf.squeeze(x, axis=(1, 2, 3)) return (x, states) if output_states else x
def make_ids_type_ids(input_ids, sep_id=102): x = input_ids x = tf.cast(x == tf.constant(sep_id), tf.int32) x = tf.cast(tf.cumsum(x, axis=1, exclusive=True) % 2, tf.int32) return x
def build_core_signals(self): self._signals['mask'] = tf.placeholder(tf.float32, shape=(cfg.T, None, 1), name="_mask") self._signals['done'] = tf.placeholder(tf.float32, shape=(cfg.T, None, 1), name="_done") self._signals['all_obs'] = tf.placeholder( tf.float32, shape=(cfg.T + 1 if cfg.T is not None else None, None) + self.obs_shape, name="_all_obs") # observations that we learn about self._signals['obs'] = tf.identity(self._signals['all_obs'][:-1, ...], name="_obs") # observations that we use as targets self._signals['target_obs'] = tf.identity( self._signals['all_obs'][1:, ...], name="_target_obs") self._signals['actions'] = tf.placeholder(tf.float32, shape=(cfg.T, None) + self.action_shape, name="_actions") self._signals['gamma'] = tf.constant(self.gamma) self._signals['batch_size'] = tf.shape(self._signals['obs'])[1] self._signals['batch_size_float'] = tf.cast( self._signals['batch_size'], tf.float32) self._signals['rewards'] = tf.placeholder(tf.float32, shape=(cfg.T, None, 1), name="_rewards") self._signals['returns'] = tf.cumsum(self._signals['rewards'], axis=0, reverse=True, name="_returns") self._signals['reward_per_ep'] = tf.reduce_mean(tf.reduce_sum( self._signals['rewards'], axis=0), name="_reward_per_ep") self.add_recorded_values(reward_per_ep=self._signals['reward_per_ep']) self._signals['mode'] = tf.placeholder(tf.string, ()) self._signals['weights'] = tf.placeholder(tf.float32, shape=(cfg.T, None, 1), name="_weights") T = tf.shape(self._signals['mask'])[0] discount_matrix = tf_discount_matrix(self.gamma, T) discounted_returns = tf.tensordot(discount_matrix, self._signals['rewards'], axes=1, name="_discounted_returns") self._signals['discounted_returns'] = discounted_returns mean_returns = masked_mean(discounted_returns, self._signals['mask'], axis=1, keepdims=True) mean_returns += tf.zeros_like(discounted_returns) self._signals['average_discounted_returns'] = mean_returns # off-policy self._signals['mu_utils'] = tf.placeholder(tf.float32, shape=( cfg.T, None, ) + self.mu.param_shape, name="_mu_log_probs") self._signals['mu_exploration'] = tf.placeholder( tf.float32, shape=(None, ), name="_mu_exploration") self._signals['mu_log_probs'] = tf.placeholder(tf.float32, shape=(cfg.T, None, 1), name="_mu_log_probs") for obj in self.rl_objects: obj.build_core_signals(self)
def make_strs_type_ids(input_strs): x = input_strs x = tf.cast(x == tf.constant('[SEP]'), tf.int32) x = tf.cast(tf.cumsum(x, axis=1, exclusive=True) % 2, tf.int32) return x
def __init__(self, lr, batch_size, dimension, util_train, util_test, campaign, reg_lambda, nn=False): # hyperparameters self.lr = lr self.batch_size = batch_size self.util_train = util_train self.util_test = util_test self.reg_lambda = reg_lambda self.train_data_amt = util_train.get_data_amt() self.test_data_amt = util_test.get_data_amt() # output dir model_name = "{}_{}_{}".format(self.lr, self.reg_lambda, self.batch_size) if nn: self.output_dir = "output/coxnn/{}/{}/".format( campaign, model_name) else: self.output_dir = "output/cox/{}/{}/".format(campaign, model_name) if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # reset graph tf.reset_default_graph() # placeholders, sorted value self.X = tf.sparse_placeholder(tf.float64) self.z = tf.placeholder(tf.float64) self.b = tf.placeholder(tf.float64) self.y = tf.placeholder(tf.float64) # computation graph, linear estimator or neural network if nn: hidden_size = 20 self.w1 = tf.Variable(initial_value=tf.truncated_normal( shape=[dimension, hidden_size], dtype=tf.float64), name='w1') self.w2 = tf.Variable(initial_value=tf.truncated_normal( shape=[hidden_size, 1], dtype=tf.float64), name='w2') self.hidden_values = tf.nn.relu( tf.sparse_tensor_dense_matmul(self.X, self.w1)) self.index = tf.matmul(self.hidden_values, self.w2) self.reg = tf.nn.l2_loss(self.w1[1:, ]) + tf.nn.l2_loss( self.w2[1:, ]) else: self.w = tf.Variable(initial_value=tf.truncated_normal( shape=[dimension, 1], dtype=tf.float64), name='w') self.index = tf.sparse_tensor_dense_matmul(self.X, self.w) self.reg = tf.reduce_sum(tf.abs(self.w[1:, ])) self.multiple_times = tf.exp(self.index) self.loss = -tf.reduce_sum((self.index - tf.log(tf.clip_by_value(tf.cumsum(self.multiple_times, reverse=True), 1e-8, 1.0))) * self.y) + \ self.reg self.optimizer = tf.train.GradientDescentOptimizer(self.lr) self.train_step = self.optimizer.minimize(self.loss) # for test h0 self.base = self.z * self.y + self.b * (1 - self.y) self.candidate = (1 / tf.cumsum(tf.exp(self.index), reverse=True)) * self.y # session initialization config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) tf.global_variables_initializer().run(session=self.sess)
maxval=10, dtype=tf.float64, seed=[4321, 0])) y = tf.math.sin(x * 3) xnew = tf.cast(tf.linspace(-1.0, 11.0, 1001), tf.float64) spline = geoml.interpolation.CubicSpline(x, y) ynew = spline.interpolate(xnew) ynew_d1 = spline.interpolate_d1(xnew) plt.plot(xnew.numpy(), ynew.numpy(), "-r") plt.plot(x.numpy(), y.numpy(), "ok") plt.plot(xnew.numpy(), ynew_d1.numpy(), "-g") plt.plot(x.numpy(), spline.d.numpy(), "og") y_mono = tf.cumsum(tf.math.abs(y)) spline_mono = geoml.interpolation.MonotonicCubicSpline(x, y_mono) ynew_mono = spline_mono.interpolate(xnew) ynew_mono_d1 = spline_mono.interpolate_d1(xnew) plt.plot(xnew.numpy(), ynew_mono.numpy(), "-r") plt.plot(x.numpy(), y_mono.numpy(), "ok") plt.plot(xnew.numpy(), ynew_mono_d1.numpy(), "-g") plt.plot(x.numpy(), spline_mono.d.numpy(), "og") plt.hlines(0, -1, 11, linestyles="dashed") setup = """ import numpy as np import tensorflow as tf import geoml