def testNegativeBinomialSample(self): with self.cached_session() as sess: probs = [.3, .9] total_count = [4., 11.] n = int(100e3) negbinom = negative_binomial.NegativeBinomial( total_count=total_count, probs=probs) samples = negbinom.sample(n, seed=12345) self.assertEqual([n, 2], samples.get_shape()) sample_mean = math_ops.reduce_mean(samples, axis=0) sample_var = math_ops.reduce_mean( (samples - sample_mean[array_ops.newaxis, ...])**2., axis=0) sample_min = math_ops.reduce_min(samples) [sample_mean_, sample_var_, sample_min_] = sess.run([ sample_mean, sample_var, sample_min]) self.assertAllEqual(np.ones(sample_min_.shape, dtype=np.bool), sample_min_ >= 0.0) for i in range(2): self.assertAllClose(sample_mean_[i], stats.nbinom.mean(total_count[i], 1 - probs[i]), atol=0., rtol=.02) self.assertAllClose(sample_var_[i], stats.nbinom.var(total_count[i], 1 - probs[i]), atol=0., rtol=.02)
def grow_tree_from_stats_summaries(stats_summary_list): """Updates ensemble based on the best gains from stats summaries.""" (node_ids_per_feature, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list) = ( boosted_trees_ops.calculate_best_gains_per_feature( node_id_range=array_ops.stack([ math_ops.reduce_min(node_ids), math_ops.reduce_max(node_ids) ]), stats_summary_list=stats_summary_list, l1=tree_hparams.l1, l2=tree_hparams.l2, tree_complexity=tree_hparams.tree_complexity, max_splits=max_splits)) grow_op = boosted_trees_ops.update_ensemble( # Confirm if local_tree_ensemble or tree_ensemble should be used. tree_ensemble.resource_handle, feature_ids=math_ops.range(0, num_features, dtype=dtypes.int32), node_ids=node_ids_per_feature, gains=gains_list, thresholds=thresholds_list, left_node_contribs=left_node_contribs_list, right_node_contribs=right_node_contribs_list, learning_rate=tree_hparams.learning_rate, max_depth=tree_hparams.max_depth, pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING) return grow_op
def testGradient4(self): s = [2, 3, 4, 2] x = np.arange(1.0, 49.0).reshape(s).astype(np.float64) with self.test_session(): t = ops.convert_to_tensor(x) su = math_ops.reduce_min(t) jacob_t, jacob_n = gradient_checker.compute_gradient( t, s, su, [1], x_init_value=x, delta=1) self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
def my_rnn(alphabetEnc, cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None): if not isinstance(cell, rnn_cell.RNNCell): raise TypeError("cell must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") outputs = [] with vs.variable_scope(scope or "RNN"): fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(inputs[0])[0] if initial_state is not None: state = initial_state else: if not dtype: raise ValueError("If no initial_state is provided, dtype must be.") state = cell.zero_state(batch_size, dtype) if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length: # Prepare variables zero_output = array_ops.zeros( array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype) zero_output.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, cell.output_size])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(inputs): if time > 0: vs.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop call_cell = lambda: cell([ input_ , alphabetEnc[time] ], state) # pylint: enable=cell-var-from-loop if sequence_length: (output, state) = _rnn_step( time, sequence_length, min_sequence_length, max_sequence_length, zero_output, state, call_cell) else: (output, state) = call_cell() outputs.append(output) return (outputs, state)
def initialize_graph(self, features, update_statistics=True): """Create any ops needed to provide input statistics. Should be called before statistics are requested. Args: features: A dictionary, the output of a `TimeSeriesInputFn` (with keys TrainEvalFeatures.TIMES and TrainEvalFeatures.VALUES). update_statistics: Whether `features` should be used to update adaptive statistics. Typically True for training and false for evaluation. Returns: An InputStatistics object composed of Variables, which will be updated based on mini-batches of data if requested. """ if (TrainEvalFeatures.TIMES in features and TrainEvalFeatures.VALUES in features): times = features[TrainEvalFeatures.TIMES] values = features[TrainEvalFeatures.VALUES] else: # times and values may not be available, for example during prediction. We # still need to retrieve our variables so that they can be read from, even # if we're not going to update them. times = None values = None # Create/retrieve variables representing input statistics, initialized # without data to avoid deadlocking if variables are initialized before # queue runners are started. with variable_scope.variable_scope("input_statistics", use_resource=True): statistics = self._create_variable_statistics_object() with variable_scope.variable_scope( "input_statistics_auxiliary", use_resource=True): # Secondary statistics, necessary for the incremental computation of the # primary statistics (e.g. counts and sums for computing a mean # incrementally). auxiliary_variables = self._AdaptiveInputAuxiliaryStatistics( num_features=self._num_features, dtype=self._dtype) if update_statistics and times is not None and values is not None: # If we have times and values from mini-batch input, create update ops to # take the new data into account. assign_op = self._update_statistics_from_mini_batch( statistics, auxiliary_variables, times, values) with ops.control_dependencies([assign_op]): stat_variables = nest.pack_sequence_as(statistics, [ array_ops.identity(tensor) for tensor in nest.flatten(statistics) ]) # Since start time updates have a race condition, ensure that the # reported start time is at least as low as the lowest time in this # mini-batch. The start time should converge on the correct value # eventually even with the race condition, but for example state space # models have an assertion which could fail without this # post-processing. return stat_variables._replace(start_time=gen_math_ops.minimum( stat_variables.start_time, math_ops.reduce_min(times))) else: return statistics
def testLargeFeed(self): server = self._cached_server with session.Session(server.target, config=self._useRPCConfig()) as sess: feed_val = np.empty([10000, 3000], dtype=np.float32) feed_val.fill(0.5) p = array_ops.placeholder(dtypes.float32, shape=[10000, 3000]) min_t = math_ops.reduce_min(p) max_t = math_ops.reduce_max(p) min_val, max_val = sess.run([min_t, max_t], feed_dict={p: feed_val}) self.assertEqual(0.5, min_val) self.assertEqual(0.5, max_val)
def _ones_diag(self): """Returns the diagonal of this operator as all ones.""" if self.shape.is_fully_defined(): d_shape = self.batch_shape.concatenate( [min(self.domain_dimension.value, self.range_dimension.value)]) else: d_shape = array_ops.concat( [self.batch_shape_tensor(), [math_ops.reduce_min(self.shape_tensor()[-2:])]], axis=0) return array_ops.ones(shape=d_shape, dtype=self.dtype)
def refresh_shortlist(): """Update the shortlist with the highest scores in id_to_score.""" new_scores, new_ids = nn_ops.top_k(self.id_to_score, self.shortlist_size) smallest_new_score = math_ops.reduce_min(new_scores) new_length = math_ops.reduce_sum( math_ops.to_int32(math_ops.greater(new_scores, dtypes.float32.min))) u1 = self.sl_ids.assign( math_ops.to_int64(array_ops.concat([[new_length], new_ids], 0))) u2 = self.sl_scores.assign( array_ops.concat([[smallest_new_score], new_scores], 0)) self.last_ops = [u1, u2] return control_flow_ops.group(u1, u2)
def element_to_bucket_id(*args): """Return int64 id of the length bucket for this element.""" seq_length = element_length_func(*args) boundaries = list(bucket_boundaries) buckets_min = [np.iinfo(np.int32).min] + boundaries buckets_max = boundaries + [np.iinfo(np.int32).max] conditions_c = math_ops.logical_and( math_ops.less_equal(buckets_min, seq_length), math_ops.less(seq_length, buckets_max)) bucket_id = math_ops.reduce_min(array_ops.where(conditions_c)) return bucket_id
def func_body(iteration, gt_cluster_score): """Per each cluster, compute the average travel distance.""" mask = math_ops.equal(labels, unique_class_ids[iteration]) this_cluster_ids = array_ops.where(mask) pairwise_distances_subset = array_ops.transpose( array_ops.gather( array_ops.transpose( array_ops.gather(pairwise_distances, this_cluster_ids)), this_cluster_ids)) this_cluster_score = -1.0 * math_ops.reduce_min( math_ops.reduce_sum( pairwise_distances_subset, axis=0)) return iteration + 1, gt_cluster_score + this_cluster_score
def compute_facility_energy(pairwise_distances, centroid_ids): """Compute the average travel distance to the assigned centroid. Args: pairwise_distances: 2-D Tensor of pairwise distances. centroid_ids: 1-D Tensor of indices. Returns: facility_energy: dtypes.float32 scalar. """ return -1.0 * math_ops.reduce_sum( math_ops.reduce_min( array_ops.gather(pairwise_distances, centroid_ids), axis=0))
def _compare(self, x, reduction_axes, keep_dims, use_gpu=False): np_ans = x if reduction_axes is None: np_ans = np.amin(np_ans, keepdims=keep_dims) else: for ra in reduction_axes[::-1]: np_ans = np.amin(np_ans, axis=ra, keepdims=keep_dims) with self.test_session(use_gpu=use_gpu): if reduction_axes is not None: reduction_axes = np.array(reduction_axes).astype(np.int32) tf_ans = math_ops.reduce_min(x, reduction_axes, keep_dims) out = tf_ans.eval() self.assertAllClose(np_ans, out) self.assertShapeEqual(np_ans, tf_ans)
def grow_tree(self, stats_summaries_list, feature_ids_list, last_layer_nodes_range): # For not in memory situation, we need to accumulate enough of batches first # before proceeding with building a tree layer. max_splits = _get_max_splits(self._tree_hparams) # Prepare accumulators. accumulators = [] dependencies = [] for i, feature_ids in enumerate(feature_ids_list): stats_summaries = stats_summaries_list[i] accumulator = data_flow_ops.ConditionalAccumulator( dtype=dtypes.float32, # The stats consist of grads and hessians (the last dimension). shape=[len(feature_ids), max_splits, self._bucket_size_list[i], 2], shared_name='numeric_stats_summary_accumulator_' + str(i)) accumulators.append(accumulator) apply_grad = accumulator.apply_grad( array_ops.stack(stats_summaries, axis=0), self._stamp_token) dependencies.append(apply_grad) # Grow the tree if enough batches is accumulated. with ops.control_dependencies(dependencies): if not self._is_chief: return control_flow_ops.no_op() min_accumulated = math_ops.reduce_min( array_ops.stack([acc.num_accumulated() for acc in accumulators])) def grow_tree_from_accumulated_summaries_fn(): """Updates tree with the best layer from accumulated summaries.""" # Take out the accumulated summaries from the accumulator and grow. stats_summaries_list = [] stats_summaries_list = [ array_ops.unstack(accumulator.take_grad(1), axis=0) for accumulator in accumulators ] grow_op = self._grow_tree_from_stats_summaries( stats_summaries_list, feature_ids_list, last_layer_nodes_range) return grow_op grow_model = control_flow_ops.cond( math_ops.greater_equal(min_accumulated, self._n_batches_per_layer), grow_tree_from_accumulated_summaries_fn, control_flow_ops.no_op, name='wait_until_n_batches_accumulated') return grow_model
def _assert_non_singular(self): """Private default implementation of _assert_non_singular.""" logging.warn( "Using (possibly slow) default implementation of assert_non_singular." " Requires conversion to a dense matrix and O(N^3) operations.") if self._can_use_cholesky(): return self.assert_positive_definite() else: singular_values = linalg_ops.svd(self.to_dense(), compute_uv=False) # TODO(langmore) Add .eig and .cond as methods. cond = (math_ops.reduce_max(singular_values, axis=-1) / math_ops.reduce_min(singular_values, axis=-1)) return check_ops.assert_less( cond, self._max_condition_number_to_be_non_singular(), message="Singular matrix up to precision epsilon.")
def grow_not_in_mem(): """Accumulates the data and grows a layer when ready.""" accumulators = [] dependencies = [] for i, feature_ids in enumerate(feature_ids_list): stats_summaries = stats_summaries_list[i] accumulator = data_flow_ops.ConditionalAccumulator( dtype=dtypes.float32, # The stats consist of grads and hessians (the last dimension). shape=[len(feature_ids), max_splits, bucket_size_list[i], 2], shared_name='numeric_stats_summary_accumulator_' + str(i)) accumulators.append(accumulator) apply_grad = accumulator.apply_grad( array_ops.stack(stats_summaries, axis=0), stamp_token) dependencies.append(apply_grad) def grow_tree_from_accumulated_summaries_fn(): """Updates tree with the best layer from accumulated summaries.""" # Take out the accumulated summaries from the accumulator and grow. stats_summaries_list = [] stats_summaries_list = [ array_ops.unstack(accumulator.take_grad(1), axis=0) for accumulator in accumulators ] grow_op = grow_tree_from_stats_summaries(stats_summaries_list, feature_ids_list) return grow_op with ops.control_dependencies(dependencies): if config.is_chief: min_accumulated = math_ops.reduce_min( array_ops.stack( [acc.num_accumulated() for acc in accumulators])) grow_model = control_flow_ops.cond( math_ops.greater_equal(min_accumulated, n_batches_per_layer), grow_tree_from_accumulated_summaries_fn, control_flow_ops.no_op, name='wait_until_n_batches_accumulated') return grow_model else: return control_flow_ops.no_op()
def masked_maximum(data, mask, dim=1): """Computes the axis wise maximum over chosen elements. Args: data: 2-D float `Tensor` of size [n, m]. mask: 2-D Boolean `Tensor` of size [n, m]. dim: The dimension over which to compute the maximum. Returns: masked_maximums: N-D `Tensor`. The maximized dimension is of size 1 after the operation. """ axis_minimums = math_ops.reduce_min(data, dim, keepdims=True) masked_maximums = math_ops.reduce_max( math_ops.multiply(data - axis_minimums, mask), dim, keepdims=True) + axis_minimums return masked_maximums
def masked_minimum(data, mask, dim=1): """Computes the axis wise minimum over chosen elements. Args: data: 2-D float `Tensor` of size [n, m]. mask: 2-D Boolean `Tensor` of size [n, m]. dim: The dimension over which to compute the minimum. Returns: masked_minimums: N-D `Tensor`. The minimized dimension is of size 1 after the operation. """ axis_maximums = math_ops.reduce_max(data, dim, keepdims=True) masked_minimums = math_ops.reduce_min( math_ops.multiply(data - axis_maximums, mask), dim, keepdims=True) + axis_maximums return masked_minimums
def _assert_non_singular(self): """Private default implementation of _assert_non_singular.""" logging.warn( "Using (possibly slow) default implementation of assert_non_singular." " Requires conversion to a dense matrix and O(N^3) operations.") if self._can_use_cholesky(): return self.assert_positive_definite() else: singular_values = linalg_ops.svd( self._get_cached_dense_matrix(), compute_uv=False) # TODO(langmore) Add .eig and .cond as methods. cond = (math_ops.reduce_max(singular_values, axis=-1) / math_ops.reduce_min(singular_values, axis=-1)) return check_ops.assert_less( cond, self._max_condition_number_to_be_non_singular(), message="Singular matrix up to precision epsilon.")
def grow_not_in_mem(): """Accumulates the data and grows a layer when ready.""" accumulators = [] dependencies = [] for i, feature_ids in enumerate(feature_ids_list): stats_summaries = stats_summaries_list[i] accumulator = data_flow_ops.ConditionalAccumulator( dtype=dtypes.float32, # The stats consist of grads and hessians (the last dimension). shape=[len(feature_ids), max_splits, bucket_size_list[i], 2], shared_name='numeric_stats_summary_accumulator_' + str(i)) accumulators.append(accumulator) apply_grad = accumulator.apply_grad( array_ops.stack(stats_summaries, axis=0), stamp_token) dependencies.append(apply_grad) def grow_tree_from_accumulated_summaries_fn(): """Updates tree with the best layer from accumulated summaries.""" # Take out the accumulated summaries from the accumulator and grow. stats_summaries_list = [] stats_summaries_list = [ array_ops.unstack(accumulator.take_grad(1), axis=0) for accumulator in accumulators ] grow_op = grow_tree_from_stats_summaries(stats_summaries_list, feature_ids_list) return grow_op with ops.control_dependencies(dependencies): if config.is_chief: min_accumulated = math_ops.reduce_min( array_ops.stack( [acc.num_accumulated() for acc in accumulators])) grow_model = control_flow_ops.cond( math_ops.greater_equal(min_accumulated, n_batches_per_layer), grow_tree_from_accumulated_summaries_fn, control_flow_ops.no_op, name='wait_until_n_batches_accumulated') return grow_model
def test_cond(self): with self.test_session(graph=ops.Graph()) as sess: # svd does not work with zero dimensional matrices, so we'll # skip if 0 in shapes_info.shape[-2:]: return # ROCm platform does not yet support complex types if test.is_built_with_rocm() and \ ((dtype == dtypes.complex64) or (dtype == dtypes.complex128)): return sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED # Ensure self-adjoint and PD so we get finite condition numbers. operator, mat = self.operator_and_matrix( shapes_info, dtype, use_placeholder=use_placeholder, ensure_self_adjoint_and_pd=True) # Eigenvalues are real, so we'll cast these to float64 and sort # for comparison. op_cond = operator.cond() s = math_ops.abs(linalg_ops.svd(mat, compute_uv=False)) mat_cond = math_ops.reduce_max(s, axis=-1) / math_ops.reduce_min( s, axis=-1) op_cond_v, mat_cond_v = sess.run([op_cond, mat_cond]) atol_override = { dtypes.float16: 1e-2, dtypes.float32: 1e-3, dtypes.float64: 1e-6, dtypes.complex64: 1e-3, dtypes.complex128: 1e-6, } rtol_override = { dtypes.float16: 1e-2, dtypes.float32: 1e-3, dtypes.float64: 1e-4, dtypes.complex64: 1e-3, dtypes.complex128: 1e-6, } atol = atol_override[dtype] rtol = rtol_override[dtype] self.assertAllClose(op_cond_v, mat_cond_v, atol=atol, rtol=rtol)
def _minimum_mean(samples, envelope, low, name=None): """Returns a stochastic lower bound on the mean of a scalar distribution. The idea is that if the true CDF is within an `eps`-envelope of the empirical CDF of the samples, and the support is bounded below, then the mean is bounded below as well. In symbols, ```none sup_x(|F_n(x) - F(x)|) < eps ``` The 0th dimension of `samples` is interpreted as independent and identically distributed samples. The remaining dimensions are broadcast together with `envelope` and `low`, and operated on separately. Args: samples: Floating-point tensor of samples from the distribution(s) of interest. Entries are assumed IID across the 0th dimension. The other dimensions must broadcast with `envelope` and `low`. envelope: Floating-point tensor of sizes of admissible CDF envelopes (i.e., the `eps` above). low: Floating-point tensor of lower bounds on the distributions' supports. name: A name for this operation (optional). Returns: bound: Floating-point tensor of lower bounds on the true means. Raises: InvalidArgumentError: If some `sample` is found to be smaller than the corresponding `low`. """ with ops.name_scope(name, "minimum_mean", [samples, envelope, low]): samples = ops.convert_to_tensor(samples, name="samples") envelope = ops.convert_to_tensor(envelope, name="envelope") low = ops.convert_to_tensor(low, name="low") xmin = math_ops.reduce_min(samples, axis=[-1]) msg = "Given sample minimum value falls below expectations" check_op = check_ops.assert_greater_equal(xmin, low, message=msg) with ops.control_dependencies([check_op]): return - _do_maximum_mean(-samples, envelope, -low)
def _MatrixSetDiagGrad(op, grad): input_shape = op.inputs[0].get_shape().merge_with(grad.get_shape()) diag_shape = op.inputs[1].get_shape() batch_shape = input_shape[:-2].merge_with(diag_shape[:-1]) matrix_shape = input_shape[-2:] if batch_shape.is_fully_defined() and matrix_shape.is_fully_defined(): diag_shape = batch_shape.as_list() + [min(matrix_shape.as_list())] else: with ops.colocate_with(grad): grad_shape = array_ops.shape(grad) grad_rank = array_ops.rank(grad) batch_shape = array_ops.slice(grad_shape, [0], [grad_rank - 2]) matrix_shape = array_ops.slice(grad_shape, [grad_rank - 2], [2]) min_dim = math_ops.reduce_min(matrix_shape) diag_shape = array_ops.concat([batch_shape, [min_dim]], 0) grad_input = array_ops.matrix_set_diag( grad, array_ops.zeros(diag_shape, dtype=grad.dtype)) grad_diag = array_ops.matrix_diag_part(grad) return (grad_input, grad_diag)
def _minimum_mean(samples, envelope, low, name=None): """Returns a stochastic lower bound on the mean of a scalar distribution. The idea is that if the true CDF is within an `eps`-envelope of the empirical CDF of the samples, and the support is bounded below, then the mean is bounded below as well. In symbols, ```none sup_x(|F_n(x) - F(x)|) < eps ``` The 0th dimension of `samples` is interpreted as independent and identically distributed samples. The remaining dimensions are broadcast together with `envelope` and `low`, and operated on separately. Args: samples: Floating-point tensor of samples from the distribution(s) of interest. Entries are assumed IID across the 0th dimension. The other dimensions must broadcast with `envelope` and `low`. envelope: Floating-point tensor of sizes of admissible CDF envelopes (i.e., the `eps` above). low: Floating-point tensor of lower bounds on the distributions' supports. name: A name for this operation (optional). Returns: bound: Floating-point tensor of lower bounds on the true means. Raises: InvalidArgumentError: If some `sample` is found to be smaller than the corresponding `low`. """ with ops.name_scope(name, "minimum_mean", [samples, envelope, low]): samples = ops.convert_to_tensor(samples, name="samples") envelope = ops.convert_to_tensor(envelope, name="envelope") low = ops.convert_to_tensor(low, name="low") xmin = math_ops.reduce_min(samples, axis=[-1]) msg = "Given sample minimum value falls below expectations" check_op = check_ops.assert_greater_equal(xmin, low, message=msg) with ops.control_dependencies([check_op]): return -_do_maximum_mean(-samples, envelope, -low)
def histogram(self, x, value_range=None, nbins=None, name=None): """Return histogram of values. Given the tensor `values`, this operation returns a rank 1 histogram counting the number of entries in `values` that fell into every bin. The bins are equal width and determined by the arguments `value_range` and `nbins`. Args: x: 1D numeric `Tensor` of items to count. value_range: Shape [2] `Tensor`. `new_values <= value_range[0]` will be mapped to `hist[0]`, `values >= value_range[1]` will be mapped to `hist[-1]`. Must be same dtype as `x`. nbins: Scalar `int32 Tensor`. Number of histogram bins. name: Python `str` name prefixed to Ops created by this class. Returns: counts: 1D `Tensor` of counts, i.e., `counts[i] = sum{ edges[i-1] <= values[j] < edges[i] : j }`. edges: 1D `Tensor` characterizing intervals used for counting. """ with ops.name_scope(name, "histogram", [x]): x = ops.convert_to_tensor(x, name="x") if value_range is None: value_range = [ math_ops.reduce_min(x), 1 + math_ops.reduce_max(x) ] value_range = ops.convert_to_tensor(value_range, name="value_range") lo = value_range[0] hi = value_range[1] if nbins is None: nbins = math_ops.to_int32(hi - lo) delta = (hi - lo) / math_ops.cast( nbins, dtype=value_range.dtype.base_dtype) edges = math_ops.range(start=lo, limit=hi, delta=delta, dtype=x.dtype.base_dtype) counts = histogram_ops.histogram_fixed_width( x, value_range=value_range, nbins=nbins) return counts, edges
def _rolling_window_indices(size, rw_size, num_valid_entries): """Returns the rolling windows indices and mask for valid ones. When size = 3, rw_size = 2, returns [[0, 1], [1, 2], [2, 0]]. When size = 2, rw_size = 3, returns [[0, 1, 0], [1, 0, 1]]. When num_valid_entries = 2, the first returns [[0, 1], [1, 0], [0, 1]] and the first 2 are valid with mask as [True, True, False]. Args: size: A scalar int `Tensor` for the size. rw_size: A scalr int `Tensor` for the rw_size. num_valid_entries: A 1-D `Tensor` with shape [batch_size] representing the number of valid entries for each instance in a batch. Returns: A tuple of Tensors (batch_rw_indices, batch_indices_mask). The first has shape [batch_size, size, rw_size] and the second has shape [batch_size, size]. """ with ops.name_scope(None, 'rolling_window_indices', (size, rw_size, num_valid_entries)): # shape = [size, rw_size] with value [[0, 1, ...], [1, 2, ...], ...]. rw_indices = array_ops.expand_dims(math_ops.range(rw_size), 0) + array_ops.expand_dims( math_ops.range(size), 1) # shape = [batch_size, size, rw_size]. Make batch_size copies. batch_rw_indices = array_ops.gather( array_ops.expand_dims(rw_indices, 0), array_ops.zeros_like(num_valid_entries), axis=0) # Mark the first n indices as valid where n = num_valid_entries. batch_indices_mask = math_ops.less( math_ops.reduce_min(batch_rw_indices, axis=2), array_ops.reshape(num_valid_entries, [-1, 1])) # Mod the indices to the range of num_valid_entries. num_valid_entries = array_ops.where( math_ops.less(num_valid_entries, 1), array_ops.ones_like(num_valid_entries), num_valid_entries) batch_rw_indices = math_ops.mod( batch_rw_indices, array_ops.reshape(num_valid_entries, [-1, 1, 1])) return batch_rw_indices, batch_indices_mask
def _MatrixSetDiagGrad(op, grad): input_shape = op.inputs[0].get_shape().merge_with(grad.get_shape()) diag_shape = op.inputs[1].get_shape() batch_shape = input_shape[:-2].merge_with(diag_shape[:-1]) matrix_shape = input_shape[-2:] if batch_shape.is_fully_defined() and matrix_shape.is_fully_defined(): diag_shape = batch_shape.as_list() + [min(matrix_shape.as_list())] else: with ops.colocate_with(grad): grad_shape = array_ops.shape(grad) grad_rank = array_ops.rank(grad) batch_shape = array_ops.slice(grad_shape, [0], [grad_rank - 2]) matrix_shape = array_ops.slice(grad_shape, [grad_rank - 2], [2]) min_dim = math_ops.reduce_min(matrix_shape) diag_shape = array_ops.concat([batch_shape, [min_dim]], 0) grad_input = array_ops.matrix_set_diag( grad, array_ops.zeros( diag_shape, dtype=grad.dtype)) grad_diag = array_ops.matrix_diag_part(grad) return (grad_input, grad_diag)
def normalize_for_graph_lstm(tensor): """Normalizes Tensor to range [-0.5, 0.5]. Scales a Tensor uniformly to fit within [-0.5, 0.5]^n. Additionally, each dimension is shifted to be centred around [0]^n i.e. the origin, in a way that data extends the same distance in positive and negative direction. In other words, the mean between maximum and minimum value of each dimension is shifted to zero. The undo_scaling op undoes scaling, but does not undo shifting. The unnormalize op does both, but is currently unused. Returns: The normalized Tensor, and an op to undo normalization. Example usage: ``` normalized_tensor, undo_scaling = normalize_for_graph_lstm(input_tensor) normalized_output_tensor = some_op(normalized_tensor) output_tensor = undo_scaling(normalized_output_tensor) ``` """ # tensor is normalized to range[-0.5, 0.5] # this function assumes tensors with shape [ batch_size, number_of_nodes, output_size ] assert (len(tensor.shape) == 3) # compute maximum and minimum joint position value in each dimension max_dim = math_ops.reduce_max(tensor, axis=1, keepdims=True) min_dim = math_ops.reduce_min(tensor, axis=1, keepdims=True) diff_dim = math_ops.subtract(max_dim, min_dim) # get normalizing factor as maximum difference within all dimensions max_diff = math_ops.reduce_max(diff_dim, axis=2, keepdims=True) normalized_tensor = math_ops.divide(tensor - min_dim - diff_dim / 2, max_diff) # return output rescaled and shifted to original position def unnormalize(tensor): return math_ops.multiply(tensor, max_diff) + diff_dim / 2 + min_dim # return output only rescaled, centered around 0 def undo_scaling(tensor): return math_ops.multiply(tensor, max_diff) return normalized_tensor, undo_scaling
def histogram(self, x, value_range=None, nbins=None, name=None): """Return histogram of values. Given the tensor `values`, this operation returns a rank 1 histogram counting the number of entries in `values` that fell into every bin. The bins are equal width and determined by the arguments `value_range` and `nbins`. Args: x: 1D numeric `Tensor` of items to count. value_range: Shape [2] `Tensor`. `new_values <= value_range[0]` will be mapped to `hist[0]`, `values >= value_range[1]` will be mapped to `hist[-1]`. Must be same dtype as `x`. nbins: Scalar `int32 Tensor`. Number of histogram bins. name: Python `str` name prefixed to Ops created by this class. Returns: counts: 1D `Tensor` of counts, i.e., `counts[i] = sum{ edges[i-1] <= values[j] < edges[i] : j }`. edges: 1D `Tensor` characterizing intervals used for counting. """ with ops.name_scope(name, "histogram", [x]): x = ops.convert_to_tensor(x, name="x") if value_range is None: value_range = [math_ops.reduce_min(x), 1 + math_ops.reduce_max(x)] value_range = ops.convert_to_tensor(value_range, name="value_range") lo = value_range[0] hi = value_range[1] if nbins is None: nbins = math_ops.cast(hi - lo, dtypes.int32) delta = (hi - lo) / math_ops.cast( nbins, dtype=value_range.dtype.base_dtype) edges = math_ops.range( start=lo, limit=hi, delta=delta, dtype=x.dtype.base_dtype) counts = histogram_ops.histogram_fixed_width( x, value_range=value_range, nbins=nbins) return counts, edges
def _prepare_and_validate_params(labels, predictions, weights=None, topn=None): """Prepares and validates the parameters. Args: labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a relevant example. predictions: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding example. weights: A `Tensor` of the same shape of predictions or [batch_size, 1]. The former case is per-example and the latter case is per-list. topn: A cutoff for how many examples to consider for this metric. Returns: (labels, predictions, weights, topn) ready to be used for metric calculation. """ labels = ops.convert_to_tensor(labels) predictions = ops.convert_to_tensor(predictions) weights = 1.0 if weights is None else ops.convert_to_tensor(weights) example_weights = array_ops.ones_like(labels) * weights predictions.get_shape().assert_is_compatible_with(example_weights.get_shape()) predictions.get_shape().assert_is_compatible_with(labels.get_shape()) predictions.get_shape().assert_has_rank(2) if topn is None: topn = array_ops.shape(predictions)[1] # All labels should be >= 0. Invalid entries are reset. is_label_valid = utils.is_label_valid(labels) labels = array_ops.where( is_label_valid, labels, array_ops.zeros_like(labels)) predictions = array_ops.where( is_label_valid, predictions, -1e-6 * array_ops.ones_like(predictions) + math_ops.reduce_min( predictions, axis=1, keepdims=True)) return labels, predictions, example_weights, topn
def hardest_negative_dist(labels, embeddings, margin=1.2, squared=False): """Build the triplet loss over a batch of embeddings. For each anchor, we get the hardest positive and hardest negative to form a triplet. Args: labels: labels of the batch, of size (batch_size,) embeddings: tensor of shape (batch_size, embed_dim) margin: margin for triplet loss squared: Boolean. If true, output is the pairwise squared euclidean distance matrix. If false, output is the pairwise euclidean distance matrix. Returns: triplet_loss: scalar tensor containing the triplet loss """ # Get the pairwise distance matrix pairwise_dist = _pairwise_distances(embeddings, squared=squared) mask_anchor_negative = _get_anchor_negative_triplet_mask(labels) mask_anchor_negative = tf.to_float(mask_anchor_negative) maximum = math_ops.reduce_max(pairwise_dist, keepdims=True) result = math_ops.reduce_min(math_ops.multiply(pairwise_dist - maximum, mask_anchor_negative), keepdims=True) + maximum return result
def do_filter(self, estimated_state, estimated_state_covariance, predicted_observation, predicted_observation_covariance, observation, observation_model, observation_noise): """Convenience function for scoring predictions. Scores a prediction against an observation, and computes the updated posterior over states. Shapes given below for arguments are for single-model Kalman filtering (e.g. KalmanFilter). For ensembles, prior_state and prior_state_var are same-length tuples of values corresponding to each model. Args: estimated_state: A prior mean over states [batch size x state dimension] estimated_state_covariance: Covariance of state prior [batch size x D x D], with D depending on the Kalman filter implementation (typically the state dimension). predicted_observation: A prediction for the observed value, such as that returned by observed_from_state. A [batch size x num features] Tensor. predicted_observation_covariance: A covariance matrix corresponding to `predicted_observation`, a [batch size x num features x num features] Tensor. observation: The observed value corresponding to the predictions given [batch size x observation dimension] observation_model: The [batch size x observation dimension x model state dimension] Tensor indicating how a particular state is mapped to (pre-noise) observations for each part of the batch. observation_noise: A [batch size x observation dimension x observation dimension] Tensor or [observation dimension x observation dimension] Tensor with covariance matrices to use for each part of the batch (a two-dimensional input will be broadcast). Returns: posterior_state, posterior_state_var: Posterior mean and covariance, updated versions of prior_state and prior_state_var. log_prediction_prob: Log probability of the observations under the priors, suitable for optimization (should be maximized). """ symmetrized_observation_covariance = 0.5 * ( predicted_observation_covariance + array_ops.matrix_transpose(predicted_observation_covariance)) instability_message = ( "This may occur due to numerically unstable filtering when there is " "a large difference in posterior variances, or when inferences are " "near-deterministic. Considering tuning the " "'filtering_maximum_posterior_variance_ratio' or " "'filtering_minimum_posterior_variance' parameters in your " "StateSpaceModelConfiguration, or tuning the transition matrix.") symmetrized_observation_covariance = numerics.verify_tensor_all_finite( symmetrized_observation_covariance, "Predicted observation covariance was not finite. {}".format( instability_message)) diag = array_ops.matrix_diag_part(symmetrized_observation_covariance) min_diag = math_ops.reduce_min(diag) non_negative_assert = control_flow_ops.Assert( min_diag >= 0., [("The predicted observation covariance " "has a negative diagonal entry. {}").format(instability_message), min_diag]) with ops.control_dependencies([non_negative_assert]): observation_covariance_cholesky = linalg_ops.cholesky( symmetrized_observation_covariance) log_prediction_prob = math_utils.mvn_tril_log_prob( loc=predicted_observation, scale_tril=observation_covariance_cholesky, x=observation) (posterior_state, posterior_state_var) = self.posterior_from_prior_state( prior_state=estimated_state, prior_state_var=estimated_state_covariance, observation=observation, observation_model=observation_model, predicted_observations=(predicted_observation, predicted_observation_covariance), observation_noise=observation_noise) return (posterior_state, posterior_state_var, log_prediction_prob)
def LastValueQuantize(inputs, per_channel=False, init_min=-6.0, init_max=6.0, vars_collection=None, name_prefix='LastValueQuant', reuse=None, is_training=True, num_bits=8, narrow_range=False, symmetric=False): """Adds a layer that collects quantization ranges as last input ranges. LastValueQuantize creates variables called 'min' and 'max', representing the interval used for quantization and clamping. Args: inputs: a tensor containing values to be quantized. per_channel: (Optional) a boolean specifying whether to use different quantization ranges per output channel. init_min: a float scalar, the initial value for variable min. init_max: a float scalar, the initial value for variable max. vars_collection: (Optional) collection where to store variables for quantization interval ends. name_prefix: name_prefix for created nodes. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. is_training: Whether the op is applied to a training or eval graph. num_bits: Number of bits to use for quantization, must be between 2 and 8. narrow_range: Whether to use the narrow quantization range [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1]. symmetric: If true, use symmetric quantization limits instead of training the minimum and maximum of each quantization range separately. Returns: a tensor containing quantized values. """ with variable_scope.variable_scope( None, default_name=name_prefix, values=[inputs], reuse=reuse) as scope: scope.set_partitioner(None) input_shape = inputs.get_shape() input_dim = len(input_shape) if per_channel: # Only support quantizing 1-, 2- and 4-dimensional tensors. assert input_dim in [1, 2, 4], ('Expected 1D, 2D or 4D input, was: %s in ' ' scope: %s' % (input_shape, name_prefix)) min_max_shape = [input_shape[-1]] else: min_max_shape = [] vars_collections = [vars_collection] if vars_collection else [] min_var = _ModelVariable( 'min', shape=min_max_shape, initializer=init_ops.constant_initializer(init_min), collections=vars_collections, trainable=False) max_var = _ModelVariable( 'max', shape=min_max_shape, initializer=init_ops.constant_initializer(init_max), collections=vars_collections, trainable=False) if not is_training: return _FakeQuantWithMinMaxVars( inputs, min_var, max_var, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range) if per_channel: if input_dim == 2: reduce_dims = [0] elif input_dim == 4: reduce_dims = [0, 1, 2] if per_channel: if input_dim >= 2: batch_min = math_ops.reduce_min( inputs, reduction_indices=reduce_dims, name='BatchMin') else: batch_min = inputs else: batch_min = math_ops.reduce_min(inputs, name='BatchMin') if per_channel: if input_dim >= 2: batch_max = math_ops.reduce_max( inputs, reduction_indices=reduce_dims, name='BatchMax') else: batch_max = inputs else: batch_max = math_ops.reduce_max(inputs, name='BatchMax') if symmetric: if narrow_range: min_max_ratio = -1 else: # In two's complement notation, the negative range is slightly larger # than the positive range. min_max_ratio = -((1 << num_bits) - 2) / (1 << num_bits) # TFLite requires that 0.0 if always in the [min; max] range. Because # batch_min <= batch_max, it follows that range_min <= 0 <= range_max. range_min = math_ops.minimum(batch_min, batch_max / min_max_ratio) range_max = math_ops.maximum(batch_max, batch_min * min_max_ratio) else: # TFLite requires that 0.0 if always in the [min; max] range. range_min = math_ops.minimum(batch_min, 0.0) range_max = math_ops.maximum(batch_max, 0.0) assign_min = state_ops.assign(min_var, range_min, name='AssignMinLast') assign_max = state_ops.assign(max_var, range_max, name='AssignMaxLast') return _FakeQuantWithMinMaxVars( inputs, assign_min, assign_max, per_channel=per_channel, num_bits=num_bits, narrow_range=narrow_range)
def bucket_by_sequence_length(input_length, tensors, batch_size, bucket_boundaries, num_threads=1, capacity=32, shapes=None, dynamic_pad=False, allow_smaller_final_batch=False, keep_input=None, shared_name=None, name=None): """Lazy bucketing of inputs according to their length. This method calls `tf.contrib.training.bucket` under the hood, after first subdividing the bucket boundaries into separate buckets and identifying which bucket the given `input_length` belongs to. See the documentation for `which_bucket` for details of the other arguments. Args: input_length: `int32` scalar `Tensor`, the sequence length of tensors. tensors: The list or dictionary of tensors, representing a single element, to bucket. Nested lists are not supported. batch_size: The new batch size pulled from the queue (python int or int32 scalar). bucket_boundaries: int list, increasing non-negative numbers. The edges of the buckets to use when bucketing tensors. Two extra buckets are created, one for `input_length < bucket_boundaries[0]` and one for `input_length >= bucket_boundaries[-1]`. num_threads: An integer. The number of threads enqueuing `tensors`. capacity: An integer. The maximum number of minibatches in the top queue, and also the maximum number of elements within each bucket. shapes: (Optional) The shapes for each example. Defaults to the inferred shapes for `tensors`. dynamic_pad: Boolean. Allow variable dimensions in input shapes. The given dimensions are padded upon dequeue so that tensors within a batch have the same shapes. allow_smaller_final_batch: (Optional) Boolean. If `True`, allow the final batches to be smaller if there are insufficient items left in the queues. keep_input: (Optional). A `bool` scalar Tensor. If provided, this tensor controls whether the input is added to the queue or not. If it evaluates `True`, then `tensors` are added to the bucket; otherwise they are dropped. This tensor essentially acts as a filtering mechanism. The default behavior is to assume `keep_input=True`. shared_name: (Optional). If set, the queues will be shared under the given name across multiple sessions. name: (Optional) A name for the operations. Returns: A tuple `(sequence_length, outputs)` where `sequence_length` is a 1-D `Tensor` of size `batch_size` and `outputs` is a list or dictionary of batched, bucketed, outputs corresponding to elements of `tensors`. Raises: TypeError: if `bucket_boundaries` is not a list of python integers. ValueError: if `bucket_boundaries` is empty or contains non-increasing values. """ tensor_list = _as_tensor_list(tensors) if not isinstance(bucket_boundaries, (list, tuple)): raise TypeError( "bucket_boundaries must be a list or tuple, but received: %s" % bucket_boundaries) if not bucket_boundaries: raise ValueError("bucket_boundaries must not be empty") for (s, e) in zip(bucket_boundaries[:-1], bucket_boundaries[1:]): if not isinstance(s, int) or not isinstance(e, int): raise TypeError( "bucket boundaries must be integers, but saw: %s and %s" % (s, e)) if s >= e: raise ValueError( "Buckets must contain sequential increasing lengths, but saw: " "%d before %d" % (s, e)) with ops.name_scope(name, "bucket_by_sequence_length", [input_length] + tensor_list) as name: input_length = ops.convert_to_tensor(input_length, dtype=dtypes.int32, name="input_length") # Bucketing conditions are: # l < b[0] # b[0] <= l < b[1] # b[1] <= l < b[2] # ... # b[N-2] <= l < b[N-1] # b[N-1] <= l # Equivalent to: # [-inf, b[0], b[1], ..., b[N-1]] <= l < [b[0], b[1], ..., b[N-1], inf] buckets_min = [np.iinfo(np.int32).min] + list(bucket_boundaries) buckets_max = list(bucket_boundaries) + [np.iinfo(np.int32).max] conditions_c = math_ops.logical_and( math_ops.less_equal(buckets_min, input_length), math_ops.less(input_length, buckets_max)) which_bucket = math_ops.reduce_min(array_ops.where(conditions_c)) which_bucket = math_ops.to_int32(which_bucket) if shapes is not None: shapes = [tensor_shape.scalar()] + shapes _, dequeued = bucket( tensors=[input_length] + tensor_list, which_bucket=which_bucket, batch_size=batch_size, num_buckets=len(bucket_boundaries) + 1, num_threads=num_threads, capacity=capacity, shapes=shapes, dynamic_pad=dynamic_pad, allow_smaller_final_batch=allow_smaller_final_batch, keep_input=keep_input, shared_name=shared_name) return (dequeued[0], _as_original_type(tensors, dequeued[1:]))
def _dynamic_rnn_loop(cell, inputs, initial_state, parallel_iterations, swap_memory, sequence_length=None, dtype=None): """Internal implementation of Dynamic RNN. Args: cell: An instance of RNNCell. inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested tuple of such elements. initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if `cell.state_size` is a tuple, then this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. parallel_iterations: Positive Python int. swap_memory: A Python boolean sequence_length: (optional) An `int32` `Tensor` of shape [batch_size]. dtype: (optional) Expected dtype of output. If not specified, inferred from initial_state. Returns: Tuple `(final_outputs, final_state)`. final_outputs: A `Tensor` of shape `[time, batch_size, cell.output_size]`. If `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape` objects, then this returns a (possibly nsted) tuple of Tensors matching the corresponding shapes. final_state: A `Tensor`, or possibly nested tuple of Tensors, matching in length and shapes to `initial_state`. Raises: ValueError: If the input depth cannot be inferred via shape inference from the inputs. """ state = initial_state assert isinstance(parallel_iterations, int), "parallel_iterations must be int" state_size = cell.state_size flat_input = nest.flatten(inputs) flat_output_size = nest.flatten(cell.output_size) # Construct an initial output input_shape = array_ops.shape(flat_input[0]) time_steps = input_shape[0] batch_size = input_shape[1] inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3) for input_ in flat_input) const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2] for shape in inputs_got_shape: if not shape[2:].is_fully_defined(): raise ValueError( "Input size (depth of inputs) must be accessible via shape inference," " but saw value None.") got_time_steps = shape[0].value got_batch_size = shape[1].value if const_time_steps != got_time_steps: raise ValueError( "Time steps is not the same for all the elements in the input in a " "batch.") if const_batch_size != got_batch_size: raise ValueError( "Batch_size is not the same for all the elements in the input." ) # Prepare dynamic conditional copying of state & output def _create_zero_arrays(size): size = _state_size_with_prefix(size, prefix=[batch_size]) return array_ops.zeros(array_ops.stack(size), rnn._infer_state_dtype(dtype, state)) flat_zero_output = tuple( _create_zero_arrays(output) for output in flat_output_size) zero_output = nest.pack_sequence_as(structure=cell.output_size, flat_sequence=flat_zero_output) if sequence_length is not None: min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) time = array_ops.constant(0, dtype=dtypes.int32, name="time") with ops.name_scope("dynamic_rnn") as scope: base_name = scope def _create_ta(name, dtype): return tensor_array_ops.TensorArray(dtype=dtype, size=time_steps, tensor_array_name=base_name + name, clear_after_read=False) output_ta = tuple( _create_ta("output_%d" % i, rnn._infer_state_dtype(dtype, state)) for i in range(len(flat_output_size))) input_ta = tuple( _create_ta("input_%d" % i, flat_input[0].dtype) for i in range(len(flat_input))) input_ta = tuple( ta.unstack(input_) for ta, input_ in zip(input_ta, flat_input)) def _time_step(time, output_ta_t, state): """Take a time step of the dynamic RNN. Args: time: int32 scalar Tensor. output_ta_t: List of `TensorArray`s that represent the output. state: nested tuple of vector tensors that represent the state. Returns: The tuple (time + 1, output_ta_t with updated flow, new_state). """ input_t = tuple(ta.read(time) for ta in input_ta) # Restore some shape information for input_, shape in zip(input_t, inputs_got_shape): input_.set_shape(shape[1:]) input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t) call_cell = lambda: cell(input_t, state) def f1(): return zero_output def f2(): return tuple( ta.read(tf.subtract(time, 1)) for ta in output_ta_t) #output_ta_t.read(tf.subtract(time, 1)) cur_zero_output = tf.cond(tf.less(time, 1), f1, f2) if sequence_length is not None: (output, new_state) = rnn._rnn_step( time=time, sequence_length=sequence_length, min_sequence_length=min_sequence_length, max_sequence_length=max_sequence_length, zero_output=cur_zero_output, # TODO state=state, call_cell=call_cell, state_size=state_size, skip_conditionals=True) else: (output, new_state) = call_cell() # Pack state if using state tuples output = nest.flatten(output) output_ta_t = tuple( ta.write(time, out) for ta, out in zip(output_ta_t, output)) return (time + 1, output_ta_t, new_state) _, output_final_ta, final_state = control_flow_ops.while_loop( cond=lambda time, *_: time < time_steps, body=_time_step, loop_vars=(time, output_ta, state), parallel_iterations=parallel_iterations, swap_memory=swap_memory) # Unpack final output if not using output tuples. final_outputs = tuple(ta.stack() for ta in output_final_ta) # Restore some shape information for output, output_size in zip(final_outputs, flat_output_size): shape = _state_size_with_prefix( output_size, prefix=[const_time_steps, const_batch_size]) output.set_shape(shape) final_outputs = nest.pack_sequence_as(structure=cell.output_size, flat_sequence=final_outputs) return (final_outputs, final_state)
def testEmptyGradients(self): with self.test_session(): x = array_ops.zeros([0, 3]) y = math_ops.reduce_min(x, [1]) error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0]) self.assertEqual(error, 0)
def rnn_mem(cell, inputs, question, state, m_input_size, m_size, initial_state=None, dtype=None, sequence_length=None, scope=None): if not isinstance(cell, rnn_cell.RNNCell): raise TypeError("cell must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") outputs = [] # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. with vs.variable_scope(scope or "RNN") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) # Temporarily avoid EmbeddingWrapper and seq2seq badness # TODO(lukaszkaiser): remove EmbeddingWrapper if inputs[0].get_shape().ndims != 1: (fixed_batch_size, input_size) = inputs[0].get_shape().with_rank(2) if input_size.value is None: raise ValueError( "Input size (second dimension of inputs[0]) must be accessible via " "shape inference, but saw value None.") else: fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(inputs[0])[0] if initial_state is not None: state = initial_state else: if not dtype: raise ValueError("If no initial_state is provided, dtype must be.") state = cell.zero_state(batch_size, dtype) if sequence_length is not None: # Prepare variables sequence_length = math_ops.to_int32(sequence_length) zero_output = array_ops.zeros( array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype) zero_output.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, cell.output_size])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(inputs): if time > 0: vs.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop call_cell = lambda: cell(input_, question, state, m_input_size, m_size) # pylint: enable=cell-var-from-loop if sequence_length is not None: (output, state) = rnn._rnn_step( time, sequence_length, min_sequence_length, max_sequence_length, zero_output, state, call_cell) else: (output, state) = call_cell() outputs.append(output) return (outputs, state)
def test(self): result_lt = ops.reduce_min(self.original_lt, {'channel'}) golden_lt = core.LabeledTensor( math_ops.reduce_min(self.original_lt.tensor, 1), [self.a0, self.a2, self.a3]) self.assertLabeledTensorsEqual(result_lt, golden_lt)
def rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None): """Creates a recurrent neural network specified by RNNCell `cell`. The simplest form of RNN network generated is: state = cell.zero_state(...) outputs = [] for input_ in inputs: output, state = cell(input_, state) outputs.append(output) return (outputs, state) However, a few other options are available: An initial state can be provided. If the sequence_length vector is provided, dynamic calculation is performed. This method of calculation does not compute the RNN steps past the maximum sequence length of the minibatch (thus saving computational time), and properly propagates the state at an example's sequence length to the final state output. The dynamic calculation performed is, at time t for batch row b, (output, state)(b, t) = (t >= sequence_length(b)) ? (zeros(cell.output_size), states(b, sequence_length(b) - 1)) : cell(input(b, t), state(b, t - 1)) Args: cell: An instance of RNNCell. inputs: A length T list of inputs, each a tensor of shape [batch_size, input_size]. initial_state: (optional) An initial state for the RNN. If `cell.state_size` is an integer, this must be a tensor of appropriate type and shape `[batch_size x cell.state_size]`. If `cell.state_size` is a tuple, this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. dtype: (optional) The data type for the initial state. Required if initial_state is not provided. sequence_length: Specifies the length of each sequence in inputs. An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`. scope: VariableScope for the created subgraph; defaults to "RNN". Returns: A pair (outputs, state) where: - outputs is a length T list of outputs (one for each input) - state is the final state Raises: TypeError: If `cell` is not an instance of RNNCell. ValueError: If `inputs` is `None` or an empty list, or if the input depth (column size) cannot be inferred from inputs via shape inference. """ if not isinstance(cell, rnn_cell.RNNCell): raise TypeError("cell must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") outputs = [] # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. with vs.variable_scope(scope or "RNN") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) # Temporarily avoid EmbeddingWrapper and seq2seq badness # TODO(lukaszkaiser): remove EmbeddingWrapper if inputs[0].get_shape().ndims != 1: input_shape = inputs[0].get_shape().with_rank_at_least(2) input_shape[1:].assert_is_fully_defined() (fixed_batch_size, input_size) = input_shape[0], input_shape[1:] if input_size[0].value is None: raise ValueError( "Input size (second dimension of inputs[0]) must be accessible via " "shape inference, but saw value None." ) else: fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(inputs[0])[0] if initial_state is not None: state = initial_state else: if not dtype: raise ValueError("If no initial_state is provided, " "dtype must be specified") state = cell.zero_state(batch_size, dtype) if sequence_length is not None: # Prepare variables sequence_length = math_ops.to_int32(sequence_length) # convert int to TensorShape if necessary output_size = _state_size_with_prefix(cell.output_size, prefix=[batch_size]) zero_output = array_ops.zeros(array_ops.pack(output_size), inputs[0].dtype) zero_output_shape = _state_size_with_prefix(cell.output_size, prefix=[fixed_batch_size.value]) zero_output.set_shape(tensor_shape.TensorShape(zero_output_shape)) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(inputs): if time > 0: vs.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop call_cell = lambda: cell(input_, state) # pylint: enable=cell-var-from-loop if sequence_length is not None: (output, state) = _rnn_step( time=time, sequence_length=sequence_length, min_sequence_length=min_sequence_length, max_sequence_length=max_sequence_length, zero_output=zero_output, state=state, call_cell=call_cell, state_size=cell.state_size, ) else: (output, state) = call_cell() outputs.append(output) return (outputs, state)
def test(self): result_lt = ops.reduce_min(self.original_lt, {'channel'}) golden_lt = core.LabeledTensor( math_ops.reduce_min(self.original_lt.tensor, 1), [self.a0, self.a2, self.a3]) self.assertLabeledTensorsEqual(result_lt, golden_lt)
def all_reduce_indexed_slices( self, input_slices: indexed_slices.IndexedSlices, options: Optional[collective_util.Options] = None ) -> indexed_slices.IndexedSlices: """All-reduce an IndexedSlices. This method must be called inside a tf.function. Args: input_slices: an IndexedSlices. options: an optional tf.distribute.experimental.CommunicationOptions. If provided, it overrides the default options. Returns: The reduced IndexedSlices. Raises: RuntimeError: if called in eager mode. """ if context.executing_eagerly(): raise RuntimeError( 'all_reduce_indexed_slices is not supported in eager mode.') # Current CollectiveAllGather implementations require input IndexedSlices to # have consistent length across the board, we handle the reduction of # IndexedSlices as follows: # 1. Gather the lengths of IndexedSlices from all participants. # 2. If they have consistent length, apply all_gather. # 3. Otherwise pad IndexedSlices to be the same length accross all # participants and apply_gather. options = self._options.merge(options) with ops.device(self._device): def all_gather_indexed_slices( all_gather_fn: Callable[ [core.TensorLike, Optional[collective_util.Options]], core.Tensor] ) -> indexed_slices.IndexedSlices: """Use all_gather_fn to aggregate `IndexedSlices`.""" all_values = all_gather_fn(input_slices.values, options) # Add control dependency to order the all-gather. if (options.implementation == collective_util.CommunicationImplementation.NCCL): control = [all_values] else: control = [] with ops.control_dependencies(control): all_indices = all_gather_fn(input_slices.indices, options) return indexed_slices.IndexedSlices( values=all_values, indices=all_indices, dense_shape=input_slices.dense_shape) length = array_ops.shape(input_slices.indices) all_lengths = self._all_gather(length, options) def all_gather_with_padding( input_tensor: core.TensorLike, options: Optional[collective_util.Options]) -> core.Tensor: """all_gather tensors of different sizes using padding.""" max_length = math_ops.reduce_max(all_lengths) padded_tensor = _pad_util(input_tensor, max_length) all_padded_tensors = self._all_gather(padded_tensor, options) split_tensors = [] for i in range(self._group_size): start_pos = i * max_length split_tensors.append( all_padded_tensors[start_pos:start_pos + all_lengths[i]]) return array_ops.concat(split_tensors, 0) return control_flow_ops.cond( math_ops.equal(math_ops.reduce_max(all_lengths), math_ops.reduce_min(all_lengths)), lambda: all_gather_indexed_slices(self._all_gather), lambda: all_gather_indexed_slices(all_gather_with_padding))
def all_reduce_indexed_slices(self, input_slices, communication_hint='AUTO', timeout=0): """All-reduce an IndexedSlices. This method must be called inside a tf.function. Args: input_slices: an IndexedSlices. communication_hint: string providing hint to runtime for choosing collective implementation. timeout: a float. The timeout in seconds. Returns: The reduced IndexedSlices. Raises: RuntimeError: if called in eager mode. """ if context.executing_eagerly(): raise RuntimeError( 'all_reduce_indexed_slices in eager mode is not supported') # Current CollectiveAllGather implementations require input IndexedSlices to # have consistent length across the board, we handle the reduction of # IndexedSlices as follows: # 1. Gather the lengths of IndexedSlices from all participants. # 2. If they have consistent length, apply all_gather. # 3. Otherwise convert IndexedSlices to dense tensors and apply # all_reduce. with ops.device(self._device): def all_gather(): """Use all_gather to aggregate `IndexedSlices`.""" all_values = self._all_gather(input_slices.values, communication_hint, timeout=timeout) # Add control dependency to order the all-gather. control = [all_values] if communication_hint == 'NCCL' else [] with ops.control_dependencies(control): all_indices = self._all_gather(input_slices.indices, communication_hint, timeout=timeout) return ops.IndexedSlices(values=all_values, indices=all_indices, dense_shape=input_slices.dense_shape) def densify_and_all_reduce(): """Use all_reduce to aggregate `IndexedSlices`.""" densified = ops.convert_to_tensor(input_slices) reduced = self.all_reduce( densified, communication_hint=communication_hint, timeout=timeout) # We have to convert dense grad to IndexedSlice because all_reduce() # and all_gather() must have the same return type as required by # control_flow_ops.cond. return ops.IndexedSlices(values=reduced, indices=math_ops.range( array_ops.shape(reduced)[0]), dense_shape=input_slices.dense_shape) length = array_ops.shape(input_slices.indices) all_lengths = self._all_gather(length, communication_hint, timeout=timeout) return control_flow_ops.cond( math_ops.equal(math_ops.reduce_max(all_lengths), math_ops.reduce_min(all_lengths)), all_gather, densify_and_all_reduce)
def call(self, inputs, count_weights=None): if isinstance(inputs, (list, np.ndarray)): inputs = ops.convert_to_tensor_v2_with_dispatch(inputs) if inputs.shape.rank == 1: inputs = array_ops.expand_dims(inputs, 1) if count_weights is not None and self._output_mode != COUNT: raise ValueError( "count_weights is not used in `output_mode='tf-idf'`, " "or `output_mode='binary'`. Please pass a single input.") self._called = True if self._max_tokens is None: raise RuntimeError( "If you construct a `CategoryEncoding` layer with " "`max_tokens=None`, you need to call `adapt()` " "on it before using it") else: out_depth = self._max_tokens if self._output_mode == TFIDF: # If the input is a sparse tensor, we densify it with the default value of # -1. Because -1 is ignored by one_hot, this effectively drops the non-set # positions from the output encoding. if self._sparse: raise ValueError("`sparse=True` with `output_mode=tfidf` " "is not supported.") if isinstance(inputs, sparse_tensor.SparseTensor): inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1) one_hot_data = array_ops.one_hot(inputs, depth=out_depth) counts = math_ops.reduce_sum(one_hot_data, axis=1) tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights) tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth))) return tf_idf_data binary_output = (self._output_mode == BINARY) if isinstance(inputs, sparse_tensor.SparseTensor): max_value = math_ops.reduce_max(inputs.values) min_value = math_ops.reduce_min(inputs.values) else: max_value = math_ops.reduce_max(inputs) min_value = math_ops.reduce_min(inputs) condition = math_ops.logical_and( math_ops.greater_equal(math_ops.cast(out_depth, max_value.dtype), max_value), math_ops.greater_equal(min_value, math_ops.cast(0, min_value.dtype))) control_flow_ops.Assert(condition, [ "Input values must be in the range 0 <= values < max_tokens" " with max_tokens={}".format(out_depth) ]) if self._sparse: result = bincount_ops.sparse_bincount(inputs, weights=count_weights, minlength=out_depth, maxlength=out_depth, axis=-1, binary_output=binary_output) result = math_ops.cast(result, K.floatx()) batch_size = array_ops.shape(result)[0] result = sparse_tensor.SparseTensor( indices=result.indices, values=result.values, dense_shape=[batch_size, out_depth]) return result else: result = bincount_ops.bincount(inputs, weights=count_weights, minlength=out_depth, maxlength=out_depth, dtype=K.floatx(), axis=-1, binary_output=binary_output) result.set_shape(tensor_shape.TensorShape((None, out_depth))) return result
def testEmptyGradients(self): with self.cached_session(): x = array_ops.zeros([0, 3]) y = math_ops.reduce_min(x, [1]) error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0]) self.assertEqual(error, 0)
def _dynamic_rnn_loop(cell, inputs, initial_state, parallel_iterations, swap_memory, sequence_length=None, dtype=None): """Internal implementation of Dynamic RNN. Args: cell: An instance of RNNCell. inputs: A `Tensor` of shape [time, batch_size, input_size], or a nested tuple of such elements. initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if `cell.state_size` is a tuple, then this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. parallel_iterations: Positive Python int. swap_memory: A Python boolean sequence_length: (optional) An `int32` `Tensor` of shape [batch_size]. dtype: (optional) Expected dtype of output. If not specified, inferred from initial_state. Returns: Tuple `(final_outputs, final_state)`. final_outputs: A `Tensor` of shape `[time, batch_size, cell.output_size]`. If `cell.output_size` is a (possibly nested) tuple of ints or `TensorShape` objects, then this returns a (possibly nsted) tuple of Tensors matching the corresponding shapes. final_state: A `Tensor`, or possibly nested tuple of Tensors, matching in length and shapes to `initial_state`. Raises: ValueError: If the input depth cannot be inferred via shape inference from the inputs. """ state = initial_state assert isinstance(parallel_iterations, int), "parallel_iterations must be int" state_size = cell.state_size flat_input = nest.flatten(inputs) flat_output_size = nest.flatten(cell.output_size) # Construct an initial output input_shape = array_ops.shape(flat_input[0]) time_steps = input_shape[0] batch_size = input_shape[1] inputs_got_shape = tuple(input_.get_shape().with_rank_at_least(3) for input_ in flat_input) const_time_steps, const_batch_size = inputs_got_shape[0].as_list()[:2] for shape in inputs_got_shape: if not shape[2:].is_fully_defined(): raise ValueError( "Input size (depth of inputs) must be accessible via shape inference," " but saw value None.") got_time_steps = shape[0].value got_batch_size = shape[1].value if const_time_steps != got_time_steps: raise ValueError( "Time steps is not the same for all the elements in the input in a " "batch.") if const_batch_size != got_batch_size: raise ValueError( "Batch_size is not the same for all the elements in the input.") # Prepare dynamic conditional copying of state & output def _create_zero_arrays(size): size = _state_size_with_prefix(size, prefix=[batch_size]) return array_ops.zeros( array_ops.stack(size), _infer_state_dtype(dtype, state)) flat_zero_output = tuple(_create_zero_arrays(output) for output in flat_output_size) zero_output = nest.pack_sequence_as(structure=cell.output_size, flat_sequence=flat_zero_output) if sequence_length is not None: min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) time = array_ops.constant(0, dtype=dtypes.int32, name="time") with ops.name_scope("dynamic_rnn") as scope: base_name = scope def _create_ta(name, dtype): return tensor_array_ops.TensorArray(dtype=dtype, size=time_steps, tensor_array_name=base_name + name) output_ta = tuple(_create_ta("output_%d" % i, _infer_state_dtype(dtype, state)) for i in range(len(flat_output_size))) input_ta = tuple(_create_ta("input_%d" % i, flat_input[0].dtype) for i in range(len(flat_input))) input_ta = tuple(ta.unstack(input_) for ta, input_ in zip(input_ta, flat_input)) def _time_step(time, output_ta_t, state): """Take a time step of the dynamic RNN. Args: time: int32 scalar Tensor. output_ta_t: List of `TensorArray`s that represent the output. state: nested tuple of vector tensors that represent the state. Returns: The tuple (time + 1, output_ta_t with updated flow, new_state). """ input_t = tuple(ta.read(time) for ta in input_ta) # Restore some shape information for input_, shape in zip(input_t, inputs_got_shape): input_.set_shape(shape[1:]) input_t = nest.pack_sequence_as(structure=inputs, flat_sequence=input_t) call_cell = lambda: cell(input_t, state) if sequence_length is not None: (output, new_state) = _rnn_step( time=time, sequence_length=sequence_length, min_sequence_length=min_sequence_length, max_sequence_length=max_sequence_length, zero_output=zero_output, state=state, call_cell=call_cell, state_size=state_size, skip_conditionals=True) else: (output, new_state) = call_cell() # Pack state if using state tuples output = nest.flatten(output) output_ta_t = tuple( ta.write(time, out) for ta, out in zip(output_ta_t, output)) return (time + 1, output_ta_t, new_state) _, output_final_ta, final_state = control_flow_ops.while_loop( cond=lambda time, *_: time < time_steps, body=_time_step, loop_vars=(time, output_ta, state), parallel_iterations=parallel_iterations, swap_memory=swap_memory) # Unpack final output if not using output tuples. final_outputs = tuple(ta.stack() for ta in output_final_ta) # Restore some shape information for output, output_size in zip(final_outputs, flat_output_size): shape = _state_size_with_prefix( output_size, prefix=[const_time_steps, const_batch_size]) output.set_shape(shape) final_outputs = nest.pack_sequence_as( structure=cell.output_size, flat_sequence=final_outputs) return (final_outputs, final_state)
def _update_statistics_from_mini_batch( self, statistics, auxiliary_variables, times, values): """Given mini-batch input, update `statistics` and `auxiliary_variables`.""" values = math_ops.cast(values, self._dtype) # The density (measured in times per observation) that we see in each part # of the mini-batch. batch_inter_observation_duration = (math_ops.cast( math_ops.reduce_max(times, axis=1) - math_ops.reduce_min(times, axis=1), self._dtype) / math_ops.cast( array_ops.shape(times)[1] - 1, self._dtype)) # Co-locate updates with their variables to minimize race conditions when # updating statistics. with ops.colocate_with(auxiliary_variables.max_time_seen): # There is a race condition if this value is being updated from multiple # workers. However, it should eventually reach the correct value if the # last chunk is presented enough times. max_time_seen_assign = state_ops.assign( auxiliary_variables.max_time_seen, gen_math_ops.maximum(auxiliary_variables.max_time_seen, math_ops.reduce_max(times))) with ops.colocate_with(auxiliary_variables.chunk_count): chunk_count_assign = state_ops.assign_add(auxiliary_variables.chunk_count, array_ops.shape( times, out_type=dtypes.int64)[0]) with ops.colocate_with(auxiliary_variables.inter_observation_duration_sum): inter_observation_duration_assign = state_ops.assign_add( auxiliary_variables.inter_observation_duration_sum, math_ops.reduce_sum(batch_inter_observation_duration)) with ops.colocate_with(auxiliary_variables.example_count): example_count_assign = state_ops.assign_add( auxiliary_variables.example_count, array_ops.size(times, out_type=dtypes.int64)) # Note: These mean/variance updates assume that all points are equally # likely, which is not true if _chunks_ are sampled uniformly from the space # of all possible contiguous chunks, since points at the start and end of # the series are then members of fewer chunks. For series which are much # longer than the chunk size (the usual/expected case), this effect becomes # irrelevant. with ops.colocate_with(auxiliary_variables.overall_feature_sum): overall_feature_sum_assign = state_ops.assign_add( auxiliary_variables.overall_feature_sum, math_ops.reduce_sum(values, axis=[0, 1])) with ops.colocate_with(auxiliary_variables.overall_feature_sum_of_squares): overall_feature_sum_of_squares_assign = state_ops.assign_add( auxiliary_variables.overall_feature_sum_of_squares, math_ops.reduce_sum(values**2, axis=[0, 1])) per_chunk_aux_updates = control_flow_ops.group( max_time_seen_assign, chunk_count_assign, inter_observation_duration_assign, example_count_assign, overall_feature_sum_assign, overall_feature_sum_of_squares_assign) with ops.control_dependencies([per_chunk_aux_updates]): example_count_float = math_ops.cast(auxiliary_variables.example_count, self._dtype) new_feature_mean = (auxiliary_variables.overall_feature_sum / example_count_float) overall_feature_mean_update = state_ops.assign( statistics.overall_feature_moments.mean, new_feature_mean) overall_feature_var_update = state_ops.assign( statistics.overall_feature_moments.variance, # De-biased n / (n - 1) variance correction example_count_float / (example_count_float - 1.) * (auxiliary_variables.overall_feature_sum_of_squares / example_count_float - new_feature_mean**2)) # TODO(b/35675805): Remove this cast min_time_batch = math_ops.cast(math_ops.argmin(times[:, 0]), dtypes.int32) def series_start_updates(): # If this is the lowest-time chunk that we have seen so far, update # series start moments to reflect that. Note that these statistics are # "best effort", as there are race conditions in the update (however, # they should eventually converge if the start of the series is # presented enough times). mean, variance = nn.moments( values[min_time_batch, :self._starting_variance_window_size], axes=[0]) return control_flow_ops.group( state_ops.assign(statistics.series_start_moments.mean, mean), state_ops.assign(statistics.series_start_moments.variance, variance)) with ops.colocate_with(statistics.start_time): series_start_update = control_flow_ops.cond( # Update moments whenever we even match the lowest time seen so far, # to ensure that series start statistics are eventually updated to # their correct values, despite race conditions (i.e. eventually # statistics.start_time will reflect the global lowest time, and # given that we will eventually update the series start moments to # their correct values). math_ops.less_equal(times[min_time_batch, 0], statistics.start_time), series_start_updates, control_flow_ops.no_op) with ops.control_dependencies([series_start_update]): # There is a race condition if this update is performed in parallel on # multiple workers. Since models may be sensitive to being presented # with times before the putative start time, the value of this # variable is post-processed above to guarantee that each worker is # presented with a start time which is at least as low as the lowest # time in its current mini-batch. start_time_update = state_ops.assign(statistics.start_time, gen_math_ops.minimum( statistics.start_time, math_ops.reduce_min(times))) inter_observation_duration_estimate = ( auxiliary_variables.inter_observation_duration_sum / math_ops.cast( auxiliary_variables.chunk_count, self._dtype)) # Estimate the total number of observations as: # (end time - start time + 1) * average intra-chunk time density total_observation_count_update = state_ops.assign( statistics.total_observation_count, math_ops.cast( gen_math_ops.round( math_ops.cast(auxiliary_variables.max_time_seen - statistics.start_time + 1, self._dtype) / inter_observation_duration_estimate), dtypes.int64)) per_chunk_stat_updates = control_flow_ops.group( overall_feature_mean_update, overall_feature_var_update, series_start_update, start_time_update, total_observation_count_update) return per_chunk_stat_updates
def _train_op_fn(loss): """Run one training iteration.""" if training_state_cache: train_op.append(training_state_cache.insert(tree_ids, node_ids, logits)) if closed_form_grad_and_hess_fn: gradients, hessians = closed_form_grad_and_hess_fn(logits, labels) else: gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0] hessians = gradients_impl.gradients( gradients, logits, name='Hessians')[0] stats_summaries_list = [] for i, feature_ids in enumerate(feature_ids_list): num_buckets = bucket_size_list[i] summaries = [ array_ops.squeeze( boosted_trees_ops.make_stats_summary( node_ids=node_ids, gradients=gradients, hessians=hessians, bucketized_features_list=[input_feature_list[f]], max_splits=max_splits, num_buckets=num_buckets), axis=0) for f in feature_ids ] stats_summaries_list.append(summaries) accumulators = [] def grow_tree_from_stats_summaries(stats_summaries_list, feature_ids_list): """Updates ensemble based on the best gains from stats summaries.""" node_ids_per_feature = [] gains_list = [] thresholds_list = [] left_node_contribs_list = [] right_node_contribs_list = [] all_feature_ids = [] assert len(stats_summaries_list) == len(feature_ids_list) for i, feature_ids in enumerate(feature_ids_list): (numeric_node_ids_per_feature, numeric_gains_list, numeric_thresholds_list, numeric_left_node_contribs_list, numeric_right_node_contribs_list) = ( boosted_trees_ops.calculate_best_gains_per_feature( node_id_range=last_layer_nodes_range, stats_summary_list=stats_summaries_list[i], l1=tree_hparams.l1, l2=tree_hparams.l2, tree_complexity=tree_hparams.tree_complexity, min_node_weight=tree_hparams.min_node_weight, max_splits=max_splits)) all_feature_ids += feature_ids node_ids_per_feature += numeric_node_ids_per_feature gains_list += numeric_gains_list thresholds_list += numeric_thresholds_list left_node_contribs_list += numeric_left_node_contribs_list right_node_contribs_list += numeric_right_node_contribs_list grow_op = boosted_trees_ops.update_ensemble( # Confirm if local_tree_ensemble or tree_ensemble should be used. tree_ensemble.resource_handle, feature_ids=all_feature_ids, node_ids=node_ids_per_feature, gains=gains_list, thresholds=thresholds_list, left_node_contribs=left_node_contribs_list, right_node_contribs=right_node_contribs_list, learning_rate=tree_hparams.learning_rate, max_depth=tree_hparams.max_depth, pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING) return grow_op if train_in_memory and is_single_machine: train_op.append(distribute_lib.increment_var(global_step)) train_op.append( grow_tree_from_stats_summaries(stats_summaries_list, feature_ids_list)) else: dependencies = [] for i, feature_ids in enumerate(feature_ids_list): stats_summaries = stats_summaries_list[i] accumulator = data_flow_ops.ConditionalAccumulator( dtype=dtypes.float32, # The stats consist of grads and hessians (the last dimension). shape=[len(feature_ids), max_splits, bucket_size_list[i], 2], shared_name='numeric_stats_summary_accumulator_' + str(i)) accumulators.append(accumulator) apply_grad = accumulator.apply_grad( array_ops.stack(stats_summaries, axis=0), stamp_token) dependencies.append(apply_grad) def grow_tree_from_accumulated_summaries_fn(): """Updates the tree with the best layer from accumulated summaries.""" # Take out the accumulated summaries from the accumulator and grow. stats_summaries_list = [] stats_summaries_list = [ array_ops.unstack(accumulator.take_grad(1), axis=0) for accumulator in accumulators ] grow_op = grow_tree_from_stats_summaries(stats_summaries_list, feature_ids_list) return grow_op with ops.control_dependencies(dependencies): train_op.append(distribute_lib.increment_var(global_step)) if config.is_chief: min_accumulated = math_ops.reduce_min( array_ops.stack( [acc.num_accumulated() for acc in accumulators])) train_op.append( control_flow_ops.cond( math_ops.greater_equal(min_accumulated, n_batches_per_layer), grow_tree_from_accumulated_summaries_fn, control_flow_ops.no_op, name='wait_until_n_batches_accumulated')) return control_flow_ops.group(train_op, name='train_op')
def testMinGradient(self): inputs = constant_op.constant([1.0], dtype=dtypes.float32) outputs = math_ops.reduce_min(array_ops.concat([inputs, inputs], 0)) with self.cached_session(): error = gradient_checker.compute_gradient_error(inputs, [1], outputs, []) self.assertLess(error, 1e-4)
def rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None): """Creates a recurrent neural network specified by RNNCell "cell". The simplest form of RNN network generated is: state = cell.zero_state(...) outputs = [] for input_ in inputs: output, state = cell(input_, state) outputs.append(output) return (outputs, state) However, a few other options are available: An initial state can be provided. If the sequence_length vector is provided, dynamic calculation is performed. This method of calculation does not compute the RNN steps past the maximum sequence length of the minibatch (thus saving computational time), and properly propagates the state at an example's sequence length to the final state output. The dynamic calculation performed is, at time t for batch row b, (output, state)(b, t) = (t >= sequence_length(b)) ? (zeros(cell.output_size), states(b, sequence_length(b) - 1)) : cell(input(b, t), state(b, t - 1)) Args: cell: An instance of RNNCell. inputs: A length T list of inputs, each a tensor of shape [batch_size, cell.input_size]. initial_state: (optional) An initial state for the RNN. This must be a tensor of appropriate type and shape [batch_size x cell.state_size]. dtype: (optional) The data type for the initial state. Required if initial_state is not provided. sequence_length: Specifies the length of each sequence in inputs. An int32 or int64 vector (tensor) size [batch_size]. Values in [0, T). scope: VariableScope for the created subgraph; defaults to "RNN". Returns: A pair (outputs, state) where: outputs is a length T list of outputs (one for each input) state is the final state Raises: TypeError: If "cell" is not an instance of RNNCell. ValueError: If inputs is None or an empty list, or if the input depth cannot be inferred from inputs via shape inference. """ if not isinstance(cell, BaseCell): raise TypeError("cell must be an instance of RNNCell") if not isinstance(inputs, list): raise TypeError("inputs must be a list") if not inputs: raise ValueError("inputs must not be empty") outputs = [] # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. with vs.variable_scope(scope or "RNN") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) # Temporarily avoid EmbeddingWrapper and seq2seq badness # TODO(lukaszkaiser): remove EmbeddingWrapper if inputs[0].get_shape().ndims != 1: (fixed_batch_size, input_size) = inputs[0].get_shape().with_rank(2) if input_size.value is None: raise ValueError( "Input size (second dimension of inputs[0]) must be accessible via " "shape inference, but saw value None.") else: fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(inputs[0])[0] if initial_state is not None: state = initial_state else: if not dtype: raise ValueError( "If no initial_state is provided, dtype must be.") state = cell.zero_state(batch_size, dtype) if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length is not None: # Prepare variables zero_output = array_ops.zeros( array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype) zero_output.set_shape( tensor_shape.TensorShape( [fixed_batch_size.value, cell.output_size])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(inputs): if time > 0: vs.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop call_cell = lambda: cell(input_, state) # pylint: enable=cell-var-from-loop if sequence_length is not None: (output, state) = _rnn_step(time, sequence_length, min_sequence_length, max_sequence_length, zero_output, state, call_cell) else: (output, state) = call_cell() outputs.append(output) return (outputs, state)
def attention_RNN(encoder_outputs, encoder_state, num_decoder_symbols, sequence_length, num_heads=1, dtype=dtypes.float32, use_attention=True, loop_function=None, scope=None): if use_attention: print ('Use the attention RNN model') if num_heads < 1: raise ValueError("With less than 1 heads, use a non-attention decoder.") with variable_scope.variable_scope(scope or "attention_RNN"): output_size = encoder_outputs[0].get_shape()[1].value top_states = [array_ops.reshape(e, [-1, 1, output_size]) for e in encoder_outputs] attention_states = array_ops.concat(top_states, 1) if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError("Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) batch_size = array_ops.shape(top_states[0])[0] # Needed for reshaping. attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) hidden_features = [] v = [] attention_vec_size = attn_size # Size of query vectors for attention. for a in xrange(num_heads): k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")) v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size])) def attention(query): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds batch_attn_size = array_ops.stack([batch_size, attn_size]) attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)] for a in attns: # Ensure the second shape of attention vectors is set. a.set_shape([None, attn_size]) # loop through the encoder_outputs attention_encoder_outputs = list() sequence_attention_weights = list() for i in xrange(len(encoder_outputs)): if i > 0: variable_scope.get_variable_scope().reuse_variables() if i == 0: with variable_scope.variable_scope("Initial_Decoder_Attention"): initial_state = rnn_cell._linear(encoder_state, output_size, True) attn_weights, ds = attention(initial_state) else: attn_weights, ds = attention(encoder_outputs[i]) output = array_ops.concat([ds[0], encoder_outputs[i]], 1) # NOTE: here we temporarily assume num_head = 1 with variable_scope.variable_scope("AttnRnnOutputProjection"): logit = rnn_cell._linear(output, num_decoder_symbols, True) attention_encoder_outputs.append(logit) # NOTE: here we temporarily assume num_head = 1 sequence_attention_weights.append(attn_weights[0]) # NOTE: here we temporarily assume num_head = 1 else: print ('Use the NON attention RNN model') with variable_scope.variable_scope(scope or "non-attention_RNN"): attention_encoder_outputs = list() sequence_attention_weights = list() # copy over logits once out of sequence_length if encoder_outputs[0].get_shape().ndims != 1: (fixed_batch_size, output_size) = encoder_outputs[0].get_shape().with_rank(2) else: fixed_batch_size = encoder_outputs[0].get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(encoder_outputs[0])[0] if sequence_length is not None: sequence_length = math_ops.to_int32(sequence_length) if sequence_length is not None: # Prepare variables zero_logit = array_ops.zeros( array_ops.pack([batch_size, num_decoder_symbols]), encoder_outputs[0].dtype) zero_logit.set_shape( tensor_shape.TensorShape([fixed_batch_size.value, num_decoder_symbols])) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(encoder_outputs): if time > 0: variable_scope.get_variable_scope().reuse_variables() # pylint: disable=cell-var-from-loop # call_cell = lambda: cell(input_, state) generate_logit = lambda: rnn_cell._linear(encoder_outputs[time], num_decoder_symbols, True) # pylint: enable=cell-var-from-loop if sequence_length is not None: logit = _step( time, sequence_length, min_sequence_length, max_sequence_length, zero_logit, generate_logit) else: logit = generate_logit attention_encoder_outputs.append(logit) return attention_encoder_outputs, sequence_attention_weights
def _dynamic_rnn_loop(cell, inputs, initial_state, ff_keep_prob, recur_keep_prob, parallel_iterations, swap_memory, sequence_length=None): """Internal implementation of Dynamic RNN. Args: cell: An instance of RNNCell. inputs: A `Tensor` of shape [time, batch_size, depth]. initial_state: A `Tensor` of shape [batch_size, depth]. parallel_iterations: Positive Python int. swap_memory: A Python boolean sequence_length: (optional) An `int32` `Tensor` of shape [batch_size]. Returns: Tuple (final_outputs, final_state). final_outputs: A `Tensor` of shape [time, batch_size, depth]`. final_state: A `Tensor` of shape [batch_size, depth]. Raises: ValueError: If the input depth cannot be inferred via shape inference from the inputs. """ state = initial_state assert isinstance(parallel_iterations, int), "parallel_iterations must be int" # Construct an initial output input_shape = array_ops.shape(inputs) (time_steps, batch_size, _) = array_ops.unpack(input_shape, 3) inputs_got_shape = inputs.get_shape().with_rank(3) (const_time_steps, const_batch_size, const_depth) = inputs_got_shape.as_list() if const_depth is None: raise ValueError( "Input size (depth of inputs) must be accessible via shape inference, " "but saw value None.") # Prepare dynamic conditional copying of state & output zero_output = array_ops.zeros( array_ops.pack([batch_size, cell.output_size]), inputs.dtype) if sequence_length is not None: min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) time = array_ops.constant(0, dtype=dtypes.int32, name="time") with ops.name_scope("dynamic_rnn", None, []) as scope: base_name = scope output_ta = tensor_array_ops.TensorArray(dtype=inputs.dtype, size=time_steps, tensor_array_name=base_name + "output") input_ta = tensor_array_ops.TensorArray(dtype=inputs.dtype, size=time_steps, tensor_array_name=base_name + "input") if isinstance(ff_keep_prob, ops.Tensor) or ff_keep_prob < 1: inputs = nn_ops.dropout(inputs, ff_keep_prob, noise_shape=array_ops.pack( [1, batch_size, const_depth])) input_ta = input_ta.unpack(inputs) if isinstance(recur_keep_prob, ops.Tensor) or recur_keep_prob < 1: ones = array_ops.ones(array_ops.pack([batch_size, cell.output_size]), inputs.dtype) state_dropout = nn_ops.dropout(ones, recur_keep_prob) state_dropout = array_ops.concat( 1, [ones] * (cell.state_size // cell.output_size - 1) + [state_dropout]) else: state_dropout = 1. def _time_step(time, state, output_ta_t): """Take a time step of the dynamic RNN. Args: time: int32 scalar Tensor. state: Vector. output_ta_t: `TensorArray`, the output with existing flow. Returns: The tuple (time + 1, new_state, output_ta_t with updated flow). """ input_t = input_ta.read(time) # Restore some shape information input_t.set_shape([const_batch_size, const_depth]) call_cell = lambda: cell(input_t, state * state_dropout) if sequence_length is not None: (output, new_state) = _rnn_step(time=time, sequence_length=sequence_length, min_sequence_length=min_sequence_length, max_sequence_length=max_sequence_length, zero_output=zero_output, state=state, call_cell=call_cell, skip_conditionals=True) else: (output, new_state) = call_cell() output_ta_t = output_ta_t.write(time, output) return (time + 1, new_state, output_ta_t) (_, final_state, output_final_ta) = control_flow_ops.while_loop( cond=lambda time, _1, _2: time < time_steps, body=_time_step, loop_vars=(time, state, output_ta), parallel_iterations=parallel_iterations, swap_memory=swap_memory) final_outputs = output_final_ta.pack() # Restore some shape information final_outputs.set_shape( [const_time_steps, const_batch_size, cell.output_size]) return (final_outputs, final_state)
def _min_matrix_dim_tensor(self): """Minimum of domain/range dimension, as a tensor.""" return math_ops.reduce_min(self.shape_tensor()[-2:])
def masked_minimum(data, mask, dim=1): axis_maximums = math_ops.reduce_max(data, dim, keepdims=True) masked_minimums = math_ops.reduce_min( math_ops.multiply(data - axis_maximums, mask), dim, keepdims=True) + axis_maximums return masked_minimums
def tpu_fn(x, y): a = x + 7.0 b = y * 2.0 c, d, e = tpu.outside_compilation(outside_fn, a, b) return (math_ops.reduce_max(c) + math_ops.reduce_min(d) + math_ops.reduce_sum(e))
def static_rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None): """Creates a recurrent neural network specified by RNNCell `cell`. The simplest form of RNN network generated is: ```python state = cell.zero_state(...) outputs = [] for input_ in inputs: output, state = cell(input_, state) outputs.append(output) return (outputs, state) ``` However, a few other options are available: An initial state can be provided. If the sequence_length vector is provided, dynamic calculation is performed. This method of calculation does not compute the RNN steps past the maximum sequence length of the minibatch (thus saving computational time), and properly propagates the state at an example's sequence length to the final state output. The dynamic calculation performed is, at time `t` for batch row `b`, ```python (output, state)(b, t) = (t >= sequence_length(b)) ? (zeros(cell.output_size), states(b, sequence_length(b) - 1)) : cell(input(b, t), state(b, t - 1)) ``` Args: cell: An instance of RNNCell. inputs: A length T list of inputs, each a `Tensor` of shape `[batch_size, input_size]`, or a nested tuple of such elements. initial_state: (optional) An initial state for the RNN. If `cell.state_size` is an integer, this must be a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. dtype: (optional) The data type for the initial state and expected output. Required if initial_state is not provided or RNN state has a heterogeneous dtype. sequence_length: Specifies the length of each sequence in inputs. An int32 or int64 vector (tensor) size `[batch_size]`, values in `[0, T)`. scope: VariableScope for the created subgraph; defaults to "rnn". Returns: A pair (outputs, state) where: - outputs is a length T list of outputs (one for each input), or a nested tuple of such elements. - state is the final state Raises: TypeError: If `cell` is not an instance of RNNCell. ValueError: If `inputs` is `None` or an empty list, or if the input depth (column size) cannot be inferred from inputs via shape inference. """ if not isinstance(cell, RNNCell): raise TypeError("cell must be an instance of RNNCell") if not nest.is_sequence(inputs): raise TypeError("inputs must be a sequence") if not inputs: raise ValueError("inputs must not be empty") outputs = [] # Create a new scope in which the caching device is either # determined by the parent scope, or is set to place the cached # Variable using the same placement as for the rest of the RNN. with vs.variable_scope(scope or "rnn") as varscope: if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) # Obtain the first sequence of the input first_input = inputs while nest.is_sequence(first_input): first_input = first_input[0] # Temporarily avoid EmbeddingWrapper and seq2seq badness # TODO(lukaszkaiser): remove EmbeddingWrapper if first_input.get_shape().ndims != 1: input_shape = first_input.get_shape().with_rank_at_least(2) fixed_batch_size = input_shape[0] flat_inputs = nest.flatten(inputs) for flat_input in flat_inputs: input_shape = flat_input.get_shape().with_rank_at_least(2) batch_size, input_size = input_shape[0], input_shape[1:] fixed_batch_size.merge_with(batch_size) for i, size in enumerate(input_size): if size.value is None: raise ValueError( "Input size (dimension %d of inputs) must be accessible via " "shape inference, but saw value None." % i) else: fixed_batch_size = first_input.get_shape().with_rank_at_least(1)[0] if fixed_batch_size.value: batch_size = fixed_batch_size.value else: batch_size = array_ops.shape(first_input)[0] if initial_state is not None: state = initial_state else: if not dtype: raise ValueError("If no initial_state is provided, " "dtype must be specified") state = cell.zero_state(batch_size, dtype) if sequence_length is not None: # Prepare variables sequence_length = ops.convert_to_tensor(sequence_length, name="sequence_length") if sequence_length.get_shape().ndims not in (None, 1): raise ValueError( "sequence_length must be a vector of length batch_size") def _create_zero_output(output_size): # convert int to TensorShape if necessary size = _state_size_with_prefix(output_size, prefix=[batch_size]) output = array_ops.zeros(array_ops.stack(size), _infer_state_dtype(dtype, state)) shape = _state_size_with_prefix( output_size, prefix=[fixed_batch_size.value]) output.set_shape(tensor_shape.TensorShape(shape)) return output output_size = cell.output_size flat_output_size = nest.flatten(output_size) flat_zero_output = tuple( _create_zero_output(size) for size in flat_output_size) zero_output = nest.pack_sequence_as(structure=output_size, flat_sequence=flat_zero_output) sequence_length = math_ops.to_int32(sequence_length) min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) for time, input_ in enumerate(inputs): if time > 0: varscope.reuse_variables() # pylint: disable=cell-var-from-loop call_cell = lambda: cell(input_, state) # pylint: enable=cell-var-from-loop if sequence_length is not None: (output, state, wt, fi, ti) = _rnn_step(time=time, sequence_length=sequence_length, min_sequence_length=min_sequence_length, max_sequence_length=max_sequence_length, zero_output=zero_output, state=state, call_cell=call_cell, state_size=cell.state_size) else: (output, state, wt, fi, ti) = call_cell() outputs.append(output) return (outputs, state, wt, fi, ti)
def _finish(self, state): var_dtype = self._variables[0].dtype.base_dtype # Update global step. global_step = self._get_global_step(state) update_global_step = state_ops.assign_add(global_step, 1.) # Update the first moment estimate. beta1 = state.get_hyper("beta1", dtype=var_dtype) moment1 = self._get_moment1(state) flat_grad = self._get_flat_grad(state) # moment1_t := beta1 * moment1_{t-1} + (1 - beta1) * flat_grad_t update_moment1 = moment1.assign(beta1 * moment1 + (1. - beta1) * flat_grad) # Update the gradient buffer. window = state.get_hyper("window") grad_buffer = self._get_grad_buffer(state) next_grad_index = math_ops.floormod( math_ops.to_int32(update_global_step - 1.), window) # grad_buffer[(t-1) % window] := moment1_t update_grad_buffer = state_ops.scatter_update(grad_buffer, next_grad_index, update_moment1) # Compute the update step. eps = state.get_hyper("eps", dtype=var_dtype) svd_eps = state.get_hyper("svd_eps", dtype=var_dtype) sigma_eps = state.get_hyper("sigma_eps", dtype=var_dtype) lr = state.get_hyper("lr", dtype=var_dtype) denom = math_ops.sqrt( math_ops.minimum( ops.convert_to_tensor(update_global_step), ops.convert_to_tensor(math_ops.cast(window, dtype=var_dtype)))) moment1_2d = array_ops.expand_dims(update_moment1, -1) # m = grad_buffer^T / sqrt(min(t, window)) # m has shape [model dimension, window], where model dimension is the sum # of the dimensions of the flattened variables. m = array_ops.transpose(math_ops.divide(update_grad_buffer, denom)) # sigma, u, _ = SVD(m^Tm + I * svd_eps) mm = math_ops.matmul(m, m, transpose_a=True) damping = math_ops.cast(linalg_ops.eye(window), dtype=var_dtype) * svd_eps sigma, u, _ = linalg_ops.svd(mm + damping) sigma_sqrt = math_ops.sqrt(sigma) sigma_sqrt_min = math_ops.reduce_min(sigma_sqrt) # sigma_sqrt_inv = 1 / (\sqrt{sigma} + sigma_eps) ^ 3 # We add sigma_eps to alleviate numerical instability. # Note that (m^Tm)^(-3/2) = u diag(sigma_sqrt_inv) u^T. sigma_sqrt_inv = math_ops.divide( math_ops.cast(1.0, dtype=var_dtype), math_ops.pow(sigma_sqrt + sigma_eps, 3)) # In full matrix AdaGrad, the update step computes (mm^T)^(-1/2)g, where the # inversion of a model dimension by model dimension matrix is needed. To # speed up this computation we calculate the following instead: # m(m^Tm)^(-3/2)m^T moment1 = m u diag(sigma_sqrt_inv) u^T m^T moment1. new_step = array_ops.expand_dims( array_ops.zeros(flat_grad.get_shape(), dtype=var_dtype), -1) head = math_ops.matmul( m, math_ops.matmul( u, math_ops.matmul( array_ops.diag(sigma_sqrt_inv), math_ops.matmul(u, math_ops.matmul(m, moment1_2d, transpose_a=True), transpose_a=True)))) # When inverting (mm^t)^(1/2), we also add epsilon * I regularization for # degenerate cases. We expand ((mm^t)^(1/2) + epsilon * I)^(-1) using # Woodbury's identity. # For full derivation please see paper at # https://arxiv.org/pdf/1806.02958.pdf tail = moment1_2d - math_ops.matmul( m, math_ops.matmul( u, math_ops.matmul( array_ops.diag( math_ops.divide(math_ops.cast(1.0, dtype=var_dtype), sigma)), math_ops.matmul(u, math_ops.matmul( m, moment1_2d, transpose_a=True), transpose_a=True)))) scaled_tail = math_ops.divide(tail, sigma_sqrt_min) update_new_step = control_flow_ops.cond( sigma_sqrt_min > eps, lambda: math_ops.add(head, scaled_tail), lambda: math_ops.add(new_step, head)) # Update each variable. update_step = [] for var in self._variables: dim = self.shape_dict[var.name] start_index = self.index_dict[var.name] end_index = start_index + dim var_update_correct_shape = array_ops.reshape( update_new_step[start_index:end_index], var.get_shape()) var_updated = state_ops.assign_sub(var, lr * var_update_correct_shape) update_step.append(var_updated) return control_flow_ops.group(update_step)
def _calculate_acceptance_probs_with_mixing(initial_probs, target_probs): """Calculates the acceptance probabilities and mixing ratio. In this case, we assume that we can *either* sample from the original data distribution with probability `m`, or sample from a reshaped distribution that comes from rejection sampling on the original distribution. This rejection sampling is done on a per-class basis, with `a_i` representing the probability of accepting data from class `i`. This method is based on solving the following analysis for the reshaped distribution: Let F be the probability of a rejection (on any example). Let p_i be the proportion of examples in the data in class i (init_probs) Let a_i is the rate the rejection sampler should *accept* class i Let t_i is the target proportion in the minibatches for class i (target_probs) ``` F = sum_i(p_i * (1-a_i)) = 1 - sum_i(p_i * a_i) using sum_i(p_i) = 1 ``` An example with class `i` will be accepted if `k` rejections occur, then an example with class `i` is seen by the rejector, and it is accepted. This can be written as follows: ``` t_i = sum_k=0^inf(F^k * p_i * a_i) = p_i * a_j / (1 - F) using geometric series identity, since 0 <= F < 1 = p_i * a_i / sum_j(p_j * a_j) using F from above ``` Note that the following constraints hold: ``` 0 <= p_i <= 1, sum_i(p_i) = 1 0 <= a_i <= 1 0 <= t_i <= 1, sum_i(t_i) = 1 ``` A solution for a_i in terms of the other variables is the following: ```a_i = (t_i / p_i) / max_i[t_i / p_i]``` If we try to minimize the amount of data rejected, we get the following: M_max = max_i [ t_i / p_i ] M_min = min_i [ t_i / p_i ] The desired probability of accepting data if it comes from class `i`: a_i = (t_i/p_i - m) / (M_max - m) The desired probability of pulling a data element from the original dataset, rather than the filtered one: m = M_min Args: initial_probs: A Tensor of the initial probability distribution, given or estimated. target_probs: A Tensor of the corresponding classes. Returns: (A 1D Tensor with the per-class acceptance probabilities, the desired probability of pull from the original distribution.) """ ratio_l = _get_target_to_initial_ratio(initial_probs, target_probs) max_ratio = math_ops.reduce_max(ratio_l) min_ratio = math_ops.reduce_min(ratio_l) # Target prob to sample from original distribution. m = min_ratio # TODO(joelshor): Simplify fraction, if possible. a_i = (ratio_l - m) / (max_ratio - m) return a_i, m
def _dynamic_rnn_loop(cell, inputs, initial_state, parallel_iterations, swap_memory, sequence_length=None): """Internal implementation of Dynamic RNN. Args: cell: An instance of RNNCell. inputs: A `Tensor` of shape [time, batch_size, input_size]. initial_state: A `Tensor` of shape `[batch_size, state_size]`, or if `cell.state_size` is a tuple, then this should be a tuple of tensors having shapes `[batch_size, s] for s in cell.state_size`. parallel_iterations: Positive Python int. swap_memory: A Python boolean sequence_length: (optional) An `int32` `Tensor` of shape [batch_size]. Returns: Tuple `(final_outputs, final_state)`. final_outputs: A `Tensor` of shape `[time, batch_size, cell.output_size]`. final_state: A `Tensor` matrix, or tuple of such matrices, matching in length and shapes to `initial_state`. Raises: ValueError: If the input depth cannot be inferred via shape inference from the inputs. """ state = initial_state assert isinstance(parallel_iterations, int), "parallel_iterations must be int" # Construct an initial output input_shape = array_ops.shape(inputs) time_steps = input_shape[0] batch_size = input_shape[1] inputs_got_shape = inputs.get_shape().with_rank_at_least(3).as_list() const_time_steps = inputs_got_shape[0] const_batch_size = inputs_got_shape[1] const_depth = inputs_got_shape[2:] if const_depth is None: raise ValueError("Input size (depth of inputs) must be accessible via shape inference, " "but saw value None.") # Prepare dynamic conditional copying of state & output zeros_size = _state_size_with_prefix(cell.output_size, prefix=[batch_size]) zero_output = array_ops.zeros(array_ops.pack(zeros_size), inputs.dtype) if sequence_length is not None: min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) time = array_ops.constant(0, dtype=dtypes.int32, name="time") state_size = cell.state_size state_is_tuple = _is_sequence(state_size) state = _unpacked_state(state) if state_is_tuple else (state,) with ops.op_scope([], "dynamic_rnn") as scope: base_name = scope output_ta = tensor_array_ops.TensorArray( dtype=inputs.dtype, size=time_steps, tensor_array_name=base_name + "output" ) input_ta = tensor_array_ops.TensorArray(dtype=inputs.dtype, size=time_steps, tensor_array_name=base_name + "input") input_ta = input_ta.unpack(inputs) def _time_step(time, output_ta_t, *state): """Take a time step of the dynamic RNN. Args: time: int32 scalar Tensor. output_ta_t: `TensorArray`, the output with existing flow. *state: List of vector tensors. Returns: The tuple (time + 1, output_ta_t with updated flow) + new_state. """ input_t = input_ta.read(time) # Restore some shape information input_t.set_shape([const_batch_size] + const_depth) # Pack state back up for use by cell state = _packed_state(structure=state_size, state=state) if state_is_tuple else state[0] call_cell = lambda: cell(input_t, state) if sequence_length is not None: (output, new_state) = _rnn_step( time=time, sequence_length=sequence_length, min_sequence_length=min_sequence_length, max_sequence_length=max_sequence_length, zero_output=zero_output, state=state, call_cell=call_cell, state_size=state_size, skip_conditionals=True, ) else: (output, new_state) = call_cell() # Pack state if using state tuples new_state = tuple(_unpacked_state(new_state)) if state_is_tuple else (new_state,) output_ta_t = output_ta_t.write(time, output) return (time + 1, output_ta_t) + new_state final_loop_vars = control_flow_ops.while_loop( cond=lambda time, *_: time < time_steps, body=_time_step, loop_vars=(time, output_ta) + tuple(state), parallel_iterations=parallel_iterations, swap_memory=swap_memory, ) (output_final_ta, final_state) = (final_loop_vars[1], final_loop_vars[2:]) final_outputs = output_final_ta.pack() # Restore some shape information final_outputs_size = _state_size_with_prefix(cell.output_size, prefix=[const_time_steps, const_batch_size]) final_outputs.set_shape(final_outputs_size) # Unpack final state if not using state tuples. final_state = _packed_state(structure=cell.state_size, state=final_state) if state_is_tuple else final_state[0] return (final_outputs, final_state)
def _dynamic_rnn_loop( cell, inputs, initial_state, parallel_iterations, swap_memory, sequence_length=None): """Internal implementation of Dynamic RNN. Args: cell: An instance of RNNCell. inputs: A `Tensor` of shape [time, batch_size, depth]. initial_state: A `Tensor` of shape [batch_size, depth]. parallel_iterations: Positive Python int. swap_memory: A Python boolean sequence_length: (optional) An `int32` `Tensor` of shape [batch_size]. Returns: Tuple (final_outputs, final_state). final_outputs: A `Tensor` of shape [time, batch_size, depth]`. final_state: A `Tensor` of shape [batch_size, depth]. Raises: ValueError: If the input depth cannot be inferred via shape inference from the inputs. """ state = initial_state assert isinstance(parallel_iterations, int), "parallel_iterations must be int" # Construct an initial output input_shape = array_ops.shape(inputs) (time_steps, batch_size, _) = array_ops.unpack(input_shape, 3) inputs_got_shape = inputs.get_shape().with_rank(3) (const_time_steps, const_batch_size, const_depth) = inputs_got_shape.as_list() if const_depth is None: raise ValueError( "Input size (depth of inputs) must be accessible via shape inference, " "but saw value None.") # Prepare dynamic conditional copying of state & output zero_output = array_ops.zeros( array_ops.pack([batch_size, cell.output_size]), inputs.dtype) if sequence_length is not None: min_sequence_length = math_ops.reduce_min(sequence_length) max_sequence_length = math_ops.reduce_max(sequence_length) time = array_ops.constant(0, dtype=dtypes.int32, name="time") with ops.op_scope([], "dynamic_rnn") as scope: base_name = scope output_ta = tensor_array_ops.TensorArray( dtype=inputs.dtype, size=time_steps, tensor_array_name=base_name + "output") input_ta = tensor_array_ops.TensorArray( dtype=inputs.dtype, size=time_steps, tensor_array_name=base_name + "input") input_ta = input_ta.unpack(inputs) def _time_step(time, state, output_ta_t): """Take a time step of the dynamic RNN. Args: time: int32 scalar Tensor. state: Vector. output_ta_t: `TensorArray`, the output with existing flow. Returns: The tuple (time + 1, new_state, output_ta_t with updated flow). """ input_t = input_ta.read(time) # Restore some shape information input_t.set_shape([const_batch_size, const_depth]) call_cell = lambda: cell(input_t, state) if sequence_length is not None: (output, new_state) = _rnn_step( time=time, sequence_length=sequence_length, min_sequence_length=min_sequence_length, max_sequence_length=max_sequence_length, zero_output=zero_output, state=state, call_cell=call_cell, skip_conditionals=True) else: (output, new_state) = call_cell() output_ta_t = output_ta_t.write(time, output) return (time + 1, new_state, output_ta_t) (unused_final_time, final_state, output_final_ta) = control_flow_ops.While( cond=lambda time, _1, _2: time < time_steps, body=_time_step, loop_vars=(time, state, output_ta), parallel_iterations=parallel_iterations, swap_memory=swap_memory) final_outputs = output_final_ta.pack() # Restore some shape information final_outputs.set_shape([ const_time_steps, const_batch_size, cell.output_size]) return (final_outputs, final_state)