def sample(self, seed=None): """Sample a matrix with the given spectrum. Args: seed: if seed is set, use a constant random number generator to produce a sample, otherwise use built in tensorflow random numbers. Returns: The sampled matrix. """ dims = self._spectrum.shape[0] if seed is not None: rand = contrib_stateless.stateless_random_uniform( shape=[dims, dims], dtype=tf.float32, # Arbitrary offset on seed to prevent overlap of random state. seed=[seed + 1233, seed + 341]) * 2 - 1 else: rand = tf.random_uniform([dims, dims], -1., 1., dtype=tf.float32) q, r = tf.qr(rand, full_matrices=True) # Multiply by the sign of the diagonal to ensure a uniform distribution. q *= tf.sign(tf.matrix_diag_part(r)) # qDq^T where D is a diagonal matrix containing the spectrum return tf.matmul(tf.matmul(q, tf.diag(self._spectrum)), q, transpose_b=True)
def testRandomNormalIsFinite(self): with self.test_session() as sess, self.test_scope(): for dtype in self._random_types(): seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) x = stateless.stateless_random_uniform( shape=[10000], seed=seed_t, dtype=dtype) y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]}) self.assertTrue(np.all(np.isfinite(y)))
def dropout_layer(seed, signal, keep_prob=0.5, training=False): # return seed, signal s, seed = seed[:2], seed[2:] rand = stateless_random_uniform(tf.shape(signal), s) mask = tf.to_float(rand < keep_prob) return seed, tf.cond( training, lambda: (signal * mask) / tf.sqrt(keep_prob), lambda: signal)
def testRandomUniformIsInRange(self): with self.cached_session() as sess, self.test_scope(): for dtype in self._random_types(): seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) x = stateless.stateless_random_uniform( shape=[1000], seed=seed_t, dtype=dtype) y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]}) self.assertTrue(np.all(y >= 0)) self.assertTrue(np.all(y < 1))
def testRandomUniformIsInRange(self): with self.cached_session() as sess, self.test_scope(): for dtype in self._random_types(): seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) x = stateless.stateless_random_uniform( shape=[1000], seed=seed_t, dtype=dtype) y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]}) self.assertTrue(np.all(y >= 0)) self.assertTrue(np.all(y < 1))
def testDistributionOfStatelessRandomUniform(self): """Use Pearson's Chi-squared test to test for uniformity.""" with self.test_session() as sess, self.test_scope(): for dtype in self._random_types(): seed_t = array_ops.placeholder(dtypes.int32, shape=[2]) n = 1000 x = stateless.stateless_random_uniform( shape=[n], seed=seed_t, dtype=dtype) y = sess.run(x, {seed_t: [565656, 121212]}) # Tests that the values are distributed amongst 10 bins with equal # probability. 16.92 is the Chi^2 value for 9 degrees of freedom with # p=0.05. This test is probabilistic and would be flaky if the random # seed were not fixed. self.assertTrue(self._chi_squared(y, 10) < 16.92)
def _draw_n_rademacher_samples(n, seed=None): """ Draws n rademacher samples. """ if seed is None: return tf.where( tf.random_uniform([n], dtype=settings.float_type) <= 0.5, tf.ones([n], dtype=settings.float_type), -1. * tf.ones([n], dtype=settings.float_type)) else: return tf.where( stateless.stateless_random_uniform( [n], dtype=settings.float_type, seed=seed) <= 0.5, tf.ones([n], dtype=settings.float_type), -1. * tf.ones([n], dtype=settings.float_type))
def _draw_n_sparse_gaussian_samples(n, s, seed=None): """ Draws n sparse gaussian samples, that is with P(X = N(0,1)) = 1/s, P(X = 0) = 1 - 1/s. """ s = tf.cast(s, settings.float_type) if seed is None: return tf.where( tf.random_uniform([n], dtype=settings.float_type) <= 1. / s, tf.random_normal([n], dtype=settings.float_type), tf.zeros([n], dtype=settings.float_type)) else: return tf.where( stateless.stateless_random_uniform( [n], dtype=settings.float_type, seed=seed) <= 1. / s, stateless.stateless_random_normal([n], dtype=settings.float_type, seed=seed), tf.zeros([n], dtype=settings.float_type))
def test_gbm_euler_step_running_max_bridge_is_deterministic(self): drift = 0.2 vol = 0.1 t = 0.2 dt = 0.01 num_samples = 8 key = 1337 states_and_max = [tf.ones([num_samples])] * 2 eps_t = contrib_stateless.stateless_random_normal( shape=[num_samples], seed=[2 * key, int(t / dt)]) u_t = contrib_stateless.stateless_random_uniform( shape=[num_samples], seed=[2 * key + 1, int(t / dt)]) next_states_and_max = dynamics.gbm_euler_step_running_max( states_and_max, drift, vol, t, dt, simulate_bridge=True, random_normal_op=lambda: eps_t, random_uniform_op=lambda: u_t) next_states_and_max_bis = dynamics.gbm_euler_step_running_max( states_and_max, drift, vol, t, dt, simulate_bridge=True, key=key) with self.session() as session: next_states_and_max_eval, next_states_and_max_bis_eval = session.run( (next_states_and_max, next_states_and_max_bis)) next_states_eval, next_max_eval = next_states_and_max_eval next_states_bis_eval, next_max_bis_eval = next_states_and_max_bis_eval self.assertEqual(next_states_eval.shape, (num_samples, )) self.assertEqual(next_states_bis_eval.shape, (num_samples, )) self.assertEqual(next_max_eval.shape, (num_samples, )) self.assertEqual(next_max_bis_eval.shape, (num_samples, )) self.assertAllClose(next_states_eval, next_states_bis_eval) self.assertAllClose(next_max_eval, next_max_bis_eval)
def train(self, loss, predictions_dict, labels): """Grows a new tree and adds it to the ensemble. Args: loss: A scalar tensor representing average loss of examples. predictions_dict: Dictionary of Rank 2 `Tensor` representing information about predictions per example. labels: Rank 2 `Tensor` representing labels per example. Returns: An op that adds a new tree to the ensemble. Raises: ValueError: if inputs are not valid. """ # Get the worker device from input dependencies. input_deps = (self._dense_floats + self._sparse_float_indices + self._sparse_int_indices) worker_device = input_deps[0].device # Get tensors relevant for training and form the loss. predictions = predictions_dict[PREDICTIONS] partition_ids = predictions_dict[PARTITION_IDS] ensemble_stamp = predictions_dict[ENSEMBLE_STAMP] gradients = gradients_impl.gradients(loss, predictions, name="Gradients", colocate_gradients_with_ops=False, gate_gradients=0, aggregation_method=None)[0] strategy = self._learner_config.multi_class_strategy class_id = -1 # Handle different multiclass strategies. if strategy == learner_pb2.LearnerConfig.TREE_PER_CLASS: # We build one vs rest trees. gradient_shape = tensor_shape.scalar() hessian_shape = tensor_shape.scalar() if self._logits_dimension == 1: # We have only 1 score, gradients is of shape [batch, 1]. hessians = gradients_impl.gradients( gradients, predictions, name="Hessian", colocate_gradients_with_ops=False, gate_gradients=0, aggregation_method=None)[0] squeezed_gradients = array_ops.squeeze(gradients, axis=[1]) squeezed_hessians = array_ops.squeeze(hessians, axis=[1]) else: hessian_list = self._diagonal_hessian(gradients, predictions) # Assemble hessian list into a tensor. hessians = array_ops.stack(hessian_list, axis=1) # Choose the class for which the tree is built (one vs rest). class_id = math_ops.to_int32( predictions_dict[NUM_TREES_ATTEMPTED] % self._logits_dimension) # Use class id tensor to get the column with that index from gradients # and hessians. squeezed_gradients = array_ops.squeeze( _get_column_by_index(gradients, class_id)) squeezed_hessians = array_ops.squeeze( _get_column_by_index(hessians, class_id)) else: # Other multiclass strategies. gradient_shape = tensor_shape.TensorShape([self._logits_dimension]) if strategy == learner_pb2.LearnerConfig.FULL_HESSIAN: hessian_shape = tensor_shape.TensorShape( ([self._logits_dimension, self._logits_dimension])) hessian_list = self._full_hessian(gradients, predictions) else: # Diagonal hessian strategy. hessian_shape = tensor_shape.TensorShape( ([self._logits_dimension])) hessian_list = self._diagonal_hessian(gradients, predictions) squeezed_gradients = gradients hessians = array_ops.stack(hessian_list, axis=1) squeezed_hessians = hessians # Get the weights for each example for quantiles calculation, weights = self._get_weights(hessian_shape, squeezed_hessians) regularization_config = self._learner_config.regularization min_node_weight = self._learner_config.constraints.min_node_weight # Create all handlers ensuring resources are evenly allocated across PS. fc_name_idx = 0 handlers = [] init_stamp_token = constant_op.constant(0, dtype=dtypes.int64) with ops.device(self._get_replica_device_setter(worker_device)): # Create handlers for dense float columns for dense_float_column_idx in range(len(self._dense_floats)): fc_name = self._fc_names[fc_name_idx] handlers.append( ordinal_split_handler.DenseSplitHandler( l1_regularization=regularization_config.l1, l2_regularization=regularization_config.l2, tree_complexity_regularization=( regularization_config.tree_complexity), min_node_weight=min_node_weight, feature_column_group_id=dense_float_column_idx, epsilon=0.01, num_quantiles=100, dense_float_column=self. _dense_floats[dense_float_column_idx], name=fc_name, gradient_shape=gradient_shape, hessian_shape=hessian_shape, multiclass_strategy=strategy, init_stamp_token=init_stamp_token)) fc_name_idx += 1 # Create handlers for sparse float columns. for sparse_float_column_idx in range( len(self._sparse_float_indices)): fc_name = self._fc_names[fc_name_idx] handlers.append( ordinal_split_handler.SparseSplitHandler( l1_regularization=regularization_config.l1, l2_regularization=regularization_config.l2, tree_complexity_regularization=( regularization_config.tree_complexity), min_node_weight=min_node_weight, feature_column_group_id=sparse_float_column_idx, epsilon=0.01, num_quantiles=100, sparse_float_column=sparse_tensor.SparseTensor( self. _sparse_float_indices[sparse_float_column_idx], self._sparse_float_values[sparse_float_column_idx], self._sparse_float_shapes[sparse_float_column_idx] ), name=fc_name, gradient_shape=gradient_shape, hessian_shape=hessian_shape, multiclass_strategy=strategy, init_stamp_token=init_stamp_token)) fc_name_idx += 1 # Create handlers for sparse int columns. for sparse_int_column_idx in range(len(self._sparse_int_indices)): fc_name = self._fc_names[fc_name_idx] handlers.append( categorical_split_handler.EqualitySplitHandler( l1_regularization=regularization_config.l1, l2_regularization=regularization_config.l2, tree_complexity_regularization=( regularization_config.tree_complexity), min_node_weight=min_node_weight, feature_column_group_id=sparse_int_column_idx, sparse_int_column=sparse_tensor.SparseTensor( self._sparse_int_indices[sparse_int_column_idx], self._sparse_int_values[sparse_int_column_idx], self._sparse_int_shapes[sparse_int_column_idx]), name=fc_name, gradient_shape=gradient_shape, hessian_shape=hessian_shape, multiclass_strategy=strategy, init_stamp_token=init_stamp_token)) fc_name_idx += 1 # Create steps accumulator. steps_accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, gradient_shape=tensor_shape.scalar(), hessian_shape=tensor_shape.scalar(), name="StepsAccumulator") # Create bias stats accumulator. bias_stats_accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, gradient_shape=gradient_shape, hessian_shape=hessian_shape, name="BiasAccumulator") # Create ensemble stats variables. num_layer_examples = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="num_layer_examples", trainable=False) num_layer_steps = variables.Variable(initial_value=array_ops.zeros( [], dtypes.int64), name="num_layer_steps", trainable=False) num_layers = variables.Variable(initial_value=array_ops.zeros( [], dtypes.int64), name="num_layers", trainable=False) active_tree = variables.Variable(initial_value=array_ops.zeros( [], dtypes.int64), name="active_tree", trainable=False) active_layer = variables.Variable(initial_value=array_ops.zeros( [], dtypes.int64), name="active_layer", trainable=False) # Create ensemble stats summaries. summary.scalar("layer_stats/num_examples", num_layer_examples) summary.scalar("layer_stats/num_steps", num_layer_steps) summary.scalar("ensemble_stats/active_tree", active_tree) summary.scalar("ensemble_stats/active_layer", active_layer) # Update bias stats. stats_update_ops = [] continue_centering = variables.Variable( initial_value=self._center_bias, name="continue_centering", trainable=False) stats_update_ops.append( control_flow_ops.cond( continue_centering, self._make_update_bias_stats_fn(ensemble_stamp, predictions, gradients, bias_stats_accumulator), control_flow_ops.no_op)) # Update handler stats. handler_reads = {} for handler in handlers: handler_reads[handler] = handler.scheduled_reads() handler_results = batch_ops_utils.run_handler_scheduled_ops( handler_reads, ensemble_stamp, worker_device) per_handler_updates = {} # Two values per handler. First one is if the handler is active for the # current layer. The second one is if the handler is going to be active # for the next layer. subsampling_type = self._learner_config.WhichOneof("feature_fraction") if subsampling_type == "feature_fraction_per_level": seed = predictions_dict[NUM_LAYERS_ATTEMPTED] active_handlers_current_layer = stateless.stateless_random_uniform( shape=[len(handlers)], seed=[seed, 1]) active_handlers_next_layer = stateless.stateless_random_uniform( shape=[len(handlers)], seed=[seed + 1, 1]) active_handlers = array_ops.stack( [active_handlers_current_layer, active_handlers_next_layer], axis=1) active_handlers = (active_handlers < self._learner_config.feature_fraction_per_level) elif subsampling_type == "feature_fraction_per_tree": seed = predictions_dict[NUM_TREES_ATTEMPTED] active_handlers_current_layer = stateless.stateless_random_uniform( shape=[len(handlers)], seed=[seed, 2]) active_handlers_current_layer = ( active_handlers_current_layer < self._learner_config.feature_fraction_per_tree) active_handlers = array_ops.stack( active_handlers_current_layer, array_ops.ones([len(handlers)], dtype=dtypes.bool)) else: active_handlers = array_ops.ones([len(handlers), 2], dtype=dtypes.bool) # Prepare empty gradients and hessians when handlers are not ready. empty_hess_shape = [1] + hessian_shape.as_list() empty_grad_shape = [1] + gradient_shape.as_list() empty_gradients = constant_op.constant([], dtype=dtypes.float32, shape=empty_grad_shape) empty_hessians = constant_op.constant([], dtype=dtypes.float32, shape=empty_hess_shape) for handler_idx in range(len(handlers)): handler = handlers[handler_idx] is_active = active_handlers[handler_idx] updates, scheduled_updates = handler.update_stats( ensemble_stamp, partition_ids, squeezed_gradients, squeezed_hessians, empty_gradients, empty_hessians, weights, is_active, handler_results[handler]) stats_update_ops.append(updates) per_handler_updates[handler] = scheduled_updates update_results = batch_ops_utils.run_handler_scheduled_ops( per_handler_updates, ensemble_stamp, worker_device) for update in update_results.values(): stats_update_ops += update # Accumulate a step after updating stats. batch_size = math_ops.cast(array_ops.shape(labels)[0], dtypes.float32) with ops.control_dependencies(stats_update_ops): add_step_op = steps_accumulator.add(ensemble_stamp, [0], [[0, 0]], [batch_size], [1.0]) # Determine learning rate. learning_rate_tuner = self._learner_config.learning_rate_tuner.WhichOneof( "tuner") if learning_rate_tuner == "fixed" or learning_rate_tuner == "dropout": tuner = getattr(self._learner_config.learning_rate_tuner, learning_rate_tuner) learning_rate = tuner.learning_rate else: # TODO (nponomareva, soroush) do the line search. id:498 gh:499 raise ValueError("Line search learning rate is not yet supported.") # After adding the step, decide if further processing is needed. ensemble_update_ops = [add_step_op] with ops.control_dependencies([add_step_op]): if self._is_chief: dropout_seed = predictions_dict[NUM_TREES_ATTEMPTED] # Get accumulated steps and examples for the current layer. _, _, _, _, acc_examples, acc_steps = steps_accumulator.serialize( ) acc_examples = math_ops.cast(acc_examples[0], dtypes.int64) acc_steps = math_ops.cast(acc_steps[0], dtypes.int64) ensemble_update_ops.append( num_layer_examples.assign(acc_examples)) ensemble_update_ops.append(num_layer_steps.assign(acc_steps)) # Determine whether we need to update tree ensemble. examples_per_layer = self._examples_per_layer if callable(examples_per_layer): examples_per_layer = examples_per_layer(active_layer) ensemble_update_ops.append( control_flow_ops.cond( acc_examples >= examples_per_layer, self._make_update_ensemble_fn( ensemble_stamp, steps_accumulator, bias_stats_accumulator, continue_centering, learning_rate, handlers, num_layers, active_tree, active_layer, dropout_seed, class_id), control_flow_ops.no_op)) # Calculate the loss to be reported. # Note, the loss is calculated from the prediction considering dropouts, so # that the value might look staggering over steps when the dropout ratio is # high. eval_loss might be referred instead in the aspect of convergence. return control_flow_ops.group(*ensemble_update_ops)
def train(self, loss, predictions_dict, labels): """Grows a new tree and adds it to the ensemble. Args: loss: A scalar tensor representing average loss of examples. predictions_dict: Dictionary of Rank 2 `Tensor` representing information about predictions per example. labels: Rank 2 `Tensor` representing labels per example. Returns: An op that adds a new tree to the ensemble. Raises: ValueError: if inputs are not valid. """ # Get the worker device from input dependencies. input_deps = (self._dense_floats + self._sparse_float_indices + self._sparse_int_indices) worker_device = input_deps[0].device # Get tensors relevant for training and form the loss. predictions = predictions_dict[PREDICTIONS] partition_ids = predictions_dict[PARTITION_IDS] ensemble_stamp = predictions_dict[ENSEMBLE_STAMP] gradients = gradients_impl.gradients( loss, predictions, name="Gradients", colocate_gradients_with_ops=False, gate_gradients=0, aggregation_method=None)[0] strategy = self._learner_config.multi_class_strategy num_classes = self._learner_config.num_classes class_id = -1 # Handle different multiclass strategies. if strategy == learner_pb2.LearnerConfig.TREE_PER_CLASS: # We build one vs rest trees. gradient_shape = tensor_shape.scalar() hessian_shape = tensor_shape.scalar() if num_classes == 2: # We have only 1 score, gradients is of shape [batch, 1]. hessians = gradients_impl.gradients( gradients, predictions, name="Hessian", colocate_gradients_with_ops=False, gate_gradients=0, aggregation_method=None)[0] squeezed_gradients = array_ops.squeeze(gradients, axis=[1]) squeezed_hessians = array_ops.squeeze(hessians, axis=[1]) else: hessian_list = self._diagonal_hessian(gradients, predictions) # Assemble hessian list into a tensor. hessians = array_ops.stack(hessian_list, axis=1) # Choose the class for which the tree is built (one vs rest). class_id = predictions_dict[NUM_TREES_ATTEMPTED] % num_classes class_id = math_ops.to_int32(class_id) # Use class id tensor to get the column with that index from gradients # and hessians. squeezed_gradients = array_ops.squeeze( _get_column_by_index(gradients, class_id)) squeezed_hessians = array_ops.squeeze( _get_column_by_index(hessians, class_id)) else: # Other multiclass strategies. gradient_shape = tensor_shape.TensorShape([num_classes]) if strategy == learner_pb2.LearnerConfig.FULL_HESSIAN: hessian_shape = tensor_shape.TensorShape(([num_classes, num_classes])) hessian_list = self._full_hessian(gradients, predictions) else: # Diagonal hessian strategy. hessian_shape = tensor_shape.TensorShape(([num_classes])) hessian_list = self._diagonal_hessian(gradients, predictions) squeezed_gradients = gradients hessians = array_ops.stack(hessian_list, axis=1) squeezed_hessians = hessians # Get the weights for each example for quantiles calculation, weights = self._get_weights(hessian_shape, squeezed_hessians) regularization_config = self._learner_config.regularization min_node_weight = self._learner_config.constraints.min_node_weight # Create all handlers ensuring resources are evenly allocated across PS. fc_name_idx = 0 handlers = [] init_stamp_token = constant_op.constant(0, dtype=dtypes.int64) with ops.device(self._get_replica_device_setter(worker_device)): # Create handlers for dense float columns for dense_float_column_idx in range(len(self._dense_floats)): fc_name = self._fc_names[fc_name_idx] handlers.append( ordinal_split_handler.DenseSplitHandler( l1_regularization=regularization_config.l1, l2_regularization=regularization_config.l2, tree_complexity_regularization=( regularization_config.tree_complexity), min_node_weight=min_node_weight, feature_column_group_id=dense_float_column_idx, epsilon=0.01, num_quantiles=100, dense_float_column=self._dense_floats[dense_float_column_idx], name=fc_name, gradient_shape=gradient_shape, hessian_shape=hessian_shape, multiclass_strategy=strategy, init_stamp_token=init_stamp_token)) fc_name_idx += 1 # Create handlers for sparse float columns. for sparse_float_column_idx in range(len(self._sparse_float_indices)): fc_name = self._fc_names[fc_name_idx] handlers.append( ordinal_split_handler.SparseSplitHandler( l1_regularization=regularization_config.l1, l2_regularization=regularization_config.l2, tree_complexity_regularization=( regularization_config.tree_complexity), min_node_weight=min_node_weight, feature_column_group_id=sparse_float_column_idx, epsilon=0.01, num_quantiles=100, sparse_float_column=sparse_tensor.SparseTensor( self._sparse_float_indices[sparse_float_column_idx], self._sparse_float_values[sparse_float_column_idx], self._sparse_float_shapes[sparse_float_column_idx]), name=fc_name, gradient_shape=gradient_shape, hessian_shape=hessian_shape, multiclass_strategy=strategy, init_stamp_token=init_stamp_token)) fc_name_idx += 1 # Create handlers for sparse int columns. for sparse_int_column_idx in range(len(self._sparse_int_indices)): fc_name = self._fc_names[fc_name_idx] handlers.append( categorical_split_handler.EqualitySplitHandler( l1_regularization=regularization_config.l1, l2_regularization=regularization_config.l2, tree_complexity_regularization=( regularization_config.tree_complexity), min_node_weight=min_node_weight, feature_column_group_id=sparse_int_column_idx, sparse_int_column=sparse_tensor.SparseTensor( self._sparse_int_indices[sparse_int_column_idx], self._sparse_int_values[sparse_int_column_idx], self._sparse_int_shapes[sparse_int_column_idx]), name=fc_name, gradient_shape=gradient_shape, hessian_shape=hessian_shape, multiclass_strategy=strategy, init_stamp_token=init_stamp_token)) fc_name_idx += 1 # Create steps accumulator. steps_accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, gradient_shape=tensor_shape.scalar(), hessian_shape=tensor_shape.scalar(), name="StepsAccumulator") # Create bias stats accumulator. bias_stats_accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, gradient_shape=gradient_shape, hessian_shape=hessian_shape, name="BiasAccumulator") # Create ensemble stats variables. num_layer_examples = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="num_layer_examples", trainable=False) num_layer_steps = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="num_layer_steps", trainable=False) num_layers = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="num_layers", trainable=False) active_tree = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="active_tree", trainable=False) active_layer = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="active_layer", trainable=False) # Create ensemble stats summaries. summary.scalar("layer_stats/num_examples", num_layer_examples) summary.scalar("layer_stats/num_steps", num_layer_steps) summary.scalar("ensemble_stats/active_tree", active_tree) summary.scalar("ensemble_stats/active_layer", active_layer) # Update bias stats. stats_update_ops = [] continue_centering = variables.Variable( initial_value=self._center_bias, name="continue_centering", trainable=False) stats_update_ops.append( control_flow_ops.cond(continue_centering, self._make_update_bias_stats_fn( ensemble_stamp, predictions, gradients, bias_stats_accumulator), control_flow_ops.no_op)) # Update handler stats. handler_reads = {} for handler in handlers: handler_reads[handler] = handler.scheduled_reads() handler_results = batch_ops_utils.run_handler_scheduled_ops( handler_reads, ensemble_stamp, worker_device) per_handler_updates = {} # Two values per handler. First one is if the the handler is active for the # current layer. The second one is if the handler is going to be active # for the next layer. subsampling_type = self._learner_config.WhichOneof("feature_fraction") if subsampling_type == "feature_fraction_per_level": seed = predictions_dict[NUM_LAYERS_ATTEMPTED] active_handlers_current_layer = stateless.stateless_random_uniform( shape=[len(handlers)], seed=[seed, 1]) active_handlers_next_layer = stateless.stateless_random_uniform( shape=[len(handlers)], seed=[seed + 1, 1]) active_handlers = array_ops.stack( [active_handlers_current_layer, active_handlers_next_layer], axis=1) active_handlers = (active_handlers < self._learner_config.feature_fraction_per_level) elif subsampling_type == "feature_fraction_per_tree": seed = predictions_dict[NUM_TREES_ATTEMPTED] active_handlers_current_layer = stateless.stateless_random_uniform( shape=[len(handlers)], seed=[seed, 2]) active_handlers_current_layer = ( active_handlers_current_layer < self._learner_config.feature_fraction_per_tree) active_handlers = array_ops.stack(active_handlers_current_layer, array_ops.ones( [len(handlers)], dtype=dtypes.bool)) else: active_handlers = array_ops.ones([len(handlers), 2], dtype=dtypes.bool) # Prepare empty gradients and hessians when handlers are not ready. empty_hess_shape = [1] + hessian_shape.as_list() empty_grad_shape = [1] + gradient_shape.as_list() empty_gradients = constant_op.constant( [], dtype=dtypes.float32, shape=empty_grad_shape) empty_hessians = constant_op.constant( [], dtype=dtypes.float32, shape=empty_hess_shape) for handler_idx in range(len(handlers)): handler = handlers[handler_idx] is_active = active_handlers[handler_idx] updates, scheduled_updates = handler.update_stats( ensemble_stamp, partition_ids, squeezed_gradients, squeezed_hessians, empty_gradients, empty_hessians, weights, is_active, handler_results[handler]) stats_update_ops.append(updates) per_handler_updates[handler] = scheduled_updates update_results = batch_ops_utils.run_handler_scheduled_ops( per_handler_updates, ensemble_stamp, worker_device) for update in update_results.values(): stats_update_ops += update # Accumulate a step after updating stats. batch_size = math_ops.cast(array_ops.shape(labels)[0], dtypes.float32) with ops.control_dependencies(stats_update_ops): add_step_op = steps_accumulator.add(ensemble_stamp, [0], [0], [batch_size], [1.0]) # Determine learning rate. learning_rate_tuner = self._learner_config.learning_rate_tuner.WhichOneof( "tuner") if learning_rate_tuner == "fixed" or learning_rate_tuner == "dropout": tuner = getattr(self._learner_config.learning_rate_tuner, learning_rate_tuner) learning_rate = tuner.learning_rate else: # TODO(nponomareva, soroush) do the line search. raise ValueError("Line search learning rate is not yet supported.") # After adding the step, decide if further processing is needed. ensemble_update_ops = [add_step_op] with ops.control_dependencies([add_step_op]): if self._is_chief: dropout_seed = predictions_dict[NUM_TREES_ATTEMPTED] # Get accumulated steps and examples for the current layer. _, _, _, _, acc_examples, acc_steps = steps_accumulator.serialize() acc_examples = math_ops.cast(acc_examples[0], dtypes.int64) acc_steps = math_ops.cast(acc_steps[0], dtypes.int64) ensemble_update_ops.append(num_layer_examples.assign(acc_examples)) ensemble_update_ops.append(num_layer_steps.assign(acc_steps)) # Determine whether we need to update tree ensemble. examples_per_layer = self._examples_per_layer if callable(examples_per_layer): examples_per_layer = examples_per_layer(active_layer) ensemble_update_ops.append( control_flow_ops.cond( acc_examples >= examples_per_layer, self._make_update_ensemble_fn( ensemble_stamp, steps_accumulator, bias_stats_accumulator, continue_centering, learning_rate, handlers, num_layers, active_tree, active_layer, dropout_seed, class_id), control_flow_ops.no_op)) # Calculate the loss to be reported - use the predictions without dropout. return control_flow_ops.group(*ensemble_update_ops)
def update_stats(self, loss, predictions_dict): """Update the accumulators with stats from this batch. Args: loss: A scalar tensor representing average loss of examples. predictions_dict: Dictionary of Rank 2 `Tensor` representing information about predictions per example. Returns: An op that adds a new tree to the ensemble. Raises: ValueError: if inputs are not valid. """ # Get the worker device from input dependencies. input_deps = ( self._dense_floats + self._sparse_float_indices + self._sparse_int_indices) worker_device = input_deps[0].device # Create ensemble stats variables. num_layer_examples = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="num_layer_examples", trainable=False) num_layer_steps = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="num_layer_steps", trainable=False) num_layers = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="num_layers", trainable=False) active_tree = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="active_tree", trainable=False) active_layer = variables.Variable( initial_value=array_ops.zeros([], dtypes.int64), name="active_layer", trainable=False) # Variable that becomes false once bias centering is done. continue_centering = variables.Variable( initial_value=self._center_bias, name="continue_centering", trainable=False) # Create bias stats accumulator. bias_stats_accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, gradient_shape=self._gradient_shape, hessian_shape=self._hessian_shape, name="BiasAccumulator") # Create steps accumulator. steps_accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, gradient_shape=tensor_shape.scalar(), hessian_shape=tensor_shape.scalar(), name="StepsAccumulator") # Get tensors relevant for training and form the loss. predictions = predictions_dict[PREDICTIONS] partition_ids = predictions_dict[PARTITION_IDS] ensemble_stamp = predictions_dict[ENSEMBLE_STAMP] gradients = gradients_impl.gradients( loss, predictions, name="Gradients", colocate_gradients_with_ops=False, gate_gradients=0, aggregation_method=None)[0] strategy = self._learner_config.multi_class_strategy class_id = self._get_class_id(predictions_dict) # Handle different multiclass strategies. if strategy == learner_pb2.LearnerConfig.TREE_PER_CLASS: # We build one vs rest trees. if self._logits_dimension == 1: # We have only 1 score, gradients is of shape [batch, 1]. hessians = gradients_impl.gradients( gradients, predictions, name="Hessian", colocate_gradients_with_ops=False, gate_gradients=0, aggregation_method=None)[0] squeezed_gradients = array_ops.squeeze(gradients, axis=[1]) squeezed_hessians = array_ops.squeeze(hessians, axis=[1]) else: hessian_list = self._diagonal_hessian(gradients, predictions) # Assemble hessian list into a tensor. hessians = array_ops.stack(hessian_list, axis=1) # Use class id tensor to get the column with that index from gradients # and hessians. squeezed_gradients = array_ops.squeeze( _get_column_by_index(gradients, class_id)) squeezed_hessians = array_ops.squeeze( _get_column_by_index(hessians, class_id)) else: # Other multiclass strategies. if strategy == learner_pb2.LearnerConfig.FULL_HESSIAN: hessian_list = self._full_hessian(gradients, predictions) else: # Diagonal hessian strategy. hessian_list = self._diagonal_hessian(gradients, predictions) squeezed_gradients = gradients hessians = array_ops.stack(hessian_list, axis=1) squeezed_hessians = hessians # Get the weights for each example for quantiles calculation, weights = self._get_weights(self._hessian_shape, squeezed_hessians) # Create all handlers ensuring resources are evenly allocated across PS. fc_name_idx = 0 handlers = [] init_stamp_token = constant_op.constant(0, dtype=dtypes.int64) l1_regularization = constant_op.constant( self._learner_config.regularization.l1, dtypes.float32) l2_regularization = constant_op.constant( self._learner_config.regularization.l2, dtypes.float32) tree_complexity_regularization = constant_op.constant( self._learner_config.regularization.tree_complexity, dtypes.float32) min_node_weight = constant_op.constant( self._learner_config.constraints.min_node_weight, dtypes.float32) epsilon = 0.01 num_quantiles = 100 strategy_tensor = constant_op.constant(strategy) with ops.device(self._get_replica_device_setter(worker_device)): # Create handlers for dense float columns for dense_float_column_idx in range(len(self._dense_floats)): fc_name = self._fc_names[fc_name_idx] handlers.append( ordinal_split_handler.DenseSplitHandler( l1_regularization=l1_regularization, l2_regularization=l2_regularization, tree_complexity_regularization=tree_complexity_regularization, min_node_weight=min_node_weight, feature_column_group_id=dense_float_column_idx, epsilon=epsilon, num_quantiles=num_quantiles, dense_float_column=self._dense_floats[dense_float_column_idx], name=fc_name, gradient_shape=self._gradient_shape, hessian_shape=self._hessian_shape, multiclass_strategy=strategy_tensor, init_stamp_token=init_stamp_token)) fc_name_idx += 1 # Create handlers for sparse float columns. for sparse_float_column_idx in range(len(self._sparse_float_indices)): fc_name = self._fc_names[fc_name_idx] handlers.append( ordinal_split_handler.SparseSplitHandler( l1_regularization=l1_regularization, l2_regularization=l2_regularization, tree_complexity_regularization=tree_complexity_regularization, min_node_weight=min_node_weight, feature_column_group_id=sparse_float_column_idx, epsilon=epsilon, num_quantiles=num_quantiles, sparse_float_column=sparse_tensor.SparseTensor( self._sparse_float_indices[sparse_float_column_idx], self._sparse_float_values[sparse_float_column_idx], self._sparse_float_shapes[sparse_float_column_idx]), name=fc_name, gradient_shape=self._gradient_shape, hessian_shape=self._hessian_shape, multiclass_strategy=strategy_tensor, init_stamp_token=init_stamp_token)) fc_name_idx += 1 # Create handlers for sparse int columns. for sparse_int_column_idx in range(len(self._sparse_int_indices)): fc_name = self._fc_names[fc_name_idx] handlers.append( categorical_split_handler.EqualitySplitHandler( l1_regularization=l1_regularization, l2_regularization=l2_regularization, tree_complexity_regularization=tree_complexity_regularization, min_node_weight=min_node_weight, feature_column_group_id=sparse_int_column_idx, sparse_int_column=sparse_tensor.SparseTensor( self._sparse_int_indices[sparse_int_column_idx], self._sparse_int_values[sparse_int_column_idx], self._sparse_int_shapes[sparse_int_column_idx]), name=fc_name, gradient_shape=self._gradient_shape, hessian_shape=self._hessian_shape, multiclass_strategy=strategy_tensor, init_stamp_token=init_stamp_token)) fc_name_idx += 1 # Create ensemble stats summaries. summary.scalar("layer_stats/num_examples", num_layer_examples) summary.scalar("layer_stats/num_steps", num_layer_steps) summary.scalar("ensemble_stats/active_tree", active_tree) summary.scalar("ensemble_stats/active_layer", active_layer) # Update bias stats. stats_update_ops = [] stats_update_ops.append( control_flow_ops.cond( continue_centering, self._make_update_bias_stats_fn( ensemble_stamp, predictions, gradients, bias_stats_accumulator), control_flow_ops.no_op)) # Update handler stats. handler_reads = collections.OrderedDict() for handler in handlers: handler_reads[handler] = handler.scheduled_reads() handler_results = batch_ops_utils.run_handler_scheduled_ops( handler_reads, ensemble_stamp, worker_device) per_handler_updates = collections.OrderedDict() # Two values per handler. First one is if the handler is active for the # current layer. The second one is if the handler is going to be active # for the next layer. subsampling_type = self._learner_config.WhichOneof("feature_fraction") if subsampling_type == "feature_fraction_per_level": seed = predictions_dict[NUM_LAYERS_ATTEMPTED] active_handlers_current_layer = stateless.stateless_random_uniform( shape=[len(handlers)], seed=[seed, 1]) active_handlers_next_layer = stateless.stateless_random_uniform( shape=[len(handlers)], seed=[seed + 1, 1]) active_handlers = array_ops.stack( [active_handlers_current_layer, active_handlers_next_layer], axis=1) active_handlers = ( active_handlers < self._learner_config.feature_fraction_per_level) elif subsampling_type == "feature_fraction_per_tree": seed = predictions_dict[NUM_TREES_ATTEMPTED] active_handlers_current_layer = stateless.stateless_random_uniform( shape=[len(handlers)], seed=[seed, 2]) active_handlers_current_layer = ( active_handlers_current_layer < self._learner_config.feature_fraction_per_tree) active_handlers = array_ops.stack( [ active_handlers_current_layer, array_ops.ones([len(handlers)], dtype=dtypes.bool) ], axis=1) else: active_handlers = array_ops.ones([len(handlers), 2], dtype=dtypes.bool) if self._learner_config.constraints.max_number_of_unique_feature_columns: target = ( self._learner_config.constraints.max_number_of_unique_feature_columns) def _feature_selection_active_handlers(): # The active list for current and the next iteration. used_handlers = array_ops.reshape(predictions_dict[USED_HANDLERS_MASK], [-1, 1]) used_handlers = array_ops.concat([used_handlers, used_handlers], axis=1) return math_ops.logical_and(used_handlers, active_handlers) active_handlers = ( control_flow_ops.cond(predictions_dict[NUM_USED_HANDLERS] >= target, _feature_selection_active_handlers, lambda: active_handlers)) # Prepare empty gradients and hessians when handlers are not ready. empty_hess_shape = [1] + self._hessian_shape.as_list() empty_grad_shape = [1] + self._gradient_shape.as_list() empty_gradients = constant_op.constant( [], dtype=dtypes.float32, shape=empty_grad_shape) empty_hessians = constant_op.constant( [], dtype=dtypes.float32, shape=empty_hess_shape) active_handlers = array_ops.unstack(active_handlers, axis=0) for handler_idx in range(len(handlers)): handler = handlers[handler_idx] is_active = active_handlers[handler_idx] updates, scheduled_updates = handler.update_stats( ensemble_stamp, partition_ids, squeezed_gradients, squeezed_hessians, empty_gradients, empty_hessians, weights, is_active, handler_results[handler]) stats_update_ops.append(updates) per_handler_updates[handler] = scheduled_updates update_results = batch_ops_utils.run_handler_scheduled_ops( per_handler_updates, ensemble_stamp, worker_device) for update in update_results.values(): stats_update_ops += update training_state = { _NUM_LAYER_EXAMPLES: num_layer_examples, _NUM_LAYER_STEPS: num_layer_steps, _NUM_LAYERS: num_layers, _ACTIVE_TREE: active_tree, _ACTIVE_LAYER: active_layer, _CONTINUE_CENTERING: continue_centering, _BIAS_STATS_ACCUMULATOR: bias_stats_accumulator, _STEPS_ACCUMULATOR: steps_accumulator, _HANDLERS: handlers } return stats_update_ops, training_state