示例#1
0
    def sample(self, seed=None):
        """Sample a matrix with the given spectrum.

    Args:
      seed: if seed is set, use a constant random number generator to produce a
        sample, otherwise use built in tensorflow random numbers.

    Returns:
      The sampled matrix.
    """

        dims = self._spectrum.shape[0]
        if seed is not None:
            rand = contrib_stateless.stateless_random_uniform(
                shape=[dims, dims],
                dtype=tf.float32,
                # Arbitrary offset on seed to prevent overlap of random state.
                seed=[seed + 1233, seed + 341]) * 2 - 1
        else:
            rand = tf.random_uniform([dims, dims], -1., 1., dtype=tf.float32)
        q, r = tf.qr(rand, full_matrices=True)

        # Multiply by the sign of the diagonal to ensure a uniform distribution.
        q *= tf.sign(tf.matrix_diag_part(r))

        # qDq^T where D is a diagonal matrix containing the spectrum
        return tf.matmul(tf.matmul(q, tf.diag(self._spectrum)),
                         q,
                         transpose_b=True)
 def testRandomNormalIsFinite(self):
   with self.test_session() as sess, self.test_scope():
     for dtype in self._random_types():
       seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
       x = stateless.stateless_random_uniform(
           shape=[10000], seed=seed_t, dtype=dtype)
       y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
       self.assertTrue(np.all(np.isfinite(y)))
示例#3
0
def dropout_layer(seed, signal, keep_prob=0.5, training=False):
    # return seed, signal
    s, seed = seed[:2], seed[2:]
    rand = stateless_random_uniform(tf.shape(signal), s)
    mask = tf.to_float(rand < keep_prob)
    return seed, tf.cond(
        training,
        lambda: (signal * mask) / tf.sqrt(keep_prob),
        lambda: signal)
 def testRandomUniformIsInRange(self):
   with self.cached_session() as sess, self.test_scope():
     for dtype in self._random_types():
       seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
       x = stateless.stateless_random_uniform(
           shape=[1000], seed=seed_t, dtype=dtype)
       y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
       self.assertTrue(np.all(y >= 0))
       self.assertTrue(np.all(y < 1))
示例#5
0
 def testRandomUniformIsInRange(self):
   with self.cached_session() as sess, self.test_scope():
     for dtype in self._random_types():
       seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
       x = stateless.stateless_random_uniform(
           shape=[1000], seed=seed_t, dtype=dtype)
       y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
       self.assertTrue(np.all(y >= 0))
       self.assertTrue(np.all(y < 1))
 def testDistributionOfStatelessRandomUniform(self):
   """Use Pearson's Chi-squared test to test for uniformity."""
   with self.test_session() as sess, self.test_scope():
     for dtype in self._random_types():
       seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
       n = 1000
       x = stateless.stateless_random_uniform(
           shape=[n], seed=seed_t, dtype=dtype)
       y = sess.run(x, {seed_t: [565656, 121212]})
       # Tests that the values are distributed amongst 10 bins with equal
       # probability. 16.92 is the Chi^2 value for 9 degrees of freedom with
       # p=0.05. This test is probabilistic and would be flaky if the random
       # seed were not fixed.
       self.assertTrue(self._chi_squared(y, 10) < 16.92)
示例#7
0
def _draw_n_rademacher_samples(n, seed=None):
    """
    Draws n rademacher samples.
    """
    if seed is None:
        return tf.where(
            tf.random_uniform([n], dtype=settings.float_type) <= 0.5,
            tf.ones([n], dtype=settings.float_type),
            -1. * tf.ones([n], dtype=settings.float_type))
    else:
        return tf.where(
            stateless.stateless_random_uniform(
                [n], dtype=settings.float_type, seed=seed) <= 0.5,
            tf.ones([n], dtype=settings.float_type),
            -1. * tf.ones([n], dtype=settings.float_type))
示例#8
0
def _draw_n_sparse_gaussian_samples(n, s, seed=None):
    """
    Draws n sparse gaussian samples, that is with P(X = N(0,1)) = 1/s, P(X = 0) = 1 - 1/s.
    """
    s = tf.cast(s, settings.float_type)
    if seed is None:
        return tf.where(
            tf.random_uniform([n], dtype=settings.float_type) <= 1. / s,
            tf.random_normal([n], dtype=settings.float_type),
            tf.zeros([n], dtype=settings.float_type))
    else:
        return tf.where(
            stateless.stateless_random_uniform(
                [n], dtype=settings.float_type, seed=seed) <= 1. / s,
            stateless.stateless_random_normal([n],
                                              dtype=settings.float_type,
                                              seed=seed),
            tf.zeros([n], dtype=settings.float_type))
    def test_gbm_euler_step_running_max_bridge_is_deterministic(self):
        drift = 0.2
        vol = 0.1
        t = 0.2
        dt = 0.01
        num_samples = 8
        key = 1337

        states_and_max = [tf.ones([num_samples])] * 2
        eps_t = contrib_stateless.stateless_random_normal(
            shape=[num_samples], seed=[2 * key, int(t / dt)])
        u_t = contrib_stateless.stateless_random_uniform(
            shape=[num_samples], seed=[2 * key + 1, int(t / dt)])

        next_states_and_max = dynamics.gbm_euler_step_running_max(
            states_and_max,
            drift,
            vol,
            t,
            dt,
            simulate_bridge=True,
            random_normal_op=lambda: eps_t,
            random_uniform_op=lambda: u_t)
        next_states_and_max_bis = dynamics.gbm_euler_step_running_max(
            states_and_max, drift, vol, t, dt, simulate_bridge=True, key=key)

        with self.session() as session:
            next_states_and_max_eval, next_states_and_max_bis_eval = session.run(
                (next_states_and_max, next_states_and_max_bis))

        next_states_eval, next_max_eval = next_states_and_max_eval
        next_states_bis_eval, next_max_bis_eval = next_states_and_max_bis_eval

        self.assertEqual(next_states_eval.shape, (num_samples, ))
        self.assertEqual(next_states_bis_eval.shape, (num_samples, ))

        self.assertEqual(next_max_eval.shape, (num_samples, ))
        self.assertEqual(next_max_bis_eval.shape, (num_samples, ))

        self.assertAllClose(next_states_eval, next_states_bis_eval)
        self.assertAllClose(next_max_eval, next_max_bis_eval)
示例#10
0
    def train(self, loss, predictions_dict, labels):
        """Grows a new tree and adds it to the ensemble.

    Args:
      loss: A scalar tensor representing average loss of examples.
      predictions_dict: Dictionary of Rank 2 `Tensor` representing information
          about predictions per example.
      labels: Rank 2 `Tensor` representing labels per example.

    Returns:
      An op that adds a new tree to the ensemble.

    Raises:
      ValueError: if inputs are not valid.
    """
        # Get the worker device from input dependencies.
        input_deps = (self._dense_floats + self._sparse_float_indices +
                      self._sparse_int_indices)
        worker_device = input_deps[0].device

        # Get tensors relevant for training and form the loss.
        predictions = predictions_dict[PREDICTIONS]
        partition_ids = predictions_dict[PARTITION_IDS]
        ensemble_stamp = predictions_dict[ENSEMBLE_STAMP]
        gradients = gradients_impl.gradients(loss,
                                             predictions,
                                             name="Gradients",
                                             colocate_gradients_with_ops=False,
                                             gate_gradients=0,
                                             aggregation_method=None)[0]
        strategy = self._learner_config.multi_class_strategy

        class_id = -1
        # Handle different multiclass strategies.
        if strategy == learner_pb2.LearnerConfig.TREE_PER_CLASS:
            # We build one vs rest trees.
            gradient_shape = tensor_shape.scalar()
            hessian_shape = tensor_shape.scalar()

            if self._logits_dimension == 1:
                # We have only 1 score, gradients is of shape [batch, 1].
                hessians = gradients_impl.gradients(
                    gradients,
                    predictions,
                    name="Hessian",
                    colocate_gradients_with_ops=False,
                    gate_gradients=0,
                    aggregation_method=None)[0]

                squeezed_gradients = array_ops.squeeze(gradients, axis=[1])
                squeezed_hessians = array_ops.squeeze(hessians, axis=[1])
            else:
                hessian_list = self._diagonal_hessian(gradients, predictions)
                # Assemble hessian list into a tensor.
                hessians = array_ops.stack(hessian_list, axis=1)

                # Choose the class for which the tree is built (one vs rest).
                class_id = math_ops.to_int32(
                    predictions_dict[NUM_TREES_ATTEMPTED] %
                    self._logits_dimension)

                # Use class id tensor to get the column with that index from gradients
                # and hessians.
                squeezed_gradients = array_ops.squeeze(
                    _get_column_by_index(gradients, class_id))
                squeezed_hessians = array_ops.squeeze(
                    _get_column_by_index(hessians, class_id))
        else:
            # Other multiclass strategies.
            gradient_shape = tensor_shape.TensorShape([self._logits_dimension])

            if strategy == learner_pb2.LearnerConfig.FULL_HESSIAN:
                hessian_shape = tensor_shape.TensorShape(
                    ([self._logits_dimension, self._logits_dimension]))
                hessian_list = self._full_hessian(gradients, predictions)
            else:
                # Diagonal hessian strategy.
                hessian_shape = tensor_shape.TensorShape(
                    ([self._logits_dimension]))
                hessian_list = self._diagonal_hessian(gradients, predictions)

            squeezed_gradients = gradients
            hessians = array_ops.stack(hessian_list, axis=1)
            squeezed_hessians = hessians

        # Get the weights for each example for quantiles calculation,
        weights = self._get_weights(hessian_shape, squeezed_hessians)

        regularization_config = self._learner_config.regularization
        min_node_weight = self._learner_config.constraints.min_node_weight
        # Create all handlers ensuring resources are evenly allocated across PS.
        fc_name_idx = 0
        handlers = []
        init_stamp_token = constant_op.constant(0, dtype=dtypes.int64)
        with ops.device(self._get_replica_device_setter(worker_device)):
            # Create handlers for dense float columns
            for dense_float_column_idx in range(len(self._dense_floats)):
                fc_name = self._fc_names[fc_name_idx]
                handlers.append(
                    ordinal_split_handler.DenseSplitHandler(
                        l1_regularization=regularization_config.l1,
                        l2_regularization=regularization_config.l2,
                        tree_complexity_regularization=(
                            regularization_config.tree_complexity),
                        min_node_weight=min_node_weight,
                        feature_column_group_id=dense_float_column_idx,
                        epsilon=0.01,
                        num_quantiles=100,
                        dense_float_column=self.
                        _dense_floats[dense_float_column_idx],
                        name=fc_name,
                        gradient_shape=gradient_shape,
                        hessian_shape=hessian_shape,
                        multiclass_strategy=strategy,
                        init_stamp_token=init_stamp_token))
                fc_name_idx += 1

            # Create handlers for sparse float columns.
            for sparse_float_column_idx in range(
                    len(self._sparse_float_indices)):
                fc_name = self._fc_names[fc_name_idx]
                handlers.append(
                    ordinal_split_handler.SparseSplitHandler(
                        l1_regularization=regularization_config.l1,
                        l2_regularization=regularization_config.l2,
                        tree_complexity_regularization=(
                            regularization_config.tree_complexity),
                        min_node_weight=min_node_weight,
                        feature_column_group_id=sparse_float_column_idx,
                        epsilon=0.01,
                        num_quantiles=100,
                        sparse_float_column=sparse_tensor.SparseTensor(
                            self.
                            _sparse_float_indices[sparse_float_column_idx],
                            self._sparse_float_values[sparse_float_column_idx],
                            self._sparse_float_shapes[sparse_float_column_idx]
                        ),
                        name=fc_name,
                        gradient_shape=gradient_shape,
                        hessian_shape=hessian_shape,
                        multiclass_strategy=strategy,
                        init_stamp_token=init_stamp_token))
                fc_name_idx += 1

            # Create handlers for sparse int columns.
            for sparse_int_column_idx in range(len(self._sparse_int_indices)):
                fc_name = self._fc_names[fc_name_idx]
                handlers.append(
                    categorical_split_handler.EqualitySplitHandler(
                        l1_regularization=regularization_config.l1,
                        l2_regularization=regularization_config.l2,
                        tree_complexity_regularization=(
                            regularization_config.tree_complexity),
                        min_node_weight=min_node_weight,
                        feature_column_group_id=sparse_int_column_idx,
                        sparse_int_column=sparse_tensor.SparseTensor(
                            self._sparse_int_indices[sparse_int_column_idx],
                            self._sparse_int_values[sparse_int_column_idx],
                            self._sparse_int_shapes[sparse_int_column_idx]),
                        name=fc_name,
                        gradient_shape=gradient_shape,
                        hessian_shape=hessian_shape,
                        multiclass_strategy=strategy,
                        init_stamp_token=init_stamp_token))
                fc_name_idx += 1

            # Create steps accumulator.
            steps_accumulator = stats_accumulator_ops.StatsAccumulator(
                stamp_token=0,
                gradient_shape=tensor_shape.scalar(),
                hessian_shape=tensor_shape.scalar(),
                name="StepsAccumulator")

            # Create bias stats accumulator.
            bias_stats_accumulator = stats_accumulator_ops.StatsAccumulator(
                stamp_token=0,
                gradient_shape=gradient_shape,
                hessian_shape=hessian_shape,
                name="BiasAccumulator")

            # Create ensemble stats variables.
            num_layer_examples = variables.Variable(
                initial_value=array_ops.zeros([], dtypes.int64),
                name="num_layer_examples",
                trainable=False)
            num_layer_steps = variables.Variable(initial_value=array_ops.zeros(
                [], dtypes.int64),
                                                 name="num_layer_steps",
                                                 trainable=False)
            num_layers = variables.Variable(initial_value=array_ops.zeros(
                [], dtypes.int64),
                                            name="num_layers",
                                            trainable=False)
            active_tree = variables.Variable(initial_value=array_ops.zeros(
                [], dtypes.int64),
                                             name="active_tree",
                                             trainable=False)
            active_layer = variables.Variable(initial_value=array_ops.zeros(
                [], dtypes.int64),
                                              name="active_layer",
                                              trainable=False)

        # Create ensemble stats summaries.
        summary.scalar("layer_stats/num_examples", num_layer_examples)
        summary.scalar("layer_stats/num_steps", num_layer_steps)
        summary.scalar("ensemble_stats/active_tree", active_tree)
        summary.scalar("ensemble_stats/active_layer", active_layer)

        # Update bias stats.
        stats_update_ops = []
        continue_centering = variables.Variable(
            initial_value=self._center_bias,
            name="continue_centering",
            trainable=False)
        stats_update_ops.append(
            control_flow_ops.cond(
                continue_centering,
                self._make_update_bias_stats_fn(ensemble_stamp, predictions,
                                                gradients,
                                                bias_stats_accumulator),
                control_flow_ops.no_op))

        # Update handler stats.
        handler_reads = {}
        for handler in handlers:
            handler_reads[handler] = handler.scheduled_reads()

        handler_results = batch_ops_utils.run_handler_scheduled_ops(
            handler_reads, ensemble_stamp, worker_device)
        per_handler_updates = {}
        # Two values per handler. First one is if the handler is active for the
        # current layer. The second one is if the handler is going to be active
        # for the next layer.
        subsampling_type = self._learner_config.WhichOneof("feature_fraction")
        if subsampling_type == "feature_fraction_per_level":
            seed = predictions_dict[NUM_LAYERS_ATTEMPTED]
            active_handlers_current_layer = stateless.stateless_random_uniform(
                shape=[len(handlers)], seed=[seed, 1])
            active_handlers_next_layer = stateless.stateless_random_uniform(
                shape=[len(handlers)], seed=[seed + 1, 1])
            active_handlers = array_ops.stack(
                [active_handlers_current_layer, active_handlers_next_layer],
                axis=1)
            active_handlers = (active_handlers <
                               self._learner_config.feature_fraction_per_level)
        elif subsampling_type == "feature_fraction_per_tree":
            seed = predictions_dict[NUM_TREES_ATTEMPTED]
            active_handlers_current_layer = stateless.stateless_random_uniform(
                shape=[len(handlers)], seed=[seed, 2])
            active_handlers_current_layer = (
                active_handlers_current_layer <
                self._learner_config.feature_fraction_per_tree)
            active_handlers = array_ops.stack(
                active_handlers_current_layer,
                array_ops.ones([len(handlers)], dtype=dtypes.bool))
        else:
            active_handlers = array_ops.ones([len(handlers), 2],
                                             dtype=dtypes.bool)

        # Prepare empty gradients and hessians when handlers are not ready.
        empty_hess_shape = [1] + hessian_shape.as_list()
        empty_grad_shape = [1] + gradient_shape.as_list()

        empty_gradients = constant_op.constant([],
                                               dtype=dtypes.float32,
                                               shape=empty_grad_shape)
        empty_hessians = constant_op.constant([],
                                              dtype=dtypes.float32,
                                              shape=empty_hess_shape)

        for handler_idx in range(len(handlers)):
            handler = handlers[handler_idx]
            is_active = active_handlers[handler_idx]
            updates, scheduled_updates = handler.update_stats(
                ensemble_stamp, partition_ids, squeezed_gradients,
                squeezed_hessians, empty_gradients, empty_hessians, weights,
                is_active, handler_results[handler])
            stats_update_ops.append(updates)
            per_handler_updates[handler] = scheduled_updates

        update_results = batch_ops_utils.run_handler_scheduled_ops(
            per_handler_updates, ensemble_stamp, worker_device)
        for update in update_results.values():
            stats_update_ops += update
        # Accumulate a step after updating stats.
        batch_size = math_ops.cast(array_ops.shape(labels)[0], dtypes.float32)
        with ops.control_dependencies(stats_update_ops):
            add_step_op = steps_accumulator.add(ensemble_stamp, [0], [[0, 0]],
                                                [batch_size], [1.0])

        # Determine learning rate.
        learning_rate_tuner = self._learner_config.learning_rate_tuner.WhichOneof(
            "tuner")
        if learning_rate_tuner == "fixed" or learning_rate_tuner == "dropout":
            tuner = getattr(self._learner_config.learning_rate_tuner,
                            learning_rate_tuner)
            learning_rate = tuner.learning_rate
        else:
            # TODO (nponomareva, soroush) do the line search. id:498 gh:499
            raise ValueError("Line search learning rate is not yet supported.")

        # After adding the step, decide if further processing is needed.
        ensemble_update_ops = [add_step_op]
        with ops.control_dependencies([add_step_op]):
            if self._is_chief:
                dropout_seed = predictions_dict[NUM_TREES_ATTEMPTED]

                # Get accumulated steps and examples for the current layer.
                _, _, _, _, acc_examples, acc_steps = steps_accumulator.serialize(
                )
                acc_examples = math_ops.cast(acc_examples[0], dtypes.int64)
                acc_steps = math_ops.cast(acc_steps[0], dtypes.int64)
                ensemble_update_ops.append(
                    num_layer_examples.assign(acc_examples))
                ensemble_update_ops.append(num_layer_steps.assign(acc_steps))
                # Determine whether we need to update tree ensemble.
                examples_per_layer = self._examples_per_layer
                if callable(examples_per_layer):
                    examples_per_layer = examples_per_layer(active_layer)
                ensemble_update_ops.append(
                    control_flow_ops.cond(
                        acc_examples >= examples_per_layer,
                        self._make_update_ensemble_fn(
                            ensemble_stamp, steps_accumulator,
                            bias_stats_accumulator, continue_centering,
                            learning_rate, handlers, num_layers, active_tree,
                            active_layer, dropout_seed, class_id),
                        control_flow_ops.no_op))

        # Calculate the loss to be reported.
        # Note, the loss is calculated from the prediction considering dropouts, so
        # that the value might look staggering over steps when the dropout ratio is
        # high. eval_loss might be referred instead in the aspect of convergence.
        return control_flow_ops.group(*ensemble_update_ops)
示例#11
0
  def train(self, loss, predictions_dict, labels):
    """Grows a new tree and adds it to the ensemble.

    Args:
      loss: A scalar tensor representing average loss of examples.
      predictions_dict: Dictionary of Rank 2 `Tensor` representing information
          about predictions per example.
      labels: Rank 2 `Tensor` representing labels per example.

    Returns:
      An op that adds a new tree to the ensemble.

    Raises:
      ValueError: if inputs are not valid.
    """
    # Get the worker device from input dependencies.
    input_deps = (self._dense_floats + self._sparse_float_indices +
                  self._sparse_int_indices)
    worker_device = input_deps[0].device

    # Get tensors relevant for training and form the loss.
    predictions = predictions_dict[PREDICTIONS]
    partition_ids = predictions_dict[PARTITION_IDS]
    ensemble_stamp = predictions_dict[ENSEMBLE_STAMP]
    gradients = gradients_impl.gradients(
        loss,
        predictions,
        name="Gradients",
        colocate_gradients_with_ops=False,
        gate_gradients=0,
        aggregation_method=None)[0]
    strategy = self._learner_config.multi_class_strategy
    num_classes = self._learner_config.num_classes

    class_id = -1
    # Handle different multiclass strategies.
    if strategy == learner_pb2.LearnerConfig.TREE_PER_CLASS:
      # We build one vs rest trees.
      gradient_shape = tensor_shape.scalar()
      hessian_shape = tensor_shape.scalar()

      if num_classes == 2:
        # We have only 1 score, gradients is of shape [batch, 1].
        hessians = gradients_impl.gradients(
            gradients,
            predictions,
            name="Hessian",
            colocate_gradients_with_ops=False,
            gate_gradients=0,
            aggregation_method=None)[0]

        squeezed_gradients = array_ops.squeeze(gradients, axis=[1])
        squeezed_hessians = array_ops.squeeze(hessians, axis=[1])
      else:
        hessian_list = self._diagonal_hessian(gradients, predictions)
        # Assemble hessian list into a tensor.
        hessians = array_ops.stack(hessian_list, axis=1)

        # Choose the class for which the tree is built (one vs rest).
        class_id = predictions_dict[NUM_TREES_ATTEMPTED] % num_classes
        class_id = math_ops.to_int32(class_id)

        # Use class id tensor to get the column with that index from gradients
        # and hessians.
        squeezed_gradients = array_ops.squeeze(
            _get_column_by_index(gradients, class_id))
        squeezed_hessians = array_ops.squeeze(
            _get_column_by_index(hessians, class_id))
    else:
      # Other multiclass strategies.
      gradient_shape = tensor_shape.TensorShape([num_classes])

      if strategy == learner_pb2.LearnerConfig.FULL_HESSIAN:
        hessian_shape = tensor_shape.TensorShape(([num_classes, num_classes]))
        hessian_list = self._full_hessian(gradients, predictions)
      else:
        # Diagonal hessian strategy.
        hessian_shape = tensor_shape.TensorShape(([num_classes]))
        hessian_list = self._diagonal_hessian(gradients, predictions)

      squeezed_gradients = gradients
      hessians = array_ops.stack(hessian_list, axis=1)
      squeezed_hessians = hessians

    # Get the weights for each example for quantiles calculation,
    weights = self._get_weights(hessian_shape, squeezed_hessians)

    regularization_config = self._learner_config.regularization
    min_node_weight = self._learner_config.constraints.min_node_weight
    # Create all handlers ensuring resources are evenly allocated across PS.
    fc_name_idx = 0
    handlers = []
    init_stamp_token = constant_op.constant(0, dtype=dtypes.int64)
    with ops.device(self._get_replica_device_setter(worker_device)):
      # Create handlers for dense float columns
      for dense_float_column_idx in range(len(self._dense_floats)):
        fc_name = self._fc_names[fc_name_idx]
        handlers.append(
            ordinal_split_handler.DenseSplitHandler(
                l1_regularization=regularization_config.l1,
                l2_regularization=regularization_config.l2,
                tree_complexity_regularization=(
                    regularization_config.tree_complexity),
                min_node_weight=min_node_weight,
                feature_column_group_id=dense_float_column_idx,
                epsilon=0.01,
                num_quantiles=100,
                dense_float_column=self._dense_floats[dense_float_column_idx],
                name=fc_name,
                gradient_shape=gradient_shape,
                hessian_shape=hessian_shape,
                multiclass_strategy=strategy,
                init_stamp_token=init_stamp_token))
        fc_name_idx += 1

      # Create handlers for sparse float columns.
      for sparse_float_column_idx in range(len(self._sparse_float_indices)):
        fc_name = self._fc_names[fc_name_idx]
        handlers.append(
            ordinal_split_handler.SparseSplitHandler(
                l1_regularization=regularization_config.l1,
                l2_regularization=regularization_config.l2,
                tree_complexity_regularization=(
                    regularization_config.tree_complexity),
                min_node_weight=min_node_weight,
                feature_column_group_id=sparse_float_column_idx,
                epsilon=0.01,
                num_quantiles=100,
                sparse_float_column=sparse_tensor.SparseTensor(
                    self._sparse_float_indices[sparse_float_column_idx],
                    self._sparse_float_values[sparse_float_column_idx],
                    self._sparse_float_shapes[sparse_float_column_idx]),
                name=fc_name,
                gradient_shape=gradient_shape,
                hessian_shape=hessian_shape,
                multiclass_strategy=strategy,
                init_stamp_token=init_stamp_token))
        fc_name_idx += 1

      # Create handlers for sparse int columns.
      for sparse_int_column_idx in range(len(self._sparse_int_indices)):
        fc_name = self._fc_names[fc_name_idx]
        handlers.append(
            categorical_split_handler.EqualitySplitHandler(
                l1_regularization=regularization_config.l1,
                l2_regularization=regularization_config.l2,
                tree_complexity_regularization=(
                    regularization_config.tree_complexity),
                min_node_weight=min_node_weight,
                feature_column_group_id=sparse_int_column_idx,
                sparse_int_column=sparse_tensor.SparseTensor(
                    self._sparse_int_indices[sparse_int_column_idx],
                    self._sparse_int_values[sparse_int_column_idx],
                    self._sparse_int_shapes[sparse_int_column_idx]),
                name=fc_name,
                gradient_shape=gradient_shape,
                hessian_shape=hessian_shape,
                multiclass_strategy=strategy,
                init_stamp_token=init_stamp_token))
        fc_name_idx += 1

      # Create steps accumulator.
      steps_accumulator = stats_accumulator_ops.StatsAccumulator(
          stamp_token=0,
          gradient_shape=tensor_shape.scalar(),
          hessian_shape=tensor_shape.scalar(),
          name="StepsAccumulator")

      # Create bias stats accumulator.
      bias_stats_accumulator = stats_accumulator_ops.StatsAccumulator(
          stamp_token=0,
          gradient_shape=gradient_shape,
          hessian_shape=hessian_shape,
          name="BiasAccumulator")

      # Create ensemble stats variables.
      num_layer_examples = variables.Variable(
          initial_value=array_ops.zeros([], dtypes.int64),
          name="num_layer_examples",
          trainable=False)
      num_layer_steps = variables.Variable(
          initial_value=array_ops.zeros([], dtypes.int64),
          name="num_layer_steps",
          trainable=False)
      num_layers = variables.Variable(
          initial_value=array_ops.zeros([], dtypes.int64),
          name="num_layers",
          trainable=False)
      active_tree = variables.Variable(
          initial_value=array_ops.zeros([], dtypes.int64),
          name="active_tree",
          trainable=False)
      active_layer = variables.Variable(
          initial_value=array_ops.zeros([], dtypes.int64),
          name="active_layer",
          trainable=False)

    # Create ensemble stats summaries.
    summary.scalar("layer_stats/num_examples", num_layer_examples)
    summary.scalar("layer_stats/num_steps", num_layer_steps)
    summary.scalar("ensemble_stats/active_tree", active_tree)
    summary.scalar("ensemble_stats/active_layer", active_layer)

    # Update bias stats.
    stats_update_ops = []
    continue_centering = variables.Variable(
        initial_value=self._center_bias,
        name="continue_centering",
        trainable=False)
    stats_update_ops.append(
        control_flow_ops.cond(continue_centering,
                              self._make_update_bias_stats_fn(
                                  ensemble_stamp, predictions, gradients,
                                  bias_stats_accumulator),
                              control_flow_ops.no_op))

    # Update handler stats.
    handler_reads = {}
    for handler in handlers:
      handler_reads[handler] = handler.scheduled_reads()

    handler_results = batch_ops_utils.run_handler_scheduled_ops(
        handler_reads, ensemble_stamp, worker_device)
    per_handler_updates = {}
    # Two values per handler. First one is if the the handler is active for the
    # current layer. The second one is if the handler is going to be active
    # for the next layer.
    subsampling_type = self._learner_config.WhichOneof("feature_fraction")
    if subsampling_type == "feature_fraction_per_level":
      seed = predictions_dict[NUM_LAYERS_ATTEMPTED]
      active_handlers_current_layer = stateless.stateless_random_uniform(
          shape=[len(handlers)], seed=[seed, 1])
      active_handlers_next_layer = stateless.stateless_random_uniform(
          shape=[len(handlers)], seed=[seed + 1, 1])
      active_handlers = array_ops.stack(
          [active_handlers_current_layer, active_handlers_next_layer], axis=1)
      active_handlers = (active_handlers <
                         self._learner_config.feature_fraction_per_level)
    elif subsampling_type == "feature_fraction_per_tree":
      seed = predictions_dict[NUM_TREES_ATTEMPTED]
      active_handlers_current_layer = stateless.stateless_random_uniform(
          shape=[len(handlers)], seed=[seed, 2])
      active_handlers_current_layer = (
          active_handlers_current_layer <
          self._learner_config.feature_fraction_per_tree)
      active_handlers = array_ops.stack(active_handlers_current_layer,
                                        array_ops.ones(
                                            [len(handlers)], dtype=dtypes.bool))
    else:
      active_handlers = array_ops.ones([len(handlers), 2], dtype=dtypes.bool)

    # Prepare empty gradients and hessians when handlers are not ready.
    empty_hess_shape = [1] + hessian_shape.as_list()
    empty_grad_shape = [1] + gradient_shape.as_list()

    empty_gradients = constant_op.constant(
        [], dtype=dtypes.float32, shape=empty_grad_shape)
    empty_hessians = constant_op.constant(
        [], dtype=dtypes.float32, shape=empty_hess_shape)

    for handler_idx in range(len(handlers)):
      handler = handlers[handler_idx]
      is_active = active_handlers[handler_idx]
      updates, scheduled_updates = handler.update_stats(
          ensemble_stamp, partition_ids, squeezed_gradients, squeezed_hessians,
          empty_gradients, empty_hessians, weights, is_active,
          handler_results[handler])
      stats_update_ops.append(updates)
      per_handler_updates[handler] = scheduled_updates

    update_results = batch_ops_utils.run_handler_scheduled_ops(
        per_handler_updates, ensemble_stamp, worker_device)
    for update in update_results.values():
      stats_update_ops += update
    # Accumulate a step after updating stats.
    batch_size = math_ops.cast(array_ops.shape(labels)[0], dtypes.float32)
    with ops.control_dependencies(stats_update_ops):
      add_step_op = steps_accumulator.add(ensemble_stamp, [0], [0],
                                          [batch_size], [1.0])

    # Determine learning rate.
    learning_rate_tuner = self._learner_config.learning_rate_tuner.WhichOneof(
        "tuner")
    if learning_rate_tuner == "fixed" or learning_rate_tuner == "dropout":
      tuner = getattr(self._learner_config.learning_rate_tuner,
                      learning_rate_tuner)
      learning_rate = tuner.learning_rate
    else:
      # TODO(nponomareva, soroush) do the line search.
      raise ValueError("Line search learning rate is not yet supported.")

    # After adding the step, decide if further processing is needed.
    ensemble_update_ops = [add_step_op]
    with ops.control_dependencies([add_step_op]):
      if self._is_chief:
        dropout_seed = predictions_dict[NUM_TREES_ATTEMPTED]

        # Get accumulated steps and examples for the current layer.
        _, _, _, _, acc_examples, acc_steps = steps_accumulator.serialize()
        acc_examples = math_ops.cast(acc_examples[0], dtypes.int64)
        acc_steps = math_ops.cast(acc_steps[0], dtypes.int64)
        ensemble_update_ops.append(num_layer_examples.assign(acc_examples))
        ensemble_update_ops.append(num_layer_steps.assign(acc_steps))
        # Determine whether we need to update tree ensemble.
        examples_per_layer = self._examples_per_layer
        if callable(examples_per_layer):
          examples_per_layer = examples_per_layer(active_layer)
        ensemble_update_ops.append(
            control_flow_ops.cond(
                acc_examples >= examples_per_layer,
                self._make_update_ensemble_fn(
                    ensemble_stamp, steps_accumulator, bias_stats_accumulator,
                    continue_centering, learning_rate, handlers, num_layers,
                    active_tree, active_layer, dropout_seed, class_id),
                control_flow_ops.no_op))

    # Calculate the loss to be reported - use the predictions without dropout.
    return control_flow_ops.group(*ensemble_update_ops)
示例#12
0
  def update_stats(self, loss, predictions_dict):
    """Update the accumulators with stats from this batch.

    Args:
      loss: A scalar tensor representing average loss of examples.
      predictions_dict: Dictionary of Rank 2 `Tensor` representing information
          about predictions per example.

    Returns:
      An op that adds a new tree to the ensemble.

    Raises:
      ValueError: if inputs are not valid.
    """
    # Get the worker device from input dependencies.
    input_deps = (
        self._dense_floats + self._sparse_float_indices +
        self._sparse_int_indices)
    worker_device = input_deps[0].device
    # Create ensemble stats variables.
    num_layer_examples = variables.Variable(
        initial_value=array_ops.zeros([], dtypes.int64),
        name="num_layer_examples",
        trainable=False)
    num_layer_steps = variables.Variable(
        initial_value=array_ops.zeros([], dtypes.int64),
        name="num_layer_steps",
        trainable=False)
    num_layers = variables.Variable(
        initial_value=array_ops.zeros([], dtypes.int64),
        name="num_layers",
        trainable=False)
    active_tree = variables.Variable(
        initial_value=array_ops.zeros([], dtypes.int64),
        name="active_tree",
        trainable=False)
    active_layer = variables.Variable(
        initial_value=array_ops.zeros([], dtypes.int64),
        name="active_layer",
        trainable=False)
    # Variable that becomes false once bias centering is done.
    continue_centering = variables.Variable(
        initial_value=self._center_bias,
        name="continue_centering",
        trainable=False)
    # Create bias stats accumulator.
    bias_stats_accumulator = stats_accumulator_ops.StatsAccumulator(
        stamp_token=0,
        gradient_shape=self._gradient_shape,
        hessian_shape=self._hessian_shape,
        name="BiasAccumulator")
    # Create steps accumulator.
    steps_accumulator = stats_accumulator_ops.StatsAccumulator(
        stamp_token=0,
        gradient_shape=tensor_shape.scalar(),
        hessian_shape=tensor_shape.scalar(),
        name="StepsAccumulator")

    # Get tensors relevant for training and form the loss.
    predictions = predictions_dict[PREDICTIONS]
    partition_ids = predictions_dict[PARTITION_IDS]
    ensemble_stamp = predictions_dict[ENSEMBLE_STAMP]
    gradients = gradients_impl.gradients(
        loss,
        predictions,
        name="Gradients",
        colocate_gradients_with_ops=False,
        gate_gradients=0,
        aggregation_method=None)[0]
    strategy = self._learner_config.multi_class_strategy

    class_id = self._get_class_id(predictions_dict)
    # Handle different multiclass strategies.
    if strategy == learner_pb2.LearnerConfig.TREE_PER_CLASS:
      # We build one vs rest trees.
      if self._logits_dimension == 1:
        # We have only 1 score, gradients is of shape [batch, 1].
        hessians = gradients_impl.gradients(
            gradients,
            predictions,
            name="Hessian",
            colocate_gradients_with_ops=False,
            gate_gradients=0,
            aggregation_method=None)[0]

        squeezed_gradients = array_ops.squeeze(gradients, axis=[1])
        squeezed_hessians = array_ops.squeeze(hessians, axis=[1])
      else:
        hessian_list = self._diagonal_hessian(gradients, predictions)
        # Assemble hessian list into a tensor.
        hessians = array_ops.stack(hessian_list, axis=1)
        # Use class id tensor to get the column with that index from gradients
        # and hessians.
        squeezed_gradients = array_ops.squeeze(
            _get_column_by_index(gradients, class_id))
        squeezed_hessians = array_ops.squeeze(
            _get_column_by_index(hessians, class_id))
    else:
      # Other multiclass strategies.
      if strategy == learner_pb2.LearnerConfig.FULL_HESSIAN:
        hessian_list = self._full_hessian(gradients, predictions)
      else:
        # Diagonal hessian strategy.
        hessian_list = self._diagonal_hessian(gradients, predictions)

      squeezed_gradients = gradients
      hessians = array_ops.stack(hessian_list, axis=1)
      squeezed_hessians = hessians

    # Get the weights for each example for quantiles calculation,
    weights = self._get_weights(self._hessian_shape, squeezed_hessians)

    # Create all handlers ensuring resources are evenly allocated across PS.
    fc_name_idx = 0
    handlers = []
    init_stamp_token = constant_op.constant(0, dtype=dtypes.int64)
    l1_regularization = constant_op.constant(
        self._learner_config.regularization.l1, dtypes.float32)
    l2_regularization = constant_op.constant(
        self._learner_config.regularization.l2, dtypes.float32)
    tree_complexity_regularization = constant_op.constant(
        self._learner_config.regularization.tree_complexity, dtypes.float32)
    min_node_weight = constant_op.constant(
        self._learner_config.constraints.min_node_weight, dtypes.float32)
    epsilon = 0.01
    num_quantiles = 100
    strategy_tensor = constant_op.constant(strategy)
    with ops.device(self._get_replica_device_setter(worker_device)):
      # Create handlers for dense float columns
      for dense_float_column_idx in range(len(self._dense_floats)):
        fc_name = self._fc_names[fc_name_idx]
        handlers.append(
            ordinal_split_handler.DenseSplitHandler(
                l1_regularization=l1_regularization,
                l2_regularization=l2_regularization,
                tree_complexity_regularization=tree_complexity_regularization,
                min_node_weight=min_node_weight,
                feature_column_group_id=dense_float_column_idx,
                epsilon=epsilon,
                num_quantiles=num_quantiles,
                dense_float_column=self._dense_floats[dense_float_column_idx],
                name=fc_name,
                gradient_shape=self._gradient_shape,
                hessian_shape=self._hessian_shape,
                multiclass_strategy=strategy_tensor,
                init_stamp_token=init_stamp_token))
        fc_name_idx += 1

      # Create handlers for sparse float columns.
      for sparse_float_column_idx in range(len(self._sparse_float_indices)):
        fc_name = self._fc_names[fc_name_idx]
        handlers.append(
            ordinal_split_handler.SparseSplitHandler(
                l1_regularization=l1_regularization,
                l2_regularization=l2_regularization,
                tree_complexity_regularization=tree_complexity_regularization,
                min_node_weight=min_node_weight,
                feature_column_group_id=sparse_float_column_idx,
                epsilon=epsilon,
                num_quantiles=num_quantiles,
                sparse_float_column=sparse_tensor.SparseTensor(
                    self._sparse_float_indices[sparse_float_column_idx],
                    self._sparse_float_values[sparse_float_column_idx],
                    self._sparse_float_shapes[sparse_float_column_idx]),
                name=fc_name,
                gradient_shape=self._gradient_shape,
                hessian_shape=self._hessian_shape,
                multiclass_strategy=strategy_tensor,
                init_stamp_token=init_stamp_token))
        fc_name_idx += 1

      # Create handlers for sparse int columns.
      for sparse_int_column_idx in range(len(self._sparse_int_indices)):
        fc_name = self._fc_names[fc_name_idx]
        handlers.append(
            categorical_split_handler.EqualitySplitHandler(
                l1_regularization=l1_regularization,
                l2_regularization=l2_regularization,
                tree_complexity_regularization=tree_complexity_regularization,
                min_node_weight=min_node_weight,
                feature_column_group_id=sparse_int_column_idx,
                sparse_int_column=sparse_tensor.SparseTensor(
                    self._sparse_int_indices[sparse_int_column_idx],
                    self._sparse_int_values[sparse_int_column_idx],
                    self._sparse_int_shapes[sparse_int_column_idx]),
                name=fc_name,
                gradient_shape=self._gradient_shape,
                hessian_shape=self._hessian_shape,
                multiclass_strategy=strategy_tensor,
                init_stamp_token=init_stamp_token))
        fc_name_idx += 1

    # Create ensemble stats summaries.
    summary.scalar("layer_stats/num_examples", num_layer_examples)
    summary.scalar("layer_stats/num_steps", num_layer_steps)
    summary.scalar("ensemble_stats/active_tree", active_tree)
    summary.scalar("ensemble_stats/active_layer", active_layer)

    # Update bias stats.
    stats_update_ops = []

    stats_update_ops.append(
        control_flow_ops.cond(
            continue_centering,
            self._make_update_bias_stats_fn(
                ensemble_stamp, predictions, gradients,
                bias_stats_accumulator), control_flow_ops.no_op))

    # Update handler stats.
    handler_reads = collections.OrderedDict()
    for handler in handlers:
      handler_reads[handler] = handler.scheduled_reads()

    handler_results = batch_ops_utils.run_handler_scheduled_ops(
        handler_reads, ensemble_stamp, worker_device)
    per_handler_updates = collections.OrderedDict()
    # Two values per handler. First one is if the handler is active for the
    # current layer. The second one is if the handler is going to be active
    # for the next layer.
    subsampling_type = self._learner_config.WhichOneof("feature_fraction")
    if subsampling_type == "feature_fraction_per_level":
      seed = predictions_dict[NUM_LAYERS_ATTEMPTED]
      active_handlers_current_layer = stateless.stateless_random_uniform(
          shape=[len(handlers)], seed=[seed, 1])
      active_handlers_next_layer = stateless.stateless_random_uniform(
          shape=[len(handlers)], seed=[seed + 1, 1])
      active_handlers = array_ops.stack(
          [active_handlers_current_layer, active_handlers_next_layer], axis=1)
      active_handlers = (
          active_handlers < self._learner_config.feature_fraction_per_level)
    elif subsampling_type == "feature_fraction_per_tree":
      seed = predictions_dict[NUM_TREES_ATTEMPTED]
      active_handlers_current_layer = stateless.stateless_random_uniform(
          shape=[len(handlers)], seed=[seed, 2])
      active_handlers_current_layer = (
          active_handlers_current_layer <
          self._learner_config.feature_fraction_per_tree)
      active_handlers = array_ops.stack(
          [
              active_handlers_current_layer,
              array_ops.ones([len(handlers)], dtype=dtypes.bool)
          ],
          axis=1)
    else:
      active_handlers = array_ops.ones([len(handlers), 2], dtype=dtypes.bool)

    if self._learner_config.constraints.max_number_of_unique_feature_columns:
      target = (
          self._learner_config.constraints.max_number_of_unique_feature_columns)

      def _feature_selection_active_handlers():
        # The active list for current and the next iteration.
        used_handlers = array_ops.reshape(predictions_dict[USED_HANDLERS_MASK],
                                          [-1, 1])
        used_handlers = array_ops.concat([used_handlers, used_handlers], axis=1)
        return math_ops.logical_and(used_handlers, active_handlers)

      active_handlers = (
          control_flow_ops.cond(predictions_dict[NUM_USED_HANDLERS] >= target,
                                _feature_selection_active_handlers,
                                lambda: active_handlers))

    # Prepare empty gradients and hessians when handlers are not ready.
    empty_hess_shape = [1] + self._hessian_shape.as_list()
    empty_grad_shape = [1] + self._gradient_shape.as_list()

    empty_gradients = constant_op.constant(
        [], dtype=dtypes.float32, shape=empty_grad_shape)
    empty_hessians = constant_op.constant(
        [], dtype=dtypes.float32, shape=empty_hess_shape)

    active_handlers = array_ops.unstack(active_handlers, axis=0)
    for handler_idx in range(len(handlers)):
      handler = handlers[handler_idx]
      is_active = active_handlers[handler_idx]
      updates, scheduled_updates = handler.update_stats(
          ensemble_stamp, partition_ids, squeezed_gradients, squeezed_hessians,
          empty_gradients, empty_hessians, weights, is_active,
          handler_results[handler])
      stats_update_ops.append(updates)
      per_handler_updates[handler] = scheduled_updates

    update_results = batch_ops_utils.run_handler_scheduled_ops(
        per_handler_updates, ensemble_stamp, worker_device)
    for update in update_results.values():
      stats_update_ops += update

    training_state = {
        _NUM_LAYER_EXAMPLES: num_layer_examples,
        _NUM_LAYER_STEPS: num_layer_steps,
        _NUM_LAYERS: num_layers,
        _ACTIVE_TREE: active_tree,
        _ACTIVE_LAYER: active_layer,
        _CONTINUE_CENTERING: continue_centering,
        _BIAS_STATS_ACCUMULATOR: bias_stats_accumulator,
        _STEPS_ACCUMULATOR: steps_accumulator,
        _HANDLERS: handlers
    }
    return stats_update_ops, training_state