def test_freeze_ensemble(self, subnetwork_fns, features, want_nodes, bias=0): with tf.Graph().as_default() as g, self.test_session(graph=g) as sess: freezer = _EnsembleFreezer() filename = os.path.join(self.test_subdirectory, "frozen.pbtxt") features = { k: tf.constant(features[k], name=k) for k in sorted(features.keys()) } weighted_subnetworks = [fn(features) for fn in subnetwork_fns] bias = tf.constant(bias, name="bias") sess.run(tf.global_variables_initializer()) freezer.freeze_ensemble(filename=filename, weighted_subnetworks=weighted_subnetworks, bias=bias, features=features) with tf.gfile.FastGFile(filename, "rb") as f: meta_graph_def = tf.MetaGraphDef() meta_graph_def.ParseFromString(f.read()) nodes = [] consts = [] for node_def in meta_graph_def.graph_def.node: nodes.append(node_def.name) if node_def.op == "Const": consts.append(node_def.name) self.assertEqual(want_nodes, nodes)
def test_load_frozen_ensemble(self, subnetwork_fns, features_to_freeze, features_to_load, want_logits, bias=0, features_placeholder=None): freezer = _EnsembleFreezer() filename = os.path.join(self.test_subdirectory, "frozen.pbtxt") bias_value = bias # First freeze ensemble. with tf.Graph().as_default() as g, self.test_session(graph=g) as sess: bias = tf.constant(bias, name="bias") if features_to_freeze is not None: features_to_freeze = tu.tensor_features(features_to_freeze) elif features_placeholder is not None: features_to_freeze = { k: tf.placeholder(dtype=tf.float32, name=k, shape=shape) for k, shape in features_placeholder.items() } features_to_freeze = freezer.wrapped_features(features_to_freeze) weighted_subnetworks = [] for name, fn in subnetwork_fns: with tf.variable_scope( "adanet/iteration_0/ensemble_{}/".format(name)): weighted_subnetworks.append(fn(features_to_freeze)) init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) freezer.freeze_ensemble(filename=filename, weighted_subnetworks=weighted_subnetworks, bias=bias, features=features_to_freeze) # Load frozen ensemble into a new graph with potentially different # features than those used when saving. with tf.Graph().as_default() as g, self.test_session(graph=g) as sess: features_to_load = tu.tensor_features(features_to_load) features_to_load = freezer.wrapped_features(features_to_load) frozen_ensemble, frozen_bias, _ = freezer.load_frozen_ensemble( filename=filename, features=features_to_load) want_ensemble = [ fn(features_to_load) for (_, fn) in subnetwork_fns ] init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) want_ensemble = sess.run(want_ensemble) frozen_ensemble = sess.run(frozen_ensemble) self.assertEqual([w.name for w in want_ensemble], [w.name for w in frozen_ensemble]) self.assertAllClose([(w.logits, w.weight, w.subnetwork) for w in want_ensemble], [(w.logits, w.weight, w.subnetwork) for w in frozen_ensemble]) self.assertAllEqual(bias_value, sess.run(frozen_bias))
def test_wrapped_features_placeholder(self): freezer = _EnsembleFreezer() features = {"x": tf.placeholder(dtype=tf.float32, shape=[], name="foo")} got = freezer.wrapped_features(features) with self.test_session() as sess: self.assertAllClose( sess.run(features, feed_dict={features["x"]: 1.}), sess.run(got, feed_dict={got["x"]: 1.}))
def test_wrapped_features_sparse_tensors(self): freezer = _EnsembleFreezer() features = { "x": tf.SparseTensor( indices=[[0, 0], [0, 1]], values=[-1., 1.], dense_shape=[1, 2]) } got = freezer.wrapped_features(features) with self.test_session() as sess: self.assertAllClose(sess.run(features), sess.run(got))
def test_load_frozen_ensemble_colocation_bug(self, features_to_freeze, features_to_load): """Test colocation bug b/74595432.""" freezer = _EnsembleFreezer() filename = os.path.join(self.test_subdirectory, "frozen.pbtxt") # First freeze ensemble. with tf.Graph().as_default() as g, self.test_session(graph=g) as sess: features_to_freeze = tu.tensor_features(features_to_freeze) features_to_freeze = freezer.wrapped_features(features_to_freeze) for k, feature in features_to_freeze.items(): with tf.colocate_with(feature): if isinstance(feature, tf.SparseTensor): feature = tf.SparseTensor( tf.identity(feature.indices, name="colocated_indices"), tf.identity(feature.values, name="colocated_values"), tf.identity(feature.dense_shape, name="colocated_dense_shape")) else: feature = tf.identity(feature, name="colocated") features_to_freeze[k] = feature weighted_subnetworks = [_dnn_subnetwork_fn()(features_to_freeze)] init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) freezer.freeze_ensemble(filename=filename, weighted_subnetworks=weighted_subnetworks, bias=tf.constant(0, name="bias"), features=features_to_freeze) # Verify that repeatedly freezing and reloading frozen ensembles works. features_to_load_copy = features_to_load.copy() for _ in range(5): with tf.Graph().as_default() as g, self.test_session( graph=g) as sess: features_to_load = tu.tensor_features(features_to_load_copy) features_to_load = freezer.wrapped_features(features_to_load) frozen_ensemble, frozen_bias, _ = freezer.load_frozen_ensemble( filename=filename, features=features_to_load) init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) freezer.freeze_ensemble(filename=filename, weighted_subnetworks=frozen_ensemble, bias=frozen_bias, features=features_to_freeze)
def test_wrapped_features_sparse_placeholder(self): freezer = _EnsembleFreezer() features = { "x": tf.sparse_placeholder( dtype=tf.float32, shape=[None, 2], name="foo") } got = freezer.wrapped_features(features) value = tf.SparseTensorValue( indices=[[0, 0], [0, 1]], values=[-1., 1.], dense_shape=[1, 2]) with self.test_session() as sess: self.assertAllClose( sess.run(features, feed_dict={features["x"]: value}), sess.run(got, feed_dict={got["x"]: value}))
def test_freeze_ensemble_error(self, subnetwork_fns, bad_persisted_tensors): with tf.Graph().as_default() as g, self.test_session(graph=g) as sess: freezer = _EnsembleFreezer() filename = os.path.join(self.test_subdirectory, "frozen.pbtxt") features = {"x": tf.constant([[-1., 1.]], name="features")} weighted_subnetworks = [fn(features) for fn in subnetwork_fns] for wwl in weighted_subnetworks: wwl.subnetwork.persisted_tensors.update(bad_persisted_tensors) bias = tf.constant(0, name="bias") with self.assertRaises(ValueError): sess.run(tf.global_variables_initializer()) freezer.freeze_ensemble( filename=filename, weighted_subnetworks=weighted_subnetworks, bias=bias, features=features)
def __init__(self, head, subnetwork_generator, max_iteration_steps, mixture_weight_type=MixtureWeightType.SCALAR, mixture_weight_initializer=None, warm_start_mixture_weights=False, adanet_lambda=0., adanet_beta=0., evaluator=None, report_materializer=None, use_bias=False, replicate_ensemble_in_training=False, adanet_loss_decay=.9, worker_wait_timeout_secs=7200, model_dir=None, report_dir=None, config=None): """Initializes an `Estimator`. Regarding the options for `mixture_weight_type`: A `SCALAR` mixture weight is a rank 0 tensor. It performs an element- wise multiplication with its subnetwork's logits. This mixture weight is the simplest to learn, the quickest to train, and most likely to generalize well. A `VECTOR` mixture weight is a tensor of shape [k] where k is the ensemble's logits dimension as defined by `head`. It is similar to `SCALAR` in that it performs an element-wise multiplication with its subnetwork's logits, but is more flexible in learning a subnetworks's preferences per class. A `MATRIX` mixture weight is a tensor of shape [a, b] where a is the number of outputs from the subnetwork's `last_layer` and b is the number of outputs from the ensemble's `logits`. This weight matrix-multiplies the subnetwork's `last_layer`. This mixture weight offers the most flexibility and expressivity, allowing subnetworks to have outputs of different dimensionalities. However, it also has the most trainable parameters (a*b), and is therefore the most sensitive to learning rates and regularization. Args: head: A `tf.contrib.estimator.Head` instance for computing loss and evaluation metrics for every candidate. subnetwork_generator: The `adanet.subnetwork.Generator` which defines the candidate subnetworks to train and evaluate at every AdaNet iteration. max_iteration_steps: Total number of steps for which to train candidates per iteration. If `OutOfRange` or `StopIteration` occurs in the middle, training stops before `max_iteration_steps` steps. mixture_weight_type: The `adanet.MixtureWeightType` defining which mixture weight type to learn in the linear combination of subnetwork outputs. mixture_weight_initializer: The initializer for mixture_weights. When `None`, the default is different according to `mixture_weight_type`. `SCALAR` initializes to 1/N where N is the number of subnetworks in the ensemble giving a uniform average. `VECTOR` initializes each entry to 1/N where N is the number of subnetworks in the ensemble giving a uniform average. `MATRIX` uses `tf.zeros_initializer`. warm_start_mixture_weights: Whether, at the beginning of an iteration, to initialize the mixture weights of the subnetworks from the previous ensemble to their learned value at the previous iteration, as opposed to retraining them from scratch. Takes precedence over the value for `mixture_weight_initializer` for subnetworks from previous iterations. adanet_lambda: Float multiplier 'lambda' for applying L1 regularization to subnetworks' mixture weights 'w' in the ensemble proportional to their complexity. See Equation (4) in the AdaNet paper. adanet_beta: Float L1 regularization multiplier 'beta' to apply equally to all subnetworks' weights 'w' in the ensemble regardless of their complexity. See Equation (4) in the AdaNet paper. evaluator: An `Evaluator` for comparing `Ensemble` instances in evaluation mode using the training set, or a holdout set. When `None`, they are compared using a moving average of their `Ensemble`'s AdaNet loss during training. report_materializer: A `ReportMaterializer` for materializing a `Builder`'s `subnetwork.Reports` into `subnetwork.MaterializedReport`s. These reports are made available to the Generator at the next iteration, so that it can adapt its search space. When `None`, the Generators' `generate_candidates` method will receive empty Lists for their `previous_ensemble_reports` and `all_reports` arguments. use_bias: Whether to add a bias term to the ensemble's logits. Adding a bias allows the ensemble to learn a shift in the data, often leading to more stable training and better predictions. replicate_ensemble_in_training: Whether to freeze a copy of the ensembled subnetworks' subgraphs in training mode in addition to prediction mode. A copy of the subnetworks' subgraphs is always saved in prediction mode so that at prediction time, the ensemble and composing subnetworks are all in prediction mode. This argument only affects the outputs of the frozen subnetworks in the ensemble. When `False` and during candidate training, the frozen subnetworks in the ensemble are in prediction mode, so training-only ops like dropout are not applied to them. When `True` and training the candidates, the frozen subnetworks will be in training mode as well, so they will apply training-only ops like dropout. However when `True`, this doubles the amount of disk space required to store the frozen ensembles, and increases the preparation stage between boosting iterations. This argument is useful for regularizing learning mixture weights, or for making training-only side inputs available in subsequent iterations. For most use-cases, this should be `False`. adanet_loss_decay: Float decay for the exponential-moving-average of the AdaNet objective throughout training. This moving average is a data- driven way tracking the best candidate with only the training set. worker_wait_timeout_secs: Float number of seconds for workers to wait for chief to prepare the next iteration during distributed training. This is needed to prevent workers waiting indefinitely for a chief that may have crashed or been turned down. When the timeout is exceeded, the worker exits the train loop. In situations where the chief job is much slower than the worker jobs, this timeout should be increased. model_dir: Directory to save model parameters, graph and etc. This can also be used to load checkpoints from the directory into a estimator to continue training a previously saved model. report_dir: Directory where the `adanet.subnetwork.MaterializedReport`s materialized by `report_materializer` would be saved. If `report_materializer` is None, this will not save anything. If `None` or empty string, defaults to "<model_dir>/report". config: `RunConfig` object to configure the runtime settings. Returns: An `Estimator` instance. Raises: ValueError: If `subnetwork_generator` is `None`. ValueError: If `max_iteration_steps` is <= 0. """ # TODO: Add argument to specify how many frozen graph # checkpoints to keep. if subnetwork_generator is None: raise ValueError("subnetwork_generator can't be None.") if max_iteration_steps <= 0.: raise ValueError("max_iteration_steps must be > 0.") self._adanet_loss_decay = adanet_loss_decay # Overwrite superclass's assert that members are not overwritten in order # to overwrite public methods. Note that we are doing something that is not # explicitly supported by the Estimator API and may break in the future. tf.estimator.Estimator._assert_members_are_not_overridden = staticmethod( lambda _: None) self._ensemble_builder = _EnsembleBuilder( head=head, mixture_weight_type=mixture_weight_type, mixture_weight_initializer=mixture_weight_initializer, warm_start_mixture_weights=warm_start_mixture_weights, adanet_lambda=adanet_lambda, adanet_beta=adanet_beta, use_bias=use_bias) candidate_builder = _CandidateBuilder( max_steps=max_iteration_steps, adanet_loss_decay=self._adanet_loss_decay) self._iteration_builder = _IterationBuilder(candidate_builder, self._ensemble_builder) self._freezer = _EnsembleFreezer() self._evaluation_checkpoint_path = None self._evaluator = evaluator self._report_materializer = report_materializer self._replicate_ensemble_in_training = replicate_ensemble_in_training self._worker_wait_timeout_secs = worker_wait_timeout_secs self._evaluation_name = None self._inside_adanet_training_loop = False # This `Estimator` is responsible for bookkeeping across iterations, and # for training the subnetworks in both a local and distributed setting. # Subclassing improves future-proofing against new private methods being # added to `tf.estimator.Estimator` that are expected to be callable by # external functions, such as in b/110435640. super(Estimator, self).__init__( model_fn=self._model_fn, params={ self._Keys.SUBNETWORK_GENERATOR: subnetwork_generator, }, config=config, model_dir=model_dir) # This is defined after base Estimator's init so that report_accessor can # use the same temporary model_dir as the underlying Estimator even if # model_dir is not provided. report_dir = report_dir or os.path.join(self._model_dir, "report") self._report_accessor = _ReportAccessor(report_dir)
def test_wrapped_features_tensors(self): freezer = _EnsembleFreezer() features = {"x": tf.constant([1, 2], name="foo")} got = freezer.wrapped_features(features) with self.test_session() as sess: self.assertAllClose(sess.run(features), sess.run(got))
def test_wrapped_features_none(self): freezer = _EnsembleFreezer() got = freezer.wrapped_features(None) self.assertIsNone(got)