def test_independent_sample_temperature(self): structure = schema.OneOf(['foo', 'bar', 'baz'], basic_specs.OP_TAG) temperature = tf.placeholder_with_default(tf.constant(5.0, tf.float32), shape=(), name='temperature') rl_structure, dist_info = controller.independent_sample( structure, temperature=temperature) with self.cached_session() as sess: sess.run(tf.global_variables_initializer()) # Samples should be valid even when the temperature is set to a value # other than 1. self.assertOneHot(sess.run(rl_structure.mask)) # Before training, the sample log-probability and entropy shouldn't be # affected by the temperature, since the probabilities are initialized # to a uniform distribution. self.assertAlmostEqual(sess.run(dist_info['sample_log_prob']), math.log(1 / 3)) self.assertAlmostEqual(sess.run(dist_info['entropy']), math.log(3)) # The gradients should be multiplied by (1 / temperature). # The OneOf has three possible choices. The gradient for the selected one # will be positive, while the gradients for the other two will be # negative. Since the selected choice can change between steps, we compare # the max, which should always give us gradients w.r.t. the selected one. trainable_vars = tf.trainable_variables() self.assertLen(trainable_vars, 1) grad_tensors = tf.gradients(dist_info['sample_log_prob'], trainable_vars) grad1 = np.max(sess.run(grad_tensors[0], {temperature: 1.0})) grad5 = np.max(sess.run(grad_tensors[0], {temperature: 5.0})) self.assertAlmostEqual(grad1 / 5, grad5)
def test_independent_sample_hierarchical(self): structure = schema.OneOf([ schema.OneOf(['a', 'b', 'c'], basic_specs.OP_TAG), schema.OneOf(['d', 'e', 'f', 'g'], basic_specs.OP_TAG), ], basic_specs.OP_TAG) rl_structure, dist_info = controller.independent_sample( structure, increase_ops_probability=0, increase_filters_probability=0, hierarchical=True) tensors = { 'outer_mask': rl_structure.mask, 'entropy': dist_info['entropy'], 'sample_log_prob': dist_info['sample_log_prob'], } self.evaluate(tf.global_variables_initializer()) for _ in range(10): values = self.evaluate(tensors) if np.all(values['outer_mask'] == np.array([1, 0])): self.assertAlmostEqual(values['entropy'], math.log(2) + math.log(3)) self.assertAlmostEqual(values['sample_log_prob'], math.log(1 / 2) + math.log(1 / 3)) elif np.all(values['outer_mask'] == np.array([0, 1])): self.assertAlmostEqual(values['entropy'], math.log(2) + math.log(4)) self.assertAlmostEqual(values['sample_log_prob'], math.log(1 / 2) + math.log(1 / 4)) else: self.fail('Unexpected outer_mask: %s', values['outer_mask'])
def test_independent_sample_increase_ops_probability_1(self): structure = schema.OneOf(['foo', 'bar', 'baz'], basic_specs.OP_TAG) rl_structure, dist_info = controller.independent_sample( structure, increase_ops_probability=1.0) self.evaluate(tf.global_variables_initializer()) self.assertAllClose(self.evaluate(rl_structure.mask), [1/3, 1/3, 1/3]) self.assertEqual(self.evaluate(dist_info['sample_log_prob']), 0)
def test_independent_sample_increase_ops_does_not_affect_ops(self): structure = schema.OneOf([42, 64], basic_specs.OP_TAG) rl_structure, dist_info = controller.independent_sample( structure, increase_filters_probability=1.0) self.evaluate(tf.global_variables_initializer()) self.assertOneHot(self.evaluate(rl_structure.mask)) self.assertAlmostEqual(self.evaluate(dist_info['sample_log_prob']), math.log(1 / 2))
def test_independent_sample_increase_filters_probability_0(self): structure = schema.OneOf([4, 12, 8], basic_specs.FILTERS_TAG) rl_structure, dist_info = controller.independent_sample( structure, increase_filters_probability=0.0) self.evaluate(tf.global_variables_initializer()) self.assertOneHot(self.evaluate(rl_structure.mask)) self.assertAlmostEqual(self.evaluate(dist_info['sample_log_prob']), math.log(1 / 3))
def test_independent_sample_increase_filters_probability_1_big_space(self): # Use a large enough number of choices that we're unlikely to select the # right one by random chance. structure = schema.OneOf(list(range(100)), basic_specs.FILTERS_TAG) rl_structure, dist_info = controller.independent_sample( structure, increase_filters_probability=1.0) self.evaluate(tf.global_variables_initializer()) self.assertAllClose(self.evaluate(rl_structure.mask), [0] * 99 + [1]) self.assertEqual(self.evaluate(dist_info['sample_log_prob']), 0)
def test_independent_sample_increase_filters_probability_1(self): # Make sure that increase_filters does the right thing when the choices do # not appear in sorted order. structure = schema.OneOf([4, 12, 8], basic_specs.FILTERS_TAG) rl_structure, dist_info = controller.independent_sample( structure, increase_filters_probability=1.0) self.evaluate(tf.global_variables_initializer()) self.assertAllClose(self.evaluate(rl_structure.mask), [0, 1, 0]) self.assertEqual(self.evaluate(dist_info['sample_log_prob']), 0)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') ssd = FLAGS.ssd min_cost = FLAGS.min_cost max_cost = FLAGS.max_cost num_samples = FLAGS.num_samples model_spec = mobile_classifier_factory.get_model_spec(ssd) model_spec, _ = controller.independent_sample(model_spec) tf_indices = search_space_utils.tf_indices(model_spec) cost_model_features = mobile_cost_model.coupled_tf_features(model_spec) cost = cost_model_lib.estimate_cost(cost_model_features, ssd) outputs = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) num_attempts = 0 while len(outputs) < num_samples: num_attempts += 1 indices_value, cost_value = sess.run([tf_indices, cost]) if min_cost <= cost_value <= max_cost: outputs.append({ 'indices': ':'.join(map(str, indices_value)), 'cost': float(cost_value), }) if num_attempts % 100 == 0 or len(outputs) == num_samples: print( 'generated {:d} samples, found {:d} / {:d} valid architectures' .format(num_attempts, len(outputs), num_samples)) # Generate output in a formatted JSON style that's (hopefully) easy for # both computers and humans to read. json.dump(outputs, sys.stdout, indent=2) print()
def test_independent_sample_not_hierarchical(self): structure = schema.OneOf( [ schema.OneOf(['a', 'b', 'c'], basic_specs.OP_TAG), schema.OneOf(['d', 'e', 'f', 'g'], basic_specs.OP_TAG), ], basic_specs.OP_TAG) unused_rl_structure, dist_info = controller.independent_sample( structure, increase_ops_probability=0, increase_filters_probability=0, hierarchical=False) tensors = { 'entropy': dist_info['entropy'], 'sample_log_prob': dist_info['sample_log_prob'], } self.evaluate(tf.global_variables_initializer()) for _ in range(10): values = self.evaluate(tensors) self.assertAlmostEqual( values['entropy'], math.log(2) + math.log(3) + math.log(4)) self.assertAlmostEqual( values['sample_log_prob'], math.log(1/2) + math.log(1/3) + math.log(1/4))
def test_independent_sample_basic(self): structure = { 'filters': schema.OneOf([48], basic_specs.FILTERS_TAG), 'opA': schema.OneOf(['foo', 'bar', 'baz'], basic_specs.OP_TAG), 'opB': schema.OneOf(['blah', 'yatta'], basic_specs.OP_TAG), 'other': schema.OneOf(['W', 'X', 'Y', 'Z'], 'some_other_tag'), } rl_structure, dist_info = controller.independent_sample(structure) self.assertItemsEqual(structure.keys(), rl_structure.keys()) self.assertEqual({k: v.choices for (k, v) in structure.items()}, {k: v.choices for (k, v) in rl_structure.items()}) self.assertEqual({k: v.tag for (k, v) in structure.items()}, {k: v.tag for (k, v) in rl_structure.items()}) self.assertEqual(rl_structure['opA'].mask.shape, [3]) self.assertEqual(rl_structure['opB'].mask.shape, [2]) self.assertEqual(rl_structure['filters'].mask.shape, [1]) self.assertEqual(rl_structure['other'].mask.shape, [4]) self.evaluate(tf.global_variables_initializer()) self.assertEqual(dist_info['entropy'].shape, []) self.assertEqual(dist_info['entropy'].dtype, tf.float32) # Initially, all the logits are zero, so the entropy of a distribution with # N possible choices is -log(N). We sum up the entropies of four different # distributions, for opA, opB, filters, and other. self.assertAlmostEqual( self.evaluate(dist_info['entropy']), math.log(1) + math.log(2) + math.log(3) + math.log(4)) self.assertEqual(dist_info['sample_log_prob'].shape, []) self.assertEqual(dist_info['sample_log_prob'].dtype, tf.float32) self.assertAlmostEqual( self.evaluate(dist_info['sample_log_prob']), math.log(1) + math.log(1 / 2) + math.log(1 / 3) + math.log(1 / 4)) # The controller will visit the elements of 'structure' in sorted order # (based on their keys). So op_indices_0 will correspond to opA, and # op_indices_1 will correspond to opB. All variables are initialized to 0. self.assertItemsEqual(dist_info['logits_by_tag'].keys(), [ 'op_indices_0', 'op_indices_1', 'filters_indices_0', 'some_other_tag_0' ]) self.assertEqual(dist_info['logits_by_tag']['filters_indices_0'].shape, [1]) self.assertEqual(dist_info['logits_by_tag']['op_indices_0'].shape, [3]) self.assertEqual(dist_info['logits_by_tag']['op_indices_1'].shape, [2]) self.assertEqual(dist_info['logits_by_tag']['some_other_tag_0'].shape, [4]) # Repeat, but with logits grouped by path instead of tag. self.assertItemsEqual(dist_info['logits_by_path'], ['filters', 'opA', 'opB', 'other']) self.assertEqual(dist_info['logits_by_path']['filters'].shape, [1]) self.assertEqual(dist_info['logits_by_path']['opA'].shape, [3]) self.assertEqual(dist_info['logits_by_path']['opB'].shape, [2]) self.assertEqual(dist_info['logits_by_path']['other'].shape, [4])
def model_fn(features, labels, mode, params): """Construct a TPUEstimatorSpec for a model.""" if mode != tf.estimator.ModeKeys.TRAIN: raise NotImplementedError( 'Expected that mode == TRAIN, but got {:!r}'.format(mode)) # Data was transposed from NHWC to HWCN on the host side. Transpose it back. # This transposition will be optimized away by the XLA compiler. It serves # as a hint to the compiler that it should expect the input data to come # in HWCN format rather than NHWC. train_features = tf.transpose(features['train'], [3, 0, 1, 2]) validation_features = tf.transpose(features['validation'], [3, 0, 1, 2]) if params['use_bfloat16'] == 'ontpu': train_features = tf.cast(train_features, tf.bfloat16) validation_features = tf.cast(validation_features, tf.bfloat16) global_step = tf.train.get_global_step() # Randomly sample a network architecture. with tf.variable_scope('rl_controller') as rl_scope: pass model_spec = mobile_classifier_factory.get_model_spec(params['ssd']) tf.io.gfile.makedirs(params['checkpoint_dir']) model_spec_filename = os.path.join(params['checkpoint_dir'], 'model_spec.json') with tf.io.gfile.GFile(model_spec_filename, 'w') as handle: handle.write(schema_io.serialize(model_spec)) increase_ops_prob = custom_layers.linear_decay( global_step, params['increase_ops_warmup_steps']) increase_filters_prob = custom_layers.linear_decay( global_step, params['increase_filters_warmup_steps']) model_spec, dist_info = controller.independent_sample( model_spec, increase_ops_probability=increase_ops_prob, increase_filters_probability=increase_filters_prob, name=rl_scope) if params['enable_cost_model']: cost_model_features = mobile_cost_model.coupled_tf_features(model_spec) estimated_cost = cost_model_lib.estimate_cost(cost_model_features, params['ssd']) # We divide the regularization strength by 2 for backwards compatibility with # the deprecated tf.contrib.layers.l2_regularizer() function, which was used # in our published experiments. kernel_regularizer = tf.keras.regularizers.l2( params['model_weight_decay'] / 2) # Set up the basic TensorFlow training/inference graph. model = mobile_classifier_factory.get_model_for_search( model_spec, kernel_regularizer=kernel_regularizer) model.build(train_features.shape) with tf.name_scope('training'): model_logits, _ = model.apply(train_features, training=True) # Cast back to float32 (effectively only when using use_bfloat16 is true). model_logits = tf.cast(model_logits, tf.float32) model_empirical_loss = tf.losses.softmax_cross_entropy( onehot_labels=labels['train'], logits=model_logits, label_smoothing=0.1) model_regularization_loss = model.regularization_loss() model_loss = model_empirical_loss + model_regularization_loss # Set up the model weight training logic. model_learning_rate = custom_layers.cosine_decay_with_linear_warmup( peak_learning_rate=params['model_learning_rate'], global_step=global_step, max_global_step=params['max_global_step'], warmup_steps=params['model_warmup_steps']) model_optimizer = tf.tpu.CrossShardOptimizer( tf.train.RMSPropOptimizer(model_learning_rate, decay=0.9, momentum=params['model_momentum'], epsilon=1.0)) model_vars = model.trainable_variables() model_update_ops = model.updates() with tf.control_dependencies(model_update_ops): grads_and_vars = model_optimizer.compute_gradients( model_loss, var_list=model_vars) if params['use_gradient_sync_barrier']: # Force all gradients to be computed before any are applied. grads_and_vars = _grads_and_vars_barrier(grads_and_vars) # NOTE: We do not pass `global_step` to apply_gradients(), so the global # step is not incremented by `model_optimizer`. The global_step will be # incremented later on, when we update the RL controller weights. If we # incremented it here too, we'd end up incrementing the global_step twice # at each training step. model_op = model_optimizer.apply_gradients(grads_and_vars) if params['use_gradient_sync_barrier']: # Finish computing gradients for the shared model weights before we # start on the RL update step. # # NOTE: The barrier above forces TensorFlow to finish computing grads # for all of the trainable variables before any of the grads can be # consumed. So while the call to with_data_dependencies() here only # explicitly depends on grads_and_vars[0][0], the call implicitly forces # TensorFlow to finish computing the gradients for *all* trainable # variables before computing the validation features. validation_features = layers.with_data_dependencies( [grads_and_vars[0][0]], [validation_features])[0] with tf.name_scope('validation'): # Estimate the model accuracy on a batch of examples from the validation # set. Force this logic to run after the model optimization step. with tf.control_dependencies([model_op]): validation_logits, _ = model.apply(validation_features, training=False) # NOTE(b/130311965): An earlier version of this code cast validation_logits # from bfloat16 to float32 before applying an argmax when the --use_bfloat16 # flag was true. As of cl/240923609, this caused XLA to compute incorrect # model accuracies. Please avoid casting from bfloat16 to bfloat32 before # taking the argmax. is_prediction_correct = tf.equal( tf.argmax(validation_logits, axis=1), tf.argmax(labels['validation'], axis=1)) validation_accuracy = tf.reduce_mean( tf.cast(is_prediction_correct, tf.float32)) # Estimate the reward for the current network architecture and update the # reward to incorporate the cost of the network architecture. if params['enable_cost_model']: rl_stats = search_space_utils.reward_for_single_cost_model( validation_accuracy, rl_reward_function=params['rl_reward_function'], estimated_cost=estimated_cost, rl_cost_model_target=params['rl_cost_model_target'], rl_cost_model_exponent=params['rl_cost_model_exponent']) rl_cost_ratio = rl_stats['rl_cost_ratio'] rl_reward = rl_stats['rl_reward'] rl_cost_adjustment = rl_stats['rl_cost_adjustment'] else: rl_reward = validation_accuracy # Compute a baseline. We first take a cross-replica sum of the rewards # for all the TPU shards, then incorporate the result into an exponential # moving average. Within a single batch, each TPU shard will select a # different set of op masks from the RL controller. Each shard will basically # evaluate a different candidate architecture in our search space. # Count the number of TPU shards (cores) used for training. num_tpu_shards = tf.tpu.cross_replica_sum( tf.ones(shape=(), dtype=rl_reward.dtype)) rl_step_baseline = tf.tpu.cross_replica_sum(rl_reward) rl_step_baseline = rl_step_baseline / num_tpu_shards rl_baseline = custom_layers.update_exponential_moving_average( rl_step_baseline, momentum=params['rl_baseline_momentum']) # Apply a REINFORCE update to the RL controller. log_prob = dist_info['sample_log_prob'] rl_advantage = rl_reward - rl_baseline rl_empirical_loss = -tf.stop_gradient(rl_advantage) * log_prob # We set rl_entropy_loss proportional to (-entropy) so that minimizing the # loss will lead to an entropy that is as large as possible. rl_entropy = dist_info['entropy'] rl_entropy_loss = -params['rl_entropy_regularization'] * rl_entropy # We use an RL learning rate of 0 for the first N epochs of training. See # Appendix A of FBNet. (https://arxiv.org/pdf/1812.03443.pdf). Although they # don't mention it explicitly, there are some indications that ProxylessNAS # (https://openreview.net/forum?id=HylVB3AqYm) might also be doing this. enable_rl_optimizer = tf.cast( tf.greater_equal(global_step, params['rl_delay_steps']), tf.float32) rl_learning_rate = params['rl_learning_rate'] * enable_rl_optimizer if params['use_exponential_rl_learning_rate_schedule']: # rl_learning_rate_progress will be 0 when the RL controller starts # learning and 1 when the search ends. rl_learning_rate_progress = tf.nn.relu( tf.div( tf.cast(global_step - params['rl_delay_steps'], tf.float32), max(1, params['max_global_step'] - params['rl_delay_steps']))) # exponentially increase the RL learning rate over time. rl_learning_rate_multiplier = tf.pow(10.0, rl_learning_rate_progress) rl_learning_rate = rl_learning_rate * rl_learning_rate_multiplier rl_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, rl_scope.name) with tf.control_dependencies(rl_update_ops): # In order to evaluate train_op, we must first evaluate validation_accuracy. # And to evaluate validation_accuracy, we must first evaluate model_op. So # running this op will perform a step of model training followed by # a step of RL controller training. if params['use_gradient_sync_barrier']: transform_grads_fn = _grads_and_vars_barrier else: transform_grads_fn = None train_op = tpu_optimizer_ops.apply_adam( rl_empirical_loss, regularization_loss=rl_entropy_loss, global_step=global_step, var_list=tf.trainable_variables(rl_scope.name), learning_rate=rl_learning_rate, beta1=0.0, beta2=0.999, epsilon=1e-8, transform_grads_fn=transform_grads_fn) # TensorBoard logging tensorboard_scalars = collections.OrderedDict([ ('model/loss', model_loss), ('model/empirical_loss', model_empirical_loss), ('model/regularization_loss', model_regularization_loss), ('model/learning_rate', model_learning_rate), ('rlcontroller/empirical_loss', rl_empirical_loss), ('rlcontroller/entropy_loss', rl_entropy_loss), ('rlcontroller/validation_accuracy', validation_accuracy), ('rlcontroller/reward', rl_reward), ('rlcontroller/step_baseline', rl_step_baseline), ('rlcontroller/baseline', rl_baseline), ('rlcontroller/advantage', rl_advantage), ('rlcontroller/log_prob', log_prob), ]) if params['enable_cost_model']: tensorboard_scalars['rlcontroller/estimated_cost'] = estimated_cost tensorboard_scalars['rlcontroller/cost_ratio'] = rl_cost_ratio tensorboard_scalars[ 'rlcontroller/cost_adjustment'] = rl_cost_adjustment tensorboard_scalars['rlcontroller/learning_rate'] = rl_learning_rate tensorboard_scalars['rlcontroller/increase_ops_prob'] = increase_ops_prob tensorboard_scalars['rlcontroller/increase_filters_prob'] = ( increase_filters_prob) # Log the values of all the choices made by the RL controller. for name_i, logits_i in dist_info['logits_by_path'].items(): assert len(logits_i.shape) == 1, logits_i for j in range(int(logits_i.shape[0])): key = 'rlpathlogits/{:s}/{:d}'.format(name_i, j) tensorboard_scalars[key] = logits_i[j] for name_i, logits_i in dist_info['logits_by_tag'].items(): assert len(logits_i.shape) == 1, logits_i for j in range(int(logits_i.shape[0])): key = 'rltaglogits/{:s}/{:d}'.format(name_i, j) tensorboard_scalars[key] = logits_i[j] # NOTE: host_call only works on rank-1 tensors. There's also a fairly # large performance penalty if we try to pass too many distinct tensors # from the TPU to the host at once. We avoid these problems by (i) calling # tf.stack to merge all of the float32 scalar values into a single rank-1 # tensor that can be sent to the host relatively cheaply and (ii) reshaping # the remaining values from scalars to rank-1 tensors. def host_call_fn(step, scalar_values): values = tf.unstack(scalar_values) with tf2.summary.create_file_writer( params['checkpoint_dir']).as_default(): with tf2.summary.record_if( tf.math.equal(step[0] % params['tpu_iterations_per_loop'], 0)): for key, value in zip(list(tensorboard_scalars.keys()), values): tf2.summary.scalar(key, value, step=step[0]) return tf.summary.all_v2_summary_ops() host_call_values = tf.stack(list(tensorboard_scalars.values())) host_call = (host_call_fn, [tf.reshape(global_step, [1]), host_call_values]) # Construct the estimator specification. return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=model_loss, train_op=train_op, host_call=host_call)