def validate_train_hook_name(self, test_hook_name, expected_hook_name, **kwargs): returned_hook = hooks_helper.get_train_hooks([test_hook_name], **kwargs) self.assertEqual(len(returned_hook), 1) self.assertIsInstance(returned_hook[0], tf.train.SessionRunHook) self.assertEqual(returned_hook[0].__class__.__name__.lower(), expected_hook_name)
def main(argv): parser = ArgParser() flags = parser.parse_args(args=argv[1:]) model_function = model_fn data_format = flags.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags.model_dir, params={ 'data_format': data_format, 'image_size': flags.image_size }) def train_input_fn(): # ds = load_data(os.path.join(os.path.join(flags.data_root, 'data'), 'train'), flags.image_size) ds = prepare_dataset_pair(flags.data_root, 'train', 10) ds = ds.cache().shuffle(buffer_size=50000).batch(flags.batch_size) ds = ds.repeat(flags.epochs_between_evals) return ds def eval_input_fn(): testset = prepare_dataset_pair(flags.data_root, 'test', 10) return testset.batch( flags.batch_size).make_one_shot_iterator().get_next() train_hooks = hooks_helper.get_train_hooks(flags.hooks, batch_size=flags.batch_size) # Train and evaluate model. for _ in range(flags.train_epochs // flags.epochs_between_evals): classifier.train(input_fn=train_input_fn, hooks=train_hooks) eval_results = classifier.evaluate(input_fn=eval_input_fn) print('\nEvaluation results:\n\t%s\n' % eval_results) # Export the model image = tf.placeholder(tf.float32, [None, flags.image_size, flags.image_size]) input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({ 'image': image, }) classifier.export_savedmodel(flags.export_dir, input_fn) shutil.rmtree(flags.model_dir)
def main(argv): parser = ResnetArgParser() parser.add_argument( '--model_class', '-mc', default='cifar10', help= "[default: %(default)s] The model you are performing experiment on.", metavar='<MC>') parser.add_argument( '--output_path', '-op', default='/tmp/output', help= "[default: %(default)s] The location of the estimator model after phase2.", metavar='<OP>') parser.add_argument( '--phase_one', '-pz', default='/tmp/models/cifar10/phase1', help= "[default: %(default)s] The directory where we stored the results from phase1", metavar='<PZ>') # Set defaults that are reasonable for this model. parser.set_defaults(data_dir='/tmp/cifar10_data', resnet_size=32, batch_size=128, version=2, output_path='/tmp/models/cifar10/phase2', method='cp', scope='cp', rate=0.15, rate_decay='flat') flags = parser.parse_args(args=argv[1:]) '''Define the parameters we need for each experiment''' if flags.model_class == 'cifar10': model_class, input_fn, model_fn = Cifar10Model, cifar_input_fn, cifar10_model_fn model_conversion_fn = cifar10_model_conversion_fn else: model_class, input_fn, model_fn = ImagenetModel, imagenet_input_fn, imagenet_model_fn model_conversion_fn = imagenet_model_conversion_fn data_dir = flags.data_dir resnet_size, batch_size, version = flags.resnet_size, flags.batch_size, flags.version method, scope = flags.method, flags.scope compression_rate, epoch_num = flags.rate, flags.train_epochs phase1_store, output_path = flags.phase_one, flags.output_path checkpoint_dir = '%s/%s/rate%s' % (phase1_store, method, compression_rate) checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) session_config = tf.ConfigProto(device_count={'GPU': 1}, inter_op_parallelism_threads=5, intra_op_parallelism_threads=10, allow_soft_placement=True) run_config = tf.estimator.RunConfig().replace( save_checkpoints_secs=1e9, session_config=session_config) model_output_dir = "%s/%s/rate%s/" % (output_path, method, compression_rate) if os.path.exists(model_output_dir): shutil.rmtree(model_output_dir) else: os.makedirs(model_output_dir) classifier = tf.estimator.Estimator(model_fn=model_conversion_fn, model_dir=model_output_dir, config=run_config, params={ 'resnet_size': resnet_size, 'data_format': None, 'batch_size': batch_size, 'multi_gpu': flags.multi_gpu, 'version': version, 'checkpoint': checkpoint_file, 'method': method, 'scope': scope, 'rate': compression_rate, 'rate_decay': flags.rate_decay, }) train_hooks = hooks_helper.get_train_hooks(flags.hooks, batch_size=batch_size) def input_fn_train(): return input_fn(True, data_dir, batch_size, 1, 10, False) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=1) print("phase2 model saved to %s" % model_output_dir) def input_fn_eval(): return cifar_input_fn(False, data_dir, batch_size, 1, 10, False) eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=None) print(eval_results)
def main(argv): parser = ResnetArgParser() parser.add_argument( '--output_path', '-op', default='/tmp/cifar10_model_tensor_based', help= "[default: %(default)s] The location of the tensorized model of phase0.", metavar='<OP>') parser.add_argument( '--inter_store', '-is', default='/tmp/intermediate_storage/', help="[default: %(default)s] The tmp location of intermediate results", metavar='<IS>') # Set defaults that are reasonable for this model. parser.set_defaults(filename='normal_weights.ckpt', method='normal', scope='normal', rate_decay='flat') flags = parser.parse_args(args=argv[1:]) ''' Save the weights ftom original resnet model to our model with modified scopes. The variable names are changes. Assume they have the same structures ''' checkpoint = tf.train.latest_checkpoint(flags.model_dir) + ".meta" saver = tf.train.import_meta_graph(checkpoint) var_p_values = [] with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: saver.restore(sess, tf.train.latest_checkpoint(flags.model_dir)) var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) var = [v for v in var_list if 'Momentum' not in v.name] for i in range(1, len(var)): var_p_values.append(sess.run(var[i])) tf.reset_default_graph() model = ImagenetModel(flags.resnet_size, flags.data_format, version=flags.version) dataset = input_fn(is_training=False, data_dir=flags.data_dir, batch_size=flags.batch_size) iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() outputs = model(next_element[0], False) checkpoint_file = flags.inter_store + "/" + flags.filename #intermidate storage var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(len(var_list)): sess.run(var_list[i].assign(var_p_values[i])) new_saver = tf.train.Saver(var_list) new_saver.save(sess, checkpoint_file) ''' Load the weights above (with modified names) into our resnet model and save it via estimator ''' session_config = tf.ConfigProto(inter_op_parallelism_threads=5, intra_op_parallelism_threads=10, allow_soft_placement=True) run_config = tf.estimator.RunConfig().replace( save_checkpoints_secs=1e9, session_config=session_config) output_model_path = flags.output_path if os.path.exists(output_model_path): shutil.rmtree(output_model_path) else: os.makedirs(output_model_path) classifier = tf.estimator.Estimator(model_fn=imagenet_model_conversion_fn, model_dir=output_model_path, config=run_config, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'multi_gpu': True, 'version': flags.version, 'checkpoint': checkpoint_file, 'method': flags.method, 'scope': flags.scope, 'rate': flags.rate, 'rate_decay': flags.rate_decay, }) train_hooks = hooks_helper.get_train_hooks(flags.hooks, batch_size=flags.batch_size) def input_fn_train(): return input_fn(True, flags.data_dir, flags.batch_size, 1, 10, False) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=1)
def test_raise_in_invalid_names(self): invalid_names = ['StepCounterHook', 'StopAtStepHook'] with self.assertRaises(ValueError): hooks_helper.get_train_hooks(invalid_names, batch_size=256)
def test_raise_in_non_list_names(self): with self.assertRaises(ValueError): hooks_helper.get_train_hooks( 'LoggingTensorHook, ProfilerHook', batch_size=256)
def resnet_main(flags, model_function, input_function): # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' if not os.path.exists(flags.model_dir): os.makedirs(flags.model_dir) logging.basicConfig(level=logging.INFO, datefmt='%m-%d %H:%M', filename='%s/%s_%s.log' %(flags.model_dir, flags.method, flags.rate), filemode='a+') logging.info("Starting end to end training...") if flags.multi_gpu: validate_batch_size_for_multi_gpu(flags.batch_size) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens # in the model_fn itself when the optimizer is defined. model_function = tf.contrib.estimator.replicate_model_fn( model_function, loss_reduction=tf.losses.Reduction.MEAN) # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) # Set up a RunConfig to save checkpoint and set session config. run_config = tf.estimator.RunConfig().replace(save_checkpoints_secs=1e9, session_config=session_config) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags.model_dir, config=run_config, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'multi_gpu': flags.multi_gpu, 'version': flags.version, 'method': flags.method, 'scope': flags.scope, 'rate': flags.rate, 'rate_decay': flags.rate_decay }) for epoch in range(flags.train_epochs // flags.epochs_between_evals): train_hooks = hooks_helper.get_train_hooks(flags.hooks, batch_size=flags.batch_size) current_epoch = (epoch+1)*flags.epochs_between_evals print('Starting a training cycle up to epoch %d' %current_epoch) logging.info('Starting a training cycle up to epoch %d' %current_epoch) def input_fn_train(): return input_function(True, flags.data_dir, flags.batch_size, flags.epochs_between_evals, flags.num_parallel_calls, flags.multi_gpu) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags.max_train_steps) print('Starting to evaluate.') logging.info('Starting to evaluate.') # Evaluate the model and print results def input_fn_eval(): return input_function(False, flags.data_dir, flags.batch_size, 1, flags.num_parallel_calls, flags.multi_gpu) # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags.max_train_steps) print("Testing accuracy on epoch %d: %s" %((epoch+1)*flags.epochs_between_evals, eval_results)) logging.info("Testing accuracy on epoch %d: %s" %((epoch+1)*flags.epochs_between_evals, eval_results))