def testTrainResults(self): samples = mnist_benchmark.MakeSamplesFromTrainOutput( self.metadata, self.contents, 0) for s in samples: print s golden = [ Sample( 'Loss', 5.7193503, '', { 'num_examples_per_epoch': 1251.1, 'epoch': 3.197186475901207, 'elapsed seconds': 0, 'step': 4000 }), Sample( 'Global Steps Per Second', 1.4384171428571428, 'global_steps/sec', { 'num_examples_per_epoch': 1251.1, 'epoch': 3.197186475901207, 'elapsed seconds': 0, 'step': 4000 }), Sample( 'Examples Per Second', 1472.9414285714283, 'examples/sec', { 'num_examples_per_epoch': 1251.1, 'epoch': 3.197186475901207, 'elapsed seconds': 0, 'step': 4000 }) ] self.assertEqual(samples, golden)
def testTrainResults(self): samples = mnist_benchmark.MakeSamplesFromTrainOutput( self.metadata_input, self.contents, 0) golden = [ Sample('Loss', 0.09562386, '', self.metadata_output), Sample('Global Steps Per Second', 217.69966666666664, 'global_steps/sec', self.metadata_output), Sample('Examples Per Second', 222924.33333333334, 'examples/sec', self.metadata_output) ] self.assertEqual(samples, golden)
def testTrainResults(self): samples = mnist_benchmark.MakeSamplesFromTrainOutput( self.metadata_input, self.contents, 0) golden = [ Sample('Loss', 3.6859958, '', self.metadata_output), Sample('Global Steps Per Second', 3.6699466666666667, 'global_steps/sec', self.metadata_output), Sample('Examples Per Second', 3758.023333333333, 'examples/sec', self.metadata_output) ] self.assertEqual(samples, golden)
def testTrainResults(self): samples = mnist_benchmark.MakeSamplesFromTrainOutput( self.metadata_input, self.contents, 0) golden = [ Sample('Loss', 5.7193503, '', self.metadata_output), Sample('Global Steps Per Second', 1.4384171428571428, 'global_steps/sec', self.metadata_output), Sample('Examples Per Second', 1472.9414285714283, 'examples/sec', self.metadata_output) ] self.assertEqual(samples, golden)
def testTrainResults(self): samples = mnist_benchmark.MakeSamplesFromTrainOutput( self.metadata, self.contents, 0) golden = [ Sample( 'Loss', 0.09562386, '', {'num_examples_per_epoch': 1251.1, 'step': 2000, 'elapsed seconds': 0, 'epoch': 1.5985932379506036}), Sample( 'Global Steps Per Second', 217.69966666666664, 'global_steps/sec', {'num_examples_per_epoch': 1251.1, 'step': 2000, 'elapsed seconds': 0, 'epoch': 1.5985932379506036}), Sample( 'Examples Per Second', 222924.33333333334, 'examples/sec', {'num_examples_per_epoch': 1251.1, 'step': 2000, 'elapsed seconds': 0, 'epoch': 1.5985932379506036}) ] self.assertEqual(samples, golden)
def testTrainResults(self): samples = mnist_benchmark.MakeSamplesFromTrainOutput( self.metadata, self.contents, 0) golden = [ Sample( 'Loss', 3.6859958, '', {'epoch': 4.000479577971386, 'elapsed seconds': 0, 'num_examples_per_epoch': 1251.1, 'step': 5005}), Sample( 'Global Steps Per Second', 3.6699466666666667, 'global_steps/sec', {'epoch': 4.000479577971386, 'elapsed seconds': 0, 'num_examples_per_epoch': 1251.1, 'step': 5005}), Sample( 'Examples Per Second', 3758.023333333333, 'examples/sec', {'epoch': 4.000479577971386, 'elapsed seconds': 0, 'num_examples_per_epoch': 1251.1, 'step': 5005}) ] self.assertEqual(samples, golden)
def Run(benchmark_spec): """Run ResNet on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] resnet_benchmark_script = 'resnet_main.py' resnet_benchmark_cmd = ( '{env_cmd} && cd tpu/models/official/resnet && ' 'python {script} ' '--use_tpu={use_tpu} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--resnet_depth={depth} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--iterations_per_loop={iterations} ' '--data_format={data_format} ' '--precision={precision} ' '--skip_host_call={skip_host_call} ' '--num_train_images={num_train_images} ' '--num_eval_images={num_eval_images}'.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, use_tpu=bool(benchmark_spec.tpus), data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, depth=benchmark_spec.depth, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, iterations=benchmark_spec.iterations, data_format=benchmark_spec.data_format, precision=benchmark_spec.precision, skip_host_call=benchmark_spec.skip_host_call, num_train_images=benchmark_spec.num_train_images, num_eval_images=benchmark_spec.num_eval_images )) if FLAGS.tf_device == 'gpu': resnet_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=resnet_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) elapsed_seconds = 0 steps_per_eval = benchmark_spec.steps_per_eval train_steps = benchmark_spec.train_steps for step in range(steps_per_eval, train_steps + steps_per_eval, steps_per_eval): step = min(step, train_steps) resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format( cmd=resnet_benchmark_cmd, step=step) if benchmark_spec.mode in ('train', 'train_and_eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) else: tpu = num_cores = '' resnet_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --mode=train {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd, should_log=True) elapsed_seconds += (time.time() - start) samples.extend(mnist_benchmark.MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, step)) if benchmark_spec.mode in ('train_and_eval', 'eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['eval'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['eval'].GetNumShards()) else: tpu = num_cores = '' resnet_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd, should_log=True) samples.extend(MakeSamplesFromEvalOutput( metadata, stdout + stderr, elapsed_seconds)) return samples
def Run(benchmark_spec): """Run Inception V3 on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] inception3_benchmark_script = ( 'tpu/models/experimental/inception/inception_v3.py') inception3_benchmark_cmd = ( '{env_cmd} && python {script} ' '--learning_rate={learning_rate} ' '--iterations={iterations} ' '--use_tpu={use_tpu} ' '--use_data={use_data} ' '--train_steps_per_eval={steps_per_eval} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--save_checkpoints_secs={save_checkpoints_secs} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--precision={precision}'.format( env_cmd=benchmark_spec.env_cmd, script=inception3_benchmark_script, learning_rate=benchmark_spec.learning_rate, iterations=benchmark_spec.iterations, use_tpu=bool(benchmark_spec.tpus), use_data=benchmark_spec.use_data, steps_per_eval=benchmark_spec.steps_per_eval, data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, save_checkpoints_secs=benchmark_spec.save_checkpoints_secs, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, precision=benchmark_spec.precision)) if FLAGS.tf_device == 'gpu': inception3_benchmark_cmd = '{env} {cmd}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=inception3_benchmark_cmd) samples = [] metadata = _CreateMetadataDict(benchmark_spec) elapsed_seconds = 0 steps_per_eval = benchmark_spec.steps_per_eval train_steps = benchmark_spec.train_steps for step in range(steps_per_eval, train_steps + steps_per_eval, steps_per_eval): step = min(step, train_steps) inception3_benchmark_cmd_step = '{cmd} --train_steps={step}'.format( cmd=inception3_benchmark_cmd, step=step) if benchmark_spec.mode in ('train', 'train_and_eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_shards = '--num_shards={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) else: tpu = num_shards = '' inception3_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --mode=train {num_shards}'.format( cmd=inception3_benchmark_cmd_step, tpu=tpu, num_shards=num_shards)) start = time.time() stdout, stderr = vm.RobustRemoteCommand( inception3_benchmark_train_cmd, should_log=True) elapsed_seconds += (time.time() - start) samples.extend( mnist_benchmark.MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, step)) if benchmark_spec.mode in ('train_and_eval', 'eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['eval'].GetName() num_shards = '--num_shards={}'.format( benchmark_spec.tpu_groups['eval'].GetNumShards()) else: tpu = num_shards = '' inception3_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --mode=eval {num_shards}'.format( cmd=inception3_benchmark_cmd_step, tpu=tpu, num_shards=num_shards)) stdout, stderr = vm.RobustRemoteCommand( inception3_benchmark_eval_cmd, should_log=True) samples.extend( resnet_benchmark.MakeSamplesFromEvalOutput( metadata, stdout + stderr, elapsed_seconds)) return samples
def Run(benchmark_spec): """Run ResNet on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if benchmark_spec.tpus: resnet_benchmark_script = 'resnet_main.py' resnet_benchmark_cmd = ( '{env_cmd} && ' 'cd tpu/models && ' 'export PYTHONPATH=$(pwd) &&' 'cd official/resnet && ' 'python {script} ' '--use_tpu={use_tpu} ' '--data_dir={data_dir} ' '--model_dir={model_dir} ' '--resnet_depth={depth} ' '--train_batch_size={train_batch_size} ' '--eval_batch_size={eval_batch_size} ' '--iterations_per_loop={iterations} ' '--data_format={data_format} ' '--precision={precision} ' '--skip_host_call={skip_host_call} ' '--num_train_images={num_train_images} ' '--num_eval_images={num_eval_images}'.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, use_tpu=bool(benchmark_spec.tpus), data_dir=benchmark_spec.data_dir, model_dir=benchmark_spec.model_dir, depth=benchmark_spec.depth, train_batch_size=benchmark_spec.train_batch_size, eval_batch_size=benchmark_spec.eval_batch_size, iterations=benchmark_spec.iterations, data_format=benchmark_spec.data_format, precision=benchmark_spec.precision, skip_host_call=benchmark_spec.skip_host_call, num_train_images=benchmark_spec.num_train_images, num_eval_images=benchmark_spec.num_eval_images)) else: resnet_benchmark_script = 'imagenet_main.py' resnet_benchmark_cmd = ('{env_cmd} && ' 'cd models && ' 'export PYTHONPATH=$(pwd) && ' 'cd official/r1/resnet && ' 'python {script} ' '--data_dir=/data/imagenet ' '--model_dir={model_dir} ' '--resnet_size={resnet_size} ' '--batch_size={batch_size} ' '--data_format={data_format} '.format( env_cmd=benchmark_spec.env_cmd, script=resnet_benchmark_script, model_dir=benchmark_spec.model_dir, resnet_size=benchmark_spec.depth, batch_size=benchmark_spec.train_batch_size, data_format=benchmark_spec.data_format)) precision = '{precision}'.format(precision=benchmark_spec.precision) if precision == 'bfloat16': resnet_benchmark_cmd = '{cmd} --dtype=fp16'.format( cmd=resnet_benchmark_cmd) else: resnet_benchmark_cmd = '{cmd} --dtype=fp32'.format( cmd=resnet_benchmark_cmd) if nvidia_driver.CheckNvidiaGpuExists(vm): resnet_benchmark_cmd = '{env} {cmd} --num_gpus={num_gpus}'.format( env=tensorflow.GetEnvironmentVars(vm), cmd=resnet_benchmark_cmd, num_gpus=nvidia_driver.QueryNumberOfGpus(vm)) samples = [] metadata = _CreateMetadataDict(benchmark_spec) elapsed_seconds = 0 steps_per_eval = benchmark_spec.steps_per_eval train_steps = benchmark_spec.train_steps for step in range(steps_per_eval, train_steps + steps_per_eval, steps_per_eval): step = min(step, train_steps) resnet_benchmark_cmd_step = '{cmd} --train_steps={step}'.format( cmd=resnet_benchmark_cmd, step=step) if benchmark_spec.mode in ('train', 'train_and_eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['train'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['train'].GetNumShards()) resnet_benchmark_train_cmd = ( '{cmd} --tpu={tpu} --mode=train {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_train_cmd = ( '{cmd} --max_train_steps={max_train_steps} ' '--train_epochs={train_epochs} --noeval_only'.format( cmd=resnet_benchmark_cmd, train_epochs=benchmark_spec.epochs_per_eval, max_train_steps=step)) start = time.time() stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_train_cmd, should_log=True) elapsed_seconds += (time.time() - start) samples.extend(mnist_benchmark.MakeSamplesFromTrainOutput( metadata, stdout + stderr, elapsed_seconds, step)) if benchmark_spec.mode in ('train_and_eval', 'eval'): if benchmark_spec.tpus: tpu = benchmark_spec.tpu_groups['eval'].GetName() num_cores = '--num_cores={}'.format( benchmark_spec.tpu_groups['eval'].GetNumShards()) resnet_benchmark_eval_cmd = ( '{cmd} --tpu={tpu} --mode=eval {num_cores}'.format( cmd=resnet_benchmark_cmd_step, tpu=tpu, num_cores=num_cores)) else: resnet_benchmark_eval_cmd = ('{cmd} --eval_only'.format( cmd=resnet_benchmark_cmd)) stdout, stderr = vm.RobustRemoteCommand(resnet_benchmark_eval_cmd, should_log=True) samples.extend( MakeSamplesFromEvalOutput( metadata, stdout + stderr, elapsed_seconds, use_tpu=bool(benchmark_spec.tpus))) return samples