Exemplo n.º 1
0
 def testEvalResults(self):
   samples = resnet_benchmark.MakeSamplesFromEvalOutput(
       self.metadata_input, self.contents, 0)
   golden = [
       Sample('Eval Loss', 3.86324, '', self.metadata_output),
       Sample('Top 1 Accuracy', 32.751465, '%', self.metadata_output),
       Sample('Top 5 Accuracy', 58.825684, '%', self.metadata_output)
   ]
   self.assertEqual(samples, golden)
 def testEvalResults(self):
   samples = resnet_benchmark.MakeSamplesFromEvalOutput(
       self.metadata, self.contents, 0)
   golden = [
       Sample(
           'Eval Loss', 3.86324, '',
           {'epoch': 4.000479577971386, 'elapsed_seconds': 0, 'step': 5005,
            'num_examples_per_epoch': 1251.1}),
       Sample(
           'Top 1 Accuracy', 32.751465, '%',
           {'epoch': 4.000479577971386, 'elapsed_seconds': 0, 'step': 5005,
            'num_examples_per_epoch': 1251.1}),
       Sample(
           'Top 5 Accuracy', 58.825684, '%',
           {'epoch': 4.000479577971386, 'elapsed_seconds': 0, 'step': 5005,
            'num_examples_per_epoch': 1251.1})
   ]
   self.assertEqual(samples, golden)
Exemplo n.º 3
0
def Run(benchmark_spec):
    """Run Inception V3 on the cluster.

  Args:
    benchmark_spec: The benchmark specification. Contains all data that is
        required to run the benchmark.

  Returns:
    A list of sample.Sample objects.
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    inception3_benchmark_script = (
        'tpu/models/experimental/inception/inception_v3.py')
    inception3_benchmark_cmd = (
        '{env_cmd} && python {script} '
        '--learning_rate={learning_rate} '
        '--iterations={iterations} '
        '--use_tpu={use_tpu} '
        '--use_data={use_data} '
        '--train_steps_per_eval={steps_per_eval} '
        '--data_dir={data_dir} '
        '--model_dir={model_dir} '
        '--save_checkpoints_secs={save_checkpoints_secs} '
        '--train_batch_size={train_batch_size} '
        '--eval_batch_size={eval_batch_size} '
        '--precision={precision}'.format(
            env_cmd=benchmark_spec.env_cmd,
            script=inception3_benchmark_script,
            learning_rate=benchmark_spec.learning_rate,
            iterations=benchmark_spec.iterations,
            use_tpu=bool(benchmark_spec.tpus),
            use_data=benchmark_spec.use_data,
            steps_per_eval=benchmark_spec.steps_per_eval,
            data_dir=benchmark_spec.data_dir,
            model_dir=benchmark_spec.model_dir,
            save_checkpoints_secs=benchmark_spec.save_checkpoints_secs,
            train_batch_size=benchmark_spec.train_batch_size,
            eval_batch_size=benchmark_spec.eval_batch_size,
            precision=benchmark_spec.precision))
    if FLAGS.tf_device == 'gpu':
        inception3_benchmark_cmd = '{env} {cmd}'.format(
            env=tensorflow.GetEnvironmentVars(vm),
            cmd=inception3_benchmark_cmd)
    samples = []
    metadata = _CreateMetadataDict(benchmark_spec)
    elapsed_seconds = 0
    steps_per_eval = benchmark_spec.steps_per_eval
    train_steps = benchmark_spec.train_steps
    for step in range(steps_per_eval, train_steps + steps_per_eval,
                      steps_per_eval):
        step = min(step, train_steps)
        inception3_benchmark_cmd_step = '{cmd} --train_steps={step}'.format(
            cmd=inception3_benchmark_cmd, step=step)
        if benchmark_spec.mode in ('train', 'train_and_eval'):
            if benchmark_spec.tpus:
                tpu = benchmark_spec.tpu_groups['train'].GetName()
                num_shards = '--num_shards={}'.format(
                    benchmark_spec.tpu_groups['train'].GetNumShards())
            else:
                tpu = num_shards = ''
            inception3_benchmark_train_cmd = (
                '{cmd} --tpu={tpu} --mode=train {num_shards}'.format(
                    cmd=inception3_benchmark_cmd_step,
                    tpu=tpu,
                    num_shards=num_shards))
            start = time.time()
            stdout, stderr = vm.RobustRemoteCommand(
                inception3_benchmark_train_cmd, should_log=True)
            elapsed_seconds += (time.time() - start)
            samples.extend(
                mnist_benchmark.MakeSamplesFromTrainOutput(
                    metadata, stdout + stderr, elapsed_seconds, step))
        if benchmark_spec.mode in ('train_and_eval', 'eval'):
            if benchmark_spec.tpus:
                tpu = benchmark_spec.tpu_groups['eval'].GetName()
                num_shards = '--num_shards={}'.format(
                    benchmark_spec.tpu_groups['eval'].GetNumShards())
            else:
                tpu = num_shards = ''
            inception3_benchmark_eval_cmd = (
                '{cmd} --tpu={tpu} --mode=eval {num_shards}'.format(
                    cmd=inception3_benchmark_cmd_step,
                    tpu=tpu,
                    num_shards=num_shards))
            stdout, stderr = vm.RobustRemoteCommand(
                inception3_benchmark_eval_cmd, should_log=True)
            samples.extend(
                resnet_benchmark.MakeSamplesFromEvalOutput(
                    metadata, stdout + stderr, elapsed_seconds))
    return samples