예제 #1
0
    def testAutoProfiling(self):
        ops.reset_default_graph()
        time_dir = os.path.join(test.get_temp_dir(), 'time')
        memory_dir = os.path.join(test.get_temp_dir(), 'memory')
        profile_dir = os.path.join(test.get_temp_dir(), 'dir/dir2/profile')
        # TODO(xpan): Should we create parent directory for them?
        gfile.MkDir(time_dir)
        gfile.MkDir(memory_dir)

        time_opts = (builder(builder.time_and_memory()).with_file_output(
            os.path.join(time_dir, 'profile')).select(['micros']).build())
        memory_opts = (builder(builder.time_and_memory()).with_file_output(
            os.path.join(memory_dir, 'profile')).select(['bytes']).build())

        time_steps = [2, 3]
        memory_steps = [1, 3]
        dump_steps = [3, 4]

        x = lib.BuildSmallModel()
        with profile_context.ProfileContext(profile_dir,
                                            trace_steps=[1, 2, 3],
                                            dump_steps=[3, 4]) as pctx:
            pctx.add_auto_profiling('scope', time_opts, time_steps)
            pctx.add_auto_profiling('scope', memory_opts, memory_steps)

            self._trainLoop(x, 10, time_dir, time_steps, memory_dir,
                            memory_steps, profile_dir, dump_steps)
  def testComplexCodeView(self):
    ops.reset_default_graph()
    outfile = os.path.join(test.get_temp_dir(), 'dump')
    opts = (builder(builder.trainable_variables_parameter())
            .with_file_output(outfile)
            .with_accounted_types(['.*'])
            .with_node_names(show_name_regexes=
                             ['.*model_analyzer_testlib.py.*'])
            .account_displayed_op_only(False)
            .select(['params', 'float_ops']).build())

    with profile_context.ProfileContext(test.get_temp_dir(),
                                        trace_steps=[],
                                        dump_steps=[]) as pctx:
      with session.Session() as sess:
        x = lib.BuildFullModel()

        sess.run(variables.global_variables_initializer())
        pctx.trace_next_step()
        _ = sess.run(x)
        tfprof_node = pctx.profiler.profile_python(options=opts)

        # pylint: disable=line-too-long
        with gfile.Open(outfile, 'r') as f:
          lines = f.read().split('\n')
          self.assertGreater(len(lines), 5)
          result = '\n'.join([l[:min(len(l), 80)] for l in lines])
          self.assertTrue(
              compat.as_text(lib.CheckAndRemoveDoc(result))
              .startswith('node name | # parameters | # float_ops'))

        self.assertLess(0, tfprof_node.total_exec_micros)
        self.assertEqual(2844, tfprof_node.total_parameters)
        self.assertLess(168800, tfprof_node.total_float_ops)
        self.assertEqual(8, len(tfprof_node.children))
        self.assertEqual('_TFProfRoot', tfprof_node.name)
        self.assertEqual(
            'model_analyzer_testlib.py:63:BuildFullModel',
            tfprof_node.children[0].name)
        self.assertEqual(
            'model_analyzer_testlib.py:63:BuildFullModel (gradient)',
            tfprof_node.children[1].name)
        self.assertEqual(
            'model_analyzer_testlib.py:67:BuildFullModel',
            tfprof_node.children[2].name)
        self.assertEqual(
            'model_analyzer_testlib.py:67:BuildFullModel (gradient)',
            tfprof_node.children[3].name)
        self.assertEqual(
            'model_analyzer_testlib.py:69:BuildFullModel',
            tfprof_node.children[4].name)
        self.assertEqual(
            'model_analyzer_testlib.py:70:BuildFullModel',
            tfprof_node.children[5].name)
        self.assertEqual(
            'model_analyzer_testlib.py:70:BuildFullModel (gradient)',
            tfprof_node.children[6].name)
        self.assertEqual(
            'model_analyzer_testlib.py:72:BuildFullModel',
            tfprof_node.children[7].name)
예제 #3
0
    def testBasics(self):
        ops.reset_default_graph()
        outfile = os.path.join(test.get_temp_dir(), "dump")
        opts = builder(
            builder.time_and_memory()).with_file_output(outfile).build()

        x = lib.BuildFullModel()

        profile_str = None
        profile_step50 = os.path.join(test.get_temp_dir(), "profile_50")
        with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
            pctx.add_auto_profiling("op",
                                    options=opts,
                                    profile_steps=[15, 50, 100])
            with session.Session() as sess:
                sess.run(variables.global_variables_initializer())
                total_steps = 101 if test.is_gpu_available() else 50
                for i in range(total_steps):
                    sess.run(x)
                    if i == 14 or i == 99:
                        self.assertTrue(gfile.Exists(outfile))
                        gfile.Remove(outfile)
                    if i == 49:
                        self.assertTrue(gfile.Exists(profile_step50))
                        with gfile.Open(outfile, "r") as f:
                            profile_str = f.read()
                        gfile.Remove(outfile)

        with lib.ProfilerFromFile(
                os.path.join(test.get_temp_dir(), "profile_50")) as profiler:
            profiler.profile_operations(options=opts)
            with gfile.Open(outfile, "r") as f:
                self.assertEqual(profile_str, f.read())
  def testComplexCodeView(self):
    ops.reset_default_graph()
    outfile = os.path.join(test.get_temp_dir(), 'dump')
    opts = (builder(builder.trainable_variables_parameter())
            .with_file_output(outfile)
            .with_accounted_types(['.*'])
            .with_node_names(show_name_regexes=
                             ['.*model_analyzer_testlib.py.*'])
            .account_displayed_op_only(False)
            .select(['params', 'float_ops']).build())

    with profile_context.ProfileContext(test.get_temp_dir(),
                                        trace_steps=[],
                                        dump_steps=[]) as pctx:
      with session.Session() as sess:
        x = lib.BuildFullModel()

        sess.run(variables.global_variables_initializer())
        pctx.trace_next_step()
        _ = sess.run(x)
        tfprof_node = pctx.profiler.profile_python(options=opts)

        # pylint: disable=line-too-long
        with gfile.Open(outfile, 'r') as f:
          lines = f.read().split('\n')
          result = '\n'.join([l[:min(len(l), 80)] for l in lines])
          self.assertEqual(compat.as_bytes('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/168.85k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (0/1.80k params, 0/45.37k flops)\n    model_analyzer_testlib.py:40:BuildSmallModel (0/0 params, 0/0 flops)\n    model_analyzer_testlib.py:44:BuildSmallModel (0/4 params, 0/8 flops)\n    model_analyzer_testlib.py:48:BuildSmallModel (0/648 params, 0/1.30k flops)\n    model_analyzer_testlib.py:49:BuildSmallModel (0/0 params, 0/23.33k flops)\n    model_analyzer_testlib.py:53:BuildSmallModel (0/1.15k params, 0/2.30k flops)\n    model_analyzer_testlib.py:54:BuildSmallModel (0/0 params, 0/18.43k flops)\n  model_analyzer_testlib.py:63:BuildFullModel (gradient) (0/0 params, 0/67.39k f\n    model_analyzer_testlib.py:49:BuildSmallModel (gradient) (0/0 params, 0/46.66\n    model_analyzer_testlib.py:54:BuildSmallModel (gradient) (0/0 params, 0/20.74\n  model_analyzer_testlib.py:67:BuildFullModel (0/1.04k params, 0/18.57k flops)\n  model_analyzer_testlib.py:67:BuildFullModel (gradient) (0/0 params, 0/37.00k f\n  model_analyzer_testlib.py:69:BuildFullModel (0/0 params, 0/0 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (0/0 params, 0/258 flops)\n  model_analyzer_testlib.py:70:BuildFullModel (gradient) (0/0 params, 0/129 flop\n  model_analyzer_testlib.py:72:BuildFullModel (0/0 params, 0/141 flops)\n'),
                           compat.as_bytes(result))

        self.assertLess(0, tfprof_node.total_exec_micros)
        self.assertEqual(2844, tfprof_node.total_parameters)
        self.assertEqual(168854, tfprof_node.total_float_ops)
        self.assertEqual(8, len(tfprof_node.children))
        self.assertEqual('_TFProfRoot', tfprof_node.name)
        self.assertEqual(
            'model_analyzer_testlib.py:63:BuildFullModel',
            tfprof_node.children[0].name)
        self.assertEqual(
            'model_analyzer_testlib.py:63:BuildFullModel (gradient)',
            tfprof_node.children[1].name)
        self.assertEqual(
            'model_analyzer_testlib.py:67:BuildFullModel',
            tfprof_node.children[2].name)
        self.assertEqual(
            'model_analyzer_testlib.py:67:BuildFullModel (gradient)',
            tfprof_node.children[3].name)
        self.assertEqual(
            'model_analyzer_testlib.py:69:BuildFullModel',
            tfprof_node.children[4].name)
        self.assertEqual(
            'model_analyzer_testlib.py:70:BuildFullModel',
            tfprof_node.children[5].name)
        self.assertEqual(
            'model_analyzer_testlib.py:70:BuildFullModel (gradient)',
            tfprof_node.children[6].name)
        self.assertEqual(
            'model_analyzer_testlib.py:72:BuildFullModel',
            tfprof_node.children[7].name)
예제 #5
0
  def testDisabled(self):
    ops.reset_default_graph()
    x = lib.BuildFullModel()
    with profile_context.ProfileContext(test.get_temp_dir(),
                                        enabled=False) as pctx:
      with session.Session() as sess:
        self.evaluate(variables.global_variables_initializer())
        for _ in range(10):
          self.evaluate(x)
      self.assertTrue(pctx.profiler is None)
      self.assertTrue(
          getattr(session.BaseSession, "profile_context", None) is None)

    with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
      with session.Session() as sess:
        self.evaluate(variables.global_variables_initializer())
        for _ in range(10):
          self.evaluate(x)
      self.assertFalse(pctx.profiler is None)
      self.assertFalse(
          getattr(session.BaseSession, "profile_context", None) is None)
예제 #6
0
    def testBasics(self):
        ops.reset_default_graph()
        outfile = os.path.join(test.get_temp_dir(), "dump")
        opts = builder(
            builder.time_and_memory()).with_file_output(outfile).build()

        x = lib.BuildFullModel()

        profile_str = None
        profile_step100 = os.path.join(test.get_temp_dir(), "profile_100")
        with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
            pctx.add_auto_profiling("op",
                                    options=opts,
                                    profile_steps=[15, 50, 100])
            with session.Session() as sess:
                sess.run(variables.global_variables_initializer())
                total_steps = 101
                for i in range(total_steps):
                    sess.run(x)
                    if i == 14 or i == 49:
                        self.assertTrue(gfile.Exists(outfile))
                        gfile.Remove(outfile)
                    if i == 99:
                        self.assertTrue(gfile.Exists(profile_step100))
                        with gfile.Open(outfile, "r") as f:
                            profile_str = f.read()
                        gfile.Remove(outfile)

            self.assertEqual(set([15, 50, 100]),
                             set(pctx.get_profiles("op").keys()))

        with lib.ProfilerFromFile(
                os.path.join(test.get_temp_dir(), "profile_100")) as profiler:
            profiler.profile_operations(options=opts)
            with gfile.Open(outfile, "r") as f:

                if test.is_built_with_rocm():
                    # The profiler output for ROCm mode, includes an extra warning related
                    # to the lack of stream tracing in ROCm moed. Need to skip this warning
                    # when doing the diff in ROCm mode
                    profile_str = "\n".join(profile_str.split("\n")[7:])

                self.assertEqual(profile_str, f.read())
예제 #7
0
  def testAutoTracingInDeubMode(self):
    ops.reset_default_graph()
    x = lib.BuildFullModel()

    with profile_context.ProfileContext(test.get_temp_dir(), debug=True):
      with session.Session() as sess:
        self.evaluate(variables.global_variables_initializer())
        for _ in range(10):
          self.evaluate(x)
          for f in gfile.ListDirectory(test.get_temp_dir()):
            # Warm up, no tracing.
            self.assertFalse("run_meta" in f)
        self.evaluate(x)
        self.assertTrue(
            gfile.Exists(os.path.join(test.get_temp_dir(), "run_meta_11")))
        gfile.Remove(os.path.join(test.get_temp_dir(), "run_meta_11"))
        # fetched already.
        self.evaluate(x)
        for f in gfile.ListDirectory(test.get_temp_dir()):
          self.assertFalse("run_meta" in f)
예제 #8
0
    def testSelectEverythingDetail(self):
        ops.reset_default_graph()
        dev = '/device:GPU:0' if test.is_gpu_available() else '/device:CPU:0'
        outfile = os.path.join(test.get_temp_dir(), 'dump')
        opts = (builder(
            builder.trainable_variables_parameter()).with_file_output(
                outfile).with_accounted_types(['.*']).select([
                    'micros', 'bytes', 'params', 'float_ops', 'occurrence',
                    'device', 'op_types', 'input_shapes'
                ]).build())

        with profile_context.ProfileContext(test.get_temp_dir(),
                                            trace_steps=[],
                                            dump_steps=[]) as pctx:
            with session.Session() as sess, ops.device(dev):
                x = lib.BuildSmallModel()

                sess.run(variables.global_variables_initializer())
                pctx.trace_next_step()
                pctx.dump_next_step()
                _ = sess.run(x)

                pctx.profiler.profile_name_scope(options=opts)

                with gfile.Open(outfile, 'r') as f:
                    # pylint: disable=line-too-long
                    dump_str = lib.CheckAndRemoveDoc(f.read())
                    outputs = dump_str.split('\n')

                    self.assertEqual(
                        outputs[0],
                        'node name | # parameters | # float_ops | requested bytes | total execution time | accelerator execution time | cpu execution time | assigned devices | op types | op count (run|defined) | input shapes'
                    )
                    for o in outputs[1:]:
                        if o.find('Conv2D ') > 0:
                            metrics = o[o.find('(') + 1:o.find(')')].split(',')
                            # Make sure time is profiled.
                            gap = 1 if test.is_gpu_available() else 2
                            for i in range(3, 6, gap):
                                mat = re.search('(.*)[um]s/(.*)[um]s',
                                                metrics[i])
                                self.assertGreater(float(mat.group(1)), 0.0)
                                self.assertGreater(float(mat.group(2)), 0.0)
                            # Make sure device is profiled.
                            if test.is_gpu_available():
                                self.assertTrue(metrics[6].find('gpu') > 0)
                                self.assertFalse(metrics[6].find('cpu') > 0)
                            else:
                                self.assertFalse(metrics[6].find('gpu') > 0)
                                self.assertTrue(metrics[6].find('cpu') > 0)
                            # Make sure float_ops is profiled.
                            mat = re.search('(.*)k/(.*)k flops',
                                            metrics[1].strip())
                            self.assertGreater(float(mat.group(1)), 0.0)
                            self.assertGreater(float(mat.group(2)), 0.0)
                            # Make sure op_count is profiled.
                            self.assertEqual(metrics[8].strip(), '1/1|1/1')
                            # Make sure input_shapes is profiled.
                            self.assertEqual(metrics[9].strip(),
                                             '0:2x6x6x3|1:3x3x3x6')

                        if o.find('DW (3x3x3x6') > 0:
                            metrics = o[o.find('(') + 1:o.find(')')].split(',')
                            mat = re.search('(.*)/(.*) params',
                                            metrics[1].strip())
                            self.assertGreater(float(mat.group(1)), 0.0)
                            self.assertGreater(float(mat.group(2)), 0.0)
                    # pylint: enable=line-too-long

        # Test that profiler restored from profile file gives the same result.
        gfile.Remove(outfile)
        profile_file = os.path.join(test.get_temp_dir(), 'profile_1')
        with lib.ProfilerFromFile(profile_file) as profiler:
            profiler.profile_name_scope(options=opts)
            with gfile.Open(outfile, 'r') as f:
                self.assertEqual(dump_str, lib.CheckAndRemoveDoc(f.read()))
예제 #9
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=2000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=100)
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    builder = option_builder.ProfileOptionBuilder
    opts1 = builder(builder.time_and_memory()).\
            order_by('micros').\
            with_max_depth(10).\
            with_file_output("./pctx/opts1-rank-%d" % hvd.rank()).\
            build()
    opts2 = builder.trainable_variables_parameter()
    # with profile_context.ProfileContext("./pctx",
    #                                     trace_steps=range(100, 110),
    #                                     dump_steps=[110]) as pctx:
    with profile_context.ProfileContext("./pctx") as pctx:
        pctx.add_auto_profiling('op', opts1, [800, 900, 1000])
        pctx.add_auto_profiling('scope', opts2, [1000])
        with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                               hooks=hooks,
                                               config=config) as mon_sess:
            while not mon_sess.should_stop():
                # Run a training step synchronously.
                image_, label_ = next(training_batch_generator)
                mon_sess.run(train_op,
                             feed_dict={
                                 image: image_,
                                 label: label_
                             })
        pctx.profiler.advise(options=model_analyzer.ALL_ADVICE)