예제 #1
0
파일: train.py 프로젝트: chinatian/glow
def main(hps):

    # Initialize Horovod.
    hvd.init()

    # Create tensorflow session
    sess = tensorflow_session()

    # Download and load dataset.
    tf.set_random_seed(hvd.rank() + hvd.size() * hps.seed)
    np.random.seed(hvd.rank() + hvd.size() * hps.seed)

    # Get data and set train_its and valid_its
    train_iterator, test_iterator, data_init = get_data(hps, sess)
    hps.train_its, hps.test_its, hps.full_test_its = get_its(hps)

    # Create log dir
    logdir = os.path.abspath(hps.logdir) + "/"
    if not os.path.exists(logdir):
        os.mkdir(logdir)

    # Create model
    import model
    model = model.model(sess, hps, train_iterator, test_iterator, data_init)

    # Initialize visualization functions
    visualise = init_visualizations(hps, model, logdir)

    if not hps.inference:
        # Perform training
        train(sess, model, hps, logdir, visualise)
    else:
        infer(sess, model, hps, test_iterator)
def main(unused_argv):
    # Horovod: initialize Horovod.
    hvd.init()

    # Load training and eval data
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())
    train_data = mnist.train.images  # Returns np.array
    train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
    eval_data = mnist.test.images  # Returns np.array
    eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = './mnist_convnet_model' if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn, model_dir=model_dir,
        config=tf.estimator.RunConfig(session_config=config))

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(
        tensors=tensors_to_log, every_n_iter=500)

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": train_data},
        y=train_labels,
        batch_size=100,
        num_epochs=None,
        shuffle=True)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(
        input_fn=train_input_fn,
        steps=20000 // hvd.size(),
        hooks=[logging_hook, bcast_hook])

    # Evaluate the model and print results
    eval_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={"x": eval_data},
        y=eval_labels,
        num_epochs=1,
        shuffle=False)
    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
    print(eval_results)
예제 #3
0
파일: train.py 프로젝트: chinatian/glow
def get_data(hps, sess):
    if hps.image_size == -1:
        hps.image_size = {'mnist': 32, 'cifar10': 32, 'imagenet-oord': 64,
                          'imagenet': 256, 'celeba': 256, 'lsun_realnvp': 64, 'lsun': 256}[hps.problem]
    if hps.n_test == -1:
        hps.n_test = {'mnist': 10000, 'cifar10': 10000, 'imagenet-oord': 50000, 'imagenet': 50000,
                      'celeba': 3000, 'lsun_realnvp': 300*hvd.size(), 'lsun': 300*hvd.size()}[hps.problem]
    hps.n_y = {'mnist': 10, 'cifar10': 10, 'imagenet-oord': 1000,
               'imagenet': 1000, 'celeba': 1, 'lsun_realnvp': 1, 'lsun': 1}[hps.problem]
    if hps.data_dir == "":
        hps.data_dir = {'mnist': None, 'cifar10': None, 'imagenet-oord': '/mnt/host/imagenet-oord-tfr', 'imagenet': '/mnt/host/imagenet-tfr',
                        'celeba': '/mnt/host/celeba-reshard-tfr', 'lsun_realnvp': '/mnt/host/lsun_realnvp', 'lsun': '/mnt/host/lsun'}[hps.problem]

    if hps.problem == 'lsun_realnvp':
        hps.rnd_crop = True
    else:
        hps.rnd_crop = False

    if hps.category:
        hps.data_dir += ('/%s' % hps.category)

    # Use anchor_size to rescale batch size based on image_size
    s = hps.anchor_size
    hps.local_batch_train = hps.n_batch_train * \
        s * s // (hps.image_size * hps.image_size)
    hps.local_batch_test = {64: 50, 32: 25, 16: 10, 8: 5, 4: 2, 2: 2, 1: 1}[
        hps.local_batch_train]  # round down to closest divisor of 50
    hps.local_batch_init = hps.n_batch_init * \
        s * s // (hps.image_size * hps.image_size)

    print("Rank {} Batch sizes Train {} Test {} Init {}".format(
        hvd.rank(), hps.local_batch_train, hps.local_batch_test, hps.local_batch_init))

    if hps.problem in ['imagenet-oord', 'imagenet', 'celeba', 'lsun_realnvp', 'lsun']:
        hps.direct_iterator = True
        import data_loaders.get_data as v
        train_iterator, test_iterator, data_init = \
            v.get_data(sess, hps.data_dir, hvd.size(), hvd.rank(), hps.pmap, hps.fmap, hps.local_batch_train,
                       hps.local_batch_test, hps.local_batch_init, hps.image_size, hps.rnd_crop)

    elif hps.problem in ['mnist', 'cifar10']:
        hps.direct_iterator = False
        import data_loaders.get_mnist_cifar as v
        train_iterator, test_iterator, data_init = \
            v.get_data(hps.problem, hvd.size(), hvd.rank(), hps.dal, hps.local_batch_train,
                       hps.local_batch_test, hps.local_batch_init, hps.image_size)

    else:
        raise Exception()

    return train_iterator, test_iterator, data_init
예제 #4
0
파일: rnn.py 프로젝트: rwth-i6/returnn
def init_backend_engine():
  """
  Initializes ``engine``, which is either :class:`TFEngine.Engine` or Theano :class:`Engine.Engine`.
  """
  BackendEngine.select_engine(config=config)
  if BackendEngine.is_theano_selected():
    print("Theano:", describe_theano_version(), file=log.v3)
    import TheanoUtil
    TheanoUtil.monkey_patches()
  elif BackendEngine.is_tensorflow_selected():
    print("TensorFlow:", describe_tensorflow_version(), file=log.v3)
    if get_tensorflow_version_tuple()[0] == 0:
      print("Warning: TF <1.0 is not supported and likely broken.", file=log.v2)
    if os.environ.get("TF_DEVICE"):
      print("Devices: Use %s via TF_DEVICE instead of %s." % (
        os.environ.get("TF_DEVICE"), config.opt_typed_value("device")), file=log.v4)
      config.set("device", os.environ.get("TF_DEVICE"))
    if config.is_true("use_horovod"):
      import socket
      # noinspection PyPackageRequirements,PyUnresolvedReferences
      import horovod.tensorflow as hvd
      from TFUtil import init_horovod
      init_horovod()  # make sure it is initialized
      if "gpu" in config.value("device", "") or os.environ.get("CUDA_VISIBLE_DEVICES", ""):
        # We assume that we want to use a GPU.
        gpu_opts = config.typed_dict.setdefault("tf_session_opts", {}).setdefault("gpu_options", {})
        assert "visible_device_list" not in gpu_opts
        gpu_opts["visible_device_list"] = str(hvd.local_rank())
        print("Horovod: Hostname %s, pid %i, using GPU %s." % (
          socket.gethostname(), os.getpid(), gpu_opts["visible_device_list"]), file=log.v3)
      else:
        if hvd.rank() == 0:  # Don't spam in all ranks.
          print("Horovod: Not using GPU.", file=log.v3)
      horovod_reduce_type = config.value("horovod_reduce_type", "")
      if horovod_reduce_type == "":
        horovod_reduce_type = "grad"
        config.set("horovod_reduce_type", horovod_reduce_type)
      else:
        assert horovod_reduce_type in ["grad", "param"], "config option 'horovod_reduce_type' invalid"
      if hvd.rank() == 0:  # Don't spam in all ranks.
        print("Horovod: Reduce type:", horovod_reduce_type, file=log.v3)
    from TFUtil import debug_register_better_repr, setup_tf_thread_pools, print_available_devices
    tf_session_opts = config.typed_value("tf_session_opts", {})
    assert isinstance(tf_session_opts, dict)
    # This must be done after the Horovod logic, such that we only touch the devices we are supposed to touch.
    setup_tf_thread_pools(log_file=log.v3, tf_session_opts=tf_session_opts)
    # Print available devices. Also make sure that get_tf_list_local_devices uses the correct TF session opts.
    print_available_devices(tf_session_opts=tf_session_opts, file=log.v2)
    debug_register_better_repr()
  else:
    raise NotImplementedError
예제 #5
0
def main(_):
    # Initialize Horovod.
    hvd.init()

    # Download and load MNIST dataset.
    mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    opt = tf.train.RMSPropOptimizer(0.01)

    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.contrib.framework.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    # BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0
    # to all other processes. This is necessary to ensure consistent initialization
    # of all workers when training is started with random weights or restored
    # from a checkpoint.
    hooks = [hvd.BroadcastGlobalVariablesHook(0),
             tf.train.StopAtStepHook(last_step=100),
             tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                        every_n_iter=10),
             ]

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Save checkpoints only on worker 0 to prevent other workers from corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = mnist.train.next_batch(100)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
예제 #6
0
파일: train.py 프로젝트: chinatian/glow
def get_its(hps):
    # These run for a fixed amount of time. As anchored batch is smaller, we've actually seen fewer examples
    train_its = int(np.ceil(hps.n_train / (hps.n_batch_train * hvd.size())))
    test_its = int(np.ceil(hps.n_test / (hps.n_batch_train * hvd.size())))
    train_epoch = train_its * hps.n_batch_train * hvd.size()

    # Do a full validation run
    if hvd.rank() == 0:
        print(hps.n_test, hps.local_batch_test, hvd.size())
    assert hps.n_test % (hps.local_batch_test * hvd.size()) == 0
    full_test_its = hps.n_test // (hps.local_batch_test * hvd.size())

    if hvd.rank() == 0:
        print("Train epoch size: " + str(train_epoch))
    return train_its, test_its, full_test_its
예제 #7
0
파일: tfops.py 프로젝트: chinatian/glow
def add_edge_padding(x, filter_size):
    assert filter_size[0] % 2 == 1
    if filter_size[0] == 1 and filter_size[1] == 1:
        return x
    a = (filter_size[0] - 1) // 2  # vertical padding size
    b = (filter_size[1] - 1) // 2  # horizontal padding size
    if True:
        x = tf.pad(x, [[0, 0], [a, a], [b, b], [0, 0]])
        name = "_".join([str(dim) for dim in [a, b, *int_shape(x)[1:3]]])
        pads = tf.get_collection(name)
        if not pads:
            if hvd.rank() == 0:
                print("Creating pad", name)
            pad = np.zeros([1] + int_shape(x)[1:3] + [1], dtype='float32')
            pad[:, :a, :, 0] = 1.
            pad[:, -a:, :, 0] = 1.
            pad[:, :, :b, 0] = 1.
            pad[:, :, -b:, 0] = 1.
            pad = tf.convert_to_tensor(pad)
            tf.add_to_collection(name, pad)
        else:
            pad = pads[0]
        pad = tf.tile(pad, [tf.shape(x)[0], 1, 1, 1])
        x = tf.concat([x, pad], axis=3)
    else:
        pad = tf.pad(tf.zeros_like(x[:, :, :, :1]) - 1,
                     [[0, 0], [a, a], [b, b], [0, 0]]) + 1
        x = tf.pad(x, [[0, 0], [a, a], [b, b], [0, 0]])
        x = tf.concat([x, pad], axis=3)
    return x
예제 #8
0
    def test_horovod_allreduce_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different rank or dimension."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            # Same rank, different dimension
            tf.set_random_seed(1234)
            dims = [17 + rank] * 3
            tensor = tf.random_uniform(dims, -1.0, 1.0)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor))

            # Same number of elements, different rank
            tf.set_random_seed(1234)
            if rank == 0:
                dims = [17, 23 * 57]
            else:
                dims = [17, 23, 57]
            tensor = tf.random_uniform(dims, -1.0, 1.0)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor))
예제 #9
0
    def test_horovod_broadcast(self):
        """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16,
                      tf.int32, tf.int64, tf.float32, tf.float64,
                      tf.bool]
            dims = [1, 2, 3]
            root_ranks = list(range(size))
            for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks):
                try:
                    tensor = tf.ones([17] * dim) * rank
                    root_tensor = tf.ones([17] * dim) * root_rank
                    if dtype == tf.bool:
                        tensor = tensor % 2
                        root_tensor = root_tensor % 2
                    tensor = tf.cast(tensor, dtype=dtype)
                    root_tensor = tf.cast(root_tensor, dtype=dtype)
                    broadcasted_tensor = hvd.broadcast(tensor, root_rank)
                    self.assertTrue(
                        session.run(tf.reduce_all(tf.equal(
                            tf.cast(root_tensor, tf.int32), tf.cast(broadcasted_tensor, tf.int32)))),
                        "hvd.broadcast produces incorrect broadcasted tensor")
                except Exception:
                    import traceback
                    traceback.print_exc()
예제 #10
0
파일: train.py 프로젝트: chinatian/glow
    def draw_samples(epoch):
        if hvd.rank() != 0:
            return

        rows = 10 if hps.image_size <= 64 else 4
        cols = rows
        n_batch = rows*cols
        y = np.asarray([_y % hps.n_y for _y in (
            list(range(cols)) * rows)], dtype='int32')

        # temperatures = [0., .25, .5, .626, .75, .875, 1.] #previously
        temperatures = [0., .25, .5, .6, .7, .8, .9, 1.]

        x_samples = []
        x_samples.append(sample_batch(y, [.0]*n_batch))
        x_samples.append(sample_batch(y, [.25]*n_batch))
        x_samples.append(sample_batch(y, [.5]*n_batch))
        x_samples.append(sample_batch(y, [.6]*n_batch))
        x_samples.append(sample_batch(y, [.7]*n_batch))
        x_samples.append(sample_batch(y, [.8]*n_batch))
        x_samples.append(sample_batch(y, [.9] * n_batch))
        x_samples.append(sample_batch(y, [1.]*n_batch))
        # previously: 0, .25, .5, .625, .75, .875, 1.

        for i in range(len(x_samples)):
            x_sample = np.reshape(
                x_samples[i], (n_batch, hps.image_size, hps.image_size, 3))
            graphics.save_raster(x_sample, logdir +
                                 'epoch_{}_sample_{}.png'.format(epoch, i))
예제 #11
0
    def test_horovod_broadcast_grad(self):
        """Test the correctness of the broadcast gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session(config=self.config) as session:
            # As of TensorFlow v1.9, gradients are not supported on
            # integer tensors
            dtypes = [tf.float32, tf.float64]
            dims = [1, 2, 3]
            root_ranks = list(range(size))
            for dtype, dim, root_rank in itertools.product(
                    dtypes, dims, root_ranks):
                tensor = tf.ones([5] * dim) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                broadcasted_tensor = hvd.broadcast(tensor, root_rank)

                grad_ys = tf.ones([5] * dim)
                grad = tf.gradients(broadcasted_tensor, tensor, grad_ys)[0]
                grad_out = session.run(grad)

                c = size if rank == root_rank else 0
                expected = np.ones([5] * dim) * c
                err = np.linalg.norm(expected - grad_out)
                self.assertLess(err, 0.00000001,
                                "gradient %s differs from expected %s, "
                                "error: %s" % (grad_out, expected, str(err)))
예제 #12
0
파일: tfops.py 프로젝트: chinatian/glow
def print_act_stats(x, _str=""):
    if not do_print_act_stats:
        return x
    if hvd.rank() != 0:
        return x
    if len(x.get_shape()) == 1:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 2:
        x_mean, x_var = tf.nn.moments(x, [0], keep_dims=True)
    if len(x.get_shape()) == 4:
        x_mean, x_var = tf.nn.moments(x, [0, 1, 2], keep_dims=True)
    stats = [tf.reduce_min(x_mean), tf.reduce_mean(x_mean), tf.reduce_max(x_mean),
             tf.reduce_min(tf.sqrt(x_var)), tf.reduce_mean(tf.sqrt(x_var)), tf.reduce_max(tf.sqrt(x_var))]
    return tf.Print(x, stats, "["+_str+"] "+x.name)
예제 #13
0
    def test_horovod_broadcast_rank_error(self):
        """Test that the broadcast returns an error if different ranks
        specify different root rank."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor = tf.ones([17] * 3, dtype=tf.float32)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, rank))
예제 #14
0
    def test_horovod_allgather_variable_size(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors,
        even if those tensors have different sizes along the first dim."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session() as session:
            dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16,
                      tf.int32, tf.int64, tf.float32, tf.float64,
                      tf.bool]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                # Support tests up to MPI Size of 35
                if size > 35:
                    break

                tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
                tensor_sizes = tensor_sizes[:size]

                tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                gathered_tensor = session.run(gathered)
                expected_size = sum(tensor_sizes)
                self.assertEqual(list(gathered_tensor.shape),
                                 [expected_size] + [17] * (dim - 1))

                for i in range(size):
                    rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
                    rank_tensor = tf.slice(
                        gathered, [sum(tensor_sizes[:i])] + [0] * (dim - 1),
                        rank_size)
                    self.assertEqual(list(rank_tensor.shape), rank_size)
                    # tf.equal() does not support tf.uint16 as of TensorFlow 1.2,
                    # so need to cast rank_tensor to tf.int32.
                    if dtype != tf.bool:
                        value = i
                    else:
                        value = i % 2
                    self.assertTrue(
                        session.run(tf.reduce_all(
                            tf.equal(tf.cast(rank_tensor, tf.int32), value))),
                        "hvd.allgather produces incorrect gathered tensor")
예제 #15
0
    def _setup_graph(self):
        num_gpu = cfg.TRAIN.NUM_GPUS
        if cfg.TRAINER == 'replicated':
            # Use two predictor threads per GPU to get better throughput
            self.num_predictor = num_gpu * 2
            self.predictors = [self._build_coco_predictor(k % num_gpu) for k in range(self.num_predictor)]
            self.dataflows = [get_eval_dataflow(shard=k, num_shards=self.num_predictor)
                              for k in range(self.num_predictor)]
        else:
            # Only eval on the first machine.
            # Alternatively, can eval on all ranks and use allgather, but allgather sometimes hangs
            self._horovod_run_eval = hvd.rank() == hvd.local_rank()
            if self._horovod_run_eval:
                self.predictor = self._build_coco_predictor(0)
                self.dataflow = get_eval_dataflow(shard=hvd.local_rank(), num_shards=hvd.local_size())

            self.barrier = hvd.allreduce(tf.random_normal(shape=[1]))
예제 #16
0
    def test_horovod_broadcast_error(self):
        """Test that the broadcast returns an error if any dimension besides
        the first is different among the tensors being broadcasted."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor_size = [17] * 3
            tensor_size[1] = 10 * (rank + 1)
            tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, 0))
예제 #17
0
    def test_horovod_broadcast_type_error(self):
        """Test that the broadcast returns an error if the types being broadcasted
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            tensor_size = [17] * 3
            dtype = tf.int32 if rank % 2 == 0 else tf.float32
            tensor = tf.ones(tensor_size, dtype=dtype) * rank
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.broadcast(tensor, 0))
예제 #18
0
    def test_horovod_allreduce_type_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different type."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        with self.test_session() as session:
            # Same rank, different dimension
            dims = [17] * 3
            tensor = tf.ones(dims,
                             dtype=tf.int32 if rank % 2 == 0 else tf.float32)
            with self.assertRaises(tf.errors.FailedPreconditionError):
                session.run(hvd.allreduce(tensor))
예제 #19
0
파일: Log.py 프로젝트: rwth-i6/returnn
 def init_by_config(self, config):
   """
   :param Config.Config config:
   """
   logs = config.list('log', [])
   log_verbosity = config.int_list('log_verbosity', [])
   log_format = config.list('log_format', [])
   if config.is_true("use_horovod"):
     # noinspection PyPackageRequirements,PyUnresolvedReferences
     import horovod.tensorflow as hvd
     from TFUtil import init_horovod
     init_horovod()  # make sure it is initialized
     new_logs = []
     for fn in logs:
       fn_prefix, fn_ext = os.path.splitext(fn)
       fn_ext = ".horovod-%i-%i%s" % (hvd.rank(), hvd.size(), fn_ext)
       new_logs.append(fn_prefix + fn_ext)
     logs = new_logs
   self.initialize(logs=logs, verbosity=log_verbosity, formatter=log_format)
예제 #20
0
    def test_horovod_allgather_grad(self):
        """Test the correctness of the allgather gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session(config=self.config) as session:
            # As of TensorFlow v1.9, gradients are not supported on
            # integer tensors
            dtypes = [tf.float32, tf.float64]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
                tensor_sizes = tensor_sizes[:size]

                tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1)) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                grad_list = []
                for r, tensor_size in enumerate(tensor_sizes):
                    g = tf.ones([tensor_size] + [17] * (dim - 1)) * r
                    grad_list.append(g)
                grad_ys = tf.concat(grad_list, axis=0)

                grad = tf.gradients(gathered, tensor, grad_ys)[0]
                grad_out = session.run(grad)

                expected = np.ones(
                    [tensor_sizes[rank]] + [17] * (dim - 1)
                ) * rank * size
                err = np.linalg.norm(expected - grad_out)
                self.assertLess(err, 0.00000001,
                                "gradient %s differs from expected %s, "
                                "error: %s" %
                                (grad_out, expected, str(err)))
예제 #21
0
    def _eval(self):
        logdir = args.logdir
        if cfg.TRAINER == 'replicated':
            with ThreadPoolExecutor(max_workers=self.num_predictor, thread_name_prefix='EvalWorker') as executor, \
                    tqdm.tqdm(total=sum([df.size() for df in self.dataflows])) as pbar:
                futures = []
                for dataflow, pred in zip(self.dataflows, self.predictors):
                    futures.append(executor.submit(eval_coco, dataflow, pred, pbar))
                all_results = list(itertools.chain(*[fut.result() for fut in futures]))
        else:
            if self._horovod_run_eval:
                local_results = eval_coco(self.dataflow, self.predictor)
                output_partial = os.path.join(
                    logdir, 'outputs{}-part{}.json'.format(self.global_step, hvd.local_rank()))
                with open(output_partial, 'w') as f:
                    json.dump(local_results, f)
            self.barrier.eval()
            if hvd.rank() > 0:
                return
            all_results = []
            for k in range(hvd.local_size()):
                output_partial = os.path.join(
                    logdir, 'outputs{}-part{}.json'.format(self.global_step, k))
                with open(output_partial, 'r') as f:
                    obj = json.load(f)
                all_results.extend(obj)
                os.unlink(output_partial)

        output_file = os.path.join(
            logdir, 'outputs{}.json'.format(self.global_step))
        with open(output_file, 'w') as f:
            json.dump(all_results, f)
        try:
            scores = print_evaluation_scores(output_file)
            for k, v in scores.items():
                self.trainer.monitors.put_scalar(k, v)
        except Exception:
            logger.exception("Exception in COCO evaluation.")
예제 #22
0
    def test_horovod_allgather(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        with self.test_session() as session:
            dtypes = [tf.uint8, tf.int8, tf.uint16, tf.int16,
                      tf.int32, tf.int64, tf.float32, tf.float64,
                      tf.bool]
            dims = [1, 2, 3]
            for dtype, dim in itertools.product(dtypes, dims):
                tensor = tf.ones([17] * dim) * rank
                if dtype == tf.bool:
                    tensor = tensor % 2
                tensor = tf.cast(tensor, dtype=dtype)
                gathered = hvd.allgather(tensor)

                gathered_tensor = session.run(gathered)
                self.assertEqual(list(gathered_tensor.shape),
                                 [17 * size] + [17] * (dim - 1))

                for i in range(size):
                    rank_tensor = tf.slice(gathered_tensor,
                                           [i * 17] + [0] * (dim - 1),
                                           [17] + [-1] * (dim - 1))
                    self.assertEqual(list(rank_tensor.shape), [17] * dim)
                    # tf.equal() does not support tf.uint16 as of TensorFlow 1.2,
                    # so need to cast rank_tensor to tf.int32.
                    if dtype != tf.bool:
                        value = i
                    else:
                        value = i % 2
                    self.assertTrue(
                        session.run(tf.reduce_all(
                            tf.equal(tf.cast(rank_tensor, tf.int32), value))),
                        "hvd.allgather produces incorrect gathered tensor")
예제 #23
0
def log(s, nl=True):
    if hvd.rank() != 0:
        return
    print(s, end='\n' if nl else '')
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(
        local_rank, gpu_local_rank))

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.visible_device_list = str(gpu_local_rank)
    K.set_session(tf.Session(config=config))

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    # number of convolutional filters to use
    filters = 64
    # convolution kernel size
    num_conv = 3

    hvdsize = hvd.size()

    batch_size = 128  # 100
    if K.image_data_format() == 'channels_first':
        original_img_size = (img_chns, img_rows, img_cols)
    else:
        original_img_size = (img_rows, img_cols, img_chns)
    latent_dim = 2
    intermediate_dim = 128
    epsilon_std = 1.0
    epochs = args.epochs  # 5

    # train the VAE on MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0],) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0],) + original_img_size)

    if hvd.rank() == 0:
        print('x_train.shape:', x_train.shape)

    train_samples = x_train.shape[0]
    # steps_per_epoch = train_samples // batch_size // hvdsize
    speedupopt = args.speedup
    if speedupopt == SpeedupOpts.imgspersec:
        steps_per_epoch = train_samples // batch_size
    else:
        steps_per_epoch = int(round(
            float(train_samples) / batch_size / hvdsize + 0.5))

    # Create the dataset and its associated one-shot iterator.
    buffer_size = 10000
    dataset = Dataset.from_tensor_slices(x_train)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    x_train_batch = iterator.get_next()

    ldict = make_shared_layers_dict(
        img_chns, img_rows, img_cols, batch_size, filters,
        num_conv, intermediate_dim, latent_dim, epsilon_std)
    # ldict is a dictionary that holds all layers. Since these layers are
    # instantiated once, they are shared amongs vae, encoder, and generator.

    x = Input(tensor=x_train_batch)
    vae = make_vae(ldict, x)
    # :  :type vae: Model

    lr = 0.001  # * hvdsize
    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    opt = TFOptimizer(opt)

    # opt = RMSprop(lr)
    # Add Horovod Distributed Optimizer.
    # opt = hvd_keras.DistributedOptimizer(opt)  # , use_locking=True)

    vae.compile(optimizer=opt, loss=None)
    if hvd.rank() == 0:
        vae.summary()

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    sess = K.get_session()
    sess.run(hvd.broadcast_global_variables(0))

    # Fit the model using data from the TF data tensors.
    vae.fit(steps_per_epoch=steps_per_epoch, epochs=epochs,
            callbacks=callbacks)

    if hvd.rank() == 0:
        x = Input(shape=original_img_size)
        vae_val = make_vae(ldict, x)
        vae_val.compile(optimizer=opt, loss=None)
        loss = vae_val.evaluate(x=x_test, y=None, batch_size=batch_size)
        print('\n\nVAE VALIDATION LOSS: {}'.format(loss))

        x = Input(shape=original_img_size)
        z_mean, _ = get_encoded(ldict, x)
        encoder = Model(x, z_mean)
        # :  :type encoder: Model

        decoder_input = Input(shape=(latent_dim,))
        x_decoded_mean_squash = get_decoded(ldict, decoder_input)
        generator = Model(decoder_input, x_decoded_mean_squash)
        # :  :type generator: Model

        # display a 2D plot of the digit classes in the latent space
        x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
        plt.figure(figsize=(6, 6))
        plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test)
        plt.colorbar()
        # plt.show()
        plt.savefig('vae_scatter.ps')
        plt.close()

        # display a 2D manifold of the digits
        n = 15  # figure with 15x15 digits
        digit_size = 28
        figure = np.zeros((digit_size * n, digit_size * n))
        # Linearly spaced coordinates on the unit square were transformed
        # through the inverse CDF (ppf) of the Gaussian
        # To produce values of the latent variables z, since the prior of the
        # latent space is Gaussian
        grid_x = norm.ppf(np.linspace(0.05, 0.95, n))
        grid_y = norm.ppf(np.linspace(0.05, 0.95, n))

        for i, yi in enumerate(grid_x):
            for j, xi in enumerate(grid_y):
                z_sample = np.array([[xi, yi]])
                z_sample = np.tile(z_sample, batch_size).reshape(batch_size, 2)
                x_decoded = generator.predict(z_sample, batch_size=batch_size)
                digit = x_decoded[0].reshape(digit_size, digit_size)
                figure[i * digit_size: (i + 1) * digit_size,
                       j * digit_size: (j + 1) * digit_size] = digit

        plt.figure(figsize=(10, 10))
        plt.imshow(figure, cmap='Greys_r')
        # plt.show()
        plt.savefig('vae_digit.ps')
        plt.close()

    K.clear_session()
예제 #25
0
 def test_horovod_rank(self):
     """Test that the rank returned by hvd.rank() is correct."""
     true_rank, _ = mpi_env_rank_and_size()
     hvd.init()
     rank = hvd.rank()
     self.assertEqual(true_rank, rank)
def main(_):
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if FLAGS.use_fp16:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

    if FLAGS.horovod:
        import horovod.tensorflow as hvd
        hvd.init()

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    tf.io.gfile.makedirs(FLAGS.output_dir)

    input_files = []
    for input_file_dir in FLAGS.input_files_dir.split(","):
        input_files.extend(tf.io.gfile.glob(os.path.join(input_file_dir, "*")))

    if FLAGS.horovod and len(input_files) < hvd.size():
        raise ValueError("Input Files must be sharded")
    if FLAGS.use_fp16 and FLAGS.manual_fp16:
        raise ValueError(
            "AMP and Manual Mixed Precision Training are both activated! Error"
        )

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    config = tf.compat.v1.ConfigProto()
    if FLAGS.horovod:
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if hvd.rank() == 0:
            tf.compat.v1.logging.info("***** Configuaration *****")
            for key in FLAGS.__flags.keys():
                tf.compat.v1.logging.info('  {}: {}'.format(
                    key, getattr(FLAGS, key)))
            tf.compat.v1.logging.info("**************************")


#    config.gpu_options.per_process_gpu_memory_fraction = 0.7
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
        config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT

    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if not FLAGS.horovod or hvd.rank() == 0 else None,
        # This variable controls how often estimator reports examples/sec.
        # Default value is every 100 steps.
        # When --report_loss is True, we set to very large value to prevent
        # default info reporting from estimator.
        # Ideally we should set it to None, but that does not work.
        log_step_count_steps=10000 if FLAGS.report_loss else 100)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_one_hot_embeddings=False,
                                hvd=None if not FLAGS.horovod else hvd)

    training_hooks = []
    if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
        global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.horovod else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size(
        )
        training_hooks.append(
            _LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps,
                               dllogging, FLAGS.display_loss_steps))
    if FLAGS.horovod and hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:
        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            batch_size=FLAGS.train_batch_size,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True,
            hvd=None if not FLAGS.horovod else hvd)

        estimator.train(input_fn=train_input_fn,
                        hooks=training_hooks,
                        max_steps=FLAGS.num_train_steps)

    if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_files = []
        for eval_file_dir in FLAGS.eval_files_dir.split(","):
            eval_files.extend(
                tf.io.gfile.glob(os.path.join(eval_file_dir, "*")))

        eval_input_fn = input_fn_builder(
            input_files=eval_files,
            batch_size=FLAGS.eval_batch_size,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False,
            hvd=None if not FLAGS.horovod else hvd)

        eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
        eval_start_time = time.time()
        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps,
                                    hooks=eval_hooks)

        eval_time_elapsed = time.time() - eval_start_time
        eval_time_wo_overhead = eval_hooks[-1].total_time

        num_sentences = (eval_hooks[-1].count -
                         eval_hooks[-1].skipped) * FLAGS.eval_batch_size

        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead,
            (eval_hooks[-1].count - eval_hooks[-1].skipped) *
            FLAGS.eval_batch_size)
        tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.use_fp16 else "fp32")
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        dllogging.logger.log(step=(),
                             data={"throughput_val": ss_sentences_per_second},
                             verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
예제 #27
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for Estimator."""
        def metric_fn(per_example_loss, label_ids, logits):
            predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
            if task_name == "cola":
                FN, FN_op = tf.metrics.false_negatives(labels=label_ids,
                                                       predictions=predictions)
                FP, FP_op = tf.metrics.false_positives(labels=label_ids,
                                                       predictions=predictions)
                TP, TP_op = tf.metrics.true_positives(labels=label_ids,
                                                      predictions=predictions)
                TN, TN_op = tf.metrics.true_negatives(labels=label_ids,
                                                      predictions=predictions)

                MCC = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) *
                                             (TN + FP) * (TN + FN))**0.5
                MCC_op = tf.group(FN_op, TN_op, TP_op, FP_op,
                                  tf.identity(MCC, name="MCC"))
                return {"MCC": (MCC, MCC_op)}
            else:
                accuracy = tf.metrics.accuracy(labels=label_ids,
                                               predictions=predictions)
                loss = tf.metrics.mean(values=per_example_loss)
                return {
                    "eval_accuracy": accuracy,
                    "eval_loss": loss,
                }

        tf.compat.v1.logging.info("*** Features ***")
        tf.compat.v1.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.compat.v1.logging.info("  name = %s, shape = %s" %
                                      (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        if not is_training and FLAGS.use_trt:
            trt_graph = get_frozen_tftrt_model(bert_config, input_ids.shape,
                                               num_labels,
                                               use_one_hot_embeddings,
                                               init_checkpoint)
            (total_loss, per_example_loss, logits,
             probabilities) = tf.import_graph_def(
                 trt_graph,
                 input_map={
                     'input_ids': input_ids,
                     'input_mask': input_mask,
                     'segment_ids': segment_ids,
                     'label_ids': label_ids
                 },
                 return_elements=[
                     'loss/cls_loss:0', 'loss/cls_per_example_loss:0',
                     'loss/cls_logits:0', 'loss/cls_probabilities:0'
                 ],
                 name='')
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {"probabilities": probabilities}
                output_spec = tf.estimator.EstimatorSpec(
                    mode=mode, predictions=predictions)
            elif mode == tf.estimator.ModeKeys.EVAL:
                eval_metric_ops = metric_fn(per_example_loss, label_ids,
                                            logits)
                output_spec = tf.estimator.EstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    eval_metric_ops=eval_metric_ops)
            return output_spec
        (total_loss, per_example_loss, logits,
         probabilities) = create_model(bert_config, is_training, input_ids,
                                       input_mask, segment_ids, label_ids,
                                       num_labels, use_one_hot_embeddings)

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        if init_checkpoint and (hvd is None or hvd.rank() == 0):
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        if FLAGS.verbose_logging:
            tf.compat.v1.logging.info("**** Trainable Variables ****")
            for var in tvars:
                init_string = ""
                if var.name in initialized_variable_names:
                    init_string = ", *INIT_FROM_CKPT*"
                tf.compat.v1.logging.info("  name = %s, shape = %s%s",
                                          var.name, var.shape, init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(
                total_loss, learning_rate, num_train_steps, num_warmup_steps,
                hvd, False, FLAGS.use_fp16, FLAGS.num_accumulation_steps)

            output_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                     loss=total_loss,
                                                     train_op=train_op)
        elif mode == tf.estimator.ModeKeys.EVAL:
            eval_metric_ops = metric_fn(per_example_loss, label_ids, logits)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode, loss=total_loss, eval_metric_ops=eval_metric_ops)
        else:
            output_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                     predictions=probabilities)
        return output_spec
예제 #28
0
    tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

callbacks = [
    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
]

# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
if hvd.rank() == 0:
    callbacks.append(
        keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

model.fit(x_train,
          y_train,
          batch_size=64,
          callbacks=callbacks,
          epochs=5,
          verbose=1 if hvd.rank() == 0 else 0,
          validation_data=(x_test, y_test))

score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
예제 #29
0
파일: mnist_horovod.py 프로젝트: zhcf/nauta
def main(_):
    hvd.init()

    # Read/download local dataset. Different copy for each process.
    mnist = tf.contrib.learn.datasets.mnist.read_data_sets(
        "mnist_data_{}".format(hvd.rank()))

    # Name images placeholder to be able to retrieve it from saved meta graph.
    images_placeholder = tf.placeholder(tf.float32, [None, 784],
                                        name=INPUT_NAME)

    dense_dropout_placeholder = tf.placeholder_with_default(1.0, [])
    labels_placeholder = tf.placeholder(tf.int64, [None])
    logits, scores, predictions = build_net(images_placeholder,
                                            dense_dropout_placeholder)

    # Exporting meta graph right now takes care of removing Horovod specific ops before serving. Graph right now
    # also does not contain any training specific ops, so it is optimized for serving too.
    tf.train.export_meta_graph("graph.meta", as_text=True)

    loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10),
                                           logits)
    accuracy = tf.reduce_mean(
        tf.cast(tf.equal(predictions, labels_placeholder), tf.float32))

    # Define summary ops to save summaries for later use in tensorboard.
    tf.summary.scalar("accuracy", accuracy)
    tf.summary.scalar("loss", loss)
    summary_op = tf.summary.merge_all()

    # Horovod: adjust learning rate based on number of workers.
    optimizer = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    global_step = tf.contrib.framework.get_or_create_global_step()

    # Wrap standard optimizer in Horovod distributed one.
    train = hvd.DistributedOptimizer(optimizer).minimize(
        loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of workers.
        tf.train.StopAtStepHook(last_step=2000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Only master saves summaries.
    if hvd.rank() == 0:
        hooks += [
            # As previously mentioned summaries are saved to EXPERIMENT_OUTPUT_PATH so that they can be discovered by
            # tensorboard.
            tf.train.SummarySaverHook(save_steps=1,
                                      output_dir=os.path.join(
                                          EXPERIMENT_OUTPUT_PATH,
                                          "tensorboard"),
                                      summary_op=summary_op)
        ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. As previously mentioned
    # checkpoints are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user.
    checkpoint_dir = os.path.join(EXPERIMENT_OUTPUT_PATH,
                                  "checkpoints") if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks) as mon_sess:
        while not mon_sess.should_stop():
            images, labels = mnist.train.next_batch(64)
            _, loss_val, accuracy_val, global_step_val = mon_sess.run(
                [train, loss, accuracy, global_step],
                feed_dict={
                    images_placeholder: images,
                    labels_placeholder: labels,
                    dense_dropout_placeholder: 0.5
                })

            # Only master publishes metrics.
            if hvd.rank() == 0:
                # Publish metrics just like in the single node example.
                publish({
                    "loss": str(loss_val),
                    "accuracy": str(accuracy_val),
                    "global_step": str(global_step_val)
                })

    # Save servable model only from Horovod master.
    if hvd.rank() == 0:
        # Create a new graph to import the previously exported one.
        with tf.Graph().as_default():
            # Import previously saved meta graph.
            restorer = tf.train.import_meta_graph("graph.meta")
            with tf.Session() as session:
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                restorer.restore(session, checkpoint_file)

                # Get handlers for images placeholder and scores op with names defined before.
                images_placeholder = tf.get_default_graph().get_tensor_by_name(
                    INPUT_NAME + ":0")
                scores = tf.get_default_graph().get_tensor_by_name(
                    SCORES_NAME + ":0")

                # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user.
                builder = tf.saved_model.builder.SavedModelBuilder(
                    os.path.join(EXPERIMENT_OUTPUT_PATH, "models", "00001"))

                prediction_signature = (
                    tf.saved_model.signature_def_utils.build_signature_def(
                        inputs={
                            MODEL_INPUT_NAME:
                            tf.saved_model.utils.build_tensor_info(
                                images_placeholder)
                        },
                        outputs={
                            MODEL_OUTPUT_NAME:
                            tf.saved_model.utils.build_tensor_info(scores)
                        },
                        method_name=tf.saved_model.signature_constants.
                        PREDICT_METHOD_NAME))

                builder.add_meta_graph_and_variables(
                    session, [tf.saved_model.tag_constants.SERVING],
                    signature_def_map={
                        MODEL_SIGNATURE_NAME: prediction_signature
                    },
                    main_op=tf.tables_initializer(),
                    strip_default_attrs=True)

                builder.save()
예제 #30
0
def train_ctl(model_func, params):
    image_width = params['image_width']
    image_height = params['image_height']
    image_format = params['image_format']
    distort_color = params['distort_color']
    momentum = params['momentum']
    loss_scale = params['loss_scale']
    data_dir = params['data_dir']
    data_idx_dir = params['data_idx_dir']
    batch_size = params['batch_size']
    num_iter = params['num_iter']
    iter_unit = params['iter_unit']
    log_dir = params['log_dir']
    export_dir = params['export_dir']
    tensorboard_dir = params['tensorboard_dir']
    display_every = params['display_every']
    precision = params['precision']
    dali_mode = params['dali_mode']
    use_xla = params['use_xla']

    if data_dir is not None:
        file_format = os.path.join(data_dir, '%s-*')
        train_files = sorted(tf.io.gfile.glob(file_format % 'train'))
        valid_files = sorted(tf.io.gfile.glob(file_format % 'validation'))
        num_train_samples = common.get_num_records(train_files)
        num_valid_samples = common.get_num_records(valid_files)
    else:
        num_train_samples = 1281982
        num_valid_samples = 5000

    train_idx_files = None
    valid_idx_files = None
    if data_idx_dir is not None:
        file_format = os.path.join(data_idx_dir, '%s-*')
        train_idx_files = sorted(tf.io.gfile.glob(file_format % 'train'))
        valid_idx_files = sorted(tf.io.gfile.glob(file_format % 'validation'))

    if iter_unit.lower() == 'epoch':
        num_epochs = num_iter
        nstep_per_epoch = num_train_samples // (batch_size * hvd.size())
        nstep_per_valid = num_valid_samples // (batch_size * hvd.size())
    else:
        assert iter_unit.lower() == 'batch'
        num_epochs = 1
        nstep_per_epoch = min(num_iter,
                              num_train_samples // (batch_size * hvd.size()))
        nstep_per_valid = min(10,
                              num_valid_samples // (batch_size * hvd.size()))

    if export_dir:
        assert os.path.exists(export_dir)
        save_format = export_dir + "/saved_model_rn50.h5"

    if use_xla:
        tf.config.optimizer.set_jit(True)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

    if tensorboard_dir and hvd.rank() == 0:
        assert os.path.exists(tensorboard_dir)
        summary_writer = tf.summary.create_file_writer(tensorboard_dir)
    else:
        summary_writer = None

    if precision == 'fp16':
        policy = keras.mixed_precision.experimental.Policy(
            'mixed_float16', loss_scale)
        keras.mixed_precision.experimental.set_policy(policy)

    lr_schedule = common.create_piecewise_constant_decay_with_warmup(
        batch_size=batch_size * hvd.size(),
        epoch_size=num_train_samples,
        warmup_epochs=common.LR_SCHEDULE[0][1],
        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
        multipliers=list(p[0] for p in common.LR_SCHEDULE),
        compute_lr_on_cpu=True)
    opt = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=momentum)

    backend.set_image_data_format(image_format)
    dtype = 'float16' if precision == 'fp16' else 'float32'
    backend.set_floatx(dtype)
    model = model_func(num_classes=image_processing.NUM_CLASSES,
                       batch_size=batch_size)

    loss_func = keras.losses.SparseCategoricalCrossentropy()

    train_top1 = tf.keras.metrics.SparseTopKCategoricalAccuracy(
        k=1, name='train_top1')
    train_top5 = tf.keras.metrics.SparseTopKCategoricalAccuracy(
        k=5, name='train_top5')

    val_loss = tf.keras.metrics.Mean(name='val_loss', dtype=tf.float32)

    val_top1 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1,
                                                              name='val_top1')
    val_top5 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5,
                                                              name='val_top5')

    if log_dir:
        # We save check points only when using the real data.
        assert data_dir, "--data_dir cannot be empty when using --log_dir"
        assert os.path.exists(log_dir)
        ckpt = tf.train.Checkpoint(epoch=tf.Variable(0),
                                   optimizer=opt,
                                   net=model)
        manager = tf.train.CheckpointManager(ckpt,
                                             log_dir,
                                             max_to_keep=3,
                                             checkpoint_name="model-ckpt")

    @tf.function
    def train_step(inputs, first_batch):
        images, labels = inputs

        with tf.GradientTape() as tape:
            predictions = model(images, training=True)
            loss = loss_func(labels, predictions)
            loss += tf.reduce_sum(model.losses)
            loss_copy = loss
            # Scale the losses
            if precision == 'fp16':
                loss = loss * tf.cast(loss_scale, loss.dtype)

        tape = hvd.DistributedGradientTape(tape)

        old_grads = tape.gradient(loss, model.trainable_variables)

        # Unscale the grads
        if precision == 'fp16':
            loss_scale_reciprocal = 1. / loss_scale
            grads = [
                g * tf.cast(loss_scale_reciprocal, g.dtype)
                if g is not None else None for g in old_grads
            ]
        else:
            grads = old_grads

        opt.apply_gradients(zip(grads, model.trainable_variables))

        train_top1.update_state(labels, predictions)
        train_top5.update_state(labels, predictions)

        if hvd.size() > 1 and first_batch:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(opt.variables(), root_rank=0)

        return loss_copy

    @tf.function
    def valid_step(inputs):
        images, labels = inputs
        predictions = model(images, training=False)
        loss = loss_func(labels, predictions)

        val_loss.update_state(loss)
        val_top1.update_state(labels, predictions)
        val_top5.update_state(labels, predictions)

    if data_dir is not None:
        num_preproc_threads = 4 if dali_mode else 10
        train_input = image_processing.image_set(
            train_files,
            batch_size,
            image_height,
            image_width,
            training=True,
            distort_color=distort_color,
            deterministic=False,
            num_threads=num_preproc_threads,
            use_dali=dali_mode,
            idx_filenames=train_idx_files)

        valid_input = image_processing.image_set(
            valid_files,
            batch_size,
            image_height,
            image_width,
            training=False,
            distort_color=False,
            deterministic=False,
            num_threads=num_preproc_threads,
            use_dali=dali_mode,
            idx_filenames=valid_idx_files)
    else:
        if dali_mode:
            raise ValueError("Must provide --data_dir if Dali is enabled")
        else:
            train_input = image_processing.fake_image_set(
                batch_size, image_height, image_width)

    global_steps = 0
    log_steps = display_every
    try:

        initial_epoch = 0
        if log_dir:
            ckpt.restore(manager.latest_checkpoint)
            if manager.latest_checkpoint:
                if hvd.rank() == 0:
                    print("Restored from {}".format(manager.latest_checkpoint))
                initial_epoch = max(
                    int(re.findall(r'\d+', manager.latest_checkpoint)[0]),
                    initial_epoch)
            else:
                if hvd.rank() == 0:
                    print("Initializing from scratch.")

        # Training Loop
        for epoch in range(num_epochs):
            if epoch < initial_epoch:
                continue
            # on_epoch_begin
            epoch_start = time.time()

            total_loss = 0.0
            num_batches = 0
            train_top1.reset_states()
            train_top5.reset_states()

            if not dali_mode:
                train_iter = iter(train_input)
            for _ in range(nstep_per_epoch):
                # on_batch_begin
                global_steps += 1
                if global_steps == 1:
                    start_time = time.time()

                if global_steps == 1 and hvd.rank() == 0 and summary_writer:
                    tf.summary.trace_on(graph=True, profiler=True)

                if not dali_mode:
                    x = next(train_iter)
                else:
                    x = train_input.get_device_minibatches()
                total_loss += train_step(x, global_steps == 1)

                if global_steps == 1 and hvd.rank() == 0 and summary_writer:
                    with summary_writer.as_default():
                        tf.summary.trace_export(
                            name="train_step",
                            step=0,
                            profiler_outdir=tensorboard_dir)

                # on_batch_end
                if global_steps % log_steps == 0:
                    timestamp = time.time()
                    elapsed_time = timestamp - start_time
                    examples_per_second = \
                        (batch_size * hvd.size() * log_steps) / elapsed_time
                    if hvd.rank() == 0:
                        print("global_step: %d images_per_sec: %.1f" %
                              (global_steps, examples_per_second))
                    start_time = timestamp
                num_batches += 1

            train_loss = total_loss / num_batches

            # on_epoch_end
            epoch_run_time = time.time() - epoch_start
            if hvd.rank() == 0:
                print("epoch: %d time_taken: %.1f" % (epoch, epoch_run_time))

            if data_dir is not None:
                val_loss.reset_states()
                val_top1.reset_states()
                val_top5.reset_states()

                if not dali_mode:
                    test_iter = iter(valid_input)
                for _ in range(nstep_per_valid):
                    if not dali_mode:
                        x = next(test_iter)
                    else:
                        x = valid_input.get_device_minibatches()
                    valid_step(x)

            if log_dir:
                ckpt.epoch.assign_add(1)
                if hvd.rank() == 0:
                    save_path = manager.save()
                    print("Saved checkpoint for epoch {}: {}".format(
                        int(ckpt.epoch), save_path))

            if hvd.rank() == 0:
                output_str = (
                    "loss: {} - top1: {} - top5: {} - val_loss: {} - "
                    "val_top1: {} - val_top5: {}")
                print(
                    output_str.format(train_loss, train_top1.result(),
                                      train_top5.result(), val_loss.result(),
                                      val_top1.result(), val_top5.result()))

            if hvd.rank() == 0 and summary_writer:
                with summary_writer.as_default():
                    tf.summary.scalar('train_loss', train_loss, global_steps)
                    tf.summary.scalar('train_top1', train_top1.result(),
                                      global_steps)
                    tf.summary.scalar('train_top5', train_top5.result(),
                                      global_steps)
                    tf.summary.scalar('val_loss', val_loss.result(),
                                      global_steps)
                    tf.summary.scalar('val_top1', val_top1.result(),
                                      global_steps)
                    tf.summary.scalar('val_top5', val_top5.result(),
                                      global_steps)

        if hvd.rank() == 0 and summary_writer:
            summary_writer.close()

    except KeyboardInterrupt:
        print("Keyboard interrupt")

    if export_dir and hvd.rank() == 0:
        model.save(save_format)
        print(f"The model is saved to {save_format}")
def test_hvd():
    import horovod.tensorflow as hvd
    hvd.init()
    print('rank', hvd.rank(), 'local', hvd.local_rank(), 'size', hvd.size())
예제 #32
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=10000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=100),
        tf.train.ProfilerHook(save_steps=1000,
                              output_dir="./phook",
                              show_memory=True)
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
예제 #33
0
def main(_):
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    if FLAGS.horovod:
        hvd.init()
    if FLAGS.use_fp16:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

    processors = {'consensus': ConsensusProcessor}

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    task_name = FLAGS.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    tf.io.gfile.makedirs(FLAGS.output_dir)

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size
    hvd_rank = 0

    config = tf.compat.v1.ConfigProto()
    if FLAGS.horovod:
        global_batch_size = FLAGS.train_batch_size * hvd.size()
        master_process = (hvd.rank() == 0)
        hvd_rank = hvd.rank()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if hvd.size() > 1:
            training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir if master_process else None,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if master_process else None,
        keep_checkpoint_max=1)

    if master_process:
        tf.compat.v1.logging.info("***** Configuration *****")
        for key in FLAGS.__flags.keys():
            tf.compat.v1.logging.info('  {}: {}'.format(
                key, getattr(FLAGS, key)))
        tf.compat.v1.logging.info("**************************")

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        start_index = 0
        end_index = len(train_examples)
        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

        if FLAGS.horovod:
            tmp_filenames = [
                os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i))
                for i in range(hvd.size())
            ]
            num_examples_per_rank = len(train_examples) // hvd.size()
            remainder = len(train_examples) % hvd.size()
            if hvd.rank() < remainder:
                start_index = hvd.rank() * (num_examples_per_rank + 1)
                end_index = start_index + num_examples_per_rank + 1
            else:
                start_index = hvd.rank() * num_examples_per_rank + remainder
                end_index = start_index + (num_examples_per_rank)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list) + 1,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_one_hot_embeddings=False,
                                hvd=None if not FLAGS.horovod else hvd,
                                use_fp16=FLAGS.use_fp16)

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:
        filed_based_convert_examples_to_features(
            train_examples[start_index:end_index], label_list,
            FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])
        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(train_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.compat.v1.logging.info("  Num steps = %d", num_train_steps)
        tf.compat.v1.logging.info("  Num of labels = %d", len(label_list))
        train_input_fn = file_based_input_fn_builder(
            input_file=tmp_filenames,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            hvd=None if not FLAGS.horovod else hvd)

        train_start_time = time.time()
        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=training_hooks)
        train_time_elapsed = time.time() - train_start_time
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (
            num_train_steps - training_hooks[-1].skipped
        ) * global_batch_size * 1.0 / train_time_wo_overhead

        if master_process:
            tf.compat.v1.logging.info("-----------------------------")
            tf.compat.v1.logging.info(
                "Total Training Time = %0.2f for Sentences = %d",
                train_time_elapsed, num_train_steps * global_batch_size)
            tf.compat.v1.logging.info(
                "Total Training Time W/O Overhead = %0.2f for Sentences = %d",
                train_time_wo_overhead,
                (num_train_steps - training_hooks[-1].skipped) *
                global_batch_size)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) with overhead = %0.2f",
                avg_sentences_per_second)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) = %0.2f",
                ss_sentences_per_second)
            tf.compat.v1.logging.info("-----------------------------")

    if FLAGS.do_eval and master_process:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        filed_based_convert_examples_to_features(eval_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, eval_file)

        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info(
            "  Num examples = %d (%d actual, %d padding)", len(eval_examples),
            num_actual_eval_examples,
            len(eval_examples) - num_actual_eval_examples)
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
        # This tells the estimator to run through the entire set.
        eval_steps = None
        eval_drop_remainder = False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            batch_size=FLAGS.eval_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    if FLAGS.do_predict and master_process:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        filed_based_convert_examples_to_features(predict_examples, label_list,
                                                 FLAGS.max_seq_length,
                                                 tokenizer, predict_file)
        tf.compat.v1.logging.info("***** Running prediction*****")
        tf.compat.v1.logging.info(
            "  Num examples = %d (%d actual, %d padding)",
            len(predict_examples), num_actual_predict_examples,
            len(predict_examples) - num_actual_predict_examples)
        tf.compat.v1.logging.info("  Batch size = %d",
                                  FLAGS.predict_batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            batch_size=FLAGS.predict_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        eval_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
        eval_start_time = time.time()

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.io.gfile.GFile(output_predict_file, "w") as writer:
            num_written_lines = 0
            tf.compat.v1.logging.info("***** Predict results *****")
            for prediction in estimator.predict(input_fn=predict_input_fn,
                                                hooks=eval_hooks,
                                                yield_single_examples=True):
                probabilities = prediction["probabilities"]
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples

        eval_time_elapsed = time.time() - eval_start_time
        eval_time_wo_overhead = eval_hooks[-1].total_time

        time_list = eval_hooks[-1].time_list
        time_list.sort()
        num_sentences = (eval_hooks[-1].count -
                         eval_hooks[-1].skipped) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            eval_time_elapsed, eval_hooks[-1].count * FLAGS.predict_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead,
            (eval_hooks[-1].count - eval_hooks[-1].skipped) *
            FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Summary Inference Statistics")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.use_fp16 else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                                  cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                                  cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                                  cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                                  cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                                  cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        tf.compat.v1.logging.info("-----------------------------")
예제 #34
0
    def model_fn(features, labels, mode, params):
        tf.compat.v1.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.compat.v1.logging.info("  name = %s, shape = %s" %
                                      (name, features[name].shape))
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]
        is_real_example = None
        if "is_real_example" in features:
            is_real_example = tf.cast(features["is_real_example"],
                                      dtype=tf.float32)
        else:
            is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (total_loss, per_example_loss, logits,
         probabilities) = create_model(bert_config, is_training, input_ids,
                                       input_mask, segment_ids, label_ids,
                                       num_labels, use_one_hot_embeddings)
        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint and (hvd is None or hvd.rank() == 0):
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
        tf.compat.v1.logging.info("**** Trainable Variables ****")

        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.compat.v1.logging.info("  name = %s, shape = %s%s", var.name,
                                      var.shape, init_string)
        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            train_op = optimization.create_optimizer(total_loss, learning_rate,
                                                     num_train_steps,
                                                     num_warmup_steps, hvd,
                                                     False, use_fp16)
            output_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                     loss=total_loss,
                                                     train_op=train_op)
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(per_example_loss, label_ids, logits,
                          is_real_example):
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int64)
                accuracy = tf.compat.v1.metrics.accuracy(
                    labels=label_ids,
                    predictions=predictions,
                    weights=is_real_example)
                loss = tf.compat.v1.metrics.mean(values=per_example_loss,
                                                 weights=is_real_example)
                return {
                    "eval_accuracy": accuracy,
                    "eval_loss": loss,
                }

            eval_metric_ops = metric_fn(per_example_loss, label_ids, logits,
                                        is_real_example)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode, loss=total_loss, eval_metric_ops=eval_metric_ops)
        else:
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode, predictions={"probabilities": probabilities})
        return output_spec
예제 #35
0
파일: train.py 프로젝트: yww2567/tensorpack
        logger.warn(
            "TF<1.6 has a bug which may lead to crash in FasterRCNN if you're unlucky."
        )

    args = parser.parse_args()
    if args.config:
        cfg.update_args(args.config)
    register_coco(cfg.DATA.BASEDIR)  # add COCO datasets to the registry
    register_balloon(
        cfg.DATA.BASEDIR)  # add the demo balloon datasets to the registry

    # Setup logger ...
    is_horovod = cfg.TRAINER == 'horovod'
    if is_horovod:
        hvd.init()
        logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size()))

    if not is_horovod or hvd.rank() == 0:
        logger.set_logger_dir(args.logdir, 'd')
    logger.info("Environment Information:\n" + collect_env_info())

    finalize_configs(is_training=True)

    # Compute the training schedule from the number of GPUs ...
    stepnum = cfg.TRAIN.STEPS_PER_EPOCH
    # warmup is step based, lr is epoch based
    init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.)
    warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)]
    warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum
    lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)]
예제 #36
0

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + url + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8-%d.zip' % hvd.rank(), 31344016)


# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

vocabulary = read_data(filename)
print('Data size', len(vocabulary))

# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 50000
예제 #37
0
파일: main.py 프로젝트: JoelRuhe/sarUNET
def main(args, config):
    if args.horovod:
        verbose = hvd.rank() == 0
        local_rank = hvd.local_rank()
    else:
        verbose = True
        local_rank = 0

    global_batch_size = args.batch_size * hvd.size(
    ) if args.horovod else args.batch_size

    timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.gmtime())
    logdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'runs',
                          timestamp)

    global_step = 0

    tf.reset_default_graph()

    # ------------------------------------------------------------------------------------------#
    # DATASET

    data_path = os.path.join(args.dataset_root,
                             f'{args.image_size}x{args.image_size}/')

    # retrieve dataset
    npy_data = NumpyPathDataset(data_path,
                                args.scratch_path,
                                copy_files=local_rank == 0,
                                is_correct_phase=True)

    dataset = tf.data.Dataset.from_tensor_slices(npy_data.scratch_files)

    if args.horovod:
        dataset.shard(hvd.size(), hvd.rank())

    if args.data_format == "NCDHW":
        current_shape = [
            args.batch_size, args.image_channels, args.image_size // 4,
            args.image_size, args.image_size
        ]
    else:
        current_shape = [
            args.batch_size, args.image_size // 4, args.image_size,
            args.image_size, args.image_channels
        ]

    real_image_input = tf.placeholder(shape=current_shape, dtype=tf.float32)

    # ------------------ NOISE ----------------

    rand_batch1 = np.random.rand(*real_image_input.shape) * 0.5
    noise_black_patches1 = rand_batch1.copy()

    # x_input = image_input + tf.random.normal(shape=image_input.shape) * args.noise_strength
    # x_input = image_input + tf.random.gamma(shape=x_input.shape, alpha=0.05)
    # x_input = x_input + tf.random.uniform(shape=x_input.shape) * args.noise_strength
    # x_input = x_input + tf.random.poisson(lam=0.5, shape=x_input.shape)

    #add box_sampler noise which mimics conebeam noise
    for i in range(real_image_input.shape[0]):
        for _ in range(100):
            arr_slices = uniform_box_sampler(noise_black_patches1,
                                             min_width=(1, 1, 1, 3, 3),
                                             max_width=(1, 1, 3, 6, 6))[0]

            noise_black_patches1[arr_slices] = 0

    x_input = real_image_input + noise_black_patches1
    y = real_image_input

    # ------------------ NETWORK ----------------

    prediction = forward(x_input, args)

    # ------------------ OPTIM -----------------
    if args.loss_fn is "mean_squared_error":
        loss = tf.losses.mean_squared_error(labels=y, predictions=prediction)
    else:
        assert args.loss_fn != "mean_squared_error", "Choose one of the available args.loss_fn"

    lr_scaler = hvd.size() if args.horovod else 1
    optimizer = tf.train.AdamOptimizer(args.learning_rate * lr_scaler)

    if args.horovod:
        optimizer = hvd.DistributedOptimizer(optimizer)

    train_step = optimizer.minimize(loss)

    # ------------- SUMMARIES -------------
    if args.data_format == "NCDHW":
        train_input = tf.transpose(x_input[0], (1, 2, 3, 0))
        prediction_input = tf.transpose(prediction[0], (1, 2, 3, 0))
        real_input = tf.transpose(y[0], (1, 2, 3, 0))
    else:
        train_input = tf.transpose(x_input[0], (0, 1, 2, 3))
        prediction_input = tf.transpose(prediction[0], (0, 1, 2, 3))
        real_input = tf.transpose(y[0], (0, 1, 2, 3))

    prediction_input = tf.clip_by_value(prediction_input,
                                        clip_value_min=args.clip_value_min,
                                        clip_value_max=args.clip_value_max)

    #transform images into grid
    shape = train_input.get_shape().as_list()
    image_shape = shape[1:3]
    print(shape)
    print(image_shape)
    grid_cols = int(2**np.floor(np.log(np.sqrt(shape[0])) / np.log(2)))
    grid_rows = shape[0] // grid_cols
    grid_shape = [grid_rows, grid_cols]
    train_input = image_grid(train_input,
                             grid_shape,
                             image_shape=shape[1:3],
                             num_channels=shape[-1])

    shape = prediction_input.get_shape().as_list()
    grid_cols = int(2**np.floor(np.log(np.sqrt(shape[0])) / np.log(2)))
    grid_rows = shape[0] // grid_cols
    grid_shape = [grid_rows, grid_cols]
    prediction_input = image_grid(prediction_input,
                                  grid_shape,
                                  image_shape=shape[1:3],
                                  num_channels=shape[-1])

    shape = real_input.get_shape().as_list()
    grid_cols = int(2**np.floor(np.log(np.sqrt(shape[0])) / np.log(2)))
    grid_rows = shape[0] // grid_cols
    grid_shape = [grid_rows, grid_cols]
    real_input = image_grid(real_input,
                            grid_shape,
                            image_shape=shape[1:3],
                            num_channels=shape[-1])

    with tf.variable_scope("train_summaries"):
        train_loss = tf.summary.scalar('train_loss', loss)
        train_imageNoise = tf.summary.image('train_imageNoise', train_input)
        train_imageRemake = tf.summary.image('train_imageRemake',
                                             prediction_input)
        train_imageReal = tf.summary.image('train_imageReal', real_input)

        image_summary_train = tf.summary.merge(
            [train_loss, train_imageReal, train_imageRemake, train_imageNoise])

    with tf.variable_scope("test_summaries"):
        test_loss = tf.summary.scalar('test_loss', loss)
        test_imageNoise = tf.summary.image('test_imageNoise', train_input)
        test_imageRemake = tf.summary.image('test_imageRemake',
                                            prediction_input)
        test_imageReal = tf.summary.image('test_imageReal', real_input)

        image_summary_test = tf.summary.merge(
            [test_loss, test_imageNoise, test_imageRemake, test_imageReal])

    # -------------- SESSION -------------

    with tf.Session(config=config) as sess:

        sess.run(tf.initialize_all_variables())

        if verbose:
            writer = tf.summary.FileWriter(logdir=logdir,
                                           graph=sess.graph,
                                           session=sess)

        #calculate percentage testset and trainingset
        train_size = int(len(npy_data) * args.train_size)
        test_size = int(len(npy_data) * (1 - args.train_size) + 1)

        num_train_steps = train_size // global_batch_size
        num_test_steps = test_size // global_batch_size

        for epoch in range(args.epochs):
            epoch_loss_train = 0
            epoch_loss_test = 0

            # TRAINING
            for i in range(num_train_steps):

                #prepare trainingbatch
                batch_loc = np.random.randint(num_test_steps,
                                              len(npy_data) - args.batch_size)
                batch_paths = npy_data[batch_loc:batch_loc + args.batch_size]
                batch = np.stack(np.load(path) for path in batch_paths)
                batch = batch[:, np.newaxis, ...].astype(np.float32) / 1024 - 1
                if args.data_format == "NDHWC":
                    batch = np.transpose(batch, (0, 2, 3, 4, 1))

                _, summary, c = sess.run(
                    [train_step, image_summary_train, loss],
                    feed_dict={real_image_input: batch})

                if i % args.logging_interval == 0 and verbose:
                    global_step = (epoch * num_train_steps *
                                   global_batch_size) + i * global_batch_size
                    writer.add_summary(summary, global_step)
                    writer.flush()
                    epoch_loss_train += c

            # TESTING
            for i in range(num_test_steps):

                #prepare testbatch
                batch_loc = np.random.randint(0,
                                              num_test_steps - args.batch_size)
                batch_paths = npy_data[batch_loc:batch_loc + args.batch_size]
                batch = np.stack(np.load(path) for path in batch_paths)
                batch = batch[:, np.newaxis, ...].astype(np.float32) / 1024 - 1

                if args.data_format == "NDHWC":
                    batch = np.transpose(batch, (0, 2, 3, 4, 1))

                c = sess.run(loss, feed_dict={real_image_input: batch})

                if i % args.logging_interval == 0 and verbose:
                    epoch_loss_test += c

            if verbose:
                # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='loss_test', simple_value=epoch_loss_test / num_test_steps)]), global_step)
                test_image_summary = sess.run(
                    image_summary_test, feed_dict={real_image_input: batch})
                writer.add_summary(test_image_summary, global_step)
                writer.flush()

            if verbose:
                print(f'Epoch [{epoch}/{args.epochs}]\t'
                      f'Train Loss: {epoch_loss_train / num_train_steps}\t'
                      f'Test Loss: {epoch_loss_test / num_test_steps}\t')
예제 #38
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if FLAGS.horovod:
        import horovod.tensorflow as hvd
        hvd.init()

    bert_config = modeling.BertConfig.from_json_file(bert_config_file.name)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    config = tf.ConfigProto()
    if FLAGS.horovod:
        config.gpu_options.visible_device_list = str(hvd.local_rank())
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if not FLAGS.horovod or hvd.rank() == 0 else None,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        # This variable controls how often estimator reports examples/sec.
        # Default value is every 100 steps.
        # When --report_loss is True, we set to very large value to prevent
        # default info reporting from estimator.
        # Ideally we should set it to None, but that does not work.
        log_step_count_steps=10000 if FLAGS.report_loss else 100)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu,
                                disable_nsp=FLAGS.disable_nsp,
                                hvd=None if not FLAGS.horovod else hvd)

    training_hooks = []
    if FLAGS.horovod and hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if FLAGS.report_loss:
        global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size * hvd.size(
        )
        training_hooks.append(
            _LogSessionRunHook(global_batch_size, 1,
                               -1 if not FLAGS.horovod else hvd.rank()))

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True,
            hvd=None if not FLAGS.horovod else hvd)
        estimator.train(input_fn=train_input_fn,
                        hooks=training_hooks,
                        max_steps=FLAGS.num_train_steps)

    if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False,
            hvd=None if not FLAGS.horovod else hvd)

        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
예제 #39
0
def main(_):
    #liangaws:测试sagemaker传入python程序的参数。
    import sys
    print(sys.argv)

    #liangaws: initialize Horovod.
    hvd.init()

    #------check Arguments------
    if FLAGS.dt_dir == "":
        FLAGS.dt_dir = (date.today() + timedelta(-1)).strftime('%Y%m%d')
    #FLAGS.model_dir = FLAGS.model_dir + FLAGS.dt_dir
    #FLAGS.data_dir  = FLAGS.data_dir + FLAGS.dt_dir

    print('task_type ', FLAGS.task_type)
    print('model_dir ', FLAGS.model_dir)
    print('data_dir ', FLAGS.data_dir)
    print('dt_dir ', FLAGS.dt_dir)
    print('num_epochs ', FLAGS.num_epochs)
    print('feature_size ', FLAGS.feature_size)
    print('field_size ', FLAGS.field_size)
    print('embedding_size ', FLAGS.embedding_size)
    print('batch_size ', FLAGS.batch_size)
    print('deep_layers ', FLAGS.deep_layers)
    print('dropout ', FLAGS.dropout)
    print('loss_type ', FLAGS.loss_type)
    print('optimizer ', FLAGS.optimizer)
    print('learning_rate ', FLAGS.learning_rate)
    print('batch_norm_decay ', FLAGS.batch_norm_decay)
    print('batch_norm ', FLAGS.batch_norm)
    print('l2_reg ', FLAGS.l2_reg)

    #------init Envs------
    #liangaws: 这里利用glob.glob函数可以把data_dir目录下的所有训练文件名抽取出来组成一个list,之后可以直接把这个文件名list传给TextLineDataset。
    tr_files = glob.glob("%s/tr*libsvm" % FLAGS.data_dir)
    random.shuffle(tr_files)
    print("tr_files:", tr_files)
    va_files = glob.glob("%s/va*libsvm" % FLAGS.data_dir)
    print("va_files:", va_files)
    te_files = glob.glob("%s/te*libsvm" % FLAGS.data_dir)
    print("te_files:", te_files)

    if FLAGS.clear_existing_model:
        try:
            shutil.rmtree(FLAGS.model_dir)
        except Exception as e:
            print(e, "at clear_existing_model")
        else:
            print("existing model cleaned at %s" % FLAGS.model_dir)

    #liangaws:这里注释掉调用设置parameter server方式进行分布式训练的环境参数,因为这个训练环境要用Sagemaker来控制。
    #set_dist_env()

    #------bulid Tasks------
    model_params = {
        "field_size": FLAGS.field_size,
        "feature_size": FLAGS.feature_size,
        "embedding_size": FLAGS.embedding_size,
        "learning_rate": FLAGS.learning_rate,
        "batch_norm_decay": FLAGS.batch_norm_decay,
        "l2_reg": FLAGS.l2_reg,
        "deep_layers": FLAGS.deep_layers,
        "dropout": FLAGS.dropout
    }

    #liangaws:这里注释掉config设置,暂时不使用这个。
    """ 
    config = tf.estimator.RunConfig().replace(session_config = tf.ConfigProto(device_count={'GPU':0, 'CPU':FLAGS.num_threads}),
            log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps)
    """

    #liangaws:设置checkpoint的周期和最大数量
    #config = tf.estimator.RunConfig().replace(save_checkpoints_secs = 5,
    #                                          keep_checkpoint_max = 5, #log_step_count_steps=FLAGS.log_steps, save_summary_steps=FLAGS.log_steps)

    #liangaws: 使用Horovod, pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # liangaws: 使用Horovod的时候, save checkpoints only on worker 0 to prevent other workers from corrupting them.
    print('current horovod rank is ', hvd.rank())
    print('input model dir is ', FLAGS.model_dir)

    print("host is ", FLAGS.hosts)
    print('current host is ', FLAGS.current_host)

    if hvd.rank() == 0:
        DeepFM = tf.estimator.Estimator(
            model_fn=model_fn,
            model_dir=FLAGS.model_dir,
            params=model_params,
            config=tf.estimator.RunConfig().replace(session_config=config))
    else:
        DeepFM = tf.estimator.Estimator(
            model_fn=model_fn,
            model_dir=None,
            params=model_params,
            config=tf.estimator.RunConfig().replace(session_config=config))

    # liangaws: 使用Horovod的时候, BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0 to all other processes. This is necessary to ensure consistent initialization of all workers when training is started with random weights or restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    #liangaws: 为了在Sagemaker pipe mode下使用horovod的单机多个worker进程,需要在调用Sagemaker的estimator fit的时候用多个channel,至少单机的每个worker需要一个channel。从SM设置的环境变量SM_CHANNELS可以获得当前的所有channel名字,之后每个worker用单独的channel来进行数据读取。
    #这里channel名字的顺序与调用Sagemaker estimator fit时候写入的顺序是不同的。比如对于{'training':train_s3, 'training-2':train2_s3, 'evaluation': validate_s3}这样的三个channel,环境变量被SM设置为['evaluation', 'training', 'training-2'],也就是说最后一个channel 'evaluation'出现在环境变量SM_CHANNELS中的第一个,其他channel则是按照原来顺序排列。
    channel_names = json.loads(os.environ['SM_CHANNELS'])
    print("channel name", channel_names)
    print("first channel", channel_names[0])
    print("last channel name", channel_names[-1])
    eval_channel = channel_names[0]

    if FLAGS.task_type == 'train':
        #liangaws:增加hook到TrainSpec中
        """
        train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(tr_files, channel='training', num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size), hooks=[bcast_hook])
        eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(va_files, channel='evaluation', num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200)
        tf.estimator.train_and_evaluate(DeepFM, train_spec, eval_spec)
        
        """
        if FLAGS.pipe_mode == 0:  #file mode
            for _ in range(FLAGS.num_epochs):
                DeepFM.train(input_fn=lambda: input_fn(
                    tr_files, num_epochs=1, batch_size=FLAGS.batch_size),
                             hooks=[bcast_hook])
                if hvd.rank() == 0:  #只需要在horovod的master做模型评估
                    DeepFM.evaluate(input_fn=lambda: input_fn(
                        va_files, num_epochs=1, batch_size=FLAGS.batch_size))
        else:  #pipe mode
            #liangaws: horovod + pipe mode方式下,训练中worker第二次进入input_fn中的时候,继续使用PipeModeDataset对同一个FIFO读取数据会出问题。
            """
            train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(channel=channel_names[1 + hvd.local_rank()], num_epochs=FLAGS.num_epochs, batch_size=FLAGS.batch_size), hooks=[bcast_hook])
            eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(channel=eval_channel, num_epochs=1, batch_size=FLAGS.batch_size), steps=None, start_delay_secs=1000, throttle_secs=1200)
            tf.estimator.train_and_evaluate(DeepFM, train_spec, eval_spec)
        
            """
            DeepFM.train(input_fn=lambda: input_fn(
                channel=channel_names[1 + hvd.local_rank()],
                num_epochs=FLAGS.num_epochs,
                batch_size=FLAGS.batch_size),
                         hooks=[bcast_hook])
            if hvd.rank() == 0:  #只需要在horovod的master做模型评估
                DeepFM.evaluate(
                    input_fn=lambda: input_fn(channel=eval_channel,
                                              num_epochs=1,
                                              batch_size=FLAGS.batch_size))

    elif FLAGS.task_type == 'eval':
        DeepFM.evaluate(input_fn=lambda: input_fn(
            va_files, num_epochs=1, batch_size=FLAGS.batch_size))
    elif FLAGS.task_type == 'infer':
        preds = DeepFM.predict(input_fn=lambda: input_fn(
            te_files, num_epochs=1, batch_size=FLAGS.batch_size),
                               predict_keys="prob")
        with open(FLAGS.data_dir + "/pred.txt", "w") as fo:
            for prob in preds:
                fo.write("%f\n" % (prob['prob']))
    #liangaws:这里修改当任务类型是train或者export的时候都保存模型
    if FLAGS.task_type == 'export' or FLAGS.task_type == 'train':
        #feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
        #feature_spec = {
        #    'feat_ids': tf.FixedLenFeature(dtype=tf.int64, shape=[None, FLAGS.field_size]),
        #    'feat_vals': tf.FixedLenFeature(dtype=tf.float32, shape=[None, FLAGS.field_size])
        #}
        #serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
        feature_spec = {
            'feat_ids':
            tf.placeholder(dtype=tf.int64,
                           shape=[None, FLAGS.field_size],
                           name='feat_ids'),
            'feat_vals':
            tf.placeholder(dtype=tf.float32,
                           shape=[None, FLAGS.field_size],
                           name='feat_vals')
        }
        serving_input_receiver_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(
            feature_spec)

        #liangaws: 使用Horovod的时候: Save model and history only on worker 0 (i.e. master)
        if hvd.rank() == 0:
            DeepFM.export_savedmodel(FLAGS.servable_model_dir,
                                     serving_input_receiver_fn)
예제 #40
0
    def dataset_fn(self,
                   batch_size,
                   training,
                   input_shape,
                   mask_shape,
                   num_threads,
                   use_gpu_prefetch,
                   normalize_data_method,
                   only_defective_images,
                   augment_data,
                   seed=None):

        super(DAGM2007_Dataset, self).dataset_fn(
            batch_size=batch_size,
            training=training,
            input_shape=input_shape,
            mask_shape=mask_shape,
            num_threads=num_threads,
            use_gpu_prefetch=use_gpu_prefetch,
            normalize_data_method=
            normalize_data_method,  # [None, "zero_centered", "zero_one"]
            only_defective_images=only_defective_images,
            augment_data=augment_data,
            seed=seed)

        shuffle_buffer_size = 10000

        def decode_csv(line):

            input_image_name, image_mask_name, label = tf.decode_csv(
                line, record_defaults=[[""], [""], [0]], field_delim=',')

            def decode_image(filepath, resize_shape, normalize_data_method):
                image_content = tf.read_file(filepath)

                # image = tf.image.decode_image(image_content, channels=resize_shape[-1])
                image = tf.image.decode_png(contents=image_content,
                                            channels=resize_shape[-1],
                                            dtype=tf.uint8)

                image = tf.image.resize_images(
                    image,
                    size=resize_shape[:2],
                    method=tf.image.ResizeMethod.
                    BILINEAR,  # [BILINEAR, NEAREST_NEIGHBOR, BICUBIC, AREA]
                    align_corners=False,
                    preserve_aspect_ratio=True)

                image.set_shape(resize_shape)
                image = tf.cast(image, tf.float32)

                if normalize_data_method == "zero_centered":
                    image = tf.divide(image, 127.5) - 1

                elif normalize_data_method == "zero_one":
                    image = tf.divide(image, 255.0)

                return image

            input_image = decode_image(
                filepath=tf.strings.join([image_dir, input_image_name],
                                         separator='/'),
                resize_shape=input_shape,
                normalize_data_method=normalize_data_method,
            )

            mask_image = tf.cond(
                tf.equal(image_mask_name, ""),
                true_fn=lambda: tf.zeros(mask_shape, dtype=tf.float32),
                false_fn=lambda: decode_image(
                    filepath=tf.strings.join([mask_image_dir, image_mask_name],
                                             separator='/'),
                    resize_shape=mask_shape,
                    normalize_data_method="zero_one",
                ),
            )

            if augment_data:

                if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
                    LOGGER.log("Using data augmentation ...")

                #input_image = tf.image.per_image_standardization(input_image)

                horizontal_flip = tf.random_uniform(shape=(), seed=seed) > 0.5
                input_image = tf.cond(
                    horizontal_flip,
                    lambda: tf.image.flip_left_right(input_image),
                    lambda: input_image)
                mask_image = tf.cond(
                    horizontal_flip,
                    lambda: tf.image.flip_left_right(mask_image),
                    lambda: mask_image)

                n_rots = tf.random_uniform(shape=(),
                                           dtype=tf.int32,
                                           minval=0,
                                           maxval=3,
                                           seed=seed)
                input_image = tf.image.rot90(input_image, k=n_rots)
                mask_image = tf.image.rot90(mask_image, k=n_rots)

            label = tf.cast(label, tf.int32)

            return (input_image, mask_image), label

        image_dir, csv_file = self._get_data_dirs(training=training)

        mask_image_dir = os.path.join(image_dir, "Label")

        dataset = tf.data.TextLineDataset(csv_file)

        dataset = dataset.skip(1)  # Skip CSV Header

        if only_defective_images:
            dataset = dataset.filter(
                lambda line: tf.not_equal(tf.strings.substr(line, -1, 1), "0"))

        dataset = dataset.cache()

        if training:

            dataset = dataset.apply(
                tf.data.experimental.shuffle_and_repeat(
                    buffer_size=shuffle_buffer_size, seed=seed))

            if hvd_utils.is_using_hvd():
                dataset = dataset.shard(hvd.size(), hvd.rank())

        else:
            dataset = dataset.repeat()

        dataset = dataset.apply(
            tf.data.experimental.map_and_batch(
                map_func=decode_csv,
                num_parallel_calls=num_threads,
                batch_size=batch_size,
                drop_remainder=True,
            ))

        dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)

        if use_gpu_prefetch:
            dataset.apply(
                tf.data.experimental.prefetch_to_device(
                    device="/gpu:0", buffer_size=batch_size * 8))

        return dataset
예제 #41
0
def main(_):
    os.environ[
        "TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false"  #causes memory fragmentation for bert leading to OOM

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path)

    if FLAGS.horovod:
        hvd.init()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
    }

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.io.gfile.makedirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    master_process = True
    training_hooks = []
    global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
    hvd_rank = 0

    config = tf.compat.v1.ConfigProto()
    if FLAGS.horovod:

        tf.compat.v1.logging.info("Multi-GPU training with TF Horovod")
        tf.compat.v1.logging.info("hvd.size() = %d hvd.rank() = %d",
                                  hvd.size(), hvd.rank())
        global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size(
        )
        master_process = (hvd.rank() == 0)
        hvd_rank = hvd.rank()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if hvd.size() > 1:
            training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1

    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir if master_process else None,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if master_process else None,
        keep_checkpoint_max=1)

    if master_process:
        tf.compat.v1.logging.info("***** Configuaration *****")
        for key in FLAGS.__flags.keys():
            tf.compat.v1.logging.info('  {}: {}'.format(
                key, getattr(FLAGS, key)))
        tf.compat.v1.logging.info("**************************")

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    training_hooks.append(LogTrainRunHook(global_batch_size, hvd_rank))

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / global_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        start_index = 0
        end_index = len(train_examples)
        tmp_filenames = [os.path.join(FLAGS.output_dir, "train.tf_record")]

        if FLAGS.horovod:
            tmp_filenames = [
                os.path.join(FLAGS.output_dir, "train.tf_record{}".format(i))
                for i in range(hvd.size())
            ]
            num_examples_per_rank = len(train_examples) // hvd.size()
            remainder = len(train_examples) % hvd.size()
            if hvd.rank() < remainder:
                start_index = hvd.rank() * (num_examples_per_rank + 1)
                end_index = start_index + num_examples_per_rank + 1
            else:
                start_index = hvd.rank() * num_examples_per_rank + remainder
                end_index = start_index + (num_examples_per_rank)

    model_fn = model_fn_builder(task_name=task_name,
                                bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_one_hot_embeddings=False,
                                hvd=None if not FLAGS.horovod else hvd)

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:

        file_based_convert_examples_to_features(
            train_examples[start_index:end_index], label_list,
            FLAGS.max_seq_length, tokenizer, tmp_filenames[hvd_rank])

        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(train_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.compat.v1.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=tmp_filenames,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True,
            hvd=None if not FLAGS.horovod else hvd)

        train_start_time = time.time()
        estimator.train(input_fn=train_input_fn,
                        max_steps=num_train_steps,
                        hooks=training_hooks)
        train_time_elapsed = time.time() - train_start_time
        train_time_wo_overhead = training_hooks[-1].total_time
        avg_sentences_per_second = num_train_steps * global_batch_size * 1.0 / train_time_elapsed
        ss_sentences_per_second = (
            num_train_steps - training_hooks[-1].skipped
        ) * global_batch_size * 1.0 / train_time_wo_overhead

        if master_process:
            tf.compat.v1.logging.info("-----------------------------")
            tf.compat.v1.logging.info(
                "Total Training Time = %0.2f for Sentences = %d",
                train_time_elapsed, num_train_steps * global_batch_size)
            tf.compat.v1.logging.info(
                "Total Training Time W/O Overhead = %0.2f for Sentences = %d",
                train_time_wo_overhead,
                (num_train_steps - training_hooks[-1].skipped) *
                global_batch_size)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) with overhead = %0.2f",
                avg_sentences_per_second)
            tf.compat.v1.logging.info(
                "Throughput Average (sentences/sec) = %0.2f",
                ss_sentences_per_second)
            tf.compat.v1.logging.info("-----------------------------")

    if FLAGS.do_eval and master_process:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info("  Num examples = %d", len(eval_examples))
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_drop_remainder = False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            batch_size=FLAGS.eval_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)]
        eval_start_time = time.time()
        result = estimator.evaluate(input_fn=eval_input_fn, hooks=eval_hooks)

        eval_time_elapsed = time.time() - eval_start_time

        time_list = eval_hooks[-1].time_list
        time_list.sort()
        # Removing outliers (init/warmup) in throughput computation.
        eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)])
        num_sentences = (int(len(time_list) * 0.99)) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            eval_time_wo_overhead, num_sentences)
        tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.use_fp16 else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                                  cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                                  cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                                  cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                                  cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                                  cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        dllogging.logger.log(
            step=(),
            data={"throughput_train": ss_sentences_per_second},
            verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                dllogging.logger.log(step=(),
                                     data={key: float(result[key])},
                                     verbosity=Verbosity.DEFAULT)
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict and master_process:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.compat.v1.logging.info("***** Running prediction*****")
        tf.compat.v1.logging.info("  Num examples = %d", len(predict_examples))
        tf.compat.v1.logging.info("  Batch size = %d",
                                  FLAGS.predict_batch_size)

        predict_drop_remainder = False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            batch_size=FLAGS.predict_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        predict_hooks = [LogEvalRunHook(FLAGS.predict_batch_size)]
        predict_start_time = time.time()

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        with tf.io.gfile.GFile(output_predict_file, "w") as writer:
            tf.compat.v1.logging.info("***** Predict results *****")
            for prediction in estimator.predict(input_fn=predict_input_fn,
                                                hooks=predict_hooks,
                                                yield_single_examples=False):
                output_line = "\t".join(
                    str(class_probability)
                    for class_probability in prediction) + "\n"
                writer.write(output_line)

        predict_time_elapsed = time.time() - predict_start_time
        predict_time_wo_overhead = predict_hooks[-1].total_time

        time_list = predict_hooks[-1].time_list
        time_list.sort()
        num_sentences = (predict_hooks[-1].count -
                         predict_hooks[-1].skipped) * FLAGS.predict_batch_size

        avg = np.mean(time_list)
        cf_50 = max(time_list[:int(len(time_list) * 0.50)])
        cf_90 = max(time_list[:int(len(time_list) * 0.90)])
        cf_95 = max(time_list[:int(len(time_list) * 0.95)])
        cf_99 = max(time_list[:int(len(time_list) * 0.99)])
        cf_100 = max(time_list[:int(len(time_list) * 1)])
        ss_sentences_per_second = num_sentences * 1.0 / predict_time_wo_overhead

        tf.compat.v1.logging.info("-----------------------------")
        tf.compat.v1.logging.info(
            "Total Inference Time = %0.2f for Sentences = %d",
            predict_time_elapsed,
            predict_hooks[-1].count * FLAGS.predict_batch_size)
        tf.compat.v1.logging.info(
            "Total Inference Time W/O Overhead = %0.2f for Sentences = %d",
            predict_time_wo_overhead,
            (predict_hooks[-1].count - predict_hooks[-1].skipped) *
            FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Summary Inference Statistics on TEST SET")
        tf.compat.v1.logging.info("Batch size = %d", FLAGS.predict_batch_size)
        tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length)
        tf.compat.v1.logging.info("Precision = %s",
                                  "fp16" if FLAGS.use_fp16 else "fp32")
        tf.compat.v1.logging.info("Latency Confidence Level 50 (ms) = %0.2f",
                                  cf_50 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 90 (ms) = %0.2f",
                                  cf_90 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 95 (ms) = %0.2f",
                                  cf_95 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 99 (ms) = %0.2f",
                                  cf_99 * 1000)
        tf.compat.v1.logging.info("Latency Confidence Level 100 (ms) = %0.2f",
                                  cf_100 * 1000)
        tf.compat.v1.logging.info("Latency Average (ms) = %0.2f", avg * 1000)
        tf.compat.v1.logging.info("Throughput Average (sentences/sec) = %0.2f",
                                  ss_sentences_per_second)
        dllogging.logger.log(step=(),
                             data={"throughput_val": ss_sentences_per_second},
                             verbosity=Verbosity.DEFAULT)
        tf.compat.v1.logging.info("-----------------------------")
예제 #42
0
파일: train.py 프로젝트: chinatian/glow
def train(sess, model, hps, logdir, visualise):
    _print(hps)
    _print('Starting training. Logging to', logdir)
    _print('epoch n_processed n_images ips dtrain dtest dsample dtot train_results test_results msg')

    # Train
    sess.graph.finalize()
    n_processed = 0
    n_images = 0
    train_time = 0.0
    test_loss_best = 999999

    if hvd.rank() == 0:
        train_logger = ResultLogger(logdir + "train.txt", **hps.__dict__)
        test_logger = ResultLogger(logdir + "test.txt", **hps.__dict__)

    tcurr = time.time()
    for epoch in range(1, hps.epochs):

        t = time.time()

        train_results = []
        for it in range(hps.train_its):

            # Set learning rate, linearly annealed from 0 in the first hps.epochs_warmup epochs.
            lr = hps.lr * min(1., n_processed /
                              (hps.n_train * hps.epochs_warmup))

            # Run a training step synchronously.
            _t = time.time()
            train_results += [model.train(lr)]
            if hps.verbose and hvd.rank() == 0:
                _print(n_processed, time.time()-_t, train_results[-1])
                sys.stdout.flush()

            # Images seen wrt anchor resolution
            n_processed += hvd.size() * hps.n_batch_train
            # Actual images seen at current resolution
            n_images += hvd.size() * hps.local_batch_train

        train_results = np.mean(np.asarray(train_results), axis=0)

        dtrain = time.time() - t
        ips = (hps.train_its * hvd.size() * hps.local_batch_train) / dtrain
        train_time += dtrain

        if hvd.rank() == 0:
            train_logger.log(epoch=epoch, n_processed=n_processed, n_images=n_images, train_time=int(
                train_time), **process_results(train_results))

        if epoch < 10 or (epoch < 50 and epoch % 10 == 0) or epoch % hps.epochs_full_valid == 0:
            test_results = []
            msg = ''

            t = time.time()
            # model.polyak_swap()

            if epoch % hps.epochs_full_valid == 0:
                # Full validation run
                for it in range(hps.full_test_its):
                    test_results += [model.test()]
                test_results = np.mean(np.asarray(test_results), axis=0)

                if hvd.rank() == 0:
                    test_logger.log(epoch=epoch, n_processed=n_processed,
                                    n_images=n_images, **process_results(test_results))

                    # Save checkpoint
                    if test_results[0] < test_loss_best:
                        test_loss_best = test_results[0]
                        model.save(logdir+"model_best_loss.ckpt")
                        msg += ' *'

            dtest = time.time() - t

            # Sample
            t = time.time()
            if epoch == 1 or epoch == 10 or epoch % hps.epochs_full_sample == 0:
                visualise(epoch)
            dsample = time.time() - t

            if hvd.rank() == 0:
                dcurr = time.time() - tcurr
                tcurr = time.time()
                _print(epoch, n_processed, n_images, "{:.1f} {:.1f} {:.1f} {:.1f} {:.1f}".format(
                    ips, dtrain, dtest, dsample, dcurr), train_results, test_results, msg)

            # model.polyak_swap()

    if hvd.rank() == 0:
        _print("Finished!")
예제 #43
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-v",
                        "--version",
                        action="version",
                        version="OpenNMT-tf %s" % __version__)
    parser.add_argument("run",
                        choices=[
                            "train_and_eval", "train", "eval", "infer",
                            "export", "score"
                        ],
                        help="Run type.")
    parser.add_argument("--config",
                        required=True,
                        nargs="+",
                        help="List of configuration files.")
    parser.add_argument("--auto_config",
                        default=False,
                        action="store_true",
                        help="Enable automatic configuration values.")
    parser.add_argument("--model_type",
                        default="",
                        choices=list(
                            classes_in_module(catalog, public_only=True)),
                        help="Model type from the catalog.")
    parser.add_argument("--model",
                        default="",
                        help="Custom model configuration file.")
    parser.add_argument(
        "--run_dir",
        default="",
        help="If set, model_dir will be created relative to this location.")
    parser.add_argument(
        "--data_dir",
        default="",
        help="If set, data files are expected to be relative to this location."
    )
    parser.add_argument("--features_file",
                        default=[],
                        nargs="+",
                        help="Run inference on this file.")
    parser.add_argument(
        "--predictions_file",
        default="",
        help=
        ("File used to save predictions. If not set, predictions are printed "
         "on the standard output."))
    parser.add_argument("--log_prediction_time",
                        default=False,
                        action="store_true",
                        help="Logs some prediction time metrics.")
    parser.add_argument(
        "--checkpoint_path",
        default=None,
        help=("Checkpoint or directory to use for inference or export "
              "(when a directory is set, the latest checkpoint is used)."))
    parser.add_argument("--source_scope",
                        default=None,
                        help=("Checkpoint scope name to restore to model."))
    parser.add_argument("--target_scope",
                        default=None,
                        help=("target scope name to restore to model."))
    parser.add_argument("--export_dir_base",
                        default=None,
                        help="The base directory of the exported model.")
    parser.add_argument("--num_gpus",
                        type=int,
                        default=1,
                        help="Number of GPUs to use for in-graph replication.")
    parser.add_argument(
        "--chief_host",
        default="",
        help="hostname:port of the chief worker (for distributed training).")
    parser.add_argument(
        "--worker_hosts",
        default="",
        help=("Comma-separated list of hostname:port of workers "
              "(for distributed training)."))
    parser.add_argument(
        "--ps_hosts",
        default="",
        help=("Comma-separated list of hostname:port of parameter servers "
              "(for distributed training)."))
    parser.add_argument(
        "--task_type",
        default="chief",
        choices=["chief", "worker", "ps", "evaluator"],
        help="Type of the task to run (for distributed training).")
    parser.add_argument("--task_index",
                        type=int,
                        default=0,
                        help="ID of the task (for distributed training).")
    parser.add_argument("--horovod",
                        default=False,
                        action="store_true",
                        help="Enable Horovod support for this run.")
    parser.add_argument("--log_level",
                        default="INFO",
                        choices=["DEBUG", "ERROR", "FATAL", "INFO", "WARN"],
                        help="Logs verbosity.")
    parser.add_argument("--seed", type=int, default=None, help="Random seed.")
    parser.add_argument("--gpu_allow_growth",
                        default=False,
                        action="store_true",
                        help="Allocate GPU memory dynamically.")
    parser.add_argument(
        "--intra_op_parallelism_threads",
        type=int,
        default=0,
        help=("Number of intra op threads (0 means the system picks "
              "an appropriate number)."))
    parser.add_argument(
        "--inter_op_parallelism_threads",
        type=int,
        default=0,
        help=("Number of inter op threads (0 means the system picks "
              "an appropriate number)."))
    parser.add_argument(
        "--session_config",
        default=None,
        help=(
            "Path to a file containing a tf.ConfigProto message in text format "
            "and used to create the TensorFlow sessions."))
    args = parser.parse_args()

    tf.logging.set_verbosity(getattr(tf.logging, args.log_level))

    # Setup cluster if defined.
    if args.chief_host:
        if args.run != "train_and_eval":
            raise ValueError(
                "Distributed training is only supported with the train_and_eval run type"
            )
        os.environ["TF_CONFIG"] = json.dumps({
            "cluster": {
                "chief": [args.chief_host],
                "worker": args.worker_hosts.split(","),
                "ps": args.ps_hosts.split(",")
            },
            "task": {
                "type": args.task_type,
                "index": args.task_index
            }
        })

    # Initialize Horovd if defined.
    if args.horovod:
        import horovod.tensorflow as hvd
        hvd.init()
        is_chief = hvd.rank() == 0
    else:
        hvd = None
        is_chief = args.task_type == "chief"

    # Load and merge run configurations.
    config = load_config(args.config)
    if args.run_dir:
        config["model_dir"] = os.path.join(args.run_dir, config["model_dir"])
    if args.data_dir:
        config["data"] = _prefix_paths(args.data_dir, config["data"])

    if is_chief and not tf.gfile.Exists(config["model_dir"]):
        tf.logging.info("Creating model directory %s", config["model_dir"])
        tf.gfile.MakeDirs(config["model_dir"])

    model = load_model(config["model_dir"],
                       model_file=args.model,
                       model_name=args.model_type,
                       serialize_model=is_chief)
    session_config = tf.ConfigProto(
        intra_op_parallelism_threads=args.intra_op_parallelism_threads,
        inter_op_parallelism_threads=args.inter_op_parallelism_threads,
        gpu_options=tf.GPUOptions(allow_growth=args.gpu_allow_growth))
    if args.session_config is not None:
        with open(args.session_config, "rb") as session_config_file:
            text_format.Merge(session_config_file.read(), session_config)
    runner = Runner(model,
                    config,
                    seed=args.seed,
                    num_devices=args.num_gpus,
                    session_config=session_config,
                    auto_config=args.auto_config,
                    hvd=hvd)

    if args.run == "train_and_eval":
        runner.train_and_evaluate(checkpoint_path=args.checkpoint_path,
                                  source_scope=args.source_scope,
                                  target_scope=args.target_scope)
    elif args.run == "train":
        runner.train(checkpoint_path=args.checkpoint_path,
                     source_scope=args.source_scope,
                     target_scope=args.target_scope)
    elif args.run == "eval":
        runner.evaluate(checkpoint_path=args.checkpoint_path)
    elif args.run == "infer":
        if not args.features_file:
            parser.error("--features_file is required for inference.")
        elif len(args.features_file) == 1:
            args.features_file = args.features_file[0]
        runner.infer(args.features_file,
                     predictions_file=args.predictions_file,
                     checkpoint_path=args.checkpoint_path,
                     log_time=args.log_prediction_time)
    elif args.run == "export":
        runner.export(checkpoint_path=args.checkpoint_path,
                      export_dir_base=args.export_dir_base)
    elif args.run == "score":
        if not args.features_file:
            parser.error("--features_file is required for scoring.")
        runner.score(args.features_file,
                     args.predictions_file,
                     checkpoint_path=args.checkpoint_path)
예제 #44
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-v",
                        "--version",
                        action="version",
                        version="OpenNMT-tf %s" % __version__)
    parser.add_argument("--config",
                        required=True,
                        nargs="+",
                        help="List of configuration files.")
    parser.add_argument(
        "--auto_config",
        default=False,
        action="store_true",
        help="Enable automatic configuration values.",
    )
    parser.add_argument(
        "--model_type",
        default="",
        choices=list(sorted(catalog.list_model_names_from_catalog())),
        help="Model type from the catalog.",
    )
    parser.add_argument("--model",
                        default="",
                        help="Custom model configuration file.")
    parser.add_argument(
        "--run_dir",
        default="",
        help="If set, model_dir will be created relative to this location.",
    )
    parser.add_argument(
        "--data_dir",
        default="",
        help="If set, data files are expected to be relative to this location.",
    )
    parser.add_argument(
        "--checkpoint_path",
        default=None,
        help=("Specific checkpoint or model directory to load "
              "(when a directory is set, the latest checkpoint is used)."),
    )
    parser.add_argument(
        "--log_level",
        default="INFO",
        choices=["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"],
        help="Logs verbosity.",
    )
    parser.add_argument("--seed", type=int, default=None, help="Random seed.")
    parser.add_argument(
        "--gpu_allow_growth",
        default=False,
        action="store_true",
        help="Allocate GPU memory dynamically.",
    )
    parser.add_argument(
        "--intra_op_parallelism_threads",
        type=int,
        default=0,
        help=("Number of intra op threads (0 means the system picks "
              "an appropriate number)."),
    )
    parser.add_argument(
        "--inter_op_parallelism_threads",
        type=int,
        default=0,
        help=("Number of inter op threads (0 means the system picks "
              "an appropriate number)."),
    )
    parser.add_argument(
        "--mixed_precision",
        default=False,
        action="store_true",
        help="Enable mixed precision.",
    )
    parser.add_argument(
        "--eager_execution",
        default=False,
        action="store_true",
        help="Enable TensorFlow eager execution.",
    )

    subparsers = parser.add_subparsers(help="Run type.", dest="run_type")
    subparsers.required = True
    parser_train = subparsers.add_parser("train", help="Training.")
    parser_train.add_argument(
        "--with_eval",
        default=False,
        action="store_true",
        help="Enable automatic evaluation.",
    )
    parser_train.add_argument(
        "--num_gpus",
        type=int,
        default=1,
        help="Number of GPUs to use for in-graph replication.",
    )
    parser_train.add_argument(
        "--horovod",
        default=False,
        action="store_true",
        help="Enable Horovod training mode.",
    )

    parser_eval = subparsers.add_parser("eval", help="Evaluation.")
    parser_eval.add_argument("--features_file",
                             nargs="+",
                             default=None,
                             help="Input features files.")
    parser_eval.add_argument("--labels_file",
                             default=None,
                             help="Output labels files.")

    parser_infer = subparsers.add_parser("infer", help="Inference.")
    parser_infer.add_argument("--features_file",
                              nargs="+",
                              required=True,
                              help="Run inference on this file.")
    parser_infer.add_argument(
        "--predictions_file",
        default="",
        help=
        ("File used to save predictions. If not set, predictions are printed "
         "on the standard output."),
    )
    parser_infer.add_argument(
        "--log_prediction_time",
        default=False,
        action="store_true",
        help="Logs some prediction time metrics.",
    )

    parser_export = subparsers.add_parser("export", help="Model export.")
    parser_export.add_argument(
        "--output_dir",
        "--export_dir",
        required=True,
        help="The directory of the exported model.",
    )
    parser_export.add_argument(
        "--format",
        "--export_format",
        choices=exporters.list_exporters(),
        default="saved_model",
        help="Format of the exported model.",
    )

    parser_score = subparsers.add_parser("score", help="Scoring.")
    parser_score.add_argument("--features_file",
                              nargs="+",
                              required=True,
                              help="Features file.")
    parser_score.add_argument("--predictions_file",
                              default=None,
                              help="Predictions to score.")

    parser_average_checkpoints = subparsers.add_parser(
        "average_checkpoints", help="Checkpoint averaging.")
    parser_average_checkpoints.add_argument(
        "--output_dir",
        required=True,
        help="The output directory for the averaged checkpoint.",
    )
    parser_average_checkpoints.add_argument(
        "--max_count",
        type=int,
        default=8,
        help="The maximal number of checkpoints to average.",
    )

    parser_update_vocab = subparsers.add_parser(
        "update_vocab", help="Update model vocabularies in checkpoint.")
    parser_update_vocab.add_argument(
        "--output_dir",
        required=True,
        help="The output directory for the updated checkpoint.",
    )
    parser_update_vocab.add_argument("--src_vocab",
                                     default=None,
                                     help="Path to the new source vocabulary.")
    parser_update_vocab.add_argument("--tgt_vocab",
                                     default=None,
                                     help="Path to the new target vocabulary.")

    # When using an option that takes multiple values just before the run type,
    # the run type is treated as a value of this option. To fix this issue, we
    # inject a placeholder option just before the run type to clearly separate it.
    parser.add_argument("--placeholder",
                        action="store_true",
                        help=argparse.SUPPRESS)
    run_types = set(subparsers.choices.keys())
    args = sys.argv[1:]
    for i, arg in enumerate(args):
        if arg in run_types:
            args.insert(i, "--placeholder")
            break

    args = parser.parse_args(args)
    if (hasattr(args, "features_file") and args.features_file
            and len(args.features_file) == 1):
        args.features_file = args.features_file[0]

    _initialize_logging(getattr(logging, args.log_level))
    tf.config.threading.set_intra_op_parallelism_threads(
        args.intra_op_parallelism_threads)
    tf.config.threading.set_inter_op_parallelism_threads(
        args.inter_op_parallelism_threads)

    if args.eager_execution:
        tf.config.run_functions_eagerly(True)

    gpus = tf.config.list_physical_devices(device_type="GPU")
    if hasattr(args, "horovod") and args.horovod:
        import horovod.tensorflow as hvd

        hvd.init()
        is_master = hvd.rank() == 0
        if gpus:
            local_gpu = gpus[hvd.local_rank()]
            tf.config.set_visible_devices(local_gpu, device_type="GPU")
            gpus = [local_gpu]
    else:
        hvd = None
        is_master = True

    if args.gpu_allow_growth:
        for device in gpus:
            tf.config.experimental.set_memory_growth(device, enable=True)

    # Load and merge run configurations.
    config = load_config(args.config)
    if args.run_dir:
        config["model_dir"] = os.path.join(args.run_dir, config["model_dir"])
    if args.data_dir:
        config["data"] = _prefix_paths(args.data_dir, config["data"])

    if is_master and not tf.io.gfile.exists(config["model_dir"]):
        tf.get_logger().info("Creating model directory %s",
                             config["model_dir"])
        tf.io.gfile.makedirs(config["model_dir"])

    model = load_model(
        config["model_dir"],
        model_file=args.model,
        model_name=args.model_type,
        serialize_model=is_master,
        as_builder=True,
    )
    runner = Runner(
        model,
        config,
        auto_config=args.auto_config,
        mixed_precision=args.mixed_precision,
        seed=args.seed,
    )

    if args.run_type == "train":
        runner.train(
            num_devices=args.num_gpus,
            with_eval=args.with_eval,
            checkpoint_path=args.checkpoint_path,
            hvd=hvd,
        )
    elif args.run_type == "eval":
        metrics = runner.evaluate(
            checkpoint_path=args.checkpoint_path,
            features_file=args.features_file,
            labels_file=args.labels_file,
        )
        print(metrics)
    elif args.run_type == "infer":
        runner.infer(
            args.features_file,
            predictions_file=args.predictions_file,
            checkpoint_path=args.checkpoint_path,
            log_time=args.log_prediction_time,
        )
    elif args.run_type == "export":
        runner.export(
            args.output_dir,
            checkpoint_path=args.checkpoint_path,
            exporter=exporters.make_exporter(args.format),
        )
    elif args.run_type == "score":
        runner.score(
            args.features_file,
            args.predictions_file,
            checkpoint_path=args.checkpoint_path,
        )
    elif args.run_type == "average_checkpoints":
        runner.average_checkpoints(args.output_dir, max_count=args.max_count)
    elif args.run_type == "update_vocab":
        runner.update_vocab(args.output_dir,
                            src_vocab=args.src_vocab,
                            tgt_vocab=args.tgt_vocab)
예제 #45
0
def main():
    tf.set_random_seed(1234)
    np.random.seed(4321)

    # initiate horovod
    hvd.init()

    cmdline = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    # Basic options
    cmdline.add_argument('--num_batches',
                         default=2,
                         type=int,
                         help="""number of each minibatch.""")
    cmdline.add_argument('--batch_size',
                         default=None,
                         type=int,
                         help="""Size of each minibatch.""")
    cmdline.add_argument('--log_frequency',
                         default=None,
                         type=int,
                         help="""Logging frequency.""")
    cmdline.add_argument('--max_steps',
                         default=None,
                         type=int,
                         help="""Maximum steps.""")
    cmdline.add_argument('--network_config',
                         default=None,
                         type=str,
                         help="""Neural net architecture.""")
    cmdline.add_argument('--data_dir',
                         default=None,
                         type=str,
                         help="""Data directory [train/test].""")
    cmdline.add_argument('--checkpt_dir',
                         default=None,
                         type=str,
                         help="""Checkpoint directory.""")
    cmdline.add_argument('--input_flags',
                         default=None,
                         type=str,
                         help="""Input json.""")
    cmdline.add_argument('--hyper_params',
                         default=None,
                         type=str,
                         help="""Hyper parameters.""")
    cmdline.add_argument('--ilr',
                         default=None,
                         type=float,
                         help="""Initial learning rate ( hyper parameter).""")
    cmdline.add_argument(
        '--epochs_per_decay',
        default=None,
        type=float,
        help="""Number of epochs per lr decay ( hyper parameter).""")
    cmdline.add_argument('--scaling',
                         default=None,
                         type=float,
                         help="""Scaling (hyper parameter).""")
    cmdline.add_argument('--bn_decay',
                         default=None,
                         type=float,
                         help="""Batch norm decay (hyper parameter).""")
    cmdline.add_argument('--save_epochs',
                         default=0.5,
                         type=float,
                         help="""Number of epochs to save checkpoint. """)
    cmdline.add_argument('--mode',
                         default='train',
                         type=str,
                         help="""train or eval (:validates from checkpoint)""")
    cmdline.add_argument('--cpu_threads',
                         default=10,
                         type=int,
                         help="""cpu threads per rank""")
    add_bool_argument(cmdline,
                      '--fp16',
                      default=None,
                      help="""Train with half-precision.""")
    add_bool_argument(cmdline,
                      '--fp32',
                      default=None,
                      help="""Train with single-precision.""")
    add_bool_argument(cmdline,
                      '--restart',
                      default=None,
                      help="""Restart training from checkpoint.""")
    add_bool_argument(cmdline,
                      '--nvme',
                      default=None,
                      help="""Copy data to burst buffer.""")

    FLAGS, unknown_args = cmdline.parse_known_args()
    if len(unknown_args) > 0:
        for bad_arg in unknown_args:
            if hvd.rank() == 0:
                print('<ERROR> Unknown command line arg: %s' % bad_arg)
        raise ValueError('Invalid command line arg(s)')

    # Load input flags
    if FLAGS.input_flags is not None:
        params = io_utils.get_dict_from_json(FLAGS.input_flags)
        params['input_flags'] = FLAGS.input_flags
    else:
        params = io_utils.get_dict_from_json('input_flags.json')
        params['input_flags'] = 'input_flags.json'

    params['start_time'] = time.time()
    params['cmdline'] = 'unknown'
    if FLAGS.batch_size is not None:
        params['batch_size'] = FLAGS.batch_size
    if FLAGS.log_frequency is not None:
        params['log_frequency'] = FLAGS.log_frequency
    if FLAGS.max_steps is not None:
        params['max_steps'] = FLAGS.max_steps
    if FLAGS.network_config is not None:
        params['network_config'] = FLAGS.network_config
    if FLAGS.data_dir is not None:
        params['data_dir'] = FLAGS.data_dir
    if FLAGS.checkpt_dir is not None:
        params['checkpt_dir'] = FLAGS.checkpt_dir
    if FLAGS.hyper_params is not None:
        params['hyper_params'] = FLAGS.hyper_params
    if FLAGS.fp16 is not None:
        params['IMAGE_FP16'] = True
    if FLAGS.fp32 is not None:
        params['IMAGE_FP16'] = False
    if FLAGS.restart is not None:
        params['restart'] = True
    if FLAGS.save_epochs is not None:
        params['epochs_per_saving'] = FLAGS.save_epochs
    if FLAGS.mode == 'train':
        params['mode'] = 'train'
    if FLAGS.mode == 'eval':
        params['mode'] = 'eval'
    if FLAGS.cpu_threads is not None:
        params['IO_threads'] = FLAGS.cpu_threads

    if FLAGS.nvme is not None:
        params = nvme_staging(params['data_dir'], params, mode=params['mode'])

    benchmark_io(params, filetype='lmdb', num_batches=FLAGS.num_batches)
예제 #46
0
import json
import os

import horovod.tensorflow as hvd

if __name__ == '__main__':

    hvd.init()

    with open(os.path.join('/opt/ml/model/rank-%s' % hvd.rank()), 'w+') as f:
        basic_info = {'rank': hvd.rank(), 'size': hvd.size()}

        json.dump(basic_info, f)
        print('Saved file "rank-%s": %s' % (hvd.rank(), basic_info))

def main(argv=None):
    # Initialize Horovod.
    hvd.init()

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    KB.set_session(tf.Session(config=config))

    # print('LOCAL RANK, OVERAL RANK: {}, {}'.format(hvd.local_rank(),
    #                                                hvd.rank()))

    ngpus = hvd.size()

    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = _parser(desc)

    num_devices_tfrecord = 1
    height, width = 224, 224  # Image dimensions. Gets resized if not match.
    distort_color = args.distort_color
    data_dir = args.datadir
    batch_size = args.batch_size  # * ngpus
    epochs = args.epochs
    imgs_per_epoch = args.imgs_per_epoch

    # Fit the model using data from the TFRecord data tensors.
    device_minibatches = RecordInputImagenetPreprocessor.device_minibatches
    images_tfrecord, labels_tfrecord, nrecords = device_minibatches(
        num_devices_tfrecord, data_dir, batch_size,
        height, width, distort_color, val=False)
    images_tfrecord = images_tfrecord[0]
    labels_tfrecord = labels_tfrecord[0]

    # CASTING FOR KERAS
    # labels[device_num] = tf.cast(labels_tfrecord, dtype)
    nclasses = 1000
    labels_tfrecord = tf.one_hot(labels_tfrecord, nclasses)

    nimgs_to_use = imgs_per_epoch if imgs_per_epoch > 0 else nrecords
    steps_per_epoch = nimgs_to_use // batch_size // hvd.size()
    # steps_per_epoch = 100

    # batch_shape = images_tfrecord.get_shape().as_list()
    # images = Input(tensor=images_tfrecord, batch_shape=x_batch_shape)
    images = Input(tensor=images_tfrecord)
    model = ResNet50(input_tensor=images, weights=None)
    if hvd.rank() == 0:
        model.summary()

        print('Num images: {}'.format(nrecords))

        if nimgs_to_use < nrecords:
            print('Using {} images per epoch'.format(nimgs_to_use))

        # print('IMAGES_TFRECORD: {}'.format(images_tfrecord))
        # print('LABELS_TFRECORD: {}'.format(labels_tfrecord))

    # Add Horovod Distributed Optimizer from nvcnn.py
    # momentum = 0.9
    # lr = 0.1
    # learning_rate = tf.train.exponential_decay(
    #             lr,
    #             self.global_step,
    #             decay_steps=FLAGS.lr_decay_epochs * nstep_per_epoch,
    #             decay_rate=FLAGS.lr_decay_rate,
    #             staircase=True)
    # opt = tf.train.MomentumOptimizer(self.learning_rate, momentum,
    #                                  use_nesterov=True)

    # lr = 0.001 * ngpus
    # opt = tf.train.AdamOptimizer()
    # opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    # opt = KO.TFOptimizer(opt)  # Required for tf.train based optimizers

    opt = KO.Adam()
    opt = hvd_keras.DistributedOptimizer(opt)

    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  # metrics=['accuracy'],
                  target_tensors=[labels_tfrecord])

    # Broadcast variables from rank 0 to all other processes.
    KB.get_session().run(hvd.broadcast_global_variables(0))

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(),
                      SamplesPerSec(ngpus * batch_size)]

    # RecordInput is a yield op which doesn't use queue runners or queues.
    # Start the queue runners.
    # sess = KB.get_session()

    # sess.run([tf.local_variables_initializer(),
    #           tf.global_variables_initializer()])

    # coord = tf.train.Coordinator()
    # threads = tf.train.start_queue_runners(sess, coord)

    start_time = time.time()
    model.fit(
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=callbacks,
        verbose=1)
    # verbose=hvd.rank() == 0)
    elapsed_time = time.time() - start_time

    if hvd.rank() == 0:
        print('[{}] finished in {} s'
              .format('TRAINING', round(elapsed_time, 3)))
        # loss = model.evaluate(None, None, steps=steps_per_epoch_val)

        images_tfrecord_val, labels_tfrecord_val, nrecords_val = \
            device_minibatches(num_devices_tfrecord, data_dir, batch_size,
                               height, width, distort_color, val=True)
        images_tfrecord_val = images_tfrecord_val[0]
        labels_tfrecord_val = labels_tfrecord_val[0]
        labels_tfrecord_val = tf.one_hot(labels_tfrecord_val, nclasses)

        # print('IMAGES_TFRECORD_VAL: {}'.format(images_tfrecord_val))
        # print('labels_tfrecord_val: {}'.format(labels_tfrecord_val))

        steps_per_epoch_val = nrecords_val // batch_size

        images_val = Input(tensor=images_tfrecord_val)
        model_val = model
        model_val.layers[0] = KL.InputLayer(input_tensor=images_val)
        model_val.compile(
            loss='categorical_crossentropy',
            optimizer=opt,
            metrics=['accuracy'],
            target_tensors=[labels_tfrecord_val])
        # model.summary()
        loss = model_val.evaluate(x=None, y=None, steps=steps_per_epoch_val)

        print('\nNum images evaluated, steps: {}, {}'.
              format(nrecords_val, steps_per_epoch_val))
        print('\nTest loss, acc: {}'.format(loss))
        # print('\nTest accuracy: {0}'.format(acc))

    # Clean up the TF session.
    # coord.request_stop()
    # coord.join(threads)

    KB.clear_session()  # do this for Horovod
예제 #48
0
파일: train.py 프로젝트: sAZmed/tensorpack
    parser.add_argument('--load', help='Load a model to start training from. It overwrites BACKBONE.WEIGHTS')
    parser.add_argument('--logdir', help='Log directory. Will remove the old one if already exists.',
                        default='train_log/maskrcnn')
    parser.add_argument('--config', help="A list of KEY=VALUE to overwrite those defined in config.py", nargs='+')

    args = parser.parse_args()
    if args.config:
        cfg.update_args(args.config)
    register_coco(cfg.DATA.BASEDIR)  # add COCO datasets to the registry
    register_balloon(cfg.DATA.BASEDIR)  # add the demo balloon datasets to the registry

    # Setup logging ...
    is_horovod = cfg.TRAINER == 'horovod'
    if is_horovod:
        hvd.init()
    if not is_horovod or hvd.rank() == 0:
        logger.set_logger_dir(args.logdir, 'd')
    logger.info("Environment Information:\n" + collect_env_info())

    finalize_configs(is_training=True)

    # Create model
    MODEL = ResNetFPNModel() if cfg.MODE_FPN else ResNetC4Model()

    # Compute the training schedule from the number of GPUs ...
    stepnum = cfg.TRAIN.STEPS_PER_EPOCH
    # warmup is step based, lr is epoch based
    init_lr = cfg.TRAIN.WARMUP_INIT_LR * min(8. / cfg.TRAIN.NUM_GPUS, 1.)
    warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)]
    warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum
    lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)]
예제 #49
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for Estimator."""
        if FLAGS.verbose_logging:
            tf.logging.info("*** Features ***")
            for name in sorted(features.keys()):
                tf.logging.info("  name = %s, shape = %s" %
                                (name, features[name].shape))

        unique_ids = features["unique_ids"]
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        if not is_training and FLAGS.use_trt:
            trt_graph = get_frozen_tftrt_model(bert_config, input_ids.shape,
                                               use_one_hot_embeddings,
                                               init_checkpoint)
            (start_logits, end_logits) = tf.import_graph_def(
                trt_graph,
                input_map={
                    'input_ids': input_ids,
                    'input_mask': input_mask,
                    'segment_ids': segment_ids
                },
                return_elements=['unstack:0', 'unstack:1'],
                name='')
            predictions = {
                "unique_ids": unique_ids,
                "start_logits": start_logits,
                "end_logits": end_logits,
            }
            output_spec = tf.estimator.TPUEstimatorSpec(
                mode=mode, predictions=predictions)
            return output_spec

        (start_logits, end_logits) = create_model(
            bert_config=bert_config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            segment_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        tvars = tf.trainable_variables()

        initialized_variable_names = {}
        if init_checkpoint and (hvd is None or hvd.rank() == 0):
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)

            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        if FLAGS.verbose_logging:
            tf.logging.info("**** Trainable Variables ****")
            for var in tvars:
                init_string = ""
                if var.name in initialized_variable_names:
                    init_string = ", *INIT_FROM_CKPT*"
                tf.logging.info(" %d name = %s, shape = %s%s",
                                0 if hvd is None else hvd.rank(), var.name,
                                var.shape, init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            seq_length = modeling.get_shape_list(input_ids)[1]

            def compute_loss(logits, positions):
                one_hot_positions = tf.one_hot(positions,
                                               depth=seq_length,
                                               dtype=tf.float32)
                log_probs = tf.nn.log_softmax(logits, axis=-1)
                loss = -tf.reduce_mean(
                    tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
                return loss

            start_positions = features["start_positions"]
            end_positions = features["end_positions"]

            start_loss = compute_loss(start_logits, start_positions)
            end_loss = compute_loss(end_logits, end_positions)

            total_loss = (start_loss + end_loss) / 2.0

            train_op = optimization.create_optimizer(
                total_loss, learning_rate, num_train_steps, num_warmup_steps,
                hvd, False, use_fp16, FLAGS.num_accumulation_steps)

            output_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                     loss=total_loss,
                                                     train_op=train_op)
        elif mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {
                "unique_ids": unique_ids,
                "start_logits": start_logits,
                "end_logits": end_logits,
            }
            output_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                     predictions=predictions)
        else:
            raise ValueError("Only TRAIN and PREDICT modes are supported: %s" %
                             (mode))

        return output_spec
예제 #50
0
def input_fn(filenames='',
             channel='training',
             batch_size=32,
             num_epochs=1,
             perform_shuffle=False):
    print('Parsing', filenames)

    def decode_libsvm(line):
        #columns = tf.decode_csv(value, record_defaults=CSV_COLUMN_DEFAULTS)
        #features = dict(zip(CSV_COLUMNS, columns))
        #labels = features.pop(LABEL_COLUMN)
        columns = tf.string_split([line], ' ')
        labels = tf.string_to_number(columns.values[0], out_type=tf.float32)
        splits = tf.string_split(columns.values[1:], ':')
        id_vals = tf.reshape(splits.values, splits.dense_shape)
        feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1)
        feat_ids = tf.string_to_number(feat_ids, out_type=tf.int32)
        feat_vals = tf.string_to_number(feat_vals, out_type=tf.float32)
        #feat_ids = tf.reshape(feat_ids,shape=[-1,FLAGS.field_size])
        #for i in range(splits.dense_shape.eval()[0]):
        #    feat_ids.append(tf.string_to_number(splits.values[2*i], out_type=tf.int32))
        #    feat_vals.append(tf.string_to_number(splits.values[2*i+1]))
        #return tf.reshape(feat_ids,shape=[-1,field_size]), tf.reshape(feat_vals,shape=[-1,field_size]), labels
        return {"feat_ids": feat_ids, "feat_vals": feat_vals}, labels

    # Extract lines from input files using the Dataset API, can pass one filename or filename list
    print("pipe mode ", FLAGS.pipe_mode)
    if FLAGS.pipe_mode == 0:
        """
        dataset = tf.data.TextLineDataset(filenames).map(decode_libsvm, num_parallel_calls=10).prefetch(500000)    # multi-thread pre-process then prefetch
              
        # Randomizes input using a window of 256 elements (read into memory)
        if perform_shuffle:
            dataset = dataset.shuffle(buffer_size=256)

        # epochs from blending together.
        dataset = dataset.repeat(num_epochs)
        dataset = dataset.batch(batch_size, drop_remainder=True) # Batch size to use
        """

        dataset = tf.data.TextLineDataset(filenames)
        #liangaws: 这里假设Sagemaker用的是S3fullreplicate,也就是sagemaker会把每个channle的数据都在每个训练实例上复制一份。所在这里直接基于每个worker的rank来做shard。
        dataset = dataset.shard(hvd.size(), hvd.rank())

        dataset = dataset.map(decode_libsvm, num_parallel_calls=10)
        dataset = dataset.prefetch(
            500000)  # multi-thread pre-process then prefetch
        if perform_shuffle:
            dataset = dataset.shuffle(buffer_size=256)

        # epochs from blending together.
        if num_epochs > 1:
            dataset = dataset.repeat(num_epochs)

        dataset = dataset.batch(batch_size,
                                drop_remainder=True)  # Batch size to use

        #return dataset.make_one_shot_iterator()
        iterator = dataset.make_one_shot_iterator()
        batch_features, batch_labels = iterator.get_next()
        #return tf.reshape(batch_ids,shape=[-1,field_size]), tf.reshape(batch_vals,shape=[-1,field_size]), batch_labels
        return batch_features, batch_labels

    else:
        print("-------enter into pipe mode branch!------------")
        dataset = PipeModeDataset(channel, record_format='TextLine')
        number_host = len(FLAGS.hosts)
        #liangaws: horovod + pipe mode下,如果每个训练实例有多个worker,需要每个worker对应一个不同的channel,因此建议每个channel中的数据集是提前经过切分好的。只要在多个训练实例上并且每个训练实例是多个worker进程的情况下,才需要对不同训练实例上的同一个channel的数据做shard。
        if number_host > 1 and hvd.size() > number_host:
            #liangaws: 在Sagemaker horovod方式下,发现current-host都是一样的。
            #index = FLAGS.hosts.index(FLAGS.current_host)
            index = hvd.rank() // FLAGS.worker_per_host
            dataset = dataset.shard(number_host, index)

        if num_epochs > 1:
            dataset = dataset.repeat(num_epochs)

        dataset = dataset.prefetch(500000)
        dataset = dataset.map(decode_libsvm, num_parallel_calls=10)
        dataset = dataset.batch(batch_size, drop_remainder=True)

        return dataset
예제 #51
0
파일: train.py 프로젝트: chinatian/glow
def _print(*args, **kwargs):
    if hvd.rank() == 0:
        print(*args, **kwargs)
예제 #52
0
def log_training_step(opts, model, file_writer, x, y, loss, pred, step,
                      metrics, optimizer, steptime, epoch):
    """ Log to file writer during training"""
    if hvd.local_rank() == 0 and hvd.rank() == 0:

        # Make y [batch_size,image_size,image_size,1], prepare for metrics
        y = tf.argmax(y, axis=-1)[..., None]
        compute_loss, compute_miou, compute_auc = metrics

        compute_miou.update_state(y, pred)
        compute_auc.update_state(y, pred)

        # Training Prints
        tf.print('\nEpoch:', epoch, 'Step', step, '/', opts.steps_per_epoch,
                 ': loss', loss, ': miou',
                 compute_miou.result().numpy(), ': auc',
                 compute_auc.result().numpy(), '\n')

        with file_writer.as_default():

            image = tf.cast(255 * x, tf.uint8)
            mask = tf.cast(255 * y, tf.uint8)
            summary_predictions = tf.cast(tf.expand_dims(pred * 255, axis=-1),
                                          tf.uint8)
            tf.summary.scalar('Training StepTime',
                              steptime,
                              step=tf.cast(step, tf.int64))
            tf.summary.image('Train_image',
                             image,
                             step=tf.cast(step, tf.int64),
                             max_outputs=2)
            tf.summary.image('Train_mask',
                             mask,
                             step=tf.cast(step, tf.int64),
                             max_outputs=2)
            tf.summary.image('Train_prediction',
                             summary_predictions,
                             step=tf.cast(step, tf.int64),
                             max_outputs=2)
            tf.summary.scalar('Training Loss',
                              loss,
                              step=tf.cast(step, tf.int64))
            tf.summary.scalar('Training mIoU',
                              compute_miou.result().numpy(),
                              step=tf.cast(step, tf.int64))
            tf.summary.scalar('Training AUC',
                              compute_auc.result().numpy(),
                              step=tf.cast(step, tf.int64))

            # Logging the optimizer's hyperparameters
            for key in optimizer._hyper:
                tf.summary.scalar(key,
                                  optimizer._hyper[key].numpy(),
                                  step=tf.cast(step, tf.int64))
            # Extract weights and filter out None elemens for aspp without weights
            weights = filter(None, [x.weights for x in model.layers])
            for var in weights:
                tf.summary.histogram('%s' % var[0].name,
                                     var[0],
                                     step=tf.cast(step, tf.int64))

        file_writer.flush()

    return
예제 #53
0
파일: rnn.py 프로젝트: akshatdewan/returnn
def init_backend_engine():
    """
  Initializes ``engine``, which is either :class:`TFEngine.Engine` or Theano :class:`Engine.Engine`.
  """
    BackendEngine.select_engine(config=config)
    if BackendEngine.is_theano_selected():
        print("Theano:", describe_theano_version(), file=log.v3)
        import TheanoUtil
        TheanoUtil.monkey_patches()
    elif BackendEngine.is_tensorflow_selected():
        print("TensorFlow:", describe_tensorflow_version(), file=log.v3)
        if get_tensorflow_version_tuple()[0] == 0:
            print("Warning: TF <1.0 is not supported and likely broken.",
                  file=log.v2)
        if os.environ.get("TF_DEVICE"):
            print("Devices: Use %s via TF_DEVICE instead of %s." %
                  (os.environ.get("TF_DEVICE"),
                   config.opt_typed_value("device")),
                  file=log.v4)
            config.set("device", os.environ.get("TF_DEVICE"))
        if config.is_true("use_horovod"):
            import socket
            # noinspection PyPackageRequirements,PyUnresolvedReferences
            import horovod.tensorflow as hvd
            from TFUtil import init_horovod
            init_horovod()  # make sure it is initialized
            if "gpu" in config.value("device", "") or os.environ.get(
                    "CUDA_VISIBLE_DEVICES", ""):
                # We assume that we want to use a GPU.
                gpu_opts = config.typed_dict.setdefault("tf_session_opts",
                                                        {}).setdefault(
                                                            "gpu_options", {})
                assert "visible_device_list" not in gpu_opts
                gpu_opts["visible_device_list"] = str(hvd.local_rank())
                print("Horovod: Hostname %s, pid %i, using GPU %s." %
                      (socket.gethostname(), os.getpid(),
                       gpu_opts["visible_device_list"]),
                      file=log.v3)
            else:
                if hvd.rank() == 0:  # Don't spam in all ranks.
                    print("Horovod: Not using GPU.", file=log.v3)
            horovod_reduce_type = config.value("horovod_reduce_type", "")
            if horovod_reduce_type == "":
                horovod_reduce_type = "grad"
                config.set("horovod_reduce_type", horovod_reduce_type)
            else:
                assert horovod_reduce_type in [
                    "grad", "param"
                ], "config option 'horovod_reduce_type' invalid"
            if hvd.rank() == 0:  # Don't spam in all ranks.
                print("Horovod: Reduce type:",
                      horovod_reduce_type,
                      file=log.v3)
        from TFUtil import debug_register_better_repr, setup_tf_thread_pools, print_available_devices
        tf_session_opts = config.typed_value("tf_session_opts", {})
        assert isinstance(tf_session_opts, dict)
        # This must be done after the Horovod logic, such that we only touch the devices we are supposed to touch.
        setup_tf_thread_pools(log_file=log.v3, tf_session_opts=tf_session_opts)
        # Print available devices. Also make sure that get_tf_list_local_devices uses the correct TF session opts.
        print_available_devices(tf_session_opts=tf_session_opts, file=log.v2)
        debug_register_better_repr()
        if config.is_true("distributed_tf"):
            import TFDistributed
            TFDistributed.init_distributed_tf(config)
    else:
        raise NotImplementedError
예제 #54
0
def get_model_and_optimizer(opts):
    """ Load the model and optimizer """

    if opts.evaluate:
        assert opts.model_dir, "WARNING: Please provide --model_dir when --evaluate"

    if opts.model_dir:
        print(f'Resuming model from {opts.model_dir}...')
        model = tf.keras.models.load_model(opts.model_dir)
    else:
        model = Deeplabv3(input_shape=(opts.image_size, opts.image_size, 3),
                          classes=2,
                          backbone='xception',
                          opts=opts)

    if opts.horovod:
        # Horovod: (optional) compression algorithm.
        compression = hvd.Compression.fp16 if opts.fp16_allreduce else hvd.Compression.none

        if opts.optimizer == 'Adam':
            opt = tf.optimizers.Adam(opts.base_lr * hvd.size(),
                                     epsilon=opts.epsilon)
        elif opts.optimizer == 'SGD':
            opt = tf.optimizers.SGD(opts.base_lr * hvd.size(), opts.momentum,
                                    opts.nesterov)
        else:
            raise NotImplementedError(
                'Only SGD and Adam are supported for now')

        # opt = mixed_precision.LossScaleOptimizer(opt, loss_scale='dynamic')

        # Horovod: add Horovod DistributedOptimizer.
        # opt = hvd.DistributedOptimizer(opt, backward_passes_per_step=5, op=hvd.Adasum)

    else:
        if opts.optimizer == 'Adam':
            opt = tf.optimizers.Adam(opts.base_lr, epsilon=opts.epsilon)
        elif opts.optimizer == 'SGD':
            opt = tf.optimizers.SGD(opts.base_lr, opts.momentum, opts.nesterov)
        else:
            raise NotImplementedError(
                'Only SGD and Adam are supported for now')
        compression = None

    if hvd.rank() == 0:
        print("Compiling model...")

    model.layers[0].build(input_shape=(None, opts.image_size, opts.image_size,
                                       3))
    # for layer in model.layers[0].layers:
    #     for var in layer.variables:
    #         print(var.name, var.shape, var.device)

    if hvd.rank() == 0:
        model.summary()
        # if opts.model == 'deeplab':
        #     for layer in model.layers: print(layer.name,layer.dtype)
        # else:
        #     for layer in model.layers[0].layers: print(layer.name,layer.dtype)

    return model, opt, compression
예제 #55
0
def test(args):
    import filelock
    with filelock.FileLock('/tmp/robotstify.lock'):
        import gym
        import sys
        try:
            import goexplore_py.complex_fetch_env
        except Exception:
            print('Could not import complex_fetch_env, is goexplore_py in PYTHONPATH?')

    import tensorflow as tf
    import horovod.tensorflow as hvd
    hvd.init()
    print('initialized worker %d' % hvd.rank(), flush=True)
    from baselines.common import set_global_seeds
    set_global_seeds(hvd.rank())
    from baselines import bench
    from baselines.common import set_global_seeds
    from atari_reset.wrappers import VecFrameStack, VideoWriter, my_wrapper,\
        EpsGreedyEnv, StickyActionEnv, NoopResetEnv, SubprocVecEnv, PreventSlugEnv, FetchSaveEnv, TanhWrap
    from atari_reset.ppo import learn
    from atari_reset.policies import CnnPolicy, GRUPolicy, FFPolicy

    set_global_seeds(hvd.rank())
    ncpu = 2
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=ncpu,
                            inter_op_parallelism_threads=ncpu)
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    tf.Session(config=config).__enter__()

    max_noops = 30 if args.noops else 0
    print('SAVE PATH', args.save_path)

    def make_env(rank):
        def env_fn():
            if args.game == 'fetch':
                assert args.fetch_target_location is not None, 'For now, we require a target location for fetch'
                kwargs = {}
                dargs = vars(args)
                for attr in dargs:
                    if attr.startswith('fetch_'):
                        if attr == 'fetch_type':
                            kwargs['model_file'] = f'teleOp_{args.fetch_type}.xml'
                        elif attr != 'fetch_total_timestep':
                            kwargs[attr[len('fetch_'):]] = dargs[attr]

                env = goexplore_py.complex_fetch_env.ComplexFetchEnv(
                    **kwargs
                )
            elif args.game == 'fetch_dumb':
                env = goexplore_py.dumb_fetch_env.ComplexFetchEnv()
            else:
                env = gym.make(args.game + 'NoFrameskip-v4')
                if args.seed_env:
                    env.seed(0)
                # if args.unlimited_score:
                #     # This removes the TimeLimit wrapper around the env
                #     env = env.env
                # env = PreventSlugEnv(env)
            # change for long runs
            # env._max_episode_steps *= 1000
            env = bench.Monitor(env, "{}.monitor.json".format(rank), allow_early_resets=True)
            if False and rank%nenvs == 0 and hvd.local_rank()==0:
                os.makedirs(args.save_path + '/vids/' + args.game, exist_ok=True)
                videofile_prefix = args.save_path + '/vids/' + args.game
                env = VideoWriter(env, videofile_prefix)
            if 'fetch' not in args.game:
                if args.noops:
                    os.makedirs(args.save_path, exist_ok=True)
                    env = NoopResetEnv(env, 30, nenvs, args.save_path, num_per_noop=args.num_per_noop, unlimited_score=args.unlimited_score)
                    env = my_wrapper(env, clip_rewards=True, sticky=args.sticky)
                if args.epsgreedy:
                    env = EpsGreedyEnv(env)
            else:
                os.makedirs(f'{args.save_path}', exist_ok=True)
                env = FetchSaveEnv(env, rank=rank, n_ranks=nenvs, save_path=f'{args.save_path}/', demo_path=args.demo)
                env = TanhWrap(env)
            # def print_rec(e):
            #     print(e.__class__.__name__)
            #     if hasattr(e, 'env'):
            #         print_rec(e.env)
            # import time
            # import random
            # time.sleep(random.random() * 10)
            # print('\tSHOWING STUFF')
            # print_rec(env)
            # print('\n\n\n')
            return env
        return env_fn

    nenvs = args.nenvs
    env = SubprocVecEnv([make_env(i + nenvs * hvd.rank()) for i in range(nenvs)])
    env = VecFrameStack(env, 1 if 'fetch' in args.game else 4)

    if 'fetch' in args.game:
        print('Fetch environment, using the feedforward policy.')
        args.policy = FFPolicy
    else:
        args.policy = {'cnn': CnnPolicy, 'gru': GRUPolicy}[args.policy]

    args.sil_pg_weight_by_value = False
    args.sil_vf_relu = False
    args.sil_vf_coef = 0
    args.sil_coef = 0
    args.sil_ent_coef = 0
    args.ent_coef = 0
    args.vf_coef = 0
    args.cliprange = 1
    args.l2_coef = 0
    args.adam_epsilon = 1e-8
    args.gamma = 0.99
    args.lam = 0.10
    args.scale_rewards = 1.0
    args.sil_weight_success_rate = True
    args.norm_adv = 1.0
    args.log_interval = 1
    args.save_interval = 100
    args.subtract_rew_avg = True
    args.clip_rewards = False
    learn(env, args, True)
예제 #56
0
            weight_decay=RUNNING_CONFIG.weight_decay,
            learning_rate=RUNNING_CONFIG.learning_rate,
            learning_rate_decay_factor=RUNNING_CONFIG.
            learning_rate_decay_factor,
            learning_rate_decay_steps=RUNNING_CONFIG.learning_rate_decay_steps,
            rmsprop_decay=RUNNING_CONFIG.rmsprop_decay,
            rmsprop_momentum=RUNNING_CONFIG.rmsprop_momentum,
            use_auto_loss_scaling=FLAGS.use_auto_loss_scaling,
            augment_data=RUNNING_CONFIG.augment_data,
            is_benchmark=RUNNING_CONFIG.exec_mode == 'training_benchmark')

    if RUNNING_CONFIG.exec_mode in [
            "train_and_evaluate", 'evaluate', 'inference_benchmark'
    ]:

        if RUNNING_CONFIG.exec_mode == 'inference_benchmark' and hvd_utils.is_using_hvd(
        ):
            raise NotImplementedError(
                "Only single GPU inference is implemented.")

        elif not hvd_utils.is_using_hvd() or hvd.rank() == 0:

            runner.evaluate(
                iter_unit=RUNNING_CONFIG.iter_unit if
                RUNNING_CONFIG.exec_mode != "train_and_evaluate" else "epoch",
                num_iter=RUNNING_CONFIG.num_iter
                if RUNNING_CONFIG.exec_mode != "train_and_evaluate" else 1,
                warmup_steps=RUNNING_CONFIG.warmup_steps,
                batch_size=RUNNING_CONFIG.batch_size,
                is_benchmark=RUNNING_CONFIG.exec_mode == 'inference_benchmark')
        batch_inputs, batch_labels = generate_batch(
            batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([train_op, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            run.log("Loss", average_loss)
            average_loss = 0
    final_embeddings = normalized_embeddings.eval()

    # Evaluate similarity in the end on worker 0.
    if hvd.rank() == 0:
        sim = similarity.eval()
        for i in xrange(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            nearest = (-sim[i, :]).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in xrange(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)
def main(argv=None):
    '''
    '''
    main.__doc__ = __doc__
    argv = sys.argv if argv is None else sys.argv.extend(argv)
    desc = main.__doc__  # .format(os.path.basename(__file__))
    # CLI parser
    args = parser_(desc)

    # Initialize Horovod.
    hvd.init()

    logdevp = args.logdevp  # For debugging
    log_device_placement, allow_soft_placement = (True, True) \
        if _DEVPROF or logdevp else (False, False)

    nranks_per_gpu = args.nranks_per_gpu
    local_rank = hvd.local_rank()
    gpu_local_rank = local_rank // nranks_per_gpu
    print('local_rank, GPU_LOCAL_RANK: {}, {}'.format(
        local_rank, gpu_local_rank))

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto(log_device_placement=log_device_placement,
                            allow_soft_placement=allow_soft_placement)
    config.gpu_options.allow_growth = True
    # config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.visible_device_list = str(gpu_local_rank)
    KB.set_session(tf.Session(config=config))

    hvdsize = hvd.size()

    checkpt = getattr(args, 'checkpt', None)
    checkpt_flag = False if checkpt is None else True
    filepath = checkpt
    # print('CHECKPT:', checkpt)

    batch_size = args.batch_size
    num_classes = 10
    epochs = args.epochs
    data_augmentation = args.aug

    datadir = getattr(args, 'datadir', None)

    # The data, shuffled and split between train and test sets:
    (x_train, y_train), (x_test, y_test) = cifar10_load_data(datadir) \
        if datadir is not None else cifar10.load_data()
    train_samples = x_train.shape[0]
    test_samples = x_test.shape[0]
    steps_per_epoch = train_samples // batch_size // hvdsize
    test_batches = test_samples // batch_size
    print(train_samples, 'train samples')
    print(test_samples, 'test samples')

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255

    # Convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes).squeeze()
    y_test = to_categorical(y_test, num_classes).squeeze()

    callbacks = []
    if hvd.rank() == 0:
        callbacks += [BatchTiming(), SamplesPerSec(batch_size * hvdsize)]

    print(x_train.shape, 'train shape')
    # with tf.device('/cpu:0'):
    model = make_model(x_train.shape, num_classes,
                       filepath if checkpt_flag else None)

    lr = 0.0001 * hvdsize
    opt = tf.train.RMSPropOptimizer(lr)
    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)  # , use_locking=True)
    opt = TFOptimizer(opt)  # Required for tf.train based optimizers

    # ------------------------------------- HAVE TO GET SESSION AFTER OPTIMIZER
    # sess = KB.get_session()
    # -------------------------------------------------------------------------

    # Let's train the model using RMSprop
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    if hvd.rank() == 0:
        model.summary()

    KB.get_session().run(hvd.broadcast_global_variables(0))
    if not data_augmentation:
        print('Not using data augmentation.')
        # model.fit(x_train, y_train,
        #           batch_size=batch_size,
        #           epochs=epochs,
        #           validation_data=(x_test, y_test),
        #           shuffle=True,
        #           callbacks=callbacks)

        train_gen = ImageDataGenerator()
        test_gen = ImageDataGenerator()
        # Train the model. The training will randomly sample 1 / N batches of
        # training data and 3 / N batches of validation data on every worker,
        # where N is the number of workers. Over-sampling of validation data
        # helps to increase probability that every validation example will be
        # evaluated.
        start_time = time.time()
        model.fit_generator(
            train_gen.flow(x_train, y_train, batch_size=batch_size),
            steps_per_epoch=steps_per_epoch,
            callbacks=callbacks,
            epochs=epochs,
            verbose=hvd.rank() == 0,
            validation_data=test_gen.flow(x_test, y_test,
                                          batch_size=batch_size),
            validation_steps=3 * test_batches // hvdsize)

    else:
        print('Using real-time data augmentation.')
        # This will do preprocessing and realtime data augmentation:
        datagen = ImageDataGenerator(
            featurewise_center=False,  # set input mean to 0 over the dataset
            samplewise_center=False,  # set each sample mean to 0
            # divide inputs by std of the dataset
            featurewise_std_normalization=False,
            samplewise_std_normalization=False,  # divide each input by its std
            zca_whitening=False,  # apply ZCA whitening
            # randomly rotate images in the range (degrees, 0 to 180)
            rotation_range=0,
            # randomly shift images horizontally (fraction of total width)
            width_shift_range=0.1,
            # randomly shift images vertically (fraction of total height)
            height_shift_range=0.1,
            horizontal_flip=True,  # randomly flip images
            vertical_flip=False)  # randomly flip images

        # Compute quantities required for feature-wise normalization
        # (std, mean, and principal components if ZCA whitening is applied).
        datagen.fit(x_train)

        start_time = time.time()
        # Fit the model on the batches generated by datagen.flow().
        model.fit_generator(
            datagen.flow(x_train, y_train, batch_size=batch_size),
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            validation_data=(x_test, y_test),
            verbose=hvd.rank() == 0,
            callbacks=callbacks)

    if hvd.rank() == 0:
        elapsed_time = time.time() - start_time
        print('[{}] finished in {} s'
              .format('TRAINING', round(elapsed_time, 3)))

        metrics = model.evaluate(x=x_test, y=y_test, batch_size=batch_size)
        print('\nCIFAR VALIDATION LOSS, ACC: {}, {}'.format(*metrics))

    KB.clear_session()
예제 #59
0
def save_adv(image_orig, image_adv, label, target_label, logits, data_obj):
    # print("********---------", data_obj.self.step)

    session = data_obj.trainer.sess
    # print(hvd.rank(), hvd.local_rank())
    output_filename = 'rank-%.5d-%.5d' % (hvd.rank(), data_obj.self.step)
    output_file = os.path.join(data_obj.save_dir, output_filename)
    writer = tf.python_io.TFRecordWriter(output_file)

    data_obj.self.step += 1
    count = len(label)

    out_image_orig = (np.transpose(image_orig, [0, 2, 3, 1]) +
                      1.0) / IMAGE_SCALE
    out_image_adv = (np.transpose(image_adv, [0, 2, 3, 1]) + 1.0) / IMAGE_SCALE
    out_image_orig = np.clip(out_image_orig, 0, 255).round()
    out_image_adv = np.clip(out_image_adv, 0, 255)

    out_image_adv_float = out_image_adv
    out_image_adv_floor = np.floor(out_image_adv)
    out_image_adv_ceil = np.ceil(out_image_adv)
    out_image_adv_round = np.round(out_image_adv)

    diff_data = (out_image_adv_round - out_image_orig) / 255
    diff_data = diff_data.reshape([count, -1])
    dist_l0 = np.linalg.norm(diff_data, 0, axis=1)
    dist_l1 = np.linalg.norm(diff_data, 1, axis=1)
    dist_l2 = np.linalg.norm(diff_data, 2, axis=1)
    dist_l_inf = np.linalg.norm(diff_data, np.inf, axis=1)

    # convert image to uint8 type
    out_image_adv_floor = out_image_adv_floor.astype(np.uint8)
    out_image_adv_ceil = out_image_adv_ceil.astype(np.uint8)
    out_image_adv_round = out_image_adv_round.astype(np.uint8)

    _img_compressed = data_obj.op_img_compressed
    _img_raw_data = data_obj.op_img_raw_data
    _image_size = data_obj.image_size

    # print(np.shape(logits), logits.dtype, type(logits), type(logits[0]))
    for i in range(0, count):
        new_feature_map = {
            "image/orig":
            _bytes_feature(
                session.run(_img_compressed,
                            feed_dict={_img_raw_data: out_image_orig[i]})),
            "image/float":
            _bytes_feature(out_image_adv_float[i].tobytes()),
            "image/floor":
            _bytes_feature(
                session.run(_img_compressed,
                            feed_dict={_img_raw_data:
                                       out_image_adv_floor[i]})),
            "image/ceil":
            _bytes_feature(
                session.run(_img_compressed,
                            feed_dict={_img_raw_data: out_image_adv_ceil[i]})),
            "image/round":
            _bytes_feature(
                session.run(_img_compressed,
                            feed_dict={_img_raw_data:
                                       out_image_adv_round[i]})),
            "image/shape":
            _int64_feature([_image_size, _image_size, 3]),
            "diff/l0":
            _float_feature(dist_l0[i]),
            "diff/l1":
            _float_feature(dist_l1[i]),
            "diff/l2":
            _float_feature(dist_l2[i]),
            "diff/l_inf":
            _float_feature(dist_l_inf[i]),
            "label/adv":
            _int64_feature(target_label[i]),
            "label/orig":
            _int64_feature(label[i]),
            "label/pred":
            _float_feature(logits[i])
        }

        example = tf.train.Example(features=tf.train.Features(
            feature=new_feature_map))
        writer.write(example.SerializeToString())

    writer.close()
    return image_orig, image_adv, label, target_label, logits
예제 #60
0
파일: main.py 프로젝트: JoelRuhe/sarUNET
                        type=float,
                        help="percentage of the size of training set, "
                        "eg: 0.9 (90%)")
    args = parser.parse_args()

    print('------------------ RUN CONFIRURATION --------------------\n')
    print('KEY\t\t\tVALUE')
    for arg in vars(args):
        print(f'{arg:<20}\t{getattr(args, arg):<40}')
    print('---------------------------------------------------------\n')

    assert float(np.log2(args.image_size)) == int(np.log2(args.image_size))

    if args.horovod:
        hvd.init()
        np.random.seed(args.seed + hvd.rank())
        tf.random.set_random_seed(args.seed + hvd.rank())
        random.seed(args.seed + hvd.rank())

        print(f"Rank {hvd.rank()}:{hvd.local_rank()} reporting!")

    else:
        np.random.seed(args.seed)
        tf.random.set_random_seed(args.seed)
        random.seed(args.seed)

    gopts = tf.GraphOptions(place_pruned_graph=True)
    config = tf.ConfigProto(graph_options=gopts, allow_soft_placement=True)

    if args.gpu:
        config.gpu_options.allow_growth = True