示例#1
0
    def __init__(self, average=True):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
        """
        import byteps.tensorflow as bps
        self.hvd = bps  # BytePS has the same interface as Horovod
        self.hvd.allreduce = bps.push_pull  # https://github.com/bytedance/byteps/issues/8
        # TODO bootstrap env vars
        bps.init()
        self.is_chief = bps.rank() == 0

        self._local_rank = bps.local_rank()
        self._rank = bps.rank()
        self._average = average

        self._compression = None
        self._has_compression = False
        logger.info("[BytePSTrainer] local rank={}".format(self._local_rank))
        SingleCostTrainer.__init__(self)
示例#2
0
    def __init__(self, average=True):
        """
        Args:
            average (bool): whether to average or sum the gradients across processes.
        """
        import byteps.tensorflow as bps
        self.hvd = bps  # BytePS has the same interface as Horovod
        self.hvd.allreduce = bps.push_pull  # https://github.com/bytedance/byteps/issues/8
        assert os.environ.get("DMLC_ROLE", None) == "worker"
        assert "DMLC_WORKER_ID" in os.environ and "DMLC_NUM_WORKER" in os.environ
        bps.init()
        self.is_chief = bps.rank() == 0

        self._local_rank = bps.local_rank()
        self._rank = bps.rank()
        self._average = average

        self._compression = None
        self._has_compression = False
        logger.info("[BytePSTrainer] local rank={}".format(self._local_rank))
        SingleCostTrainer.__init__(self)
示例#3
0
parser.add_argument('--num-warmup-batches', type=int, default=10,
                    help='number of warm-up batches that don\'t count towards benchmark')
parser.add_argument('--num-batches-per-iter', type=int, default=10,
                    help='number of batches per benchmark iteration')
parser.add_argument('--num-iters', type=int, default=10000,
                    help='number of benchmark iterations')

parser.add_argument('--eager', action='store_true', default=False,
                    help='enables eager execution')
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='disables CUDA training')

args = parser.parse_args()
args.cuda = not args.no_cuda

bps.init()

# BytePS: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
if args.cuda:
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(bps.local_rank())
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    config.gpu_options.allow_growth = False
    config.gpu_options.visible_device_list = ''

if args.eager:
    tf.enable_eager_execution(config)

# Set up standard model.
示例#4
0
def main(_):
    # BytePS: initialize BytePS.
    bps.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % bps.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    # BytePS: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * bps.size())

    # BytePS: add BytePS Distributed Optimizer.
    opt = bps.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # BytePS: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        bps.BroadcastGlobalVariablesHook(0),

        # BytePS: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=200000 // bps.size()),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]

    # BytePS: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(bps.local_rank())

    # BytePS: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if bps.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train, batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
示例#5
0
def main(_):
    bps.init()
    tf.logging.set_verbosity(tf.logging.INFO)

    bert_config = modeling.BertConfig(256)

    model_fn = model_fn_builder(bert_config=bert_config,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps)

    max_seq_length = FLAGS.max_seq_length
    max_predictions_per_seq = FLAGS.max_predictions_per_seq

    with tf.name_scope("input"):
        input_ids = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32)
        input_mask = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32)
        segment_ids = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32)
        masked_lm_positions = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_predictions_per_seq],
            dtype=tf.int32)
        masked_lm_ids = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_predictions_per_seq],
            dtype=tf.int32)
        masked_lm_weights = tf.placeholder(
            shape=[FLAGS.train_batch_size, max_predictions_per_seq],
            dtype=tf.float32)
        next_sentence_labels = tf.placeholder(
            shape=[FLAGS.train_batch_size, 1], dtype=tf.int32)

    features = {
        "input_ids": input_ids,
        "input_mask": input_mask,
        "segment_ids": segment_ids,
        "masked_lm_positions": masked_lm_positions,
        "masked_lm_ids": masked_lm_ids,
        "masked_lm_weights": masked_lm_weights,
        "next_sentence_labels": next_sentence_labels
    }

    train_op = model_fn(features, None, None, None)

    infer_shape_ops = add_infer_shape_ops()

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        bps.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=205 // bps.size()),
    ]

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(bps.local_rank())

    training_batch_generator = train_input_generator(features)

    with tf.train.MonitoredTrainingSession(hooks=hooks,
                                           config=config) as mon_sess:
        mon_sess = TimelineSession(mon_sess, infer_shape_ops)
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            feed_dict = next(training_batch_generator)
            mon_sess.run([train_op], feed_dict=feed_dict)