Пример #1
0
    def begin(self):
        self._sync_op = BroadcastGlobalVariablesOp()

        self._step = 0
        self._step_place = tf.placeholder(dtype=tf.int32, shape=())
        self._sync_step_op = all_reduce(self._step_place, op='max')
        self._resize_op, self._new_size_op = self._build_resize_op(
            self._schedule, self._step_place)
Пример #2
0
    def begin(self):
        self._step = 0
        self._trained_samples = 0
        self._trained_samples_place = tf.placeholder(dtype=tf.int32, shape=())
        self._sync_offset_op = all_reduce(self._trained_samples_place,
                                          op='max')

        self._sync_state_op = BroadcastGlobalVariablesOp()
        self._resize_op = resize_cluster_from_url()
Пример #3
0
 def begin(self):
     self._kungfu_step = tf.Variable(0, trainable=False, dtype=tf.int64)
     self._advance = tf.assign_add(self._kungfu_step, 1)
     self._sync_op = BroadcastGlobalVariablesOp()
     ckpt = _get_init_step()
     self._init_kungfu_step = tf.assign(self._kungfu_step, int(ckpt))
     self._resize_op = self._build_resize_op(self._schedule, int(ckpt))
     self._reset_global_step = tf.assign(tf.train.get_global_step(),
                                         int(ckpt))
Пример #4
0
def test_sync_sgd():
    x = tf.Variable(tf.ones([], tf.float32))
    y = x * x
    optimizer = tf.train.GradientDescentOptimizer(0.1)
    optimizer = SynchronousSGDOptimizer(optimizer)
    train_op = optimizer.minimize(y)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(BroadcastGlobalVariablesOp())
        for _ in range(2):
            sess.run(train_op)
def train_mnist(sess,
                x,
                y_,
                train_op,
                test_op,
                optimizer,
                dataset,
                n_epochs=1,
                batch_size=5000):

    log_period = 100

    # get the cluster size
    n_shards = current_cluster_size()
    # get the cluster rank of the node
    shard_id = current_rank()

    # calculate number of datapoints per node
    training_set_size = dataset['training_set']['x'].shape[0]
    shard_size = training_set_size // n_shards
    step_per_epoch = shard_size // batch_size
    n_steps = step_per_epoch * n_epochs
    print('step_per_epoch: %d, %d steps in total' % (step_per_epoch, n_steps))

    # KungFu: Each replica is responsible for a data shard.
    offset = batch_size * shard_id

    sess.run(tf.global_variables_initializer())

    # KungFu: broadcast the global variable
    from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
    sess.run(BroadcastGlobalVariablesOp())

    print('training')
    # train the model with all batches allocated to the node
    for step in range(n_steps):
        xs = dataset['training_set']['x'][offset:offset + batch_size]
        y_s = dataset['training_set']['y'][offset:offset + batch_size]
        offset = (offset + batch_size * n_shards) % training_set_size
        sess.run(train_op, {
            x: xs,
            y_: y_s,
        })
        # log the validation accuracy
        if step % log_period == 0:
            training_acc_dataset = dict()
            training_acc_dataset['x'] = xs
            training_acc_dataset['y'] = y_s
            result = test_mnist(sess, x, y_, test_op, training_acc_dataset)
            print('training accuracy: %f' % result)
            result = test_mnist(sess, x, y_, test_op,
                                dataset['validation_set'])
            print('validation accuracy: %f' % result)
Пример #6
0
def test_pair_averaging():
    x = tf.Variable(tf.ones([], tf.float32))
    y = x * x
    optimizer = tf.train.GradientDescentOptimizer(0.1)
    optimizer = PairAveragingOptimizer(optimizer)
    train_op = optimizer.minimize(y)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(BroadcastGlobalVariablesOp())
        for _ in range(2):
            sess.run(train_op)
        # FIXME: check values
    run_barrier()
Пример #7
0
    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log_final_result(img_sec_mean, img_sec_conf)


loss = loss_function()
train_opt = opt.minimize(loss)

if tf.executing_eagerly():
    with tf.device(device):
        run(lambda: opt.minimize(loss_function,
                                 var_list=model.trainable_variables))
else:
    init = tf.global_variables_initializer()
    bcast_op = None
    if args.kf_optimizer:
        from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
        bcast_op = BroadcastGlobalVariablesOp()
    with tf.Session(config=config) as session:
        from kungfu._utils import measure
        duration, _ = measure(lambda: session.run(init))
        log('init took %.3fs' % (duration))
        if bcast_op:
            duration, _ = measure(lambda: session.run(bcast_op))
            log('bcast_op took %.3fs' % (duration))
        run(lambda: session.run(train_opt))
        if barrier_op is not None:
            session.run(barrier_op)
Пример #8
0
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
        img_secs.append(img_sec)

    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))


loss = loss_function()
train_opt = opt.minimize(loss)

if tf.executing_eagerly():
    with tf.device(device):
        run(lambda: opt.minimize(loss_function,
                                 var_list=model.trainable_variables))
else:
    init = tf.global_variables_initializer()
    with tf.Session(config=config) as session:
        session.run(init)
        if args.kf_optimizer:
            from kungfu.tensorflow.initializer import \
                BroadcastGlobalVariablesOp
            session.run(BroadcastGlobalVariablesOp())
        run(lambda: session.run(train_opt))
Пример #9
0
 def begin(self):
     global_step = tf.train.get_or_create_global_step()
     new_global_step = all_reduce(global_step, op='max')
     self._sync_step_op = tf.assign(global_step, new_global_step)
     from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
     self._sync_state_op = BroadcastGlobalVariablesOp()
Пример #10
0
 def before_train(self):
     self._size_place = tf.placeholder(dtype=tf.uint32, shape=[])
     self._resize_op = resize(self._size_place)
     self._sync_op = BroadcastGlobalVariablesOp()
Пример #11
0
def parallel_train(training_dataset, kungfu_option):
    from kungfu import current_cluster_size, current_rank
    from kungfu.tensorflow.optimizers import SynchronousSGDOptimizer, SynchronousAveragingOptimizer, PairAveragingOptimizer

    ds = training_dataset.shuffle(buffer_size=4096)
    ds = ds.shard(num_shards=current_cluster_size(), index=current_rank())
    ds = ds.repeat(n_epoch)
    ds = ds.map(_map_fn, num_parallel_calls=4)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=1)

    iterator = ds.make_one_shot_iterator()
    one_element = iterator.get_next()
    net, total_loss, log_tensors = make_model(*one_element,
                                              is_train=True,
                                              reuse=False)
    x_ = net.img  # net input
    last_conf = net.last_conf  # net output
    last_paf = net.last_paf  # net output
    confs_ = net.confs  # GT
    pafs_ = net.pafs  # GT
    mask = net.m1  # mask1, GT
    # net.m2 = m2                 # mask2, GT
    stage_losses = net.stage_losses
    l2_loss = net.l2_loss

    global_step = tf.Variable(1, trainable=False)
    # scaled_lr = lr_init * current_cluster_size()  # Horovod: scale the learning rate linearly
    scaled_lr = lr_init  # Linear scaling rule is not working in openpose training.
    with tf.variable_scope('learning_rate'):
        lr_v = tf.Variable(scaled_lr, trainable=False)

    opt = tf.train.MomentumOptimizer(lr_v, 0.9)

    # KungFu
    if kungfu_option == 'sync-sgd':
        opt = SynchronousSGDOptimizer(opt)
    elif kungfu_option == 'async-sgd':
        opt = PairAveragingOptimizer(opt)
    elif kungfu_option == 'sma':
        opt = SynchronousAveragingOptimizer(opt)
    else:
        raise RuntimeError('Unknown distributed training optimizer.')

    train_op = opt.minimize(total_loss, global_step=global_step)
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = True

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # KungFu
    from kungfu.tensorflow.initializer import BroadcastGlobalVariablesOp
    bcast = BroadcastGlobalVariablesOp()

    global n_step, lr_decay_every_step
    n_step = n_step // current_cluster_size() + 1  # KungFu
    lr_decay_every_step = lr_decay_every_step // current_cluster_size(
    ) + 1  # KungFu

    # Start training
    with tf.Session(config=config) as sess:
        init.run()
        bcast.run()  # KungFu
        print('Worker{}: Initialized'.format(current_rank()))
        print(
            'Worker{}: Start - n_step: {} batch_size: {} lr_init: {} lr_decay_every_step: {}'
            .format(current_rank(), n_step, batch_size, lr_init,
                    lr_decay_every_step))

        # restore pre-trained weights
        try:
            # tl.files.load_and_assign_npz(sess, os.path.join(model_path, 'pose.npz'), net)
            tl.files.load_and_assign_npz_dict(sess=sess,
                                              name=os.path.join(
                                                  model_path, 'pose.npz'))
        except:
            print("no pre-trained model")

        # train until the end
        while True:
            step = sess.run(global_step)
            if step == n_step:
                break

            tic = time.time()
            if step != 0 and (step % lr_decay_every_step == 0):
                new_lr_decay = lr_decay_factor**(step // lr_decay_every_step)
                sess.run(tf.assign(lr_v, scaled_lr * new_lr_decay))

            [_, _loss, _stage_losses, _l2, conf_result, paf_result] = \
                sess.run([train_op, total_loss, stage_losses, l2_loss, last_conf, last_paf])

            # tstring = time.strftime('%d-%m %H:%M:%S', time.localtime(time.time()))
            lr = sess.run(lr_v)
            print(
                'Worker{}: Total Loss at iteration {} / {} is: {} Learning rate {:10e} l2_loss {:10e} Took: {}s'
                .format(current_rank(), step, n_step, _loss, lr, _l2,
                        time.time() - tic))
            for ix, ll in enumerate(_stage_losses):
                print('Worker{}:', current_rank(), 'Network#', ix,
                      'For Branch', ix % 2 + 1, 'Loss:', ll)

            # save intermediate results and model
            if current_rank() == 0:  # KungFu
                if (step != 0) and (step % save_interval == 0):
                    # save some results
                    [
                        img_out, confs_ground, pafs_ground, conf_result,
                        paf_result, mask_out
                    ] = sess.run(
                        [x_, confs_, pafs_, last_conf, last_paf, mask])
                    draw_results(img_out, confs_ground, conf_result,
                                 pafs_ground, paf_result, mask_out,
                                 'train_%d_' % step)

                    # save model
                    # tl.files.save_npz(
                    #    net.all_params, os.path.join(model_path, 'pose' + str(step) + '.npz'), sess=sess)
                    # tl.files.save_npz(net.all_params, os.path.join(model_path, 'pose.npz'), sess=sess)
                    tl.files.save_npz_dict(net.all_params,
                                           os.path.join(
                                               model_path,
                                               'pose' + str(step) + '.npz'),
                                           sess=sess)
                    tl.files.save_npz_dict(net.all_params,
                                           os.path.join(
                                               model_path, 'pose.npz'),
                                           sess=sess)