def apply_gradients(self, apply_grads_func, grads_and_vars, **kwargs): gradients, variables = list(zip(*grads_and_vars)) # logging.info("apply gradients is called here------------") if self._reshape_strategy: # logging.info("reshape on") reshape_strategy(1) else: # logging.info("reshape called with int 0") reshape_strategy(0) if self._nccl: # FIXME: We have a limitation that KungFu schedules NCCL operations # in the order of the given gradients. This order is sub-optimal # to the topological sorting order of dataflow. We get around of this issue by # fusing all gradients. We need to figure out H ow to get the optimal topological s # sortting order from TensorFlow. if self._nccl_fusion: fused_grad = fuse(gradients) summed_fused_gradients = group_nccl_all_reduce([fused_grad]) summed_gradients = defuse(summed_fused_gradients[0], [g.shape for g in gradients]) else: summed_gradients = group_nccl_all_reduce(gradients) else: summed_gradients = group_all_reduce(gradients) reduced_grads = map_maybe(lambda g: g / self._num_workers, summed_gradients) # We need to re-zip gradients and variables as grads_and_vars can be only unzipped once. reduced_grads_and_vars = zip(reduced_grads, variables) return apply_grads_func(reduced_grads_and_vars, **kwargs)
def benchmark_step(first_batch): # reshape strategy here reshape_strategy(reshape) # gradient calculation and updates with tf.GradientTape() as tape: probs = model(data, training=True) loss = tf.losses.categorical_crossentropy(target, probs) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) if first_batch: from kungfu.tensorflow.initializer import broadcast_variables broadcast_variables(model.variables) broadcast_variables(opt.variables())
def run(benchmark_step): # Warm-up log('Running warmup...') timeit.timeit(benchmark_step, number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] iteration_time = [] for x in range(args.num_iters): #reshape_strategy op session.run(reshape_strategy(reshape)) time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) log('iteration time : %.3f' % time) img_secs.append(img_sec) iteration_time.append(time) # Results log('mean iteration time: %.3f' % np.mean(iteration_time)) img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
val_acc_metric = tf.metrics.SparseCategoricalAccuracy() best_val_acc = 0 time_log = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = f"tensorboard-logs/{args.name}/{time_log}" summary_writer = tf.summary.create_file_writer(log_dir, flush_millis=10000) step = 0 with summary_writer.as_default(): for epoch in range(NUM_EPOCHS): print('Start of epoch %d' % (epoch + 1, )) for batch, (images, labels) in enumerate(train_dataset.take(NUM_STEPS)): # reshape strategy before apply_gradients (and therefore AllReduce is called in KungFu) reshape_strategy(reshape) t0 = time.time() probs, loss_value = training_step(mnist_model, opt, images, labels, batch == 0) print('training step %d, took %s' % (step, show_duration(time.time() - t0))) step += 1 # print(f"batch number here is {batch}") # update training metric train_acc_metric(labels, probs) # Log loss metric every 10th step only on the 0th worker if step % 3 == 0 and current_rank() == 0: # print('Training step #%d\tLoss: %.6f' %
best_val_acc = 0 time_log = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = f"tensorboard-logs/{args.name}/{time_log}" summary_writer = tf.summary.create_file_writer( log_dir, flush_millis=10000) step = 0 with summary_writer.as_default(): for epoch in range(NUM_EPOCHS): print('Start of epoch %d' % (epoch+1,)) for batch, (images, labels) in enumerate(train_dataset.take(NUM_STEPS)): # reshape strategy before apply_gradients (and therefore AllReduce is called in KungFu) keep = reshape_strategy() if not keep: print("RESHAPE STRATEGY DIDN'T WORK!!!!!!!!!!!!!!") t0 = time.time() probs, loss_value = training_step( mnist_model, opt, images, labels, batch == 0) print('training step %d, took %s' % (step, show_duration(time.time() - t0))) step += 1 # print(f"batch number here is {batch}") # update training metric train_acc_metric(labels, probs) # Log loss metric every 10th step only on the 0th worker
return '%.2fs' % duration sec = int(duration) mm, ss = sec / 60, sec % 60 if duration < 3600: return '%dm%ds' % (mm, ss) return '%dh%dm%ds' % (mm / 60, mm % 60, ss) # x = tf.Variable(tf.ones([], dtype=tf.int32)) x = tf.ones((10, 1), dtype=tf.int32) print(x.numpy()) steps = 10 mean_time = [] for i in range(steps): # reshape strategy before AllReduce to bypass straggler node t1 = time.time() keep = reshape_strategy(debug=False) iteration_time = time.time() - t1 print('reshape took %s' % (show_duration(iteration_time))) t0 = time.time() v = all_reduce(x) print('all reduce step %d, took %s' % (i, show_duration(time.time() - t0))) mean_time.append(iteration_time) if not keep: break print(np.mean(mean_time))