def RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, ): log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) for i in range(epoch_iters): timeout = 600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt)) prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') train_fmt = "Training loss: {}, accuracy: {}" log.info(train_fmt.format(loss, accuracy)) num_images = epoch * epoch_iters * total_batch_size prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0]) test_accuracy = 0 if (test_model is not None): # Run 100 iters of testing ntests = 0 for _ in range(0, 100): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar( workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy')) ntests += 1 test_accuracy /= ntests else: test_accuracy = (-1) explog.log(input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, }) assert loss < 40, "Exploded gradients :(" return epoch + 1
def RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, best_accuracy, ): ''' Run one epoch of the trainer. TODO: add checkpointing here. ''' # TODO: add loading from checkpoint log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt)) prefix = "{}_{}".format( train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') train_fmt = "Training loss: {}, accuracy: {}" log.info(train_fmt.format(loss, accuracy)) num_images = epoch * epoch_iters * total_batch_size prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0] ) test_accuracy = 0 if (test_model is not None): # Run 100 iters of testing ntests = 0 # for _ in range(0, 100): # for _ in range(0, 125): for _ in range(0, args.test_iters): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar(workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy' )) ntests += 1 test_accuracy /= ntests else: test_accuracy = (-1) if test_accuracy > best_accuracy: best_accuracy = test_accuracy explog.log( input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, 'best_accuracy': best_accuracy, } ) assert loss < 40, "Exploded gradients :(" # TODO: add checkpointing return epoch + 1, best_accuracy
def RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, ): ''' Run one epoch of the trainer. TODO: add checkpointing here. ''' # TODO: add loading from checkpoint log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) ts = time.time() drop = 10 max = 0.0 spans = [] for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 3600 #3600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 if i > drop: spans.append(dt) pass updateEvery = args.notify_frequency #ignore the first 10 iterations if i == drop: #reset timer ts = time.time() pass if (i - drop) % updateEvery == 0 and i > drop: fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec), max = {:.2f}, avg = {:.2f}, median = {}. medthru = {}, avgthru = {}" te = time.time() td = te - ts currSpeed = updateEvery * total_batch_size / td if max < currSpeed: max = currSpeed pass log.info( fmt.format(i + 1, epoch_iters, epoch, currSpeed, max, np.mean(spans), np.median(spans), 1. / np.median(spans), 1. / np.mean(spans))) ts = time.time() prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') train_fmt = "Training loss: {}, accuracy: {}" log.info(train_fmt.format(loss, accuracy)) num_images = epoch * epoch_iters * total_batch_size prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0]) test_accuracy = 0 if (test_model is not None): # Run 100 iters of testing ntests = 0 for _ in range(0, 100): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar( workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy')) ntests += 1 test_accuracy /= ntests else: test_accuracy = (-1) explog.log(input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, }) assert loss < 40, "Exploded gradients :(" # TODO: add checkpointing print("accuracy = %s. test_acc = %s. loss = %s" % (accuracy, test_accuracy, loss)) return epoch + 1
def RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, explog, plt_kernel): ''' Run one epoch of the trainer. TODO: add checkpointing here. ''' # TODO: add loading from checkpoint if args.test_data_type == 'VAL': log.info("Starting epoch {}/{}".format(epoch, args.num_epochs)) epoch_iters = int(args.epoch_size / total_batch_size / num_shards) epoch_loss = [] epoch_accuracy = [] for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 600.0 if i == 0 else 60.0 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 # display_first_image() fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt)) prefix = "{}_{}".format( train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') train_fmt = "Training loss: {}, accuracy: {}" log.info(train_fmt.format(loss, accuracy)) epoch_loss.append(loss) epoch_accuracy.append(accuracy) num_images = epoch * epoch_iters * total_batch_size prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') learning_rate = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0] ) test_accuracy = 0 if (test_model is not None): # Run 100 iters of testing ntests = 0 for _ in range(0, 100): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: test_accuracy += np.asscalar(workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy' )) ntests += 1 test_accuracy /= ntests else: test_accuracy = (-1) explog.log( input_count=num_images, batch_count=(i + epoch * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch, 'test_accuracy': test_accuracy, } ) assert loss < 40, "Exploded gradients :(" if DEBUG_TRAINING: device_name = "{}_{}".format(test_model._device_prefix, test_model._devices[0]) display_activation_map(plt_kernel, channel=0, batch_num=16, device_name=device_name) plt.pause(0.001) #lfw verification test elif args.test_data_type == 'LFW' and args.load_model_path is not None: lfw_pairs = os.path.join(os.path.abspath('../dataset'), 'lfw_pairs.txt') if not os.path.exists(lfw_pairs): log.error('There is no lfw_pairs.txt in folder dataset/lfw!!!') else: actual_issame = lfw.get_issame_list(lfw.read_pairs(lfw_pairs)) num_test_images = len(actual_issame) * 2 assert num_test_images % total_batch_size == 0, \ 'The number of lfw test images must be interger multiple of the test bach size' num_batches = num_test_images // total_batch_size emb_array = np.zeros((num_test_images, args.feature_dim)) for _ in range(0, num_batches): workspace.RunNet(test_model.net.Proto().name) for g in test_model._devices: # display_activation_map(plt_kernel, channel=0, batch_num=16) # plt.pause(0.001) label = workspace.FetchBlob('{}_{}'.format(test_model._device_prefix, g) + '/label') embedding = workspace.FetchBlob('{}_{}'.format(test_model._device_prefix, g) + '/fc5') emb_array[label] = embedding _, _, test_accuracy, test_val, val_std, far = lfw.evaluate(emb_array, actual_issame, nrof_folds=10) log.info('Accuracy: %1.3f+-%1.3f' % (np.mean(test_accuracy), np.std(test_accuracy))) log.info('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (test_val, val_std, far)) #megaface verification test elif args.test_data_type == 'MEGAFACE' and args.load_model_path is not None: pass return epoch + 1, epoch_loss, epoch_accuracy
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) # For testing explicit sync model.param_init_net.UniformFill([], ["sync_num"], shape=[1]) return [loss] def add_optimizer(model): return optimizer.build_sgd( model, 0.1, policy="fixed", max_gradient_norm=5.0, allow_lr_injection=True, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(devices), ) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=not gpu, shared_model=not gpu, ) data_parallel_model.AddBlobSync(model, ["sync_num"]) # Light test for LR names lr_names = data_parallel_model.GetLearningRateBlobNames(model) self.assertGreater(len(lr_names), 0) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/label".format(model._device_prefix, g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.FeedBlob(model._device_prefix + "_0/sync_num", np.array([i * 2]).astype(np.float32), device_option=core.DeviceOption( model._device_type, 0)) workspace.RunNet(model.net.Proto().name) # Test AddBlobSync for j in model._devices: sync = workspace.FetchBlob(model._device_prefix + "_{}/sync_num".format(j))[0] self.assertTrue(abs(sync - i * 2) < 0.01) return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
def RunEpoch(args, epoch, train_model, test_model, explog, elapsed_training_time): """ Run a training epoch one the evaluation model, and then compute the accuracy on a test model. :param args: the script's parameters :param epoch: the current epoch'count :param train_model: the model on which training will be performed :param test_model: the model on which testing will be performed :param explog: the log object wrapping the file """ log.info("Starting epoch {}/{}".format(epoch + 1, args.epoch_count)) epoch_iters = int(args.epoch_size / args.batch_size / args.num_shards) test_epoch_iters = int(args.test_epoch_size / args.batch_size / args.num_shards) prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0]) total_time = 0. for i in range(epoch_iters): # This timeout is required (temporarily) since CUDA-NCCL # operators might deadlock when synchronizing between GPUs. timeout = 600 if i == 0 else 300 with timeout_guard.CompleteInTimeOrDie(timeout): t1 = time.time() workspace.RunNet(train_model.net.Proto().name) t2 = time.time() dt = t2 - t1 total_time += dt # Log the tiem it took to run the current batch fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)" log.info( fmt.format(i + 1, epoch_iters, epoch + 1, args.batch_size / dt)) # Get the accuracy and loss for this particular device accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') # Write the training loss and accuracy for this batch log.info("Training loss: {}, accuracy: {}".format(loss, accuracy)) # Compute the total number of images computed for this epoch; get the accuracy and the loss num_images = (epoch + 1) * epoch_iters * args.batch_size accuracy = workspace.FetchBlob(prefix + '/accuracy') loss = workspace.FetchBlob(prefix + '/loss') try: learning_rate = workspace.FetchBlob( (prefix if args.per_device_optimization else '') + data_parallel_model.GetLearningRateBlobNames(train_model)[0]) except AttributeError: log.error( "The learning rate could not be found on this peer; this is likely due to the " "--per_device_optimization=True option.") learning_rate = 'unknown' # Prepare the parameters required for testing test_accuracy = 0 test_accuracy_top5 = 0 if test_model is not None: ntests = 0 for _ in range(test_epoch_iters): workspace.RunNet(test_model.net.Proto().name) # Aggregate the accuracy across all the devices involved in testing for g in test_model._devices: test_accuracy += np.asscalar( workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy')) test_accuracy_top5 += np.asscalar( workspace.FetchBlob( "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5')) ntests += 1 # Compute the average test_accuracy and the average top-5 test accuracy across # a test epoch, and across all devices involved in it test_accuracy /= ntests test_accuracy_top5 /= ntests # Log the results to stdout, update total training time elapsed_training_time += total_time on_target = test_accuracy >= args.target_accuracy log.info("Finished testing on epoch {}. Obtained:\nAccuracy (Local - Training): {}\n" \ "Loss (Local - Training): {}\nTop-1 Acc: {}\nTop-5 Acc: {}\nOn target: {}\n Elapsed training time: {}" .format(epoch + 1, accuracy, loss, test_accuracy, test_accuracy_top5, on_target, elapsed_training_time)) # Log this epoch's results explog.log(input_count=num_images, batch_count=((epoch + 1) * epoch_iters), additional_values={ 'accuracy': accuracy, 'loss': loss, 'learning_rate': learning_rate, 'epoch': epoch + 1, 'top1_test_accuracy': test_accuracy, 'top5_test_accuracy': test_accuracy_top5, 'target_accuracy': args.target_accuracy, 'on_target': on_target, 'elapsed_training_time': elapsed_training_time, }) assert loss < 40, "Exploded gradients" return elapsed_training_time, on_target
labels_device) if i == 0 and e == 0: workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net, overwrite=True) workspace.RunNetOnce(deploy_model.param_init_net) workspace.CreateNet(deploy_model.net, overwrite=True) workspace.RunNet(train_model.net.Proto().name) loss_sum += workspace.FetchBlob("gpu_0/loss") correct += workspace.FetchBlob("gpu_0/accuracy") time_ep = time.time() - time_ep lr = workspace.FetchBlob( data_parallel_model.GetLearningRateBlobNames(train_model)[0]) values = [ e + 1, lr, loss_sum / batch_num, correct / batch_num, test_res['loss'], test_res['accuracy'], time_ep, ] table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') if e % 25 == 0: