예제 #1
0
def main(_):
    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu
    set_memory_growth()

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)

    cfg = load_yaml(FLAGS.cfg_path)
    uv_weight_mask = cv2.imread(cfg['uv_weight_mask']) / 255.

    # define network
    generator = PRNet_Model(cfg['input_size'], cfg['ch_size'])
    generator.summary(line_length=80)

    # load dataset
    train_dataset = load_dataset(cfg,
                                 shuffle=True,
                                 num_workers=cfg['num_workers'])

    # define optimizer
    learning_rate_G = MultiStepLR(cfg['lr_G'], cfg['lr_steps'], cfg['lr_rate'])
    optimizer_G = tf.keras.optimizers.Adam(learning_rate=learning_rate_G,
                                           beta_1=cfg['adam_beta1_G'],
                                           beta_2=cfg['adam_beta2_G'])

    # define losses function
    loss_fn = WeightedMSE(uv_weight_mask)

    # load checkpoint
    checkpoint_dir = './checkpoints/' + cfg['sub_name']
    checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'),
                                     optimizer_G=optimizer_G,
                                     model=generator)
    manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                         directory=checkpoint_dir,
                                         max_to_keep=3)
    if manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        print('[*] load ckpt from {} at step {}.'.format(
            manager.latest_checkpoint, checkpoint.step.numpy()))
    else:
        if cfg['pretrain_name'] is not None:
            pretrain_dir = './checkpoints/' + cfg['pretrain_name']
            if tf.train.latest_checkpoint(pretrain_dir):
                checkpoint.restore(tf.train.latest_checkpoint(pretrain_dir))
                checkpoint.step.assign(0)
                print("[*] training from pretrain model {}.".format(
                    pretrain_dir))
            else:
                print(
                    "[*] cannot find pretrain model {}.".format(pretrain_dir))
        else:
            print("[*] training from scratch.")

    # define training step function
    @tf.function
    def train_step(img, pos):
        with tf.GradientTape() as tape_G:
            pre = generator(img, training=True)

            losses_G = {}
            losses_G['pixel'] = loss_fn(pos, pre)
            total_loss_G = tf.add_n([l for l in losses_G.values()])

        grads_G = tape_G.gradient(total_loss_G, generator.trainable_variables)
        optimizer_G.apply_gradients(zip(grads_G,
                                        generator.trainable_variables))

        return total_loss_G, losses_G

    # training loop
    summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name'])
    niter = int(cfg['train_dataset']['num_samples'] * cfg['epoch'] /
                cfg['batch_size'])
    prog_bar = ProgressBar(niter, checkpoint.step.numpy())
    remain_steps = max(niter - checkpoint.step.numpy(), 0)

    for sample in take(remain_steps, train_dataset):
        checkpoint.step.assign_add(1)
        steps = checkpoint.step.numpy()
        img, pos = sample['Image'], sample['Posmap']

        total_loss_G, losses_G = train_step(img, pos)

        prog_bar.update_gan(total_loss_G.numpy(),
                            optimizer_G.lr(steps).numpy())

        if steps % cfg['log_steps'] == 0:
            with summary_writer.as_default():
                tf.summary.scalar('loss_G/total_loss',
                                  total_loss_G,
                                  step=steps)
                for k, l in losses_G.items():
                    tf.summary.scalar('loss_G/{}'.format(k), l, step=steps)

                tf.summary.scalar('learning_rate_G',
                                  optimizer_G.lr(steps),
                                  step=steps)

        if steps % cfg['save_steps'] == 0:
            manager.save()
            print('\n[*] save ckpt file at {}'.format(
                manager.latest_checkpoint))

    print("\n[*] training done!")
예제 #2
0
def main(_):
    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth()

    cfg = load_yaml(FLAGS.cfg_path)

    # define network
    model = RetinaFaceModel(cfg, training=True)
    model.summary(line_length=80)

    # define prior box
    priors = prior_box((cfg['input_size'], cfg['input_size']),
                       cfg['min_sizes'], cfg['steps'], cfg['clip'])

    # load dataset
    train_dataset = load_dataset(cfg, priors, shuffle=True)

    # define optimizer
    steps_per_epoch = cfg['dataset_len'] // cfg['batch_size']
    learning_rate = MultiStepWarmUpLR(
        initial_learning_rate=cfg['init_lr'],
        lr_steps=[e * steps_per_epoch for e in cfg['lr_decay_epoch']],
        lr_rate=cfg['lr_rate'],
        warmup_steps=cfg['warmup_epoch'] * steps_per_epoch,
        min_lr=cfg['min_lr'])
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate,
                                        momentum=0.9,
                                        nesterov=True)

    # define losses function
    multi_box_loss = MultiBoxLoss()

    # load checkpoint
    checkpoint_dir = '/content/drive/My Drive/Colab/checkpoints/' + cfg[
        'sub_name']
    checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'),
                                     optimizer=optimizer,
                                     model=model)
    manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                         directory=checkpoint_dir,
                                         max_to_keep=3)
    if manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        print('[*] load ckpt from {} at step {}.'.format(
            manager.latest_checkpoint, checkpoint.step.numpy()))
    else:
        print("[*] training from scratch.")

    # define training step function
    @tf.function
    def train_step(inputs, labels):
        with tf.GradientTape() as tape:
            predictions = model(inputs, training=True)

            losses = {}
            losses['reg'] = tf.reduce_sum(model.losses)
            losses['loc'], losses['landm'], losses['class'] = \
                multi_box_loss(labels, predictions)
            total_loss = tf.add_n([l for l in losses.values()])

        grads = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        return total_loss, losses

    # training loop
    summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name'])
    remain_steps = max(
        steps_per_epoch * cfg['epoch'] - checkpoint.step.numpy(), 0)
    prog_bar = ProgressBar(steps_per_epoch,
                           checkpoint.step.numpy() % steps_per_epoch)

    for inputs, labels in train_dataset.take(remain_steps):
        checkpoint.step.assign_add(1)
        steps = checkpoint.step.numpy()

        total_loss, losses = train_step(inputs, labels)

        prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format(
            ((steps - 1) // steps_per_epoch) + 1, cfg['epoch'],
            total_loss.numpy(),
            optimizer.lr(steps).numpy()))

        if steps % 10 == 0:
            with summary_writer.as_default():
                tf.summary.scalar('loss/total_loss', total_loss, step=steps)
                for k, l in losses.items():
                    tf.summary.scalar('loss/{}'.format(k), l, step=steps)
                tf.summary.scalar('learning_rate',
                                  optimizer.lr(steps),
                                  step=steps)

        if steps % cfg['save_steps'] == 0:
            manager.save()
            print("\n[*] save ckpt file at {}".format(
                manager.latest_checkpoint))

    manager.save()
    print("\n[*] training done! save ckpt file at {}".format(
        manager.latest_checkpoint))
예제 #3
0
def main(_):
    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth()

    cfg = load_yaml(FLAGS.cfg_path)

    # define training step function
    @tf.function
    def train_step(inputs, labels, drop_path_prob):
        with tf.GradientTape() as tape:
            logits, logits_aux = model((inputs, drop_path_prob), training=True)

            losses = {}
            losses['reg'] = tf.reduce_sum(model.losses)
            losses['ce'] = criterion(labels, logits)
            losses['ce_auxiliary'] = \
                cfg['auxiliary_weight'] * criterion(labels, logits_aux)
            total_loss = tf.add_n([l for l in losses.values()])

        grads = tape.gradient(total_loss, model.trainable_variables)
        grads = [(tf.clip_by_norm(grad, cfg['grad_clip'])) for grad in grads]
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        return logits, total_loss, losses

    # Used to store the final accuracy for every arch
    final_acc = pd.DataFrame(data=None, columns=['arch_name', 'acc'])

    loop_num = 50

    if Debug:
        # debugpy.wait_for_client()
        loop_num = 1
    # define network
    for arch_num in range(loop_num):
        # read the arch
        arch = str(f"{cfg['sub_name']}_{arch_num}")
        cfg['arch'] = arch

        model = CifarModel(cfg, training=True, file_name=FLAGS.file_name)
        if Debug:
            model.summary(line_length=80)
            print("param size = {:f}MB".format(count_parameters_in_MB(model)))

        # load dataset
        train_dataset = load_cifar10_dataset(
            cfg['batch_size'],
            split='train',
            shuffle=True,
            drop_remainder=True,
            using_normalize=cfg['using_normalize'],
            using_crop=cfg['using_crop'],
            using_flip=cfg['using_flip'],
            using_cutout=cfg['using_cutout'],
            cutout_length=cfg['cutout_length'])
        val_dataset = load_cifar10_dataset(
            cfg['val_batch_size'],
            split='test',
            shuffle=False,
            drop_remainder=False,
            using_normalize=cfg['using_normalize'],
            using_crop=False,
            using_flip=False,
            using_cutout=False)

        # define optimizer
        steps_per_epoch = cfg['dataset_len'] // cfg['batch_size']
        learning_rate = CosineAnnealingLR(initial_learning_rate=cfg['init_lr'],
                                          t_period=cfg['epoch'] *
                                          steps_per_epoch,
                                          lr_min=cfg['lr_min'])
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate,
                                            momentum=cfg['momentum'])

        # define losses function
        criterion = CrossEntropyLoss()

        # load checkpoint
        checkpoint_dir = './checkpoints/' + arch
        checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'),
                                         optimizer=optimizer,
                                         model=model)
        manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                             directory=checkpoint_dir,
                                             max_to_keep=3)
        if manager.latest_checkpoint:
            checkpoint.restore(manager.latest_checkpoint)
            print('[*] load ckpt from {} at step {}.'.format(
                manager.latest_checkpoint, checkpoint.step.numpy()))
        else:
            print("[*] training from scratch.")

        # training loop
        summary_writer = tf.summary.create_file_writer('./logs/' +
                                                       cfg['sub_name'])
        total_steps = steps_per_epoch * cfg['epoch']
        remain_steps = max(total_steps - checkpoint.step.numpy(), 0)
        prog_bar = ProgressBar(steps_per_epoch,
                               checkpoint.step.numpy() % steps_per_epoch)

        train_acc = AvgrageMeter()
        val_acc = AvgrageMeter()
        best_acc = 0.
        for inputs, labels in train_dataset.take(remain_steps):
            checkpoint.step.assign_add(1)
            drop_path_prob = cfg['drop_path_prob'] * (
                tf.cast(checkpoint.step, tf.float32) / total_steps)
            steps = checkpoint.step.numpy()
            epochs = ((steps - 1) // steps_per_epoch) + 1

            logits, total_loss, losses = train_step(inputs, labels,
                                                    drop_path_prob)
            train_acc.update(
                accuracy(logits.numpy(), labels.numpy())[0], cfg['batch_size'])

            prog_bar.update(
                "epoch={}/{}, loss={:.4f}, acc={:.2f}, lr={:.2e}".format(
                    epochs, cfg['epoch'], total_loss.numpy(), train_acc.avg,
                    optimizer.lr(steps).numpy()))

            if steps % cfg['val_steps'] == 0 and steps > 1:
                print("\n[*] validate...", end='')
                val_acc.reset()
                for inputs_val, labels_val in val_dataset:
                    logits_val, _ = model((inputs_val, tf.constant([0.])))
                    val_acc.update(
                        accuracy(logits_val.numpy(), labels_val.numpy())[0],
                        inputs_val.shape[0])

                if val_acc.avg > best_acc:
                    best_acc = val_acc.avg
                    model.save_weights(
                        f"checkpoints/{cfg['sub_name']}/best.ckpt")

                val_str = " val acc {:.2f}%, best acc {:.2f}%"
                print(val_str.format(val_acc.avg, best_acc), end='')

            if steps % 10 == 0:
                with summary_writer.as_default():
                    tf.summary.scalar('acc/train', train_acc.avg, step=steps)
                    tf.summary.scalar('acc/val', val_acc.avg, step=steps)

                    tf.summary.scalar('loss/total_loss',
                                      total_loss,
                                      step=steps)
                    for k, l in losses.items():
                        tf.summary.scalar('loss/{}'.format(k), l, step=steps)
                    tf.summary.scalar('learning_rate',
                                      optimizer.lr(steps),
                                      step=steps)

            if steps % cfg['save_steps'] == 0:
                manager.save()
                print("\n[*] save ckpt file at {}".format(
                    manager.latest_checkpoint))

            if steps % steps_per_epoch == 0:
                train_acc.reset()

        manager.save()
        print("\n[*] training one arch done! save ckpt file at {}".format(
            manager.latest_checkpoint))
        final_acc.loc[arch_num] = list([arch, best_acc])
    print("Whole training ended, the best result is :")
    print("\t", final_acc.iloc[final_acc['acc'].idxmax()])
예제 #4
0
def main(_):
    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth()

    cfg = load_yaml(FLAGS.cfg_path)

    # define network
    generator = RRDB_Model(cfg['input_size'], cfg['ch_size'], cfg['network_G'])
    generator.summary(line_length=80)
    discriminator = DiscriminatorVGG128(cfg['gt_size'], cfg['ch_size'])
    discriminator.summary(line_length=80)

    # load dataset
    train_dataset = load_dataset(cfg, 'train_dataset', shuffle=False)

    # define optimizer
    learning_rate_G = MultiStepLR(cfg['lr_G'], cfg['lr_steps'], cfg['lr_rate'])
    learning_rate_D = MultiStepLR(cfg['lr_D'], cfg['lr_steps'], cfg['lr_rate'])
    optimizer_G = tf.keras.optimizers.Adam(learning_rate=learning_rate_G,
                                           beta_1=cfg['adam_beta1_G'],
                                           beta_2=cfg['adam_beta2_G'])
    optimizer_D = tf.keras.optimizers.Adam(learning_rate=learning_rate_D,
                                           beta_1=cfg['adam_beta1_D'],
                                           beta_2=cfg['adam_beta2_D'])

    # define losses function
    pixel_loss_fn = PixelLoss(criterion=cfg['pixel_criterion'])
    fea_loss_fn = ContentLoss(criterion=cfg['feature_criterion'])
    gen_loss_fn = GeneratorLoss(gan_type=cfg['gan_type'])
    dis_loss_fn = DiscriminatorLoss(gan_type=cfg['gan_type'])

    # load checkpoint
    checkpoint_dir = './checkpoints/' + cfg['sub_name']
    checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'),
                                     optimizer_G=optimizer_G,
                                     optimizer_D=optimizer_D,
                                     model=generator,
                                     discriminator=discriminator)
    manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                         directory=checkpoint_dir,
                                         max_to_keep=3)
    if manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        print('[*] load ckpt from {} at step {}.'.format(
            manager.latest_checkpoint, checkpoint.step.numpy()))
    else:
        if cfg['pretrain_name'] is not None:
            pretrain_dir = './checkpoints/' + cfg['pretrain_name']
            if tf.train.latest_checkpoint(pretrain_dir):
                checkpoint.restore(tf.train.latest_checkpoint(pretrain_dir))
                checkpoint.step.assign(0)
                print("[*] training from pretrain model {}.".format(
                    pretrain_dir))
            else:
                print(
                    "[*] cannot find pretrain model {}.".format(pretrain_dir))
        else:
            print("[*] training from scratch.")

    # define training step function
    @tf.function
    def train_step(lr, hr):
        with tf.GradientTape(persistent=True) as tape:
            sr = generator(lr, training=True)
            hr_output = discriminator(hr, training=True)
            sr_output = discriminator(sr, training=True)

            losses_G = {}
            losses_D = {}
            losses_G['reg'] = tf.reduce_sum(generator.losses)
            losses_D['reg'] = tf.reduce_sum(discriminator.losses)
            losses_G['pixel'] = cfg['w_pixel'] * pixel_loss_fn(hr, sr)
            losses_G['feature'] = cfg['w_feature'] * fea_loss_fn(hr, sr)
            losses_G['gan'] = cfg['w_gan'] * gen_loss_fn(hr_output, sr_output)
            losses_D['gan'] = dis_loss_fn(hr_output, sr_output)
            total_loss_G = tf.add_n([l for l in losses_G.values()])
            total_loss_D = tf.add_n([l for l in losses_D.values()])

        grads_G = tape.gradient(total_loss_G, generator.trainable_variables)
        grads_D = tape.gradient(total_loss_D,
                                discriminator.trainable_variables)
        optimizer_G.apply_gradients(zip(grads_G,
                                        generator.trainable_variables))
        optimizer_D.apply_gradients(
            zip(grads_D, discriminator.trainable_variables))

        return total_loss_G, total_loss_D, losses_G, losses_D

    # training loop
    summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name'])
    prog_bar = ProgressBar(cfg['niter'], checkpoint.step.numpy())
    remain_steps = max(cfg['niter'] - checkpoint.step.numpy(), 0)

    for lr, hr in train_dataset.take(remain_steps):
        checkpoint.step.assign_add(1)
        steps = checkpoint.step.numpy()

        total_loss_G, total_loss_D, losses_G, losses_D = train_step(lr, hr)

        prog_bar.update(
            "loss_G={:.4f}, loss_D={:.4f}, lr_G={:.1e}, lr_D={:.1e}".format(
                total_loss_G.numpy(), total_loss_D.numpy(),
                optimizer_G.lr(steps).numpy(),
                optimizer_D.lr(steps).numpy()))

        if steps % 10 == 0:
            with summary_writer.as_default():
                tf.summary.scalar('loss_G/total_loss',
                                  total_loss_G,
                                  step=steps)
                tf.summary.scalar('loss_D/total_loss',
                                  total_loss_D,
                                  step=steps)
                for k, l in losses_G.items():
                    tf.summary.scalar('loss_G/{}'.format(k), l, step=steps)
                for k, l in losses_D.items():
                    tf.summary.scalar('loss_D/{}'.format(k), l, step=steps)

                tf.summary.scalar('learning_rate_G',
                                  optimizer_G.lr(steps),
                                  step=steps)
                tf.summary.scalar('learning_rate_D',
                                  optimizer_D.lr(steps),
                                  step=steps)

        if steps % cfg['save_steps'] == 0:
            manager.save()
            print("\n[*] save ckpt file at {}".format(
                manager.latest_checkpoint))

    print("\n [*] training done!")
예제 #5
0
def main(_):
    '''
    Train for one epoch to get supernet , then random sample 50 architectures for finetuning.
    This structure is basically the same as train_search.py
    TODO: Add PGD here and calculate FSP
    '''
    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth()

    cfg = load_yaml(FLAGS.cfg_path)

    # define network
    sna = SearchNetArch(cfg)
    sna.model.summary(line_length=80)
    print("param size = {:f}MB".format(count_parameters_in_MB(sna.model)))

    # load dataset
    t_split = f"train[0%:{int(cfg['train_portion'] * 100)}%]"
    v_split = f"train[{int(cfg['train_portion'] * 100)}%:100%]"
    train_dataset = load_cifar10_dataset(
        cfg['batch_size'],
        split=t_split,
        shuffle=True,
        drop_remainder=True,
        using_normalize=cfg['using_normalize'],
        using_crop=cfg['using_crop'],
        using_flip=cfg['using_flip'],
        using_cutout=cfg['using_cutout'],
        cutout_length=cfg['cutout_length'])
    val_dataset = load_cifar10_dataset(cfg['batch_size'],
                                       split=v_split,
                                       shuffle=True,
                                       drop_remainder=True,
                                       using_normalize=cfg['using_normalize'],
                                       using_crop=cfg['using_crop'],
                                       using_flip=cfg['using_flip'],
                                       using_cutout=cfg['using_cutout'],
                                       cutout_length=cfg['cutout_length'])

    # define optimizer
    steps_per_epoch = int(cfg['dataset_len'] * cfg['train_portion'] //
                          cfg['batch_size'])
    learning_rate = CosineAnnealingLR(initial_learning_rate=cfg['init_lr'],
                                      t_period=cfg['epoch'] * steps_per_epoch,
                                      lr_min=cfg['lr_min'])
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate,
                                        momentum=cfg['momentum'])
    optimizer_arch = tf.keras.optimizers.Adam(
        learning_rate=cfg['arch_learning_rate'], beta_1=0.5, beta_2=0.999)

    # define losses function
    criterion = CrossEntropyLoss()

    # load checkpoint
    checkpoint_dir = './checkpoints/' + cfg['sub_name']
    checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'),
                                     optimizer=optimizer,
                                     optimizer_arch=optimizer_arch,
                                     model=sna.model,
                                     alphas_normal=sna.alphas_normal,
                                     alphas_reduce=sna.alphas_reduce,
                                     betas_normal=sna.betas_normal,
                                     betas_reduce=sna.betas_reduce)
    manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                         directory=checkpoint_dir,
                                         max_to_keep=3)
    if manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        print('[*] load ckpt from {} at step {}.'.format(
            manager.latest_checkpoint, checkpoint.step.numpy()))
    else:
        print("[*] training from scratch.")
    print(f"[*] searching model after {cfg['start_search_epoch']} epochs.")

    # define training step function for model
    @tf.function
    def train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logits = sna.model((inputs, *sna.arch_parameters), training=True)

            losses = {}
            losses['reg'] = tf.reduce_sum(sna.model.losses)
            losses['ce'] = criterion(labels, logits)
            total_loss = tf.add_n([l for l in losses.values()])

        grads = tape.gradient(total_loss, sna.model.trainable_variables)
        grads = [(tf.clip_by_norm(grad, cfg['grad_clip'])) for grad in grads]
        optimizer.apply_gradients(zip(grads, sna.model.trainable_variables))

        return logits, total_loss, losses

    # define training step function for arch_parameters
    @tf.function
    def train_step_arch(inputs, labels):
        with tf.GradientTape() as tape:
            logits = sna.model((inputs, *sna.arch_parameters), training=True)

            losses = {}
            losses['reg'] = cfg['arch_weight_decay'] * tf.add_n(
                [tf.reduce_sum(p**2) for p in sna.arch_parameters])
            losses['ce'] = criterion(labels, logits)
            total_loss = tf.add_n([l for l in losses.values()])

        grads = tape.gradient(total_loss, sna.arch_parameters)
        optimizer_arch.apply_gradients(zip(grads, sna.arch_parameters))

        return losses

    summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name'])

    print("[*] finished searching for one epoch")

    print("[*] Start sampling architetures")

    prog_bar = ProgressBar(50, 0)

    # Start sampling for 50 archs
    for geno_num in range(50):
        genotype = sna.get_genotype(random_search_flag=True)
        prog_bar.update(f"\n Sampled{geno_num}th arch: {genotype}")
        # print(f"\n Sampled {geno_num}th arch: {genotype}")
        f = open(
            os.path.join('./logs', cfg['sub_name'],
                         'search_random_arch_genotype.py'), 'a')
        f.write(f"\n{cfg['sub_name']}_{geno_num} = {genotype}\n")
        f.close()

    print("Sampling done!")
    debugpy.wait_for_client()
예제 #6
0
def main(_):
    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth()

    cfg = load_yaml(FLAGS.cfg_path)

    # define network
    sna = SearchNetArch(cfg)
    sna.model.summary(line_length=80)
    print("param size = {:f}MB".format(count_parameters_in_MB(sna.model)))

    # load dataset
    t_split = f"train[0%:{int(cfg['train_portion'] * 100)}%]"
    v_split = f"train[{int(cfg['train_portion'] * 100)}%:100%]"
    train_dataset = load_cifar10_dataset(
        cfg['batch_size'],
        split=t_split,
        shuffle=True,
        drop_remainder=True,
        using_normalize=cfg['using_normalize'],
        using_crop=cfg['using_crop'],
        using_flip=cfg['using_flip'],
        using_cutout=cfg['using_cutout'],
        cutout_length=cfg['cutout_length'])
    val_dataset = load_cifar10_dataset(cfg['batch_size'],
                                       split=v_split,
                                       shuffle=True,
                                       drop_remainder=True,
                                       using_normalize=cfg['using_normalize'],
                                       using_crop=cfg['using_crop'],
                                       using_flip=cfg['using_flip'],
                                       using_cutout=cfg['using_cutout'],
                                       cutout_length=cfg['cutout_length'])

    # define optimizer
    steps_per_epoch = int(cfg['dataset_len'] * cfg['train_portion'] //
                          cfg['batch_size'])
    learning_rate = CosineAnnealingLR(initial_learning_rate=cfg['init_lr'],
                                      t_period=cfg['epoch'] * steps_per_epoch,
                                      lr_min=cfg['lr_min'])
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate,
                                        momentum=cfg['momentum'])
    optimizer_arch = tf.keras.optimizers.Adam(
        learning_rate=cfg['arch_learning_rate'], beta_1=0.5, beta_2=0.999)

    # define losses function
    criterion = CrossEntropyLoss()

    # load checkpoint
    checkpoint_dir = './checkpoints/' + cfg['sub_name']
    checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'),
                                     optimizer=optimizer,
                                     optimizer_arch=optimizer_arch,
                                     model=sna.model,
                                     alphas_normal=sna.alphas_normal,
                                     alphas_reduce=sna.alphas_reduce,
                                     betas_normal=sna.betas_normal,
                                     betas_reduce=sna.betas_reduce)
    manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                         directory=checkpoint_dir,
                                         max_to_keep=3)
    if manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        print('[*] load ckpt from {} at step {}.'.format(
            manager.latest_checkpoint, checkpoint.step.numpy()))
    else:
        print("[*] training from scratch.")
    print(f"[*] searching model after {cfg['start_search_epoch']} epochs.")

    # define training step function for model
    @tf.function
    def train_step(inputs, labels):
        with tf.GradientTape() as tape:
            logits = sna.model((inputs, *sna.arch_parameters), training=True)

            losses = {}
            losses['reg'] = tf.reduce_sum(sna.model.losses)
            losses['ce'] = criterion(labels, logits)
            total_loss = tf.add_n([l for l in losses.values()])

        grads = tape.gradient(total_loss, sna.model.trainable_variables)
        grads = [(tf.clip_by_norm(grad, cfg['grad_clip'])) for grad in grads]
        optimizer.apply_gradients(zip(grads, sna.model.trainable_variables))

        return logits, total_loss, losses

    # define training step function for arch_parameters
    @tf.function
    def train_step_arch(inputs, labels):
        with tf.GradientTape() as tape:
            logits = sna.model((inputs, *sna.arch_parameters), training=True)

            losses = {}
            losses['reg'] = cfg['arch_weight_decay'] * tf.add_n(
                [tf.reduce_sum(p**2) for p in sna.arch_parameters])
            losses['ce'] = criterion(labels, logits)
            total_loss = tf.add_n([l for l in losses.values()])

        grads = tape.gradient(total_loss, sna.arch_parameters)
        optimizer_arch.apply_gradients(zip(grads, sna.arch_parameters))

        return losses

    # training loop
    summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name'])
    total_steps = steps_per_epoch * cfg['epoch']
    remain_steps = max(total_steps - checkpoint.step.numpy(), 0)
    prog_bar = ProgressBar(steps_per_epoch,
                           checkpoint.step.numpy() % steps_per_epoch)

    train_acc = AvgrageMeter()
    for inputs, labels in train_dataset.take(remain_steps):
        checkpoint.step.assign_add(1)
        steps = checkpoint.step.numpy()
        epochs = ((steps - 1) // steps_per_epoch) + 1

        if epochs > cfg['start_search_epoch']:
            inputs_val, labels_val = next(iter(val_dataset))
            arch_losses = train_step_arch(inputs_val, labels_val)

        logits, total_loss, losses = train_step(inputs, labels)
        train_acc.update(
            accuracy(logits.numpy(), labels.numpy())[0], cfg['batch_size'])

        prog_bar.update(
            "epoch={:d}/{:d}, loss={:.4f}, acc={:.2f}, lr={:.2e}".format(
                epochs, cfg['epoch'], total_loss.numpy(), train_acc.avg,
                optimizer.lr(steps).numpy()))

        if steps % 10 == 0:
            with summary_writer.as_default():
                tf.summary.scalar('acc/train', train_acc.avg, step=steps)

                tf.summary.scalar('loss/total_loss', total_loss, step=steps)
                for k, l in losses.items():
                    tf.summary.scalar('loss/{}'.format(k), l, step=steps)
                tf.summary.scalar('learning_rate',
                                  optimizer.lr(steps),
                                  step=steps)

                if epochs > cfg['start_search_epoch']:
                    for k, l in arch_losses.items():
                        tf.summary.scalar('arch_losses/{}'.format(k),
                                          l,
                                          step=steps)
                    tf.summary.scalar('arch_learning_rate',
                                      cfg['arch_learning_rate'],
                                      step=steps)

        if steps % cfg['save_steps'] == 0:
            manager.save()
            print("\n[*] save ckpt file at {}".format(
                manager.latest_checkpoint))

        if steps % steps_per_epoch == 0:
            train_acc.reset()
            if epochs > cfg['start_search_epoch']:
                genotype = sna.get_genotype()
                print(f"\nsearch arch: {genotype}")
                f = open(
                    os.path.join('./logs', cfg['sub_name'],
                                 'search_arch_genotype.py'), 'a')
                f.write(f"\n{cfg['sub_name']}_{epochs} = {genotype}\n")
                f.close()

    manager.save()
    print("\n[*] training done! save ckpt file at {}".format(
        manager.latest_checkpoint))
예제 #7
0
def main(_):
    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth()

    cfg = load_yaml(FLAGS.cfg_path)

    # define network
    if cfg['network_G']['name']=='RRDB':    # ESRGAN 4x
        model = RRDB_Model(None, cfg['ch_size'], cfg['network_G'])
    elif cfg['network_G']['name']=='RRDB_CIPLAB':
        model = RRDB_Model_16x(None, cfg['ch_size'], cfg['network_G'])
    elif cfg['network_G']['name']=='RFB_ESRGAN':
        model = RFB_Model_16x(None, cfg['ch_size'], cfg['network_G'])
    model.summary(line_length=80)

    # load dataset
    train_dataset = load_dataset(cfg, 'train_dataset', shuffle=True)
    set5_dataset = load_val_dataset(cfg, 'set5')
    set14_dataset = load_val_dataset(cfg, 'set14')
    if 'DIV8K' in cfg['test_dataset']:
        DIV8K_val = load_val_dataset(cfg, 'DIV8K', crop_centor=cfg['test_dataset']['DIV8K_crop_centor'])
    # define optimizer
    learning_rate = MultiStepLR(cfg['lr'], cfg['lr_steps'], cfg['lr_rate'])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                         beta_1=cfg['adam_beta1_G'],
                                         beta_2=cfg['adam_beta2_G'])

    # define losses function
    if cfg['cycle_mse']:
        pixel_loss_fn = PixelLossDown(criterion=cfg['pixel_criterion'], scale=cfg['scale'])
    else:
        pixel_loss_fn = PixelLoss(criterion=cfg['pixel_criterion'])
    # load checkpoint
    checkpoint_dir = cfg['log_dir'] + '/checkpoints'
    checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'),
                                     optimizer=optimizer,
                                     model=model)
    manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                         directory=checkpoint_dir,
                                         max_to_keep=3)
    if manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        print('[*] load ckpt from {} at step {}.'.format(
            manager.latest_checkpoint, checkpoint.step.numpy()))
    else:
        print("[*] training from scratch.")

    # define training step function
    @tf.function
    def train_step(lr, hr):
        with tf.GradientTape() as tape:
            sr = model(lr, training=True)

            losses = {}
            losses['reg'] = tf.reduce_sum(model.losses)
            losses['pixel'] = cfg['w_pixel'] * pixel_loss_fn(hr, sr)
            total_loss = tf.add_n([l for l in losses.values()])

        grads = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        return total_loss, losses

    # training loop
    summary_writer = tf.summary.create_file_writer(cfg['log_dir']+'/logs')
    prog_bar = ProgressBar(cfg['niter'], checkpoint.step.numpy())
    remain_steps = max(cfg['niter'] - checkpoint.step.numpy(), 0)

    for _ in range(remain_steps):
        lr, hr = train_dataset()

        checkpoint.step.assign_add(1)
        steps = checkpoint.step.numpy()

        total_loss, losses = train_step(lr, hr)

        prog_bar.update("loss={:.4f}, lr={:.1e}".format(
            total_loss.numpy(), optimizer.lr(steps).numpy()))

        if steps % 10 == 0:
            with summary_writer.as_default():
                tf.summary.scalar(
                    'loss/total_loss', total_loss, step=steps)
                for k, l in losses.items():
                    tf.summary.scalar('loss/{}'.format(k), l, step=steps)
                tf.summary.scalar(
                    'learning_rate', optimizer.lr(steps), step=steps)

        if steps % cfg['save_steps'] == 0:
            manager.save()
            print("\n[*] save ckpt file at {}".format(
                manager.latest_checkpoint))

            # log results on test data
            set5_logs = evaluate_dataset(set5_dataset, model, cfg)
            set14_logs = evaluate_dataset(set14_dataset, model, cfg)
            if 'DIV8K' in cfg['test_dataset']:
                DIV8K_logs = evaluate_dataset(DIV8K_val, model, cfg)

            with summary_writer.as_default():
                if cfg['logging']['psnr']:
                    tf.summary.scalar('set5/psnr', set5_logs['psnr'], step=steps)
                    tf.summary.scalar('set14/psnr', set14_logs['psnr'], step=steps)
                    if 'DIV8K' in cfg['test_dataset']:
                        tf.summary.scalar('DIV8K/psnr', DIV8K_logs['psnr'], step=steps)

                if cfg['logging']['ssim']:
                    tf.summary.scalar('set5/ssim', set5_logs['ssim'], step=steps)
                    tf.summary.scalar('set14/ssim', set14_logs['ssim'], step=steps)
                    if 'DIV8K' in cfg['test_dataset']:
                        tf.summary.scalar('DIV8K/psnr', DIV8K_logs['psnr'], step=steps)

                if cfg['logging']['lpips']:
                    tf.summary.scalar('set5/lpips', set5_logs['lpips'], step=steps)
                    tf.summary.scalar('set14/lpips', set14_logs['lpips'], step=steps)
                    if 'DIV8K' in cfg['test_dataset']:
                        tf.summary.scalar('DIV8K/lpips', DIV8K_logs['lpips'], step=steps)

                if cfg['logging']['plot_samples']:
                    tf.summary.image("set5/samples", [set5_logs['samples']], step=steps)
                    tf.summary.image("set14/samples", [set14_logs['samples']], step=steps)
                    if 'DIV8K' in cfg['test_dataset']:
                        tf.summary.image("DIV8K/samples", [DIV8K_logs['samples']], step=steps)

    print("\n[*] training done!")
예제 #8
0
def main(_):
    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth()

    cfg = load_yaml(FLAGS.cfg_path)

    # define network
    model = RRDB_Model(cfg['input_size'], cfg['ch_size'], cfg['network_G'])
    model.summary(line_length=80)

    # load dataset
    train_dataset = load_dataset(cfg, 'train_dataset', shuffle=True)

    # define optimizer
    learning_rate = MultiStepLR(cfg['lr'], cfg['lr_steps'], cfg['lr_rate'])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate,
                                         beta_1=cfg['adam_beta1_G'],
                                         beta_2=cfg['adam_beta2_G'])

    # define losses function
    pixel_loss_fn = PixelLoss(criterion=cfg['pixel_criterion'])

    # load checkpoint
    checkpoint_dir = './checkpoints/' + cfg['sub_name']
    checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'),
                                     optimizer=optimizer,
                                     model=model)
    manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                         directory=checkpoint_dir,
                                         max_to_keep=3)
    if manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        print('[*] load ckpt from {} at step {}.'.format(
            manager.latest_checkpoint, checkpoint.step.numpy()))
    else:
        print("[*] training from scratch.")

    # define training step function
    @tf.function
    def train_step(lr, hr):
        with tf.GradientTape() as tape:
            sr = model(lr, training=True)

            losses = {}
            losses['reg'] = tf.reduce_sum(model.losses)
            losses['pixel'] = cfg['w_pixel'] * pixel_loss_fn(hr, sr)
            total_loss = tf.add_n([l for l in losses.values()])

        grads = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        return total_loss, losses

    # training loop
    summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name'])
    prog_bar = ProgressBar(cfg['niter'], checkpoint.step.numpy())
    remain_steps = max(cfg['niter'] - checkpoint.step.numpy(), 0)

    for lr, hr in train_dataset.take(remain_steps):
        checkpoint.step.assign_add(1)
        steps = checkpoint.step.numpy()

        total_loss, losses = train_step(lr, hr)

        prog_bar.update("loss={:.4f}, lr={:.1e}".format(
            total_loss.numpy(),
            optimizer.lr(steps).numpy()))

        if steps % 10 == 0:
            with summary_writer.as_default():
                tf.summary.scalar('loss/total_loss', total_loss, step=steps)
                for k, l in losses.items():
                    tf.summary.scalar('loss/{}'.format(k), l, step=steps)
                tf.summary.scalar('learning_rate',
                                  optimizer.lr(steps),
                                  step=steps)

        if steps % cfg['save_steps'] == 0:
            manager.save()
            print("\n[*] save ckpt file at {}".format(
                manager.latest_checkpoint))

    print("\n[*] training done!")
예제 #9
0
def train_retinaface(cfg):

    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    if cfg['distributed']:
        import horovod.tensorflow as hvd
        # Initialize Horovod
        hvd.init()
    else:
        hvd = []
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    reset_random_seeds()

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth(hvd)

    # define network
    model = RetinaFaceModel(cfg, training=True)
    model.summary(line_length=80)

    # define prior box
    priors = prior_box((cfg['input_size'], cfg['input_size']),
                       cfg['min_sizes'],  cfg['steps'], cfg['clip'])

    # load dataset
    train_dataset = load_dataset(cfg, priors, 'train', hvd)
    if cfg['evaluation_during_training']:
        val_dataset = load_dataset(cfg, priors, 'val', [])

    # define optimizer
    if cfg['distributed']:
        init_lr = cfg['init_lr'] * hvd.size()
        min_lr = cfg['min_lr'] * hvd.size()
        steps_per_epoch = cfg['dataset_len'] // (cfg['batch_size'] * hvd.size())
    else:
        init_lr = cfg['init_lr']
        min_lr = cfg['min_lr']
        steps_per_epoch = cfg['dataset_len'] // cfg['batch_size']

    learning_rate = MultiStepWarmUpLR(
        initial_learning_rate=init_lr,
        lr_steps=[e * steps_per_epoch for e in cfg['lr_decay_epoch']],
        lr_rate=cfg['lr_rate'],
        warmup_steps=cfg['warmup_epoch'] * steps_per_epoch,
        min_lr=min_lr)

    optimizer = tf.keras.optimizers.SGD(
        learning_rate=learning_rate, momentum=0.9, nesterov=True)

    # define losses function
    multi_box_loss = MultiBoxLoss(num_class=cfg['num_class'])

    # load checkpoint
    checkpoint_dir = os.path.join(cfg['output_path'], 'checkpoints', cfg['sub_name'])
    checkpoint = tf.train.Checkpoint(epoch=tf.Variable(0, name='epoch'),
                                     optimizer=optimizer,
                                     model=model)
    manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                         directory=checkpoint_dir,
                                         max_to_keep=3)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'cfg.pickle'), 'wb') as handle:
        pickle.dump(cfg, handle, protocol=pickle.HIGHEST_PROTOCOL)

    if manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        print('[*] load ckpt from {}'.format(manager.latest_checkpoint))
    else:
        print("[*] training from scratch.")

    # define training step function
    @tf.function
    def train_step(inputs, labels, first_batch, epoch):
        with tf.GradientTape() as tape:
            predictions = model(inputs, training=True)

            losses = {}
            losses['reg'] = tf.reduce_sum(model.losses)
            losses['loc'], losses['landm'], losses['class'] = \
                multi_box_loss(labels, predictions)
            total_loss = tf.add_n([l for l in losses.values()])

        if cfg['distributed']:
            # Horovod: add Horovod Distributed GradientTape.
            tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if cfg['distributed'] and first_batch and epoch:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        return total_loss, losses

    def test_step(inputs, img_name):
        _, img_height_raw, img_width_raw, _ = inputs.shape
        # pad input image to avoid unmatched shape problem
        img = inputs[0].numpy()
        # if img_name == '6_Funeral_Funeral_6_618':
        #     resize = 0.5 # this image is too big to avoid OOM problem
        #     img = cv2.resize(img, None, None, fx=resize, fy=resize,
        #                      interpolation=cv2.INTER_LINEAR)
        img, pad_params = pad_input_image(img, max_steps=max(cfg['steps']))
        input_img = img[np.newaxis, ...]
        predictions = model(input_img, training=False)
        outputs = pred_to_outputs(cfg, predictions, input_img.shape).numpy()
        # recover padding effect
        outputs = recover_pad_output(outputs, pad_params)

        bboxs = outputs[:, :4]
        confs = outputs[:, -1]
        pred_boxes = []
        for box, conf in zip(bboxs, confs):
            x = int(box[0] * img_width_raw)
            y = int(box[1] * img_height_raw)
            w = int(box[2] * img_width_raw) - int(box[0] * img_width_raw)
            h = int(box[3] * img_height_raw) - int(box[1] * img_height_raw)
            pred_boxes.append([x, y, w, h, conf])

        pred_boxes = np.array(pred_boxes).astype('float')

        return pred_boxes

    #training loop
    summary_writer = tf.summary.create_file_writer(os.path.join(cfg['output_path'], 'logs', cfg['sub_name']))
    prog_bar = ProgressBar(steps_per_epoch, 0)

    if cfg['evaluation_during_training']:
        widerface_eval_hard = WiderFaceEval(split='hard')

    for epoch in range(cfg['epoch']):
        try:
            actual_epoch = epoch + 1

            if cfg['distributed']:
                if hvd.rank() == 0:
                    print("\nStart of epoch %d" % (actual_epoch,))
            else:
                print("\nStart of epoch %d" % (actual_epoch,))

            checkpoint.epoch.assign_add(1)
            start_time = time.time()

            #Iterate over the batches of the dataset.
            for batch, (x_batch_train, y_batch_train, img_name) in enumerate(train_dataset):
                total_loss, losses = train_step(x_batch_train, y_batch_train, batch == 0, epoch == 0)

                if cfg['distributed']:
                    if hvd.rank() == 0:
                        # prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format(
                        #     checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32)))
                        if batch % 100 == 0:
                            print("batch={}/{},  epoch={}/{}, loss={:.4f}, lr={:.1e}".format(
                                batch, steps_per_epoch, checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32)))
                else:
                    prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format(
                        checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32)))

            # Display metrics at the end of each epoch.
            # train_acc = train_acc_metric.result()
            # print("\nTraining loss over epoch: %.4f" % (float(total_loss.numpy()),))

            if cfg['distributed']:
                if hvd.rank() == 0:
                    print("Time taken: %.2fs" % (time.time() - start_time))
                    manager.save()
                    print("\n[*] save ckpt file at {}".format(manager.latest_checkpoint))
            else:
                print("Time taken: %.2fs" % (time.time() - start_time))
                manager.save()
                print("\n[*] save ckpt file at {}".format(manager.latest_checkpoint))

            if cfg['evaluation_during_training']:
                # Run a validation loop at the end of each epoch.
                for batch, (x_batch_val, y_batch_val, img_name) in enumerate(val_dataset.take(500)):
                    if '/' in img_name.numpy()[0].decode():
                        img_name = img_name.numpy()[0].decode().split('/')[1].split('.')[0]
                    else:
                        img_name = []
                    pred_boxes = test_step(x_batch_val, img_name)
                    gt_boxes = labels_to_boxes(y_batch_val)
                    widerface_eval_hard.update(pred_boxes, gt_boxes, img_name)

                ap_hard = widerface_eval_hard.calculate_ap()
                widerface_eval_hard.reset()

                if cfg['distributed']:
                    if hvd.rank() == 0:
                        print("Validation acc: %.4f" % (float(ap_hard),))
                else:
                    print("Validation acc: %.4f" % (float(ap_hard),))

            def tensorboard_writer():
                with summary_writer.as_default():
                    tf.summary.scalar('loss/total_loss', total_loss, step=actual_epoch)
                    for k, l in losses.items():
                        tf.summary.scalar('loss/{}'.format(k), l, step=actual_epoch)
                    tf.summary.scalar('learning_rate', optimizer._decayed_lr(tf.float32), step=actual_epoch)
                    if cfg['evaluation_during_training']:
                        tf.summary.scalar('Val AP', ap_hard, step=actual_epoch)

            if cfg['distributed']:
                if hvd.rank() == 0:
                    tensorboard_writer()
            else:
                tensorboard_writer()

        except Exception as E:
            print(E)
            continue

    if cfg['distributed']:
        if hvd.rank() == 0:
            manager.save()
            print("\n[*] training done! save ckpt file at {}".format(
                manager.latest_checkpoint))
    else:
        manager.save()
        print("\n[*] training done! save ckpt file at {}".format(
            manager.latest_checkpoint))