def main(_): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu set_memory_growth() logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) cfg = load_yaml(FLAGS.cfg_path) uv_weight_mask = cv2.imread(cfg['uv_weight_mask']) / 255. # define network generator = PRNet_Model(cfg['input_size'], cfg['ch_size']) generator.summary(line_length=80) # load dataset train_dataset = load_dataset(cfg, shuffle=True, num_workers=cfg['num_workers']) # define optimizer learning_rate_G = MultiStepLR(cfg['lr_G'], cfg['lr_steps'], cfg['lr_rate']) optimizer_G = tf.keras.optimizers.Adam(learning_rate=learning_rate_G, beta_1=cfg['adam_beta1_G'], beta_2=cfg['adam_beta2_G']) # define losses function loss_fn = WeightedMSE(uv_weight_mask) # load checkpoint checkpoint_dir = './checkpoints/' + cfg['sub_name'] checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer_G=optimizer_G, model=generator) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: if cfg['pretrain_name'] is not None: pretrain_dir = './checkpoints/' + cfg['pretrain_name'] if tf.train.latest_checkpoint(pretrain_dir): checkpoint.restore(tf.train.latest_checkpoint(pretrain_dir)) checkpoint.step.assign(0) print("[*] training from pretrain model {}.".format( pretrain_dir)) else: print( "[*] cannot find pretrain model {}.".format(pretrain_dir)) else: print("[*] training from scratch.") # define training step function @tf.function def train_step(img, pos): with tf.GradientTape() as tape_G: pre = generator(img, training=True) losses_G = {} losses_G['pixel'] = loss_fn(pos, pre) total_loss_G = tf.add_n([l for l in losses_G.values()]) grads_G = tape_G.gradient(total_loss_G, generator.trainable_variables) optimizer_G.apply_gradients(zip(grads_G, generator.trainable_variables)) return total_loss_G, losses_G # training loop summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name']) niter = int(cfg['train_dataset']['num_samples'] * cfg['epoch'] / cfg['batch_size']) prog_bar = ProgressBar(niter, checkpoint.step.numpy()) remain_steps = max(niter - checkpoint.step.numpy(), 0) for sample in take(remain_steps, train_dataset): checkpoint.step.assign_add(1) steps = checkpoint.step.numpy() img, pos = sample['Image'], sample['Posmap'] total_loss_G, losses_G = train_step(img, pos) prog_bar.update_gan(total_loss_G.numpy(), optimizer_G.lr(steps).numpy()) if steps % cfg['log_steps'] == 0: with summary_writer.as_default(): tf.summary.scalar('loss_G/total_loss', total_loss_G, step=steps) for k, l in losses_G.items(): tf.summary.scalar('loss_G/{}'.format(k), l, step=steps) tf.summary.scalar('learning_rate_G', optimizer_G.lr(steps), step=steps) if steps % cfg['save_steps'] == 0: manager.save() print('\n[*] save ckpt file at {}'.format( manager.latest_checkpoint)) print("\n[*] training done!")
def main(_): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define network model = RetinaFaceModel(cfg, training=True) model.summary(line_length=80) # define prior box priors = prior_box((cfg['input_size'], cfg['input_size']), cfg['min_sizes'], cfg['steps'], cfg['clip']) # load dataset train_dataset = load_dataset(cfg, priors, shuffle=True) # define optimizer steps_per_epoch = cfg['dataset_len'] // cfg['batch_size'] learning_rate = MultiStepWarmUpLR( initial_learning_rate=cfg['init_lr'], lr_steps=[e * steps_per_epoch for e in cfg['lr_decay_epoch']], lr_rate=cfg['lr_rate'], warmup_steps=cfg['warmup_epoch'] * steps_per_epoch, min_lr=cfg['min_lr']) optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9, nesterov=True) # define losses function multi_box_loss = MultiBoxLoss() # load checkpoint checkpoint_dir = '/content/drive/My Drive/Colab/checkpoints/' + cfg[ 'sub_name'] checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: print("[*] training from scratch.") # define training step function @tf.function def train_step(inputs, labels): with tf.GradientTape() as tape: predictions = model(inputs, training=True) losses = {} losses['reg'] = tf.reduce_sum(model.losses) losses['loc'], losses['landm'], losses['class'] = \ multi_box_loss(labels, predictions) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) return total_loss, losses # training loop summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name']) remain_steps = max( steps_per_epoch * cfg['epoch'] - checkpoint.step.numpy(), 0) prog_bar = ProgressBar(steps_per_epoch, checkpoint.step.numpy() % steps_per_epoch) for inputs, labels in train_dataset.take(remain_steps): checkpoint.step.assign_add(1) steps = checkpoint.step.numpy() total_loss, losses = train_step(inputs, labels) prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format( ((steps - 1) // steps_per_epoch) + 1, cfg['epoch'], total_loss.numpy(), optimizer.lr(steps).numpy())) if steps % 10 == 0: with summary_writer.as_default(): tf.summary.scalar('loss/total_loss', total_loss, step=steps) for k, l in losses.items(): tf.summary.scalar('loss/{}'.format(k), l, step=steps) tf.summary.scalar('learning_rate', optimizer.lr(steps), step=steps) if steps % cfg['save_steps'] == 0: manager.save() print("\n[*] save ckpt file at {}".format( manager.latest_checkpoint)) manager.save() print("\n[*] training done! save ckpt file at {}".format( manager.latest_checkpoint))
def main(_): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define training step function @tf.function def train_step(inputs, labels, drop_path_prob): with tf.GradientTape() as tape: logits, logits_aux = model((inputs, drop_path_prob), training=True) losses = {} losses['reg'] = tf.reduce_sum(model.losses) losses['ce'] = criterion(labels, logits) losses['ce_auxiliary'] = \ cfg['auxiliary_weight'] * criterion(labels, logits_aux) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, model.trainable_variables) grads = [(tf.clip_by_norm(grad, cfg['grad_clip'])) for grad in grads] optimizer.apply_gradients(zip(grads, model.trainable_variables)) return logits, total_loss, losses # Used to store the final accuracy for every arch final_acc = pd.DataFrame(data=None, columns=['arch_name', 'acc']) loop_num = 50 if Debug: # debugpy.wait_for_client() loop_num = 1 # define network for arch_num in range(loop_num): # read the arch arch = str(f"{cfg['sub_name']}_{arch_num}") cfg['arch'] = arch model = CifarModel(cfg, training=True, file_name=FLAGS.file_name) if Debug: model.summary(line_length=80) print("param size = {:f}MB".format(count_parameters_in_MB(model))) # load dataset train_dataset = load_cifar10_dataset( cfg['batch_size'], split='train', shuffle=True, drop_remainder=True, using_normalize=cfg['using_normalize'], using_crop=cfg['using_crop'], using_flip=cfg['using_flip'], using_cutout=cfg['using_cutout'], cutout_length=cfg['cutout_length']) val_dataset = load_cifar10_dataset( cfg['val_batch_size'], split='test', shuffle=False, drop_remainder=False, using_normalize=cfg['using_normalize'], using_crop=False, using_flip=False, using_cutout=False) # define optimizer steps_per_epoch = cfg['dataset_len'] // cfg['batch_size'] learning_rate = CosineAnnealingLR(initial_learning_rate=cfg['init_lr'], t_period=cfg['epoch'] * steps_per_epoch, lr_min=cfg['lr_min']) optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=cfg['momentum']) # define losses function criterion = CrossEntropyLoss() # load checkpoint checkpoint_dir = './checkpoints/' + arch checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: print("[*] training from scratch.") # training loop summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name']) total_steps = steps_per_epoch * cfg['epoch'] remain_steps = max(total_steps - checkpoint.step.numpy(), 0) prog_bar = ProgressBar(steps_per_epoch, checkpoint.step.numpy() % steps_per_epoch) train_acc = AvgrageMeter() val_acc = AvgrageMeter() best_acc = 0. for inputs, labels in train_dataset.take(remain_steps): checkpoint.step.assign_add(1) drop_path_prob = cfg['drop_path_prob'] * ( tf.cast(checkpoint.step, tf.float32) / total_steps) steps = checkpoint.step.numpy() epochs = ((steps - 1) // steps_per_epoch) + 1 logits, total_loss, losses = train_step(inputs, labels, drop_path_prob) train_acc.update( accuracy(logits.numpy(), labels.numpy())[0], cfg['batch_size']) prog_bar.update( "epoch={}/{}, loss={:.4f}, acc={:.2f}, lr={:.2e}".format( epochs, cfg['epoch'], total_loss.numpy(), train_acc.avg, optimizer.lr(steps).numpy())) if steps % cfg['val_steps'] == 0 and steps > 1: print("\n[*] validate...", end='') val_acc.reset() for inputs_val, labels_val in val_dataset: logits_val, _ = model((inputs_val, tf.constant([0.]))) val_acc.update( accuracy(logits_val.numpy(), labels_val.numpy())[0], inputs_val.shape[0]) if val_acc.avg > best_acc: best_acc = val_acc.avg model.save_weights( f"checkpoints/{cfg['sub_name']}/best.ckpt") val_str = " val acc {:.2f}%, best acc {:.2f}%" print(val_str.format(val_acc.avg, best_acc), end='') if steps % 10 == 0: with summary_writer.as_default(): tf.summary.scalar('acc/train', train_acc.avg, step=steps) tf.summary.scalar('acc/val', val_acc.avg, step=steps) tf.summary.scalar('loss/total_loss', total_loss, step=steps) for k, l in losses.items(): tf.summary.scalar('loss/{}'.format(k), l, step=steps) tf.summary.scalar('learning_rate', optimizer.lr(steps), step=steps) if steps % cfg['save_steps'] == 0: manager.save() print("\n[*] save ckpt file at {}".format( manager.latest_checkpoint)) if steps % steps_per_epoch == 0: train_acc.reset() manager.save() print("\n[*] training one arch done! save ckpt file at {}".format( manager.latest_checkpoint)) final_acc.loc[arch_num] = list([arch, best_acc]) print("Whole training ended, the best result is :") print("\t", final_acc.iloc[final_acc['acc'].idxmax()])
def main(_): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define network generator = RRDB_Model(cfg['input_size'], cfg['ch_size'], cfg['network_G']) generator.summary(line_length=80) discriminator = DiscriminatorVGG128(cfg['gt_size'], cfg['ch_size']) discriminator.summary(line_length=80) # load dataset train_dataset = load_dataset(cfg, 'train_dataset', shuffle=False) # define optimizer learning_rate_G = MultiStepLR(cfg['lr_G'], cfg['lr_steps'], cfg['lr_rate']) learning_rate_D = MultiStepLR(cfg['lr_D'], cfg['lr_steps'], cfg['lr_rate']) optimizer_G = tf.keras.optimizers.Adam(learning_rate=learning_rate_G, beta_1=cfg['adam_beta1_G'], beta_2=cfg['adam_beta2_G']) optimizer_D = tf.keras.optimizers.Adam(learning_rate=learning_rate_D, beta_1=cfg['adam_beta1_D'], beta_2=cfg['adam_beta2_D']) # define losses function pixel_loss_fn = PixelLoss(criterion=cfg['pixel_criterion']) fea_loss_fn = ContentLoss(criterion=cfg['feature_criterion']) gen_loss_fn = GeneratorLoss(gan_type=cfg['gan_type']) dis_loss_fn = DiscriminatorLoss(gan_type=cfg['gan_type']) # load checkpoint checkpoint_dir = './checkpoints/' + cfg['sub_name'] checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer_G=optimizer_G, optimizer_D=optimizer_D, model=generator, discriminator=discriminator) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: if cfg['pretrain_name'] is not None: pretrain_dir = './checkpoints/' + cfg['pretrain_name'] if tf.train.latest_checkpoint(pretrain_dir): checkpoint.restore(tf.train.latest_checkpoint(pretrain_dir)) checkpoint.step.assign(0) print("[*] training from pretrain model {}.".format( pretrain_dir)) else: print( "[*] cannot find pretrain model {}.".format(pretrain_dir)) else: print("[*] training from scratch.") # define training step function @tf.function def train_step(lr, hr): with tf.GradientTape(persistent=True) as tape: sr = generator(lr, training=True) hr_output = discriminator(hr, training=True) sr_output = discriminator(sr, training=True) losses_G = {} losses_D = {} losses_G['reg'] = tf.reduce_sum(generator.losses) losses_D['reg'] = tf.reduce_sum(discriminator.losses) losses_G['pixel'] = cfg['w_pixel'] * pixel_loss_fn(hr, sr) losses_G['feature'] = cfg['w_feature'] * fea_loss_fn(hr, sr) losses_G['gan'] = cfg['w_gan'] * gen_loss_fn(hr_output, sr_output) losses_D['gan'] = dis_loss_fn(hr_output, sr_output) total_loss_G = tf.add_n([l for l in losses_G.values()]) total_loss_D = tf.add_n([l for l in losses_D.values()]) grads_G = tape.gradient(total_loss_G, generator.trainable_variables) grads_D = tape.gradient(total_loss_D, discriminator.trainable_variables) optimizer_G.apply_gradients(zip(grads_G, generator.trainable_variables)) optimizer_D.apply_gradients( zip(grads_D, discriminator.trainable_variables)) return total_loss_G, total_loss_D, losses_G, losses_D # training loop summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name']) prog_bar = ProgressBar(cfg['niter'], checkpoint.step.numpy()) remain_steps = max(cfg['niter'] - checkpoint.step.numpy(), 0) for lr, hr in train_dataset.take(remain_steps): checkpoint.step.assign_add(1) steps = checkpoint.step.numpy() total_loss_G, total_loss_D, losses_G, losses_D = train_step(lr, hr) prog_bar.update( "loss_G={:.4f}, loss_D={:.4f}, lr_G={:.1e}, lr_D={:.1e}".format( total_loss_G.numpy(), total_loss_D.numpy(), optimizer_G.lr(steps).numpy(), optimizer_D.lr(steps).numpy())) if steps % 10 == 0: with summary_writer.as_default(): tf.summary.scalar('loss_G/total_loss', total_loss_G, step=steps) tf.summary.scalar('loss_D/total_loss', total_loss_D, step=steps) for k, l in losses_G.items(): tf.summary.scalar('loss_G/{}'.format(k), l, step=steps) for k, l in losses_D.items(): tf.summary.scalar('loss_D/{}'.format(k), l, step=steps) tf.summary.scalar('learning_rate_G', optimizer_G.lr(steps), step=steps) tf.summary.scalar('learning_rate_D', optimizer_D.lr(steps), step=steps) if steps % cfg['save_steps'] == 0: manager.save() print("\n[*] save ckpt file at {}".format( manager.latest_checkpoint)) print("\n [*] training done!")
def main(_): ''' Train for one epoch to get supernet , then random sample 50 architectures for finetuning. This structure is basically the same as train_search.py TODO: Add PGD here and calculate FSP ''' # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define network sna = SearchNetArch(cfg) sna.model.summary(line_length=80) print("param size = {:f}MB".format(count_parameters_in_MB(sna.model))) # load dataset t_split = f"train[0%:{int(cfg['train_portion'] * 100)}%]" v_split = f"train[{int(cfg['train_portion'] * 100)}%:100%]" train_dataset = load_cifar10_dataset( cfg['batch_size'], split=t_split, shuffle=True, drop_remainder=True, using_normalize=cfg['using_normalize'], using_crop=cfg['using_crop'], using_flip=cfg['using_flip'], using_cutout=cfg['using_cutout'], cutout_length=cfg['cutout_length']) val_dataset = load_cifar10_dataset(cfg['batch_size'], split=v_split, shuffle=True, drop_remainder=True, using_normalize=cfg['using_normalize'], using_crop=cfg['using_crop'], using_flip=cfg['using_flip'], using_cutout=cfg['using_cutout'], cutout_length=cfg['cutout_length']) # define optimizer steps_per_epoch = int(cfg['dataset_len'] * cfg['train_portion'] // cfg['batch_size']) learning_rate = CosineAnnealingLR(initial_learning_rate=cfg['init_lr'], t_period=cfg['epoch'] * steps_per_epoch, lr_min=cfg['lr_min']) optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=cfg['momentum']) optimizer_arch = tf.keras.optimizers.Adam( learning_rate=cfg['arch_learning_rate'], beta_1=0.5, beta_2=0.999) # define losses function criterion = CrossEntropyLoss() # load checkpoint checkpoint_dir = './checkpoints/' + cfg['sub_name'] checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer=optimizer, optimizer_arch=optimizer_arch, model=sna.model, alphas_normal=sna.alphas_normal, alphas_reduce=sna.alphas_reduce, betas_normal=sna.betas_normal, betas_reduce=sna.betas_reduce) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: print("[*] training from scratch.") print(f"[*] searching model after {cfg['start_search_epoch']} epochs.") # define training step function for model @tf.function def train_step(inputs, labels): with tf.GradientTape() as tape: logits = sna.model((inputs, *sna.arch_parameters), training=True) losses = {} losses['reg'] = tf.reduce_sum(sna.model.losses) losses['ce'] = criterion(labels, logits) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, sna.model.trainable_variables) grads = [(tf.clip_by_norm(grad, cfg['grad_clip'])) for grad in grads] optimizer.apply_gradients(zip(grads, sna.model.trainable_variables)) return logits, total_loss, losses # define training step function for arch_parameters @tf.function def train_step_arch(inputs, labels): with tf.GradientTape() as tape: logits = sna.model((inputs, *sna.arch_parameters), training=True) losses = {} losses['reg'] = cfg['arch_weight_decay'] * tf.add_n( [tf.reduce_sum(p**2) for p in sna.arch_parameters]) losses['ce'] = criterion(labels, logits) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, sna.arch_parameters) optimizer_arch.apply_gradients(zip(grads, sna.arch_parameters)) return losses summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name']) print("[*] finished searching for one epoch") print("[*] Start sampling architetures") prog_bar = ProgressBar(50, 0) # Start sampling for 50 archs for geno_num in range(50): genotype = sna.get_genotype(random_search_flag=True) prog_bar.update(f"\n Sampled{geno_num}th arch: {genotype}") # print(f"\n Sampled {geno_num}th arch: {genotype}") f = open( os.path.join('./logs', cfg['sub_name'], 'search_random_arch_genotype.py'), 'a') f.write(f"\n{cfg['sub_name']}_{geno_num} = {genotype}\n") f.close() print("Sampling done!") debugpy.wait_for_client()
def main(_): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define network sna = SearchNetArch(cfg) sna.model.summary(line_length=80) print("param size = {:f}MB".format(count_parameters_in_MB(sna.model))) # load dataset t_split = f"train[0%:{int(cfg['train_portion'] * 100)}%]" v_split = f"train[{int(cfg['train_portion'] * 100)}%:100%]" train_dataset = load_cifar10_dataset( cfg['batch_size'], split=t_split, shuffle=True, drop_remainder=True, using_normalize=cfg['using_normalize'], using_crop=cfg['using_crop'], using_flip=cfg['using_flip'], using_cutout=cfg['using_cutout'], cutout_length=cfg['cutout_length']) val_dataset = load_cifar10_dataset(cfg['batch_size'], split=v_split, shuffle=True, drop_remainder=True, using_normalize=cfg['using_normalize'], using_crop=cfg['using_crop'], using_flip=cfg['using_flip'], using_cutout=cfg['using_cutout'], cutout_length=cfg['cutout_length']) # define optimizer steps_per_epoch = int(cfg['dataset_len'] * cfg['train_portion'] // cfg['batch_size']) learning_rate = CosineAnnealingLR(initial_learning_rate=cfg['init_lr'], t_period=cfg['epoch'] * steps_per_epoch, lr_min=cfg['lr_min']) optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=cfg['momentum']) optimizer_arch = tf.keras.optimizers.Adam( learning_rate=cfg['arch_learning_rate'], beta_1=0.5, beta_2=0.999) # define losses function criterion = CrossEntropyLoss() # load checkpoint checkpoint_dir = './checkpoints/' + cfg['sub_name'] checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer=optimizer, optimizer_arch=optimizer_arch, model=sna.model, alphas_normal=sna.alphas_normal, alphas_reduce=sna.alphas_reduce, betas_normal=sna.betas_normal, betas_reduce=sna.betas_reduce) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: print("[*] training from scratch.") print(f"[*] searching model after {cfg['start_search_epoch']} epochs.") # define training step function for model @tf.function def train_step(inputs, labels): with tf.GradientTape() as tape: logits = sna.model((inputs, *sna.arch_parameters), training=True) losses = {} losses['reg'] = tf.reduce_sum(sna.model.losses) losses['ce'] = criterion(labels, logits) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, sna.model.trainable_variables) grads = [(tf.clip_by_norm(grad, cfg['grad_clip'])) for grad in grads] optimizer.apply_gradients(zip(grads, sna.model.trainable_variables)) return logits, total_loss, losses # define training step function for arch_parameters @tf.function def train_step_arch(inputs, labels): with tf.GradientTape() as tape: logits = sna.model((inputs, *sna.arch_parameters), training=True) losses = {} losses['reg'] = cfg['arch_weight_decay'] * tf.add_n( [tf.reduce_sum(p**2) for p in sna.arch_parameters]) losses['ce'] = criterion(labels, logits) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, sna.arch_parameters) optimizer_arch.apply_gradients(zip(grads, sna.arch_parameters)) return losses # training loop summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name']) total_steps = steps_per_epoch * cfg['epoch'] remain_steps = max(total_steps - checkpoint.step.numpy(), 0) prog_bar = ProgressBar(steps_per_epoch, checkpoint.step.numpy() % steps_per_epoch) train_acc = AvgrageMeter() for inputs, labels in train_dataset.take(remain_steps): checkpoint.step.assign_add(1) steps = checkpoint.step.numpy() epochs = ((steps - 1) // steps_per_epoch) + 1 if epochs > cfg['start_search_epoch']: inputs_val, labels_val = next(iter(val_dataset)) arch_losses = train_step_arch(inputs_val, labels_val) logits, total_loss, losses = train_step(inputs, labels) train_acc.update( accuracy(logits.numpy(), labels.numpy())[0], cfg['batch_size']) prog_bar.update( "epoch={:d}/{:d}, loss={:.4f}, acc={:.2f}, lr={:.2e}".format( epochs, cfg['epoch'], total_loss.numpy(), train_acc.avg, optimizer.lr(steps).numpy())) if steps % 10 == 0: with summary_writer.as_default(): tf.summary.scalar('acc/train', train_acc.avg, step=steps) tf.summary.scalar('loss/total_loss', total_loss, step=steps) for k, l in losses.items(): tf.summary.scalar('loss/{}'.format(k), l, step=steps) tf.summary.scalar('learning_rate', optimizer.lr(steps), step=steps) if epochs > cfg['start_search_epoch']: for k, l in arch_losses.items(): tf.summary.scalar('arch_losses/{}'.format(k), l, step=steps) tf.summary.scalar('arch_learning_rate', cfg['arch_learning_rate'], step=steps) if steps % cfg['save_steps'] == 0: manager.save() print("\n[*] save ckpt file at {}".format( manager.latest_checkpoint)) if steps % steps_per_epoch == 0: train_acc.reset() if epochs > cfg['start_search_epoch']: genotype = sna.get_genotype() print(f"\nsearch arch: {genotype}") f = open( os.path.join('./logs', cfg['sub_name'], 'search_arch_genotype.py'), 'a') f.write(f"\n{cfg['sub_name']}_{epochs} = {genotype}\n") f.close() manager.save() print("\n[*] training done! save ckpt file at {}".format( manager.latest_checkpoint))
def main(_): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define network if cfg['network_G']['name']=='RRDB': # ESRGAN 4x model = RRDB_Model(None, cfg['ch_size'], cfg['network_G']) elif cfg['network_G']['name']=='RRDB_CIPLAB': model = RRDB_Model_16x(None, cfg['ch_size'], cfg['network_G']) elif cfg['network_G']['name']=='RFB_ESRGAN': model = RFB_Model_16x(None, cfg['ch_size'], cfg['network_G']) model.summary(line_length=80) # load dataset train_dataset = load_dataset(cfg, 'train_dataset', shuffle=True) set5_dataset = load_val_dataset(cfg, 'set5') set14_dataset = load_val_dataset(cfg, 'set14') if 'DIV8K' in cfg['test_dataset']: DIV8K_val = load_val_dataset(cfg, 'DIV8K', crop_centor=cfg['test_dataset']['DIV8K_crop_centor']) # define optimizer learning_rate = MultiStepLR(cfg['lr'], cfg['lr_steps'], cfg['lr_rate']) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=cfg['adam_beta1_G'], beta_2=cfg['adam_beta2_G']) # define losses function if cfg['cycle_mse']: pixel_loss_fn = PixelLossDown(criterion=cfg['pixel_criterion'], scale=cfg['scale']) else: pixel_loss_fn = PixelLoss(criterion=cfg['pixel_criterion']) # load checkpoint checkpoint_dir = cfg['log_dir'] + '/checkpoints' checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: print("[*] training from scratch.") # define training step function @tf.function def train_step(lr, hr): with tf.GradientTape() as tape: sr = model(lr, training=True) losses = {} losses['reg'] = tf.reduce_sum(model.losses) losses['pixel'] = cfg['w_pixel'] * pixel_loss_fn(hr, sr) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) return total_loss, losses # training loop summary_writer = tf.summary.create_file_writer(cfg['log_dir']+'/logs') prog_bar = ProgressBar(cfg['niter'], checkpoint.step.numpy()) remain_steps = max(cfg['niter'] - checkpoint.step.numpy(), 0) for _ in range(remain_steps): lr, hr = train_dataset() checkpoint.step.assign_add(1) steps = checkpoint.step.numpy() total_loss, losses = train_step(lr, hr) prog_bar.update("loss={:.4f}, lr={:.1e}".format( total_loss.numpy(), optimizer.lr(steps).numpy())) if steps % 10 == 0: with summary_writer.as_default(): tf.summary.scalar( 'loss/total_loss', total_loss, step=steps) for k, l in losses.items(): tf.summary.scalar('loss/{}'.format(k), l, step=steps) tf.summary.scalar( 'learning_rate', optimizer.lr(steps), step=steps) if steps % cfg['save_steps'] == 0: manager.save() print("\n[*] save ckpt file at {}".format( manager.latest_checkpoint)) # log results on test data set5_logs = evaluate_dataset(set5_dataset, model, cfg) set14_logs = evaluate_dataset(set14_dataset, model, cfg) if 'DIV8K' in cfg['test_dataset']: DIV8K_logs = evaluate_dataset(DIV8K_val, model, cfg) with summary_writer.as_default(): if cfg['logging']['psnr']: tf.summary.scalar('set5/psnr', set5_logs['psnr'], step=steps) tf.summary.scalar('set14/psnr', set14_logs['psnr'], step=steps) if 'DIV8K' in cfg['test_dataset']: tf.summary.scalar('DIV8K/psnr', DIV8K_logs['psnr'], step=steps) if cfg['logging']['ssim']: tf.summary.scalar('set5/ssim', set5_logs['ssim'], step=steps) tf.summary.scalar('set14/ssim', set14_logs['ssim'], step=steps) if 'DIV8K' in cfg['test_dataset']: tf.summary.scalar('DIV8K/psnr', DIV8K_logs['psnr'], step=steps) if cfg['logging']['lpips']: tf.summary.scalar('set5/lpips', set5_logs['lpips'], step=steps) tf.summary.scalar('set14/lpips', set14_logs['lpips'], step=steps) if 'DIV8K' in cfg['test_dataset']: tf.summary.scalar('DIV8K/lpips', DIV8K_logs['lpips'], step=steps) if cfg['logging']['plot_samples']: tf.summary.image("set5/samples", [set5_logs['samples']], step=steps) tf.summary.image("set14/samples", [set14_logs['samples']], step=steps) if 'DIV8K' in cfg['test_dataset']: tf.summary.image("DIV8K/samples", [DIV8K_logs['samples']], step=steps) print("\n[*] training done!")
def main(_): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define network model = RRDB_Model(cfg['input_size'], cfg['ch_size'], cfg['network_G']) model.summary(line_length=80) # load dataset train_dataset = load_dataset(cfg, 'train_dataset', shuffle=True) # define optimizer learning_rate = MultiStepLR(cfg['lr'], cfg['lr_steps'], cfg['lr_rate']) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=cfg['adam_beta1_G'], beta_2=cfg['adam_beta2_G']) # define losses function pixel_loss_fn = PixelLoss(criterion=cfg['pixel_criterion']) # load checkpoint checkpoint_dir = './checkpoints/' + cfg['sub_name'] checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: print("[*] training from scratch.") # define training step function @tf.function def train_step(lr, hr): with tf.GradientTape() as tape: sr = model(lr, training=True) losses = {} losses['reg'] = tf.reduce_sum(model.losses) losses['pixel'] = cfg['w_pixel'] * pixel_loss_fn(hr, sr) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) return total_loss, losses # training loop summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name']) prog_bar = ProgressBar(cfg['niter'], checkpoint.step.numpy()) remain_steps = max(cfg['niter'] - checkpoint.step.numpy(), 0) for lr, hr in train_dataset.take(remain_steps): checkpoint.step.assign_add(1) steps = checkpoint.step.numpy() total_loss, losses = train_step(lr, hr) prog_bar.update("loss={:.4f}, lr={:.1e}".format( total_loss.numpy(), optimizer.lr(steps).numpy())) if steps % 10 == 0: with summary_writer.as_default(): tf.summary.scalar('loss/total_loss', total_loss, step=steps) for k, l in losses.items(): tf.summary.scalar('loss/{}'.format(k), l, step=steps) tf.summary.scalar('learning_rate', optimizer.lr(steps), step=steps) if steps % cfg['save_steps'] == 0: manager.save() print("\n[*] save ckpt file at {}".format( manager.latest_checkpoint)) print("\n[*] training done!")
def train_retinaface(cfg): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' if cfg['distributed']: import horovod.tensorflow as hvd # Initialize Horovod hvd.init() else: hvd = [] os.environ['CUDA_VISIBLE_DEVICES'] = '0' reset_random_seeds() logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth(hvd) # define network model = RetinaFaceModel(cfg, training=True) model.summary(line_length=80) # define prior box priors = prior_box((cfg['input_size'], cfg['input_size']), cfg['min_sizes'], cfg['steps'], cfg['clip']) # load dataset train_dataset = load_dataset(cfg, priors, 'train', hvd) if cfg['evaluation_during_training']: val_dataset = load_dataset(cfg, priors, 'val', []) # define optimizer if cfg['distributed']: init_lr = cfg['init_lr'] * hvd.size() min_lr = cfg['min_lr'] * hvd.size() steps_per_epoch = cfg['dataset_len'] // (cfg['batch_size'] * hvd.size()) else: init_lr = cfg['init_lr'] min_lr = cfg['min_lr'] steps_per_epoch = cfg['dataset_len'] // cfg['batch_size'] learning_rate = MultiStepWarmUpLR( initial_learning_rate=init_lr, lr_steps=[e * steps_per_epoch for e in cfg['lr_decay_epoch']], lr_rate=cfg['lr_rate'], warmup_steps=cfg['warmup_epoch'] * steps_per_epoch, min_lr=min_lr) optimizer = tf.keras.optimizers.SGD( learning_rate=learning_rate, momentum=0.9, nesterov=True) # define losses function multi_box_loss = MultiBoxLoss(num_class=cfg['num_class']) # load checkpoint checkpoint_dir = os.path.join(cfg['output_path'], 'checkpoints', cfg['sub_name']) checkpoint = tf.train.Checkpoint(epoch=tf.Variable(0, name='epoch'), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) os.makedirs(checkpoint_dir, exist_ok=True) with open(os.path.join(checkpoint_dir, 'cfg.pickle'), 'wb') as handle: pickle.dump(cfg, handle, protocol=pickle.HIGHEST_PROTOCOL) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {}'.format(manager.latest_checkpoint)) else: print("[*] training from scratch.") # define training step function @tf.function def train_step(inputs, labels, first_batch, epoch): with tf.GradientTape() as tape: predictions = model(inputs, training=True) losses = {} losses['reg'] = tf.reduce_sum(model.losses) losses['loc'], losses['landm'], losses['class'] = \ multi_box_loss(labels, predictions) total_loss = tf.add_n([l for l in losses.values()]) if cfg['distributed']: # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(total_loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) if cfg['distributed'] and first_batch and epoch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) return total_loss, losses def test_step(inputs, img_name): _, img_height_raw, img_width_raw, _ = inputs.shape # pad input image to avoid unmatched shape problem img = inputs[0].numpy() # if img_name == '6_Funeral_Funeral_6_618': # resize = 0.5 # this image is too big to avoid OOM problem # img = cv2.resize(img, None, None, fx=resize, fy=resize, # interpolation=cv2.INTER_LINEAR) img, pad_params = pad_input_image(img, max_steps=max(cfg['steps'])) input_img = img[np.newaxis, ...] predictions = model(input_img, training=False) outputs = pred_to_outputs(cfg, predictions, input_img.shape).numpy() # recover padding effect outputs = recover_pad_output(outputs, pad_params) bboxs = outputs[:, :4] confs = outputs[:, -1] pred_boxes = [] for box, conf in zip(bboxs, confs): x = int(box[0] * img_width_raw) y = int(box[1] * img_height_raw) w = int(box[2] * img_width_raw) - int(box[0] * img_width_raw) h = int(box[3] * img_height_raw) - int(box[1] * img_height_raw) pred_boxes.append([x, y, w, h, conf]) pred_boxes = np.array(pred_boxes).astype('float') return pred_boxes #training loop summary_writer = tf.summary.create_file_writer(os.path.join(cfg['output_path'], 'logs', cfg['sub_name'])) prog_bar = ProgressBar(steps_per_epoch, 0) if cfg['evaluation_during_training']: widerface_eval_hard = WiderFaceEval(split='hard') for epoch in range(cfg['epoch']): try: actual_epoch = epoch + 1 if cfg['distributed']: if hvd.rank() == 0: print("\nStart of epoch %d" % (actual_epoch,)) else: print("\nStart of epoch %d" % (actual_epoch,)) checkpoint.epoch.assign_add(1) start_time = time.time() #Iterate over the batches of the dataset. for batch, (x_batch_train, y_batch_train, img_name) in enumerate(train_dataset): total_loss, losses = train_step(x_batch_train, y_batch_train, batch == 0, epoch == 0) if cfg['distributed']: if hvd.rank() == 0: # prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format( # checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32))) if batch % 100 == 0: print("batch={}/{}, epoch={}/{}, loss={:.4f}, lr={:.1e}".format( batch, steps_per_epoch, checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32))) else: prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format( checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32))) # Display metrics at the end of each epoch. # train_acc = train_acc_metric.result() # print("\nTraining loss over epoch: %.4f" % (float(total_loss.numpy()),)) if cfg['distributed']: if hvd.rank() == 0: print("Time taken: %.2fs" % (time.time() - start_time)) manager.save() print("\n[*] save ckpt file at {}".format(manager.latest_checkpoint)) else: print("Time taken: %.2fs" % (time.time() - start_time)) manager.save() print("\n[*] save ckpt file at {}".format(manager.latest_checkpoint)) if cfg['evaluation_during_training']: # Run a validation loop at the end of each epoch. for batch, (x_batch_val, y_batch_val, img_name) in enumerate(val_dataset.take(500)): if '/' in img_name.numpy()[0].decode(): img_name = img_name.numpy()[0].decode().split('/')[1].split('.')[0] else: img_name = [] pred_boxes = test_step(x_batch_val, img_name) gt_boxes = labels_to_boxes(y_batch_val) widerface_eval_hard.update(pred_boxes, gt_boxes, img_name) ap_hard = widerface_eval_hard.calculate_ap() widerface_eval_hard.reset() if cfg['distributed']: if hvd.rank() == 0: print("Validation acc: %.4f" % (float(ap_hard),)) else: print("Validation acc: %.4f" % (float(ap_hard),)) def tensorboard_writer(): with summary_writer.as_default(): tf.summary.scalar('loss/total_loss', total_loss, step=actual_epoch) for k, l in losses.items(): tf.summary.scalar('loss/{}'.format(k), l, step=actual_epoch) tf.summary.scalar('learning_rate', optimizer._decayed_lr(tf.float32), step=actual_epoch) if cfg['evaluation_during_training']: tf.summary.scalar('Val AP', ap_hard, step=actual_epoch) if cfg['distributed']: if hvd.rank() == 0: tensorboard_writer() else: tensorboard_writer() except Exception as E: print(E) continue if cfg['distributed']: if hvd.rank() == 0: manager.save() print("\n[*] training done! save ckpt file at {}".format( manager.latest_checkpoint)) else: manager.save() print("\n[*] training done! save ckpt file at {}".format( manager.latest_checkpoint))