from lib.fpn.box_utils import bbox_loss import torch.backends.cudnn as cudnn from pycocotools.cocoeval import COCOeval from lib.pytorch_misc import optimistic_restore, clip_grad_norm from torch.optim.lr_scheduler import ReduceLROnPlateau cudnn.benchmark = True conf = ModelConfig() if conf.coco: train, val = CocoDetection.splits() val.ids = val.ids[:conf.val_size] train.ids = train.ids train_loader, val_loader = CocoDataLoader.splits( train, val, batch_size=conf.batch_size, num_workers=conf.num_workers, num_gpus=conf.num_gpus) else: train, val, _ = VG.splits(num_val_im=conf.val_size, filter_non_overlap=False, filter_empty_rels=False, use_proposals=conf.use_proposals) train_loader, val_loader = VGDataLoader.splits( train, val, batch_size=conf.batch_size, num_workers=conf.num_workers, num_gpus=conf.num_gpus) detector = ObjectDetector(
def main(args): print(args) check_args(args) if not exists(args.output_dir): os.makedirs(args.output_dir) summary_writer = SummaryWriter(args.output_dir) if args.coco: train, val = CocoDetection.splits() val.ids = val.ids[:args.val_size] train.ids = train.ids train_loader, val_loader = CocoDataLoader.splits(train, val, batch_size=args.batch_size, num_workers=args.num_workers, num_gpus=args.num_gpus) else: train, val, _ = VG.splits(num_val_im=args.val_size, filter_non_overlap=False, filter_empty_rels=False, use_proposals=args.use_proposals) train_loader, val_loader = VGDataLoader.splits(train, val, batch_size=args.batch_size, num_workers=args.num_workers, num_gpus=args.num_gpus) print(train.ind_to_classes) os._exit(0) all_in_one_model = neural_motifs_sg2im_model(args, train.ind_to_classes) # Freeze the detector for n, param in all_in_one_model.detector.named_parameters(): param.requires_grad = False all_in_one_model.cuda() gan_g_loss, gan_d_loss = get_gan_losses(args.gan_loss_type) t, epoch, checkpoint = all_in_one_model.t, all_in_one_model.epoch, all_in_one_model.checkpoint while True: if t >= args.num_iterations: break epoch += 1 print('Starting epoch %d' % epoch) for step, batch in enumerate(tqdm(train_loader, desc='Training Epoch %d' % epoch, total=len(train_loader))): if t == args.eval_mode_after: print('switching to eval mode') all_in_one_model.model.eval() all_in_one_model.optimizer = optim.Adam(all_in_one_model.parameters(), lr=args.learning_rate) t += 1 with timeit('forward', args.timing): result = all_in_one_model[batch] imgs, imgs_pred, objs, g_scores_fake_crop, g_obj_scores_fake_crop, g_scores_fake_img, \ d_scores_fake_crop, d_obj_scores_fake_crop, d_scores_real_crop, d_obj_scores_real_crop, \ d_scores_fake_img, d_scores_real_img = result.imgs, result.imgs_pred, result.objs, \ result.g_scores_fake_crop, result.g_obj_scores_fake_crop, result.g_scores_fake_img, \ result.d_scores_fake_crop, result.d_obj_scores_fake_crop, result.d_scores_real_crop, \ result.d_obj_scores_real_crop, result.d_scores_fake_img, result.d_scores_real_img with timeit('loss', args.timing): total_loss, losses = calculate_model_losses( args, imgs, imgs_pred) if all_in_one_model.obj_discriminator is not None: total_loss = add_loss(total_loss, F.cross_entropy(g_obj_scores_fake_crop, objs), losses, 'ac_loss', args.ac_loss_weight) weight = args.discriminator_loss_weight * args.d_obj_weight total_loss = add_loss(total_loss, gan_g_loss(g_scores_fake_crop), losses, 'g_gan_obj_loss', weight) if all_in_one_model.img_discriminator is not None: weight = args.discriminator_loss_weight * args.d_img_weight total_loss = add_loss(total_loss, gan_g_loss(g_scores_fake_img), losses, 'g_gan_img_loss', weight) losses['total_loss'] = total_loss.item() if not math.isfinite(losses['total_loss']): print('WARNING: Got loss = NaN, not backpropping') continue with timeit('backward', args.timing): all_in_one_model.optimizer.zero_grad() total_loss.backward() all_in_one_model.optimizer.step() if all_in_one_model.obj_discriminator is not None: with timeit('d_obj loss', args.timing): d_obj_losses = LossManager() d_obj_gan_loss = gan_d_loss(d_scores_real_crop, d_scores_fake_crop) d_obj_losses.add_loss(d_obj_gan_loss, 'd_obj_gan_loss') d_obj_losses.add_loss(F.cross_entropy(d_obj_scores_real_crop, objs), 'd_ac_loss_real') d_obj_losses.add_loss(F.cross_entropy(d_obj_scores_fake_crop, objs), 'd_ac_loss_fake') with timeit('d_obj backward', args.timing): all_in_one_model.optimizer_d_obj.zero_grad() d_obj_losses.total_loss.backward() all_in_one_model.optimizer_d_obj.step() if all_in_one_model.img_discriminator is not None: with timeit('d_img loss', args.timing): d_img_losses = LossManager() d_img_gan_loss = gan_d_loss(d_scores_real_img, d_scores_fake_img) d_img_losses.add_loss(d_img_gan_loss, 'd_img_gan_loss') with timeit('d_img backward', args.timing): all_in_one_model.optimizer_d_img.zero_grad() d_img_losses.total_loss.backward() all_in_one_model.optimizer_d_img.step() if t % args.print_every == 0: print('t = %d / %d' % (t, args.num_iterations)) G_loss_list = [] for name, val in losses.items(): G_loss_list.append('[%s]: %.4f' % (name, val)) checkpoint['losses'][name].append(val) summary_writer.add_scalar("G_%s" % name, val, t) print("G: %s" % ", ".join(G_loss_list)) checkpoint['losses_ts'].append(t) if all_in_one_model.obj_discriminator is not None: D_obj_loss_list = [] for name, val in d_obj_losses.items(): D_obj_loss_list.append('[%s]: %.4f' % (name, val)) checkpoint['d_losses'][name].append(val) summary_writer.add_scalar("D_obj_%s" % name, val, t) print("D_obj: %s" % ", ".join(D_obj_loss_list)) if all_in_one_model.img_discriminator is not None: D_img_loss_list = [] for name, val in d_img_losses.items(): D_img_loss_list.append('[%s]: %.4f' % (name, val)) checkpoint['d_losses'][name].append(val) summary_writer.add_scalar("D_img_%s" % name, val, t) print("D_img: %s" % ", ".join(D_img_loss_list)) if t % args.checkpoint_every == 0: print('checking on train') train_results = check_model(args, train_loader, all_in_one_model) t_losses, t_samples = train_results checkpoint['train_samples'].append(t_samples) checkpoint['checkpoint_ts'].append(t) for name, images in t_samples.items(): summary_writer.add_image("train_%s" % name, images, t) print('checking on val') val_results = check_model(args, val_loader, all_in_one_model) val_losses, val_samples = val_results checkpoint['val_samples'].append(val_samples) for name, images in val_samples.items(): summary_writer.add_image("val_%s" % name, images, t) for k, v in val_losses.items(): checkpoint['val_losses'][k].append(v) summary_writer.add_scalar("val_%s" % k, v, t) checkpoint['model_state'] = all_in_one_model.model.state_dict() if all_in_one_model.obj_discriminator is not None: checkpoint['d_obj_state'] = all_in_one_model.obj_discriminator.state_dict() checkpoint['d_obj_optim_state'] = all_in_one_model.optimizer_d_obj.state_dict() if all_in_one_model.img_discriminator is not None: checkpoint['d_img_state'] = all_in_one_model.img_discriminator.state_dict() checkpoint['d_img_optim_state'] = all_in_one_model.optimizer_d_img.state_dict() checkpoint['optim_state'] = all_in_one_model.optimizer.state_dict() checkpoint['counters']['t'] = t checkpoint['counters']['epoch'] = epoch checkpoint_path = os.path.join(args.output_dir, '%s_with_model.pt' % args.checkpoint_name) print('Saving checkpoint to ', checkpoint_path) torch.save(checkpoint, checkpoint_path) # Save another checkpoint without any model or optim state checkpoint_path = os.path.join(args.output_dir, '%s_no_model.pt' % args.checkpoint_name) key_blacklist = ['model_state', 'optim_state', 'model_best_state', 'd_obj_state', 'd_obj_optim_state', 'd_obj_best_state', 'd_img_state', 'd_img_optim_state', 'd_img_best_state'] small_checkpoint = {} for k, v in checkpoint.items(): if k not in key_blacklist: small_checkpoint[k] = v torch.save(small_checkpoint, checkpoint_path)