def train(**kwargs): # first free all GPU memory t.cuda.empty_cache() """ Get options """ opt = Config() print_options(opt) # overwrite options from commandline for k_, v_ in kwargs.items(): setattr(opt, k_, v_) device = t.device('cuda') if opt.gpu else t.device('cpu') # TODO: visualization """ Dataset """ dataset = create_dataset(opt) dataset_size = len(dataset) iter_per_epoch = int(dataset_size / opt.batch_size) print(f'loaded {dataset_size} images for training') """ Create Network Instances """ model_names = ['netG_x', 'netG_y', 'netD_x', 'netD_y'] netG_x = ResnetGenerator(opt) netG_y = ResnetGenerator(opt) # print(netG_x) netD_x = NLayerDiscriminator(opt) netD_y = NLayerDiscriminator(opt) # print(netD_x) if opt.gpu: netG_x.to(device) summary(netG_x, input_size=(3, opt.crop_size, opt.crop_size)) netG_y.to(device) netD_x.to(device) summary(netD_x, input_size=(3, opt.crop_size, opt.crop_size)) netD_y.to(device) """ Define optimizer and Loss """ optimizer_g = t.optim.Adam(itertools.chain(netG_x.parameters(), netG_y.parameters()), lr=opt.g_lr, betas=(opt.beta1, 0.999)) optimizer_d = t.optim.Adam(itertools.chain(netD_x.parameters(), netD_y.parameters()), lr=opt.d_lr, betas=(opt.beta1, 0.999)) optimizers = [optimizer_g, optimizer_d] """ Forward cycle loss: lambda_A * ||G_B(G_A(A)) - A|| (Eqn. (2) in the paper) Backward cycle loss: lambda_B * ||G_A(G_B(B)) - B|| (Eqn. (2) in the paper) Identity loss (optional): lambda_identity * (||G_A(B) - B|| * lambda_B + ||G_B(A) - A|| * lambda_A) (Sec 5.2 "Photo generation from paintings" in the paper) """ lambda_X = 10.0 # weight for cycle loss (A -> B -> A^) lambda_Y = 10.0 # weight for cycle loss (B -> A -> B^) lambda_identity = 0.5 # 定义 GAN 损失,define GAN loss. # it's a MSELoss() when initialized, only calculate later during iteration # criterionGAN = nn.MSELoss().to(device) criterionGAN = GANLoss(gan_mode='lsgan') # cycle loss criterionCycle = nn.L1Loss() # identical loss criterionIdt = nn.L1Loss() # loss meters loss_X_meter = MovingAverageValueMeter(opt.plot_every) loss_Y_meter = MovingAverageValueMeter(opt.plot_every) score_Dx_real_y = MovingAverageValueMeter(opt.plot_every) score_Dx_fake_y = MovingAverageValueMeter(opt.plot_every) losses = {} scores = {} """ use identity mapping. Setting lambda_identity other than 0 has an effect of scaling the weight of the identity mapping loss. For example, if the weight of the identity loss should be 10 times smaller than the weight of the reconstruction loss, please set lambda_identity = 0.1 """ for epoch in range(opt.max_epochs): epoch_start_time = time.time() """ calculate losses, gradients, and update network weights; called in every iteration """ for i, data in enumerate(dataset): real_x = data['A'].to(device) real_y = data['B'].to(device) ###################### # X -> Y' -> X^ cycle ###################### optimizer_g.zero_grad() # set g_x and g_y gradients to zero fake_y = netG_x(real_x) # X -> Y' prediction = netD_x(fake_y) #netD_x provide feedback to netG_x loss_G_X = criterionGAN(prediction, True) # cycle_consistance x_hat = netG_y(fake_y) # Y' -> X^ # Forward cycle loss x^ = || G_y(G_x(real_x)) || loss_cycle_X = criterionCycle(x_hat, real_x) * lambda_X # identity loss if lambda_identity > 0: # netG_x should be identity if real_y is fed: ||netG_x(real_y) - real_y|| idt_x = netG_x(real_y) loss_idt_x = criterionIdt(idt_x, real_y) * lambda_Y * lambda_identity else: loss_idt_x = 0. loss_X = loss_G_X + loss_cycle_X + loss_idt_x loss_X.backward(retain_graph=True) optimizer_g.step() loss_X_meter.add(loss_X.item()) ###################### # Y -> X' -> Y^ cycle ###################### optimizer_g.zero_grad() # set g_x and g_y gradients to zero fake_x = netG_y(real_y) # Y -> X' prediction = netD_y(fake_x) loss_G_Y = criterionGAN(prediction, True) # print(f'loss_G_Y = {round(float(loss_G_Y), 3)}') y_hat = netG_x(fake_x) # Y -> X' -> Y^ # Forward cycle loss y^ = || G_x(G_y(real_y)) || loss_cycle_Y = criterionCycle(y_hat, real_y) * lambda_Y # identity loss if lambda_identity > 0: # netG_y should be identiy if real_x is fed: ||netG_y(real_x) - real_x|| idt_y = netG_y(real_x) loss_idt_y = criterionIdt(idt_y, real_x) * lambda_X * lambda_identity else: loss_idt_y = 0. loss_Y = loss_G_Y + loss_cycle_Y + loss_idt_y loss_Y.backward(retain_graph=True) optimizer_g.step() loss_Y_meter.add(loss_Y.item()) ###################### # netD_x ###################### optimizer_d.zero_grad() # loss_real pred_real = netD_x(real_y) loss_D_x_real = criterionGAN(pred_real, True) score_Dx_real_y.add(float(pred_real.data.mean())) # loss_fake pred_fake = netD_x(fake_y) loss_D_x_fake = criterionGAN(pred_fake, False) score_Dx_fake_y.add(float(pred_fake.data.mean())) # loss and backward loss_D_x = (loss_D_x_real + loss_D_x_fake) * 0.5 loss_D_x.backward() optimizer_d.step() ###################### # netD_y ###################### optimizer_d.zero_grad() # loss_real pred_real = netD_y(real_x) loss_D_y_real = criterionGAN(pred_real, True) # loss_fake pred_fake = netD_y(fake_x) loss_D_y_fake = criterionGAN(pred_fake, False) # loss and backward loss_D_y = (loss_D_y_real + loss_D_y_fake) * 0.5 loss_D_y.backward() optimizer_d.step() # save snapshot if i % opt.plot_every == 0: filename = opt.name + '_snap_%03d_%05d.png' % ( epoch, i, ) test_path = os.path.join(opt.checkpoint_path, filename) tv.utils.save_image(fake_y, test_path, normalize=True) print(f'{filename} saved.') losses['loss_X'] = loss_X_meter.value()[0] losses['loss_Y'] = loss_Y_meter.value()[0] scores['score_Dx_real_y'] = score_Dx_real_y.value()[0] scores['score_Dx_fake_y'] = score_Dx_fake_y.value()[0] print(losses) print(scores) # print(f'iteration {i} finished') # save model if epoch % opt.save_every == 0 or epoch == opt.max_epochs - 1: save_filename = f'{opt.name}_netG_{epoch}.pth' save_filepath = os.path.join(opt.model_path, save_filename) t.save(netG_x.state_dict(), save_filepath) print(f'model saved as {save_filename}') # epoch end logs epoech_time = int(time.time() - epoch_start_time) print_options(opt, epoch_log=True, epoch=epoch, time=epoech_time, losses=losses, scores=scores) print()
def main(config, cuda): device = torch.device("cuda" if cuda and torch.cuda.is_available() else "cpu") if cuda: current_device = torch.cuda.current_device() print("Running on", torch.cuda.get_device_name(current_device)) else: print("Running on CPU") # Configuration CONFIG = Dict(yaml.load(open(config))) # Dataset dataset = CocoStuff10k( root=CONFIG.ROOT, split="train", image_size=513, crop_size=CONFIG.IMAGE.SIZE.TRAIN, scale=True, flip=True, ) # DataLoader loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=CONFIG.BATCH_SIZE, num_workers=CONFIG.NUM_WORKERS, shuffle=True, ) loader_iter = iter(loader) # Model model = DeepLabV2_ResNet101_MSC(n_classes=CONFIG.N_CLASSES) state_dict = torch.load(CONFIG.INIT_MODEL) model.load_state_dict(state_dict, strict=False) # Skip "aspp" layer model = nn.DataParallel(model) model.to(device) # Optimizer optimizer = { "sgd": torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[ { "params": get_lr_params(model.module, key="1x"), "lr": CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_lr_params(model.module, key="10x"), "lr": 10 * CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_lr_params(model.module, key="20x"), "lr": 20 * CONFIG.LR, "weight_decay": 0.0, }, ], momentum=CONFIG.MOMENTUM, ) }.get(CONFIG.OPTIMIZER) # Loss definition criterion = CrossEntropyLoss2d(ignore_index=CONFIG.IGNORE_LABEL) criterion.to(device) # TensorBoard Logger writer = SummaryWriter(CONFIG.LOG_DIR) loss_meter = MovingAverageValueMeter(20) model.train() model.module.scale.freeze_bn() for iteration in tqdm( range(1, CONFIG.ITER_MAX + 1), total=CONFIG.ITER_MAX, leave=False, dynamic_ncols=True, ): # Set a learning rate poly_lr_scheduler( optimizer=optimizer, init_lr=CONFIG.LR, iter=iteration - 1, lr_decay_iter=CONFIG.LR_DECAY, max_iter=CONFIG.ITER_MAX, power=CONFIG.POLY_POWER, ) # Clear gradients (ready to accumulate) optimizer.zero_grad() iter_loss = 0 for i in range(1, CONFIG.ITER_SIZE + 1): try: data, target = next(loader_iter) except: loader_iter = iter(loader) data, target = next(loader_iter) # Image data = data.to(device) # Propagate forward outputs = model(data) # Loss loss = 0 for output in outputs: # Resize target for {100%, 75%, 50%, Max} outputs target_ = resize_target(target, output.size(2)) target_ = target_.to(device) # Compute crossentropy loss loss += criterion(output, target_) # Backpropagate (just compute gradients wrt the loss) loss /= float(CONFIG.ITER_SIZE) loss.backward() iter_loss += float(loss) loss_meter.add(iter_loss) # Update weights with accumulated gradients optimizer.step() # TensorBoard if iteration % CONFIG.ITER_TF == 0: writer.add_scalar("train_loss", loss_meter.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("train_lr_group{}".format(i), o["lr"], iteration) # for name, param in model.named_parameters(): # name = name.replace('.', '/') # writer.add_histogram(name, param, iteration, bins="auto") # if param.requires_grad: # writer.add_histogram(name + '/grad', param.grad, iteration, bins="auto") # Save a model if iteration % CONFIG.ITER_SNAP == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_{}.pth".format(iteration)), ) # Save a model if iteration % 100 == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_current.pth"), ) torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_final.pth") )
def train(config, cuda): # Auto-tune cuDNN torch.backends.cudnn.benchmark = True # Configuration device = get_device(cuda) CONFIG = Dict(yaml.load(open(config))) # Dataset 10k or 164k dataset = get_dataset(CONFIG.DATASET.NAME)( root=CONFIG.DATASET.ROOT, split=CONFIG.DATASET.SPLIT.TRAIN, base_size=CONFIG.IMAGE.SIZE.TRAIN.BASE, crop_size=CONFIG.IMAGE.SIZE.TRAIN.CROP, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), warp=CONFIG.DATASET.WARP_IMAGE, scale=CONFIG.DATASET.SCALES, flip=True, ) # DataLoader loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=CONFIG.SOLVER.BATCH_SIZE.TRAIN, num_workers=CONFIG.DATALOADER.NUM_WORKERS, shuffle=True, ) loader_iter = iter(loader) # Model model = setup_model(CONFIG.MODEL.INIT_MODEL, CONFIG.DATASET.N_CLASSES, train=True) model.to(device) # Optimizer optimizer = torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[ { "params": get_params(model.module, key="1x"), "lr": CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.SOLVER.LR, "weight_decay": 0.0, }, ], momentum=CONFIG.SOLVER.MOMENTUM, ) # Learning rate scheduler scheduler = PolynomialLR( optimizer=optimizer, step_size=CONFIG.SOLVER.LR_DECAY, iter_max=CONFIG.SOLVER.ITER_MAX, power=CONFIG.SOLVER.POLY_POWER, ) # Loss definition criterion = nn.CrossEntropyLoss(ignore_index=CONFIG.DATASET.IGNORE_LABEL) criterion.to(device) # TensorBoard logger writer = SummaryWriter(CONFIG.SOLVER.LOG_DIR) average_loss = MovingAverageValueMeter(CONFIG.SOLVER.AVERAGE_LOSS) # Freeze the batch norm pre-trained on COCO model.train() model.module.base.freeze_bn() for iteration in tqdm( range(1, CONFIG.SOLVER.ITER_MAX + 1), total=CONFIG.SOLVER.ITER_MAX, leave=False, dynamic_ncols=True, ): # Clear gradients (ready to accumulate) optimizer.zero_grad() loss = 0 for _ in range(CONFIG.SOLVER.ITER_SIZE): try: images, labels = next(loader_iter) except: loader_iter = iter(loader) images, labels = next(loader_iter) images = images.to(device) labels = labels.to(device) # Propagate forward logits = model(images) # Loss iter_loss = 0 for logit in logits: # Resize labels for {100%, 75%, 50%, Max} logits _, _, H, W = logit.shape labels_ = resize_labels(labels, shape=(H, W)) iter_loss += criterion(logit, labels_) # Backpropagate (just compute gradients wrt the loss) iter_loss /= CONFIG.SOLVER.ITER_SIZE iter_loss.backward() loss += float(iter_loss) average_loss.add(loss) # Update weights with accumulated gradients optimizer.step() # Update learning rate scheduler.step(epoch=iteration) # TensorBoard if iteration % CONFIG.SOLVER.ITER_TB == 0: writer.add_scalar("loss/train", average_loss.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("lr/group{}".format(i), o["lr"], iteration) if False: # This produces a large log file for name, param in model.named_parameters(): name = name.replace(".", "/") # Weight/gradient distribution writer.add_histogram(name, param, iteration, bins="auto") if param.requires_grad: writer.add_histogram(name + "/grad", param.grad, iteration, bins="auto") # Save a model if iteration % CONFIG.SOLVER.ITER_SAVE == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.MODEL.SAVE_DIR, "checkpoint_{}.pth".format(iteration)), ) # To verify progress separately torch.save( model.module.state_dict(), osp.join(CONFIG.MODEL.SAVE_DIR, "checkpoint_current.pth"), ) torch.save( model.module.state_dict(), osp.join(CONFIG.MODEL.SAVE_DIR, "checkpoint_final.pth"), )
def train(self): torch.cuda.empty_cache() ###################### # Save / Load model ###################### if self.opt.continue_train: try: self.continue_from_latest_checkpoint() except CyganException as e: self.logger.error(e) self.opt.continue_train = False self.reset_save() else: self.reset_save() self.add_file_logger() ###################### # Dataset ###################### if self.opt.model == 'base': dataset = SteelyDataset(self.opt.genreA, self.opt.genreB, self.opt.phase, use_mix=False) else: dataset = SteelyDataset(self.opt.genreA, self.opt.genreB, self.opt.phase, use_mix=True) dataset_size = len(dataset) iter_num = int(dataset_size / self.opt.batch_size) self.logger.info( f'Dataset loaded, genreA: {self.opt.genreA}, genreB: {self.opt.genreB}, total size: {dataset_size}.' ) ###################### # Initiate ###################### lambda_A = 10.0 # weight for cycle loss (A -> B -> A^) lambda_B = 10.0 # weight for cycle loss (B -> A -> B^) lambda_identity = 0.5 criterionGAN = GANLoss(gan_mode='lsgan') criterionCycle = nn.L1Loss() criterionIdt = nn.L1Loss() GLoss_meter = MovingAverageValueMeter(self.opt.plot_every) DLoss_meter = MovingAverageValueMeter(self.opt.plot_every) CycleLoss_meter = MovingAverageValueMeter(self.opt.plot_every) # loss meters losses = {} scores = {} losses_dict = {'loss_G': [], 'loss_D': [], 'loss_C': [], 'epoch': []} ###################### # Start Training ###################### for epoch in range(self.opt.start_epoch, self.opt.max_epoch): loader = DataLoader(dataset, batch_size=self.opt.batch_size, shuffle=True, num_workers=self.opt.num_threads, drop_last=True) epoch_start_time = time.time() for i, data in enumerate(loader): real_A = torch.unsqueeze(data[:, 0, :, :], 1).to(self.device, dtype=torch.float) real_B = torch.unsqueeze(data[:, 1, :, :], 1).to(self.device, dtype=torch.float) gaussian_noise = torch.abs( torch.normal(mean=torch.zeros(self.opt.data_shape), std=self.opt.gaussian_std)).to( self.device, dtype=torch.float) if self.opt.model == 'base': ###################### # Generator ###################### fake_B = self.generator_A2B(real_A) # X -> Y' fake_A = self.generator_B2A(real_B) # Y -> X' fake_B_copy = copy.copy(fake_B) fake_A_copy = copy.copy(fake_A) DB_fake = self.discriminator_B( fake_B + gaussian_noise) # netD_x provide feedback to netG_x DA_fake = self.discriminator_A(fake_A + gaussian_noise) loss_G_A2B = criterionGAN(DB_fake, True) loss_G_B2A = criterionGAN(DA_fake, True) # cycle_consistence cycle_A = self.generator_B2A(fake_B) # Y' -> X^ cycle_B = self.generator_A2B(fake_A) # Y -> X' -> Y^ loss_cycle_A2B = criterionCycle(cycle_A, real_A) * lambda_A loss_cycle_B2A = criterionCycle(cycle_B, real_B) * lambda_B # identity loss if lambda_identity > 0: # netG_x should be identity if real_y is fed: ||netG_x(real_y) - real_y|| idt_A = self.generator_A2B(real_B) idt_B = self.generator_B2A(real_A) loss_idt_A = criterionIdt( idt_A, real_B) * lambda_A * lambda_identity loss_idt_B = criterionIdt( idt_B, real_A) * lambda_A * lambda_identity else: loss_idt_A = 0. loss_idt_B = 0. loss_idt = loss_idt_A + loss_idt_B self.GA2B_optimizer.zero_grad( ) # set g_x and g_y gradients to zero loss_A2B = loss_G_A2B + loss_cycle_A2B + loss_idt_A loss_A2B.backward(retain_graph=True) self.GA2B_optimizer.step() self.GB2A_optimizer.zero_grad( ) # set g_x and g_y gradients to zero loss_B2A = loss_G_B2A + loss_cycle_B2A + loss_idt_B loss_B2A.backward(retain_graph=True) self.GB2A_optimizer.step() cycle_loss = loss_cycle_A2B + loss_cycle_B2A CycleLoss_meter.add(cycle_loss.item()) loss_G = loss_G_A2B + loss_G_B2A + loss_idt GLoss_meter.add(loss_G.item()) ###################### # Sample ###################### fake_A_sample, fake_B_sample = (None, None) if self.opt.use_image_pool: [fake_A_sample, fake_B_sample] = self.pool([fake_A_copy, fake_B_copy]) ###################### # Discriminator ###################### # loss_real DA_real = self.discriminator_A(real_A + gaussian_noise) DB_real = self.discriminator_B(real_B + gaussian_noise) loss_DA_real = criterionGAN(DA_real, True) loss_DB_real = criterionGAN(DB_real, True) # loss fake if self.opt.use_image_pool: DA_fake_sample = self.discriminator_A(fake_A_sample + gaussian_noise) DB_fake_sample = self.discriminator_B(fake_B_sample + gaussian_noise) loss_DA_fake = criterionGAN(DA_fake_sample, False) loss_DB_fake = criterionGAN(DB_fake_sample, False) else: loss_DA_fake = criterionGAN(DA_fake, False) loss_DB_fake = criterionGAN(DB_fake, False) # loss and backward self.DA_optimizer.zero_grad() loss_DA = (loss_DA_real + loss_DA_fake) * 0.5 loss_DA.backward() self.DA_optimizer.step() self.DB_optimizer.zero_grad() loss_DB = (loss_DB_real + loss_DB_fake) * 0.5 loss_DB.backward() self.DB_optimizer.step() loss_D = loss_DA + loss_DB DLoss_meter.add(loss_D.item()) else: real_mixed = torch.unsqueeze(data[:, 2, :, :], 1).to(self.device, dtype=torch.float) ###################### # Generator ###################### fake_B = self.generator_A2B(real_A) # X -> Y' fake_A = self.generator_B2A(real_B) # Y -> X' fake_B_copy = fake_B.detach().clone() fake_A_copy = fake_A.detach().clone() DB_fake = self.discriminator_B( fake_B + gaussian_noise) # netD_x provide feedback to netG_x DA_fake = self.discriminator_A(fake_A + gaussian_noise) loss_G_A2B = criterionGAN(DB_fake, True) loss_G_B2A = criterionGAN(DA_fake, True) # cycle_consistence cycle_A = self.generator_B2A(fake_B) # Y' -> X^ cycle_B = self.generator_A2B(fake_A) # Y -> X' -> Y^ loss_cycle_A2B = criterionCycle(cycle_A, real_A) * lambda_A loss_cycle_B2A = criterionCycle(cycle_B, real_B) * lambda_B # identity loss if lambda_identity > 0: # netG_x should be identity if real_y is fed: ||netG_x(real_y) - real_y|| idt_A = self.generator_A2B(real_B) idt_B = self.generator_B2A(real_A) loss_idt_A = criterionIdt( idt_A, real_B) * lambda_A * lambda_identity loss_idt_B = criterionIdt( idt_B, real_A) * lambda_A * lambda_identity else: loss_idt_A = 0. loss_idt_B = 0. loss_idt = loss_idt_A + loss_idt_B self.GA2B_optimizer.zero_grad( ) # set g_x and g_y gradients to zero loss_A2B = loss_G_A2B + loss_cycle_A2B + loss_idt_A loss_A2B.backward(retain_graph=True) self.GA2B_optimizer.step() self.GB2A_optimizer.zero_grad( ) # set g_x and g_y gradients to zero loss_B2A = loss_G_B2A + loss_cycle_B2A + loss_idt_B loss_B2A.backward(retain_graph=True) self.GB2A_optimizer.step() cycle_loss = loss_cycle_A2B + loss_cycle_B2A CycleLoss_meter.add(cycle_loss.item()) loss_G = loss_G_A2B + loss_G_B2A + loss_idt GLoss_meter.add(loss_G.item()) ###################### # Sample ###################### fake_A_sample, fake_B_sample = (None, None) if self.opt.use_image_pool: [fake_A_sample, fake_B_sample] = self.pool([fake_A_copy, fake_B_copy]) ###################### # Discriminator ###################### # loss_real DA_real = self.discriminator_A(real_A + gaussian_noise) DB_real = self.discriminator_B(real_B + gaussian_noise) DA_real_all = self.discriminator_A_all(real_mixed + gaussian_noise) DB_real_all = self.discriminator_B_all(real_mixed + gaussian_noise) loss_DA_real = criterionGAN(DA_real, True) loss_DB_real = criterionGAN(DB_real, True) loss_DA_all_real = criterionGAN(DA_real_all, True) loss_DB_all_real = criterionGAN(DB_real_all, True) # loss fake if self.opt.use_image_pool: DA_fake_sample = self.discriminator_A(fake_A_sample + gaussian_noise) DB_fake_sample = self.discriminator_B(fake_B_sample + gaussian_noise) DA_fake_sample_all = self.discriminator_A_all( fake_A_sample + gaussian_noise) DB_fake_sample_all = self.discriminator_B_all( fake_B_sample + gaussian_noise) loss_DA_all_fake = criterionGAN( DA_fake_sample_all, False) loss_DB_all_fake = criterionGAN( DB_fake_sample_all, False) loss_DA_fake = criterionGAN(DA_fake_sample, False) loss_DB_fake = criterionGAN(DB_fake_sample, False) else: DA_fake_all = self.discriminator_A_all(fake_A_copy + gaussian_noise) DB_fake_all = self.discriminator_B_all(fake_B_copy + gaussian_noise) loss_DA_all_fake = criterionGAN(DA_fake_all, False) loss_DB_all_fake = criterionGAN(DB_fake_all, False) loss_DA_fake = criterionGAN(DA_fake, False) loss_DB_fake = criterionGAN(DB_fake, False) # loss and backward self.DA_optimizer.zero_grad() loss_DA = (loss_DA_real + loss_DA_fake) * 0.5 loss_DA.backward() self.DA_optimizer.step() self.DB_optimizer.zero_grad() loss_DB = (loss_DB_real + loss_DB_fake) * 0.5 loss_DB.backward() self.DB_optimizer.step() self.DA_all_optimizer.zero_grad() loss_DA_all = (loss_DA_all_real + loss_DA_all_fake) * 0.5 loss_DA_all.backward() self.DA_all_optimizer.step() self.DB_all_optimizer.zero_grad() loss_DB_all = (loss_DB_all_real + loss_DB_all_fake) * 0.5 loss_DB_all.backward() self.DB_all_optimizer.step() loss_D = loss_DA + loss_DB + loss_DB_all + loss_DA_all DLoss_meter.add(loss_D.item()) ###################### # Snapshot ###################### if i % self.opt.plot_every == 0: file_name = self.opt.name + '_snap_%03d_%05d.png' % ( epoch, i, ) # test_path = os.path.join(self.opt.checkpoint_path, file_name) # tv.utils.save_image(fake_B, test_path, normalize=True) # self.logger.info(f'Snapshot {file_name} saved.') losses['loss_C'] = float(CycleLoss_meter.value()[0]) losses['loss_G'] = float(GLoss_meter.value()[0]) losses['loss_D'] = float(DLoss_meter.value()[0]) self.logger.info(str(losses)) self.logger.info('Epoch {} progress: {:.2%}\n'.format( epoch, i / iter_num)) # save model if epoch % self.opt.save_every == 0 or epoch == self.opt.max_epoch - 1: self.save_model(epoch) ###################### # lr_scheduler ###################### self.GA2B_scheduler.step(epoch) self.GB2A_scheduler.step(epoch) self.DA_scheduler.step(epoch) self.DB_scheduler.step(epoch) if self.opt.model != 'base': self.DA_all_scheduler.step(epoch) self.DB_all_scheduler.step(epoch) epoch_time = int(time.time() - epoch_start_time) ###################### # Logging ###################### self.logger.info( f'Epoch {epoch} finished, cost time {epoch_time}\n') self.logger.info(str(losses) + '\n\n') ###################### # Loss_Dict ###################### losses_dict['loss_C'].append(losses['loss_C']) losses_dict['loss_G'].append(losses['loss_G']) losses_dict['loss_D'].append(losses['loss_D']) losses_dict['epoch'].append(epoch) with open(self.opt.loss_save_path, 'w') as f: json.dump(losses_dict, f)
i] #this is to retrive the data from datasets easy method img = img / 255 loss = model.loss(img, bbox, label) optimizer.zero_grad() loss.backward() optimizer.step() loss_value = loss.cpu().data.numpy() avg_loss.add(loss_value) ma20_loss.add(float(loss_value)) print( '[epoch:{}] [batch:{}/{}] [sample_loss:{:.4f}] [avg_loss:{:.4f}] [ma20_loss:{:.4f}]' .format(epoch, i, len(trainval_dataset), loss_value, avg_loss.value()[0], ma20_loss.value()[0])) model.eval() for i in range(len(test_dataset)): img, _, _ = test_dataset[i] imgx = img / 255 bbox_out, class_out, prob_out = model.predict(imgx, prob_threshold=0.95) vis_bbox(img, bbox_out, class_out, prob_out, label_names=voc_bbox_label_names) plt.show() fig = plt.gcf() fig.set_size_inches(11, 5)
valid_f = open(text_file) valid_d = valid_f.readlines() valid_f.close() train_iter = [] train_loss = [] i = 0 ma_loss = MovingAverageValueMeter(windowsize=500) for s in train_d: i = i + 1 t = s.strip().split(' ') t_iter = int(t[0]) ma_loss.add(float(t[1])) if i % 500 == 0: train_iter.append(t_iter) train_loss.append(ma_loss.value()[0]) valid_iter = [] valid_loss = [] i = 0 for s in valid_d: i = i + 1 if i >= 0: t = s.strip().split(' ') t_iter = int(t[0]) t_loss = float(t[1]) valid_iter.append(t_iter) valid_loss.append(t_loss) #========== #plt.semilogx(x, b, marker='^', linewidth=0.5, color='k')
def main(config, cuda): # Configuration with open(config) as f: CONFIG = yaml.load(f) cuda = cuda and torch.cuda.is_available() # Dataset dataset = get_dataset(CONFIG['DATASET'])( root=CONFIG['ROOT'], split='train', image_size=(CONFIG['IMAGE']['SIZE']['TRAIN'], CONFIG['IMAGE']['SIZE']['TRAIN']), scale=True, flip=True, # preload=True ) # DataLoader loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=CONFIG['BATCH_SIZE'], num_workers=CONFIG['NUM_WORKERS'], shuffle=True) loader_iter = iter(loader) # Model model = DeepLabV2_ResNet101_MSC(n_classes=CONFIG['N_CLASSES']) state_dict = torch.load(CONFIG['INIT_MODEL']) model.load_state_dict(state_dict, strict=False) # Skip "aspp" layer if cuda: model.cuda() # Optimizer optimizer = { 'sgd': torch.optim.SGD( params=[ { 'params': get_1x_lr_params(model), 'lr': float(CONFIG['LR']) }, { 'params': get_10x_lr_params(model), 'lr': 10 * float(CONFIG['LR']) } # NOQA ], lr=float(CONFIG['LR']), momentum=float(CONFIG['MOMENTUM']), weight_decay=float(CONFIG['WEIGHT_DECAY'])), }.get(CONFIG['OPTIMIZER']) # Loss definition criterion = CrossEntropyLoss2d(ignore_index=CONFIG['IGNORE_LABEL']) if cuda: criterion.cuda() # TensorBoard Logger writer = SummaryWriter(CONFIG['LOG_DIR']) loss_meter = MovingAverageValueMeter(20) model.train() for iteration in tqdm(range(1, CONFIG['ITER_MAX'] + 1), total=CONFIG['ITER_MAX'], leave=False, dynamic_ncols=True): # Polynomial lr decay poly_lr_scheduler(optimizer=optimizer, init_lr=float(CONFIG['LR']), iter=iteration - 1, lr_decay_iter=CONFIG['LR_DECAY'], max_iter=CONFIG['ITER_MAX'], power=CONFIG['POLY_POWER']) optimizer.zero_grad() iter_loss = 0 for i in range(1, CONFIG['ITER_SIZE'] + 1): data, target = next(loader_iter) # Image data = data.cuda() if cuda else data data = Variable(data) # Forward propagation outputs = model(data) # Label target = resize_target(target, outputs[0].size(2)) target = target.cuda() if cuda else target target = Variable(target) # Aggregate losses for [100%, 75%, 50%, Max] loss = 0 for output in outputs: loss += criterion(output, target) loss /= CONFIG['ITER_SIZE'] iter_loss += loss.data[0] loss.backward() # Reload dataloader if ((iteration - 1) * CONFIG['ITER_SIZE'] + i) % len(loader) == 0: loader_iter = iter(loader) loss_meter.add(iter_loss) # Back propagation optimizer.step() # TensorBoard if iteration % CONFIG['ITER_TF'] == 0: writer.add_scalar('train_loss', loss_meter.value()[0], iteration) # Save a model if iteration % CONFIG['ITER_SNAP'] == 0: torch.save( model.state_dict(), osp.join(CONFIG['SAVE_DIR'], 'checkpoint_{}.pth.tar'.format(iteration))) # NOQA writer.add_text('log', 'Saved a model', iteration) torch.save(model.state_dict(), osp.join(CONFIG['SAVE_DIR'], 'checkpoint_final.pth.tar'))
def main(config, cuda, gpu): # Configuration CONFIG = Dict(yaml.load(open(config))) # CUDA check cuda = cuda and torch.cuda.is_available() if cuda: gpu_ids = [int(string) for string in gpu.split(',')] current_device = torch.cuda.current_device() print('Running on', torch.cuda.get_device_name(current_device), gpu_ids) # Dataset dataset = CocoStuff10k( root=CONFIG.ROOT, split='train', image_size=513, crop_size=CONFIG.IMAGE.SIZE.TRAIN, scale=True, flip=True, ) # DataLoader loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=CONFIG.BATCH_SIZE, num_workers=CONFIG.NUM_WORKERS, shuffle=True, ) loader_iter = iter(loader) # Model model = DeepLabV2_ResNet101_MSC(n_classes=CONFIG.N_CLASSES) state_dict = torch.load(CONFIG.INIT_MODEL) model.load_state_dict(state_dict, strict=False) # Skip "aspp" layer model = nn.DataParallel(model, device_ids=gpu_ids) if cuda: model.cuda() # Optimizer optimizer = { 'sgd': torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[{ 'params': get_lr_params(model.module, key='1x'), 'lr': CONFIG.LR, 'weight_decay': CONFIG.WEIGHT_DECAY }, { 'params': get_lr_params(model.module, key='10x'), 'lr': 10 * CONFIG.LR, 'weight_decay': CONFIG.WEIGHT_DECAY }, { 'params': get_lr_params(model.module, key='20x'), 'lr': 20 * CONFIG.LR, 'weight_decay': 0.0 }], momentum=CONFIG.MOMENTUM, ), }.get(CONFIG.OPTIMIZER) # Loss definition criterion = CrossEntropyLoss2d(ignore_index=CONFIG.IGNORE_LABEL) if cuda: criterion.cuda() # TensorBoard Logger writer = SummaryWriter(CONFIG.LOG_DIR) loss_meter = MovingAverageValueMeter(20) model.train() model.module.scale.freeze_bn() for iteration in tqdm( range(1, CONFIG.ITER_MAX + 1), total=CONFIG.ITER_MAX, leave=False, dynamic_ncols=True, ): # Set a learning rate poly_lr_scheduler( optimizer=optimizer, init_lr=CONFIG.LR, iter=iteration - 1, lr_decay_iter=CONFIG.LR_DECAY, max_iter=CONFIG.ITER_MAX, power=CONFIG.POLY_POWER, ) # Clear gradients (ready to accumulate) optimizer.zero_grad() iter_loss = 0 for i in range(1, CONFIG.ITER_SIZE + 1): data, target = next(loader_iter) # Image data = data.cuda() if cuda else data data = Variable(data) # Propagate forward outputs = model(data) # Loss loss = 0 for output in outputs: # Resize target for {100%, 75%, 50%, Max} outputs target_ = resize_target(target, output.size(2)) target_ = target_.cuda() if cuda else target_ target_ = Variable(target_) # Compute crossentropy loss loss += criterion(output, target_) # Backpropagate (just compute gradients wrt the loss) loss /= float(CONFIG.ITER_SIZE) loss.backward() iter_loss += loss.data[0] # Reload dataloader if ((iteration - 1) * CONFIG.ITER_SIZE + i) % len(loader) == 0: loader_iter = iter(loader) loss_meter.add(iter_loss) # Update weights with accumulated gradients optimizer.step() # TensorBoard if iteration % CONFIG.ITER_TF == 0: writer.add_scalar('train_loss', loss_meter.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar('train_lr_group{}'.format(i), o['lr'], iteration) if iteration % 1000 != 0: continue for name, param in model.named_parameters(): name = name.replace('.', '/') writer.add_histogram(name, param, iteration, bins="auto") if param.requires_grad: writer.add_histogram(name + '/grad', param.grad, iteration, bins="auto") # Save a model if iteration % CONFIG.ITER_SNAP == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, 'checkpoint_{}.pth'.format(iteration)), ) # Save a model if iteration % 100 == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, 'checkpoint_current.pth'), ) torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, 'checkpoint_final.pth'), )
def train(self): torch.cuda.empty_cache() ###################### # Save / Load model ###################### if self.opt.continue_train: try: self.continue_from_latest_checkpoint() except Exception as e: self.logger.error(e) return else: self.reset_save() self.logger.add_file_logger(self.opt.log_path) ###################### # Dataset ###################### dataset = ClassifierDataset(self.opt.genreA, self.opt.genreB, 'train') test_dataset = ClassifierDataset(self.opt.genreA, self.opt.genreB, 'test') dataset_size = len(dataset) iter_num = int(dataset_size / self.opt.batch_size) plot_every = iter_num // 10 self.logger.info( f'Dataset loaded, genreA: {self.opt.genreA}, genreB: {self.opt.genreB}, total size: {dataset_size}.' ) ###################### # Initiate ###################### softmax_criterion = nn.BCELoss() Loss_meter = MovingAverageValueMeter(self.opt.plot_every) losses = {} ###################### # Start Training ###################### test_data = torch.from_numpy(test_dataset.get_data()).to( self.device, dtype=torch.float) gaussian_noise = torch.normal(mean=torch.zeros(test_data.shape), std=self.opt.gaussian_std).to( self.device, dtype=torch.float) # test_data += gaussian_noise real_test_label = torch.from_numpy(test_dataset.get_labels()).view( -1, 2).to(self.device, dtype=torch.float) for epoch in range(self.opt.start_epoch, self.opt.max_epoch): loader = DataLoader(dataset, batch_size=self.opt.batch_size, shuffle=True, num_workers=self.opt.num_threads, drop_last=True) epoch_start_time = time.time() for i, batch in enumerate(loader): data = batch[0].to(self.device, dtype=torch.float) real_label = batch[1].view(self.opt.batch_size, 2).to(self.device, dtype=torch.float) self.classifier_optimizer.zero_grad() estimate_train = self.classifier(data) loss = softmax_criterion(estimate_train, real_label) loss.backward() self.classifier_optimizer.step() Loss_meter.add(loss.item()) # test if i % plot_every == 0: with torch.no_grad(): estimate_test = self.classifier(test_data) estimate_test = nn.functional.softmax(estimate_test, dim=1) test_prediction = torch.argmax(estimate_test, 1).eq( torch.argmax(real_test_label, 1)) test_accuracy = torch.mean( test_prediction.type(torch.float32)).cpu() self.logger.info( 'Epoch {} progress {:.2%}: Loss: {}, Accuracy: {}\n'. format(epoch, i / iter_num, Loss_meter.value()[0], test_accuracy)) if epoch % self.opt.save_every == 0 or epoch == self.opt.max_epoch - 1: self.save_model(epoch) self.classifier_scheduler.step(epoch) epoch_time = int(time.time() - epoch_start_time) self.logger.info( f'Epoch {epoch} finished, cost time {epoch_time}\n')
def train(): """Create the model and start the training.""" # === 1.Configuration print(CONFIG_PATH) # === select which GPU you want to use # === here assume to use 8 GPUs, idx are 0,1,2,3,...,7 os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, CONFIG.EXP.GPU_IDX)) device = get_device(torch.cuda.is_available()) cudnn.benchmark = True comment_init = "" writer = SummaryWriter(comment=comment_init) # Setup loss logger # === MovingAverageValueMeter(self,windowsize) # === - add(value): 记录value # === - reset() # === - value() : 返回MA和标准差 average_loss = MovingAverageValueMeter(CONFIG.SOLVER.AVERAGE_LOSS) if not os.path.exists(CONFIG.MODEL.SAVE_PATH): os.makedirs(CONFIG.MODEL.SAVE_PATH) # Path to save models checkpoint_dir = os.path.join( CONFIG.EXP.OUTPUT_DIR, # ./data "models", CONFIG.MODEL.NAME.lower(), # DeepLabV2_ResNet101_MSC CONFIG.DATASET.SPLIT.TRAIN, # train_aug ) # === checkpoint_dir: ./data/DeepLabV2_ResNet101_MSC/train_aug if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) print("Checkpoint dst:", checkpoint_dir) # === 2.Dataloader === trainloader = data.DataLoader( VOCDataSet( CONFIG.DATASET.DIRECTORY, CONFIG.DATASET.LIST_PATH, max_iters=CONFIG.SOLVER.ITER_MAX * CONFIG.SOLVER.BATCH_SIZE.TRAIN, crop_size=(CONFIG.IMAGE.SIZE.TRAIN, CONFIG.IMAGE.SIZE.TRAIN), scale=CONFIG.DATASET.RANDOM.SCALE, mirror=CONFIG.DATASET.RANDOM.MIRROR, mean=IMG_MEAN, label_path=CONFIG.DATASET.SEG_LABEL), # for training batch_size=CONFIG.SOLVER.BATCH_SIZE.TRAIN, shuffle=True, num_workers=CONFIG.DATALOADER.NUM_WORKERS, pin_memory=True) # 使用iter(dataloader)返回的是一个迭代器,可以使用next访问 # loader_iter = iter(trainloader) # === 3.Create network & weights === print("Model:", CONFIG.MODEL.NAME) # model = DeepLabV2_ResNet101_MSC(n_classes=CONFIG.DATASET.N_CLASSES) model = DeepLabV2_DRN105_MSC(n_classes=CONFIG.DATASET.N_CLASSES) state_dict = torch.load(CONFIG.MODEL.INIT_MODEL) # model.base.load_state_dict(state_dict, strict=False) # to skip ASPP print(" Init:", CONFIG.MODEL.INIT_MODEL) # === show the skip weight for m in model.base.state_dict().keys(): if m not in state_dict.keys(): print(" Skip init:", m) # === DeepLabv2 = Res101+ASPP # === model.base = DeepLabv2 # === model = MSC(DeepLabv2) # model.base.load_state_dict(state_dict, # strict=False) # strict=False to skip ASPP model = nn.DataParallel(model) # multi-GPU model.to(device) # put in GPU is available # === 4.Loss definition criterion = nn.CrossEntropyLoss(ignore_index=CONFIG.DATASET.IGNORE_LABEL) criterion.to(device) # put in GPU is available # === 5.optimizer === optimizer = torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[ { "params": get_params(model.module, key="1x"), "lr": CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.SOLVER.LR, "weight_decay": 0.0, }, ], momentum=CONFIG.SOLVER.MOMENTUM, ) # Learning rate scheduler scheduler = PolynomialLR( optimizer=optimizer, step_size=CONFIG.SOLVER.LR_DECAY, iter_max=CONFIG.SOLVER.ITER_MAX, power=CONFIG.SOLVER.POLY_POWER, ) time_start = time.time() # set start time # === training iteration === for i_iter, batch in enumerate(trainloader, start=1): torch.set_grad_enabled(True) model.train() model.module.base.freeze_bn() optimizer.zero_grad() images, labels, _, _ = batch logits = model(images.to(device)) # <<<<<<<<<<<<<<<<<<<< # === Loss # === logits = [logits] + logits_pyramid + [logits_max] iter_loss = 0 loss = 0 for logit in logits: # Resize labels for {100%, 75%, 50%, Max} logits _, _, H, W = logit.shape labels_ = resize_labels(labels, size=(H, W)) iter_loss += criterion(logit, labels_.to(device)) # iter_loss /= CONFIG.SOLVER.ITER_SIZE iter_loss /= 4 iter_loss.backward() loss += float(iter_loss) average_loss.add(loss) # Update weights with accumulated gradients optimizer.step() # Update learning rate scheduler.step(epoch=i_iter) # TensorBoard writer.add_scalar("loss", average_loss.value()[0], global_step=i_iter) print( 'iter/max_iter = [{}/{}] completed, loss = {:4.3} time:{}'.format( i_iter, CONFIG.SOLVER.ITER_MAX, average_loss.value()[0], show_timing(time_start, time.time()))) # print('iter = ', i_iter, 'of', args.num_steps, '', # loss.data.cpu().numpy()) # === save final model if i_iter >= CONFIG.SOLVER.ITER_MAX: print('save final model as...{}'.format( osp.join(CONFIG.MODEL.SAVE_PATH, 'VOC12_' + str(CONFIG.SOLVER.ITER_MAX) + '.pth'))) torch.save( model.module.state_dict(), osp.join(CONFIG.MODEL.SAVE_PATH, 'VOC12_' + str(CONFIG.SOLVER.ITER_MAX) + '.pth')) break if i_iter % CONFIG.EXP.EVALUATE_ITER == 0: print("Evaluation....") evaluate_gpu(model, writer, i_iter) # === Save model every 250 iteration========================== # because DataParalel will add 'module' in each name of layer. # so here use model.module.state_dict() # ============================================================ if i_iter % CONFIG.MODEL.SAVE_EVERY_ITER == 0: print('saving model ...') torch.save( model.module.state_dict(), osp.join(CONFIG.MODEL.SAVE_PATH, 'VOC12_{}.pth'.format(i_iter)))
def train(**kwargs): opt._parse(kwargs) image_folder_path = 'DataSets/images/' cvs_file_path = 'DataSets/labels.csv' dataset = DataSets(cvs_file_path, image_folder_path) data_size = len(dataset) indices = list(range(data_size)) split = int(np.floor(data_size * 0.2)) np.random.seed(42) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_sampler = torch.utils.data.SubsetRandomSampler(train_indices) valid_sampler = torch.utils.data.SubsetRandomSampler(val_indices) train_loader = torch.utils.data.DataLoader(dataset, batch_size=1, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(dataset, batch_size=1, sampler=valid_sampler) print('load data') avg_loss = AverageValueMeter() ma20_loss = MovingAverageValueMeter(windowsize=20) faster_rcnn = FasterRCNNVGG16() print('model construct completed') start_epoch = 0 best_map = -100 trainer = FasterRCNNTrainer(faster_rcnn).cuda() optimizer = optim.SGD(trainer.faster_rcnn.parameters(), lr=opt.lr, momentum=0.9) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) if opt.load_path: print('load pretrained model from %s' % opt.load_path) checkpoint = torch.load(opt.load_path) start_epoch = checkpoint['epoch'] best_map = checkpoint['best_map'] trainer.faster_rcnn.load_state_dict(checkpoint['model_state']) optimizer.load_state_dict(checkpoint['optimizer_state']) print("> Loaded checkpoint '{}' (epoch {})".format( args.resume, start_epoch)) #trainer.vis.text(dataset.db.label_names, win='labels') # set tensor-board for visualization writer = SummaryWriter('runs/' + opt.log_root) for epoch in range(start_epoch, opt.epoch): trainer.train(mode=True) #must set as that in tranning for ii, (img, _, _, bbox_, label_, scale, _) in enumerate(train_loader): scale = at.scalar(scale) img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda() optimizer.zero_grad() loss = trainer.forward(img, bbox, label, scale) loss.total_loss.backward() optimizer.step() #print(loss) #print(loss.total_loss) loss_value = loss.total_loss.cpu().data.numpy() avg_loss.add(float(loss_value)) ma20_loss.add(float(loss_value)) print( '[epoch:{}/{}] [batch:{}/{}] [sample_loss:{:.4f}] [avg_loss:{:.4f}] [ma20_loss:{:.4f}]' .format(epoch, opt.epoch, ii + 1, len(train_loader), loss.total_loss.data, avg_loss.value()[0], ma20_loss.value()[0])) if (ii + 1) % opt.plot_every == 0: niter = epoch * len(train_loader) + ii writer.add_scalar('Train/Loss', ma20_loss.value()[0], niter) eval_result = eval(val_loader, faster_rcnn, test_num=opt.test_num) print(eval_result['map']) if eval_result['map'] > best_map: best_map = eval_result['map'] state = { "epoch": epoch + 1, "best_map": best_map, "model_state": trainer.faster_rcnn.state_dict(), "optimizer_state": optimizer.state_dict() } torch.save(state, opt.model_para) scheduler.step() state = { "epoch": epoch + 1, "best_map": best_map, "model_state": trainer.faster_rcnn.state_dict(), "optimizer_state": optimizer.state_dict() } torch.save(state, 'last_epoch.pkl') writer.close()
def main(config, cuda, excludeval, embedding, continue_from, nolog, inputmix, imagedataset, experimentid, nshot, ishot): frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) #print(values) #in case you want to save to the location of script you're running datadir = os.path.join( '/home/SharedData/omkar/zscoseg/yash_manas/data/datasets', imagedataset) if not nolog: #name the savedir, might add logs/ before the datetime for clarity if experimentid is None: savedir = time.strftime('%Y%m%d%H%M%S') else: savedir = experimentid #the full savepath is then: savepath = os.path.join('logs', imagedataset, savedir) #in case the folder has not been created yet / except already exists error: try: os.makedirs(savepath) print("Log dir:", savepath) except: pass if continue_from is None: #now join the path in save_screenshot: shutil.copytree('./libs/', savepath + '/libs') shutil.copy2(osp.abspath(inspect.stack()[0][1]), savepath) shutil.copy2(config, savepath) args_dict = {} for a in args: args_dict[a] = values[a] with open(savepath + '/args.json', 'w') as fp: json.dump(args_dict, fp) cuda = cuda and torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") if cuda: current_device = torch.cuda.current_device() print("Running on", torch.cuda.get_device_name(current_device)) else: print("Running on CPU") # Configuration CONFIG = Dict(yaml.load(open(config), Loader=yaml.FullLoader)) visibility_mask = {} if excludeval: seen_classes = np.load(datadir + '/split/seen_cls.npy') else: seen_classes = np.asarray(np.concatenate([ np.load(datadir + '/split/seen_cls.npy'), np.load(datadir + '/split/val_cls.npy') ]), dtype=int) novel_classes = np.load(datadir + '/split/novel_cls.npy') seen_novel_classes = np.concatenate([seen_classes, novel_classes]) seen_map = np.array([-1] * 256) for i, n in enumerate(list(seen_classes)): seen_map[n] = i visibility_mask[0] = seen_map.copy() for i, n in enumerate(list(novel_classes)): visibility_mask[i + 1] = seen_map.copy() visibility_mask[i + 1][n] = seen_classes.shape[0] + i if excludeval: train = np.load(datadir + '/split/train_list.npy')[:-CONFIG.VAL_SIZE] else: train = np.load(datadir + '/split/train_list.npy') novelset = [] seenset = [] if inputmix == 'novel' or inputmix == 'both': inverse_dict = pickle.load( open(datadir + '/split/inverse_dict_train.pkl', 'rb')) for icls, key in enumerate(novel_classes): if (inverse_dict[key].size > 0): for v in inverse_dict[key][ishot * 20:ishot * 20 + nshot]: novelset.append((v, icls)) #print((v, icls)) if inputmix == 'both': seenset = [] inverse_dict = pickle.load( open(datadir + '/split/inverse_dict_train.pkl', 'rb')) for icls, key in enumerate(seen_classes): if (inverse_dict[key].size > 0): for v in inverse_dict[key][ishot * 20:ishot * 20 + nshot]: seenset.append(v) if inputmix == 'seen': seenset = range(train.shape[0]) sampler = RandomImageSampler(seenset, novelset) if inputmix == 'novel': visible_classes = seen_novel_classes if nshot is not None: nshot = str(nshot) + 'n' elif inputmix == 'seen': visible_classes = seen_classes if nshot is not None: nshot = str(nshot) + 's' elif inputmix == 'both': visible_classes = seen_novel_classes if nshot is not None: nshot = str(nshot) + 'b' print("Visible classes:", visible_classes.size, " \nClasses are: ", visible_classes, "\nTrain Images:", train.shape[0]) #a Dataset 10k or 164k dataset = get_dataset(CONFIG.DATASET)(train=train, test=None, root=CONFIG.ROOT, split=CONFIG.SPLIT.TRAIN, base_size=513, crop_size=CONFIG.IMAGE.SIZE.TRAIN, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), warp=CONFIG.WARP_IMAGE, scale=(0.5, 1.5), flip=True, visibility_mask=visibility_mask) # DataLoader loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=CONFIG.BATCH_SIZE.TRAIN, num_workers=CONFIG.NUM_WORKERS, sampler=sampler) if embedding == 'word2vec': class_emb = pickle.load( open(datadir + '/word_vectors/word2vec.pkl', "rb")) elif embedding == 'fasttext': class_emb = pickle.load( open(datadir + '/word_vectors/fasttext.pkl', "rb")) elif embedding == 'fastnvec': class_emb = np.concatenate([ pickle.load(open(datadir + '/word_vectors/fasttext.pkl', "rb")), pickle.load(open(datadir + '/word_vectors/word2vec.pkl', "rb")) ], axis=1) else: print("invalid emb ", embedding) sys.exit() print((class_emb.shape)) class_emb = F.normalize(torch.tensor(class_emb), p=2, dim=1).cuda() loader_iter = iter(loader) DeepLab = DeepLabV2_ResNet101_MSC #import ipdb; ipdb.set_trace() state_dict = torch.load(CONFIG.INIT_MODEL) # Model load model = DeepLab(class_emb.shape[1], class_emb[visible_classes]) if continue_from is not None and continue_from > 0: print("Loading checkpoint: {}".format(continue_from)) #import ipdb; ipdb.set_trace() model = nn.DataParallel(model) state_file = osp.join(savepath, "checkpoint_{}.pth".format(continue_from)) if osp.isfile(state_file + '.tar'): state_dict = torch.load(state_file + '.tar') model.load_state_dict(state_dict['state_dict'], strict=True) elif osp.isfile(state_file): state_dict = torch.load(state_file) model.load_state_dict(state_dict, strict=True) else: print("Checkpoint {} not found".format(continue_from)) sys.exit() else: model.load_state_dict( state_dict, strict=False ) # make strict=True to debug if checkpoint is loaded correctly or not if performance is low model = nn.DataParallel(model) model.to(device) # Optimizer optimizer = { "sgd": torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[{ "params": get_params(model.module, key="1x"), "lr": CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.LR, "weight_decay": 0.0, }], momentum=CONFIG.MOMENTUM, ), "adam": torch.optim.Adam( # cf lr_mult and decay_mult in train.prototxt params=[{ "params": get_params(model.module, key="1x"), "lr": CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.LR, "weight_decay": 0.0, }]) # Add any other optimizer }.get(CONFIG.OPTIMIZER) if 'optimizer' in state_dict: optimizer.load_state_dict(state_dict['optimizer']) print("Learning rate:", CONFIG.LR) # Loss definition criterion = nn.CrossEntropyLoss(ignore_index=-1) criterion.to(device) if not nolog: # TensorBoard Logger if continue_from is not None: writer = SummaryWriter( savepath + '/runs/fs_{}_{}_{}'.format(continue_from, nshot, ishot)) else: writer = SummaryWriter(savepath + '/runs') loss_meter = MovingAverageValueMeter(20) model.train() model.module.scale.freeze_bn() pbar = tqdm( range(1, CONFIG.ITER_MAX + 1), total=CONFIG.ITER_MAX, leave=False, dynamic_ncols=True, ) for iteration in pbar: # Set a learning rate poly_lr_scheduler( optimizer=optimizer, init_lr=CONFIG.LR, iter=iteration - 1, lr_decay_iter=CONFIG.LR_DECAY, max_iter=CONFIG.ITER_MAX, power=CONFIG.POLY_POWER, ) # Clear gradients (ready to accumulate) optimizer.zero_grad() iter_loss = 0 for i in range(1, CONFIG.ITER_SIZE + 1): try: data, target = next(loader_iter) except: loader_iter = iter(loader) data, target = next(loader_iter) # Image data = data.to(device) # Propagate forward outputs = model(data) # Loss loss = 0 for output in outputs: # Resize target for {100%, 75%, 50%, Max} outputs target_ = resize_target(target, output.size(2)) target_ = torch.tensor(target_).to(device) loss += criterion.forward(output, target_) # Backpropagate (just compute gradients wrt the loss) #print(loss) loss /= float(CONFIG.ITER_SIZE) loss.backward() iter_loss += float(loss) del data, target, outputs #print(iter_loss) pbar.set_postfix(loss="%.3f" % iter_loss) # Update weights with accumulated gradients optimizer.step() if not nolog: loss_meter.add(iter_loss) # TensorBoard if iteration % CONFIG.ITER_TB == 0: writer.add_scalar("train_loss", loss_meter.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("train_lr_group{}".format(i), o["lr"], iteration) if False: # This produces a large log file for name, param in model.named_parameters(): name = name.replace(".", "/") writer.add_histogram(name, param, iteration, bins="auto") if param.requires_grad: writer.add_histogram(name + "/grad", param.grad, iteration, bins="auto") # Save a model if continue_from is not None: if iteration in CONFIG.ITER_SAVE: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), }, osp.join( savepath, "checkpoint_{}_{}_{}_{}.pth.tar".format( continue_from, nshot, ishot, iteration)), ) # Save a model (short term) [unnecessary for fewshot] if False and iteration % 100 == 0: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), }, osp.join( savepath, "checkpoint_{}_{}_{}_current.pth.tar".format( continue_from, nshot, ishot)), ) print( osp.join( savepath, "checkpoint_{}_{}_{}_current.pth.tar".format( continue_from, nshot, ishot))) else: if iteration % CONFIG.ITER_SAVE == 0: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, osp.join(savepath, "checkpoint_{}.pth.tar".format(iteration)), ) # Save a model (short term) if iteration % 100 == 0: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, osp.join(savepath, "checkpoint_current.pth.tar"), ) torch.cuda.empty_cache() if not nolog: if continue_from is not None: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), }, osp.join( savepath, "checkpoint_{}_{}_{}_{}.pth.tar".format( continue_from, nshot, ishot, CONFIG.ITER_MAX))) else: torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, osp.join(savepath, "checkpoint_{}.pth.tar".format(CONFIG.ITER_MAX)))
def train(self): torch.cuda.empty_cache() ###################### # Save / Load model ###################### if self.opt.continue_train: try: self.continue_from_latest_checkpoint() except Exception as e: self.logger.error(e) self.opt.continue_train = False self.reset_save() else: self.reset_save() dataset = UnitRiffDataset(self.opt.dataset_name, self.opt.instr_type) dataset_size = len(dataset) self.logger.info( f'Dataset {self.opt.dataset_name} loaded, size {dataset_size}') ###################### # Initiate ###################### criterionGAN = nn.BCEWithLogitsLoss() GLoss_meter = MovingAverageValueMeter(self.opt.plot_every) DLoss_meter = MovingAverageValueMeter(self.opt.plot_every) losses = {} ###################### # Start Training ###################### for epoch in range(self.opt.start_epoch, self.opt.max_epoch): loader = DataLoader(dataset, batch_size=self.opt.batch_size, shuffle=True, num_workers=self.opt.num_threads, drop_last=False) epoch_start_time = time.time() for i, data in enumerate(loader): batch_size = data.size(0) # print(batch_size) real_label = torch.ones(size=[batch_size, 1], device=self.device) fake_label = torch.zeros(size=[batch_size, 1], device=self.device) seed = np.array([ generate_random_seed(1, self.opt.instr_type, pattern=self.opt.chord_type) for _ in range(batch_size) ]) # print(seed.shape) noise = torch.randn(batch_size, self.opt.seed_size, device=self.device) seed = torch.from_numpy(seed).to(device=self.device, dtype=torch.float) fake_data = self.generator(noise, seed, batch_size) D_fake = self.discriminator(fake_data, batch_size) real_data = torch.unsqueeze(data, 1).to(device=self.device, dtype=torch.float) D_real = self.discriminator(real_data, batch_size) # print(D_fake.shape) ###################### # Generator ###################### self.G_optimizer.zero_grad() loss_G = criterionGAN(D_fake, real_label) loss_G.backward(retain_graph=True) self.G_optimizer.step() self.G_optimizer.zero_grad() loss_G = criterionGAN(D_fake, real_label) loss_G.backward(retain_graph=True) self.G_optimizer.step() GLoss_meter.add(loss_G.item()) ###################### # Discriminator ###################### self.D_optimizer.zero_grad() loss_D_real = criterionGAN(D_real, real_label) loss_D_fake = criterionGAN(D_fake, fake_label) loss_D = 0.5 * loss_D_real + 0.5 * loss_D_fake loss_D.backward() self.D_optimizer.step() DLoss_meter.add(loss_D.item()) if epoch % self.opt.save_every == 0 or epoch == self.opt.max_epoch - 1: self.save_model(epoch) losses['loss_G'] = float(GLoss_meter.value()[0]) losses['loss_D'] = float(DLoss_meter.value()[0]) self.G_scheduler.step(epoch) self.D_scheduler.step(epoch) epoch_time = int(time.time() - epoch_start_time) self.logger.info( f'Epoch {epoch} finished, cost time {epoch_time}\n') self.logger.info(str(losses) + '\n\n')
def train(config_path, cuda): """ Training DeepLab by v2 protocol """ # Configuration CONFIG = Dict(yaml.load(config_path)) device = get_device(cuda) torch.backends.cudnn.benchmark = True # Dataset dataset = get_dataset(CONFIG.DATASET.NAME)( root=CONFIG.DATASET.ROOT, split=CONFIG.DATASET.SPLIT.TRAIN, ignore_label=CONFIG.DATASET.IGNORE_LABEL, mean_bgr=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), augment=True, base_size=CONFIG.IMAGE.SIZE.BASE, crop_size=CONFIG.IMAGE.SIZE.TRAIN, scales=CONFIG.DATASET.SCALES, flip=True, ) print(dataset) # DataLoader loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=CONFIG.SOLVER.BATCH_SIZE.TRAIN, num_workers=CONFIG.DATALOADER.NUM_WORKERS, shuffle=True, ) loader_iter = iter(loader) # Model check print("Model:", CONFIG.MODEL.NAME) assert ( CONFIG.MODEL.NAME == "DeepLabV2_ResNet101_MSC" ), 'Currently support only "DeepLabV2_ResNet101_MSC"' # Model setup model = eval(CONFIG.MODEL.NAME)(n_classes=CONFIG.DATASET.N_CLASSES) state_dict = torch.load(CONFIG.MODEL.INIT_MODEL) print(" Init:", CONFIG.MODEL.INIT_MODEL) for m in model.base.state_dict().keys(): if m not in state_dict.keys(): print(" Skip init:", m) model.base.load_state_dict(state_dict, strict=False) # to skip ASPP model = nn.DataParallel(model) model.to(device) # Loss definition criterion = nn.CrossEntropyLoss(ignore_index=CONFIG.DATASET.IGNORE_LABEL) criterion.to(device) # Optimizer optimizer = torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[ { "params": get_params(model.module, key="1x"), "lr": CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.SOLVER.LR, "weight_decay": CONFIG.SOLVER.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.SOLVER.LR, "weight_decay": 0.0, }, ], momentum=CONFIG.SOLVER.MOMENTUM, ) # Learning rate scheduler scheduler = PolynomialLR( optimizer=optimizer, step_size=CONFIG.SOLVER.LR_DECAY, iter_max=CONFIG.SOLVER.ITER_MAX, power=CONFIG.SOLVER.POLY_POWER, ) # Setup loss logger writer = SummaryWriter(os.path.join(CONFIG.EXP.OUTPUT_DIR, "logs", CONFIG.EXP.ID)) average_loss = MovingAverageValueMeter(CONFIG.SOLVER.AVERAGE_LOSS) # Path to save models checkpoint_dir = os.path.join( CONFIG.EXP.OUTPUT_DIR, "models", CONFIG.EXP.ID, CONFIG.MODEL.NAME.lower(), CONFIG.DATASET.SPLIT.TRAIN, ) makedirs(checkpoint_dir) print("Checkpoint dst:", checkpoint_dir) # Freeze the batch norm pre-trained on COCO model.train() model.module.base.freeze_bn() for iteration in tqdm( range(1, CONFIG.SOLVER.ITER_MAX + 1), total=CONFIG.SOLVER.ITER_MAX, dynamic_ncols=True, ): # Clear gradients (ready to accumulate) optimizer.zero_grad() loss = 0 for _ in range(CONFIG.SOLVER.ITER_SIZE): try: _, images, labels = next(loader_iter) except: loader_iter = iter(loader) _, images, labels = next(loader_iter) # Propagate forward logits = model(images.to(device)) # Loss iter_loss = 0 for logit in logits: # Resize labels for {100%, 75%, 50%, Max} logits _, _, H, W = logit.shape labels_ = resize_labels(labels, size=(H, W)) iter_loss += criterion(logit, labels_.to(device)) # Propagate backward (just compute gradients wrt the loss) iter_loss /= CONFIG.SOLVER.ITER_SIZE iter_loss.backward() loss += float(iter_loss) #print(loss) average_loss.add(loss) # Update weights with accumulated gradients optimizer.step() # Update learning rate scheduler.step(epoch=iteration) # TensorBoard if iteration % CONFIG.SOLVER.ITER_TB == 0: writer.add_scalar("loss/train", average_loss.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("lr/group_{}".format(i), o["lr"], iteration) for i in range(torch.cuda.device_count()): writer.add_scalar( "gpu/device_{}/memory_cached".format(i), torch.cuda.memory_cached(i) / 1024 ** 3, iteration, ) if False: for name, param in model.module.base.named_parameters(): name = name.replace(".", "/") # Weight/gradient distribution writer.add_histogram(name, param, iteration, bins="auto") if param.requires_grad: writer.add_histogram( name + "/grad", param.grad, iteration, bins="auto" ) # Save a model if iteration % CONFIG.SOLVER.ITER_SAVE == 0: torch.save( model.module.state_dict(), os.path.join(checkpoint_dir, "checkpoint_{}.pth".format(iteration)), ) torch.save( model.module.state_dict(), os.path.join(checkpoint_dir, "checkpoint_final.pth") )
def main(config, cuda): cuda = cuda and torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") if cuda: current_device = torch.cuda.current_device() print("Running on", torch.cuda.get_device_name(current_device)) else: print("Running on CPU") # Configuration CONFIG = Dict(yaml.load(open(config))) dataset = get_dataset(CONFIG.DATASET)( data_path=CONFIG.ROOT, crop_size=256, scale=(0.6, 0.8, 1., 1.2, 1.4), rotation=15, flip=True, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), ) """ # Dataset 10k or 164k dataset = get_dataset(CONFIG.DATASET)( root=CONFIG.ROOT, split=CONFIG.SPLIT.TRAIN, base_size=513, crop_size=CONFIG.IMAGE.SIZE.TRAIN, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), warp=CONFIG.WARP_IMAGE, scale=(0.5, 0.75, 1.0, 1.25, 1.5), flip=True, ) """ # DataLoader loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=CONFIG.BATCH_SIZE.TRAIN, num_workers=CONFIG.NUM_WORKERS, shuffle=True, ) loader_iter = iter(loader) # Model model = DeepLabV3Plus_ResNet101_MSC(n_classes=CONFIG.N_CLASSES) state_dict = torch.load(CONFIG.INIT_MODEL) model.load_state_dict(state_dict, strict=False) # Skip "aspp" layer model = nn.DataParallel(model) model.to(device) for name, param in model.named_parameters(): if param.requires_grad: print(name) # Optimizer optimizer = torch.optim.Adam( params=get_params(model.module), lr=CONFIG.LR, weight_decay=CONFIG.WEIGHT_DECAY, ) """ # Optimizer optimizer = torch.optim.SGD( # cf lr_mult and decay_mult in train.prototxt params=[ { "params": get_params(model.module, key="1x"), "lr": CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="10x"), "lr": 10 * CONFIG.LR, "weight_decay": CONFIG.WEIGHT_DECAY, }, { "params": get_params(model.module, key="20x"), "lr": 20 * CONFIG.LR, "weight_decay": 0.0, }, ], momentum=CONFIG.MOMENTUM, ) """ # Loss definition criterion = CrossEntropyLoss2d(ignore_index=CONFIG.IGNORE_LABEL) criterion.to(device) max_pooling_loss = MaxPoolingLoss(ratio=0.3, p=1.7, reduce=True) # TensorBoard Logger writer = SummaryWriter(CONFIG.LOG_DIR) loss_meter = MovingAverageValueMeter(20) model.train() model.module.scale.freeze_bn() for iteration in tqdm( range(1, CONFIG.ITER_MAX + 1), total=CONFIG.ITER_MAX, leave=False, dynamic_ncols=True, ): """ # Set a learning rate poly_lr_scheduler( optimizer=optimizer, init_lr=CONFIG.LR, iter=iteration - 1, lr_decay_iter=CONFIG.LR_DECAY, max_iter=CONFIG.ITER_MAX, power=CONFIG.POLY_POWER, ) """ # Clear gradients (ready to accumulate) optimizer.zero_grad() iter_loss = 0 for i in range(1, CONFIG.ITER_SIZE + 1): try: images, labels = next(loader_iter) except: loader_iter = iter(loader) images, labels = next(loader_iter) images = images.to(device) labels = labels.to(device).unsqueeze(1).float() # Propagate forward logits = model(images) # Loss loss = 0 for logit in logits: # Resize labels for {100%, 75%, 50%, Max} logits labels_ = F.interpolate(labels, logit.shape[2:], mode="nearest") labels_ = labels_.squeeze(1).long() # Compute NLL and MPL nll_loss = criterion(logit, labels_) # loss += nll_loss loss += max_pooling_loss(nll_loss) # Backpropagate (just compute gradients wrt the loss) loss /= float(CONFIG.ITER_SIZE) loss.backward() iter_loss += float(loss) loss_meter.add(iter_loss) # Update weights with accumulated gradients optimizer.step() if iteration % CONFIG.ITER_TB == 0: writer.add_scalar("train_loss", loss_meter.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("train_lr_group{}".format(i), o["lr"], iteration) gt_viz, images_viz, predicts_viz = make_vizs( images, labels_, logits, (CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R)) writer.add_image("gt/images", torch.from_numpy(images_viz[0]), iteration) writer.add_image("gt/labels", torch.from_numpy(gt_viz[0]), iteration) for i, predict_viz in enumerate(predicts_viz): writer.add_image("predict/" + str(i), torch.from_numpy(predict_viz[0]), iteration) if False: # This produces a large log file for name, param in model.named_parameters(): name = name.replace(".", "/") writer.add_histogram(name, param, iteration, bins="auto") if param.requires_grad: writer.add_histogram(name + "/grad", param.grad, iteration, bins="auto") # Save a model if iteration % CONFIG.ITER_SAVE == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_{}.pth".format(iteration)), ) # Save a model (short term) if iteration % 100 == 0: torch.save( model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_current.pth"), ) torch.save(model.module.state_dict(), osp.join(CONFIG.SAVE_DIR, "checkpoint_final.pth"))
class DataManager: def __init__(self, imagedataset, datadir, inputmix, embedding, device): self.imagedataset = imagedataset self.datadir = datadir self.inputmix = inputmix self.embedding = embedding self.device = device def generateSavepath(self, experimentid): # name the savedir, might add logs/ before the datetime for clarity if experimentid is None: savedir = time.strftime('%Y%m%d%H%M%S') else: savedir = experimentid self.savepath = os.path.join('logs', self.imagedataset, savedir) return self.savepath # getter method def get_savepath(self): return self.savepath def generateTB(self, period): self.writer = SummaryWriter(self.savepath + '/runs') self.loss_meter = MovingAverageValueMeter(20) self.tb = period def get_writer(self): return self.writer def createDirectory(self, values, config, args): try: os.makedirs(self.savepath) # print("Log dir:", savepath) except: pass # now join the path in save_screenshot: if os.path.exists(self.savepath + '/libs'): shutil.rmtree(self.savepath + '/libs') shutil.copytree('./libs/', self.savepath + '/libs') shutil.copy2(osp.abspath(inspect.stack()[0][1]), self.savepath) shutil.copy2(config, self.savepath) args_dict = {} for a in args: args_dict[a] = values[a] with open(self.savepath + '/args.json', 'w') as fp: json.dump(args_dict, fp) def loadClasses(self, bkg): self.seen_classes = np.load( self.datadir + '/split/seen_cls.npy') #only the seen classes if bkg: self.seen_classes = np.asarray(np.concatenate( [np.array([0]), self.seen_classes]), dtype=int) #seen classes + bkg self.novel_classes = np.load(self.datadir + '/split/novel_cls.npy') self.all_labels = np.genfromtxt(self.datadir + '/labels_2.txt', delimiter='\t', usecols=1, dtype='str') self.seen_classes = np.asarray(np.concatenate( [self.seen_classes, np.load(self.datadir + '/split/val_cls.npy')]), dtype=int) self.seen_novel_classes = np.concatenate( [self.seen_classes, self.novel_classes]) self.to_ignore_classes = self.novel_classes if self.inputmix == 'seen': self.visible_classes = self.seen_classes else: self.visible_classes = self.seen_novel_classes print("Seen classes: ") print(self.seen_classes) print("all labels: ") print(self.all_labels) return self.seen_classes, self.novel_classes, self.seen_novel_classes, self.to_ignore_classes, self.visible_classes, self.all_labels def get_Classes(self): return self.seen_classes, self.novel_classes, self.seen_novel_classes, self.to_ignore_classes, self.visible_classes, self.all_labels, self.visibility_mask def loadData(self): self.train = np.load(self.datadir + '/split/train_list.npy') self.novelset = [] self.seenset = [] if self.inputmix == 'seen': self.seenset = range(self.train.shape[0]) else: print("inputmix is not seen") exit() return self.train, self.seenset, self.novelset def get_data(self): return self.train, self.seenset, self.novelset def loadDatasets(self, CONFIG, bs): # Sampler sampler = MyDistributedSampler( self.seenset, self.novelset, num_replicas=torch.distributed.get_world_size(), rank=torch.distributed.get_rank()) self.dataset = get_dataset(CONFIG.DATASET)( train=self.train, test=None, root=CONFIG.ROOT, transform=None, split=CONFIG.SPLIT.TRAIN, base_size=513, crop_size=CONFIG.IMAGE.SIZE.TRAIN, mean=(CONFIG.IMAGE.MEAN.B, CONFIG.IMAGE.MEAN.G, CONFIG.IMAGE.MEAN.R), warp=CONFIG.WARP_IMAGE, scale=(0.5, 1.5), flip=True, visibility_mask=self.visibility_mask, ) random.seed(42) # DataLoader self.loader = torch.utils.data.DataLoader( dataset=self.dataset, batch_size=bs, num_workers=CONFIG.NUM_WORKERS, # num_workers = 1, sampler=sampler, pin_memory=True) return self.dataset, self.loader def get_datasets(self): return self.dataset, self.loader def loadClassEmbs(self): # Word embeddings if self.embedding == 'word2vec': self.class_emb = pickle.load( open(self.datadir + '/word_vectors/word2vec.pkl', "rb")) elif self.embedding == 'fasttext': self.class_emb = pickle.load( open(self.datadir + '/word_vectors/fasttext.pkl', "rb")) elif self.embedding == 'fastnvec': self.class_emb = np.concatenate([ pickle.load( open(self.datadir + '/word_vectors/fasttext.pkl', "rb")), pickle.load( open(self.datadir + '/word_vectors/word2vec.pkl', "rb")) ], axis=1) else: print("invalid emb ", self.embedding) exit() self.class_emb = F.normalize(torch.tensor(self.class_emb), p=2, dim=1).to(self.device) self.seen_class_emb = self.class_emb[self.seen_classes] self.to_ignore_class_emb = self.class_emb[self.to_ignore_classes] return self.class_emb, self.to_ignore_class_emb, self.seen_class_emb def get_clsEmbs(self): return self.class_emb, self.to_ignore_class_emb, self.seen_class_emb def loadClsMaps(self, bkg): self.seen_map = np.array([-1] * 256) for i, n in enumerate(list(self.seen_classes)): self.seen_map[n] = i self.all_map = np.array([-1] * 256) for i, n in enumerate(list(self.seen_classes)): self.all_map[n] = i for i, n in enumerate(self.to_ignore_classes, len(self.seen_classes)): self.all_map[n] = i self.inverse_map = np.array([-1] * 256) for i, n in enumerate(self.all_map): self.inverse_map[n] = i if bkg: for i, n in enumerate(self.to_ignore_classes): self.seen_map[n] = 0 # viene usata per sapere quali predizioni sono unseen e quali no nel calcolo della percentuale self.cls_map_seen = np.array([0] * 256) for i, n in enumerate(self.to_ignore_classes): self.cls_map_seen[n] = 1 self.cls_map = None self.cls_map = np.array([255] * 256) for i, n in enumerate(self.seen_classes): self.cls_map[n] = i # VISIBILITY MASK self.visibility_mask = {} self.visibility_mask[0] = self.seen_map.copy() print(self.visibility_mask[0]) return self.seen_map, self.cls_map_seen, self.cls_map def getClsMaps(self): return self.seen_map, self.cls_map_seen, self.cls_map, self.inverse_map def savePerIteration(self, iter_loss, optimizer, model, iteration, save): self.loss_meter.add(iter_loss) # TensorBoard if iteration % self.tb == 0: self.writer.add_scalar("train_loss", self.loss_meter.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): self.writer.add_scalar("train_lr_group{}".format(i), o["lr"], iteration) # Save a model (short term) if iteration > 0 and iteration % save == 0: print( "\nIteration: {} \nSaving (short term) model (iteration,state_dict,optimizer) ...\n " .format(iteration)) with open(self.savepath + '/iteration.json', 'w') as fp: json.dump({'iteration': iteration}, fp) name = "checkpoint_current.pth.tar" if "voc" in self.savepath or iteration % 5000 == 0: name = "checkpoint_{}.pth.tar".format(iteration) torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, osp.join(self.savepath, name)) def saveFinal(self, optimizer, model): torch.save( { 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, osp.join(self.savepath, "checkpoint_final.pth.tar"))
iter_loss /= CONFIG.SOLVER.ITER_SIZE iter_loss.backward() loss += float(iter_loss) average_loss.add(loss) # Update weights with accumulated gradients optimizer.step() # Update learning rate scheduler.step(epoch=iteration) # TensorBoard if iteration % CONFIG.SOLVER.ITER_TB == 0: writer.add_scalar("loss/train", average_loss.value()[0], iteration) for i, o in enumerate(optimizer.param_groups): writer.add_scalar("lr/group_{}".format(i), o["lr"], iteration) for i in range(torch.cuda.device_count()): writer.add_scalar( "gpu/device_{}/memory_cached".format(i), torch.cuda.memory_cached(i) / 1024**3, iteration, ) # Save a model if iteration % CONFIG.SOLVER.ITER_SAVE == 0: torch.save( model.module.state_dict(), os.path.join(checkpoint_dir, "checkpoint_{}.pth".format(iteration)),