def __init__(self, device, cfg_file, summaryFolder): self.device = device self.summaryFolder = summaryFolder with open(cfg_file, 'r') as f: self.cfg = yaml.safe_load(f) utils.init_seeds(self.cfg['model_hyp']['random_seed']) attSplit = loadmat('./dataset/xlsa/' + self.cfg['dataset']['name'] + '/att_splits.mat') res101 = loadmat('./dataset/xlsa/' + self.cfg['dataset']['name'] + '/res101.mat') labels = res101['labels'].astype(int).squeeze() - 1 seen_dataLoc = attSplit['test_seen_loc'].squeeze() - 1 unseen_dataLoc = attSplit['test_unseen_loc'].squeeze() - 1 seen_labels = labels[seen_dataLoc] unseen_labels = labels[unseen_dataLoc] self.seen_labels = np.unique(seen_labels) self.unseen_labels = np.unique(unseen_labels) self.clsname = [ attSplit['allclasses_names'][i][0][0] for i in range(len(attSplit['allclasses_names'])) ] att_matrix = np.transpose(attSplit['att']) self.cfg['model_hyp']['att_feats'] = att_matrix.shape[1] self.attMatrix = att_matrix.copy() self.attMatrix[:len(self.seen_labels)] = att_matrix[self.seen_labels] self.attMatrix[len(self.seen_labels):] = att_matrix[self.unseen_labels] self.attMatrix = torch.FloatTensor(self.attMatrix).to(self.device) pass
def main(): parser = argparse.ArgumentParser(description="FCOS") parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--start_epoch", type=int, default=1) parser.add_argument("--dist", action="store_true") args = parser.parse_args() if (args.dist): torch.cuda.set_device(args.local_rank) dist.init_process_group(backend="nccl", init_method="env://") utils.synchronize() utils.init_seeds(0) train(args.dist, args.start_epoch, args.local_rank)
def init_pipeline(self): df = self.load_data() utils.init_seeds() utils.init_cuda() print("Unique locations", np.unique(df[["lat", "long"]], axis=0).shape) print("Min latitude", df["lat"].min()) print("Max latitude", df["lat"].max()) print("Min longitude", df["long"].min()) print("Max longitude", df["long"].max()) print("Min ts", df["ts"].min()) print("Max ts", df["ts"].max()) print("Number of days", (df["ts"].max() - df["ts"].min())//(60*60*24))
def train(cfg): # Initialize init_seeds() image_size_min = 6.6 # 320 / 32 / 1.5 image_size_max = 28.5 # 320 / 32 / 28.5 if cfg.TRAIN.MULTI_SCALE: image_size_min = round(cfg.TRAIN.IMAGE_SIZE / 32 / 1.5) image_size_max = round(cfg.TRAIN.IMAGE_SIZE / 32 * 1.5) image_size = image_size_max * 32 # initiate with maximum multi_scale size print(f"Using multi-scale {image_size_min * 32} - {image_size}") # Remove previous results for files in glob.glob("results.txt"): os.remove(files) # Initialize model model = YOLOv3(cfg).to(device) # Optimizer optimizer = optim.SGD(model.parameters(), lr=cfg.TRAIN.LR, momentum=cfg.TRAIN.MOMENTUM, weight_decay=cfg.TRAIN.DECAY, nesterov=True) # Define the loss function calculation formula of the model compute_loss = YoloV3Loss(cfg) epoch = 0 start_epoch = 0 best_maps = 0.0 context = None # Dataset # Apply augmentation hyperparameters train_dataset = VocDataset(anno_file_type=cfg.TRAIN.DATASET, image_size=cfg.TRAIN.IMAGE_SIZE, cfg=cfg) # Dataloader train_dataloader = DataLoader(train_dataset, batch_size=cfg.TRAIN.MINI_BATCH_SIZE, num_workers=cfg.TRAIN.WORKERS, shuffle=cfg.TRAIN.SHUFFLE, pin_memory=cfg.TRAIN.PIN_MENORY) if cfg.TRAIN.WEIGHTS.endswith(".pth"): state = torch.load(cfg.TRAIN.WEIGHTS, map_location=device) # load model try: state["state_dict"] = { k: v for k, v in state["state_dict"].items() if model.state_dict()[k].numel() == v.numel() } model.load_state_dict(state["state_dict"], strict=False) except KeyError as e: error_msg = f"{cfg.TRAIN.WEIGHTS} is not compatible with {cfg.CONFIG_FILE}. " error_msg += f"Specify --weights `` or specify a --config-file " error_msg += f"compatible with {cfg.TRAIN.WEIGHTS}. " raise KeyError(error_msg) from e # load optimizer if state["optimizer"] is not None: optimizer.load_state_dict(state["optimizer"]) best_maps = state["best_maps"] # load results if state.get("training_results") is not None: with open("results.txt", "w") as file: file.write(state["training_results"]) # write results.txt start_epoch = state["batches"] + 1 // len(train_dataloader) del state elif len(cfg.TRAIN.WEIGHTS) > 0: # possible weights are "*.weights", "yolov3-tiny.conv.15", "darknet53.conv.74" etc. load_darknet_weights(model, cfg.TRAIN.WEIGHTS) else: print("Pre training model weight not loaded.") # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: # skip print amp info model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # source https://arxiv.org/pdf/1812.01187.pdf scheduler = CosineDecayLR(optimizer, max_batches=cfg.TRAIN.MAX_BATCHES, lr=cfg.TRAIN.LR, warmup=cfg.TRAIN.WARMUP_BATCHES) # Initialize distributed training if device.type != "cpu" and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend="nccl", # "distributed backend" # distributed training init method init_method="tcp://127.0.0.1:9999", # number of nodes for distributed training world_size=1, # distributed training node rank rank=0) model = torch.nn.parallel.DistributedDataParallel(model) model.backbone = model.module.backbone # Model EMA # TODO: ema = ModelEMA(model, decay=0.9998) # Start training batches_num = len(train_dataloader) # number of batches # 'loss_GIOU', 'loss_Confidence', 'loss_Classification' 'loss' results = (0, 0, 0, 0) epochs = cfg.TRAIN.MAX_BATCHES // len(train_dataloader) print(f"Using {cfg.TRAIN.WORKERS} dataloader workers.") print( f"Starting training {cfg.TRAIN.MAX_BATCHES} batches for {epochs} epochs..." ) start_time = time.time() for epoch in range(start_epoch, epochs): model.train() # init batches batches = 0 mean_losses = torch.zeros(4) print("\n") print( ("%10s" * 7) % ("Batch", "memory", "GIoU", "conf", "cls", "total", " image_size")) progress_bar = tqdm(enumerate(train_dataloader), total=batches_num) for index, (images, small_label_bbox, medium_label_bbox, large_label_bbox, small_bbox, medium_bbox, large_bbox) in progress_bar: # number integrated batches (since train start) batches = index + len(train_dataloader) * epoch scheduler.step(batches) images = images.to(device) small_label_bbox = small_label_bbox.to(device) medium_label_bbox = medium_label_bbox.to(device) large_label_bbox = large_label_bbox.to(device) small_bbox = small_bbox.to(device) medium_bbox = medium_bbox.to(device) large_bbox = large_bbox.to(device) # Hyper parameter Burn-in if batches <= cfg.TRAIN.WARMUP_BATCHES: for m in model.named_modules(): if m[0].endswith('BatchNorm2d'): m[1].track_running_stats = batches == cfg.TRAIN.WARMUP_BATCHES # Run model pred, raw = model(images) # Compute loss loss, loss_giou, loss_conf, loss_cls = compute_loss( pred, raw, small_label_bbox, medium_label_bbox, large_label_bbox, small_bbox, medium_bbox, large_bbox) # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize accumulated gradient if batches % cfg.TRAIN.BATCH_SIZE // cfg.TRAIN.MINI_BATCH_SIZE == 0: optimizer.step() optimizer.zero_grad() # TODO: ema.update(model) # Print batch results # update mean losses loss_items = torch.tensor([loss_giou, loss_conf, loss_cls, loss]) mean_losses = (mean_losses * index + loss_items) / (index + 1) memory = f"{torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0:.2f}G" context = ("%10s" * 2 + "%10.3g" * 5) % ( "%g/%g" % (batches + 1, cfg.TRAIN.MAX_BATCHES), memory, *mean_losses, train_dataset.image_size) progress_bar.set_description(context) # Multi-Scale training if cfg.TRAIN.MULTI_SCALE: # adjust img_size (67% - 150%) every 10 batch size if batches % cfg.TRAIN.RESIZE_INTERVAL == 0: train_dataset.image_size = random.randrange( image_size_min, image_size_max + 1) * 32 # Write Tensorboard results if tb_writer: # 'loss_GIOU', 'loss_Confidence', 'loss_Classification' 'loss' titles = ["GIoU", "Confidence", "Classification", "Train loss"] for xi, title in zip( list(mean_losses) + list(results), titles): tb_writer.add_scalar(title, xi, index) # Process epoch results # TODO: ema.update_attr(model) final_epoch = epoch + 1 == epochs # Calculate mAP # skip first epoch maps = 0. if epoch > 0: maps = evaluate(cfg, args) # Write epoch results with open("results.txt", "a") as f: # 'loss_GIOU', 'loss_Confidence', 'loss_Classification' 'loss', 'maps' f.write(context + "%10.3g" * 1 % maps) f.write("\n") # Update best mAP if maps > best_maps: best_maps = maps # Save training results with open("results.txt", 'r') as f: # Create checkpoint state = { 'batches': batches, 'best_maps': maps, 'training_results': f.read(), 'state_dict': model.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(state, "weights/checkpoint.pth") # Save best checkpoint if best_maps == maps: state = { 'batches': -1, 'best_maps': None, 'training_results': None, 'state_dict': model.state_dict(), 'optimizer': None } torch.save(state, "weights/model_best.pth") # Delete checkpoint del state print(f"{epoch - start_epoch} epochs completed " f"in {(time.time() - start_time) / 3600:.3f} hours.\n") dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache()
def main(args): init_seeds(seed=args.seed) # split data files = list([f.stem for f in Path(args.dataroot).glob('map/*.npz')]) train_files, val_files = train_test_split(files, test_size=0.1) # define dataloaders train_set = ReferenceDataset(train_files, args.dataroot) val_set = ReferenceDatasetEval(val_files, args.dataroot) train_loader = DataLoader(train_set, args.batch_size, shuffle=True, num_workers=4) val_loader = DataLoader(val_set, args.batch_size, drop_last=True) # define networks netG = SRNTT(args.ngf, args.n_blocks, args.use_weights).to(device) netG.content_extractor.load_state_dict(torch.load(args.init_weight)) if args.netD == 'image': netD = ImageDiscriminator(args.ndf).to(device) elif args.netD == 'patch': netD = Discriminator(args.ndf).to(device) # define criteria criterion_rec = nn.L1Loss().to(device) criterion_per = PerceptualLoss().to(device) criterion_adv = AdversarialLoss().to(device) criterion_tex = TextureLoss(args.use_weights).to(device) # metrics criterion_psnr = PSNR(max_val=1., mode='Y') criterion_ssim = SSIM(window_size=11) # define optimizers optimizer_G = optim.Adam(netG.parameters(), args.lr) optimizer_D = optim.Adam(netD.parameters(), args.lr) scheduler_G = StepLR(optimizer_G, int(args.n_epochs * len(train_loader) / 2), 0.1) scheduler_D = StepLR(optimizer_D, int(args.n_epochs * len(train_loader) / 2), 0.1) # for tensorboard writer = SummaryWriter(log_dir=f'runs/{args.pid}' if args.pid else None) if args.netG_pre is None: """ pretrain """ step = 0 for epoch in range(1, args.n_epochs_init + 1): for i, batch in enumerate(train_loader, 1): img_hr = batch['img_hr'].to(device) img_lr = batch['img_lr'].to(device) maps = {k: v.to(device) for k, v in batch['maps'].items()} weights = batch['weights'].to(device) _, img_sr = netG(img_lr, maps, weights) """ train G """ optimizer_G.zero_grad() g_loss = criterion_rec(img_sr, img_hr) g_loss.backward() optimizer_G.step() """ logging """ writer.add_scalar('pre/g_loss', g_loss.item(), step) if step % args.display_freq == 0: writer.add_images('pre/img_lr', img_lr.clamp(0, 1), step) writer.add_images('pre/img_hr', img_hr.clamp(0, 1), step) writer.add_images('pre/img_sr', img_sr.clamp(0, 1), step) log_txt = [ f'[Pre][Epoch{epoch}][{i}/{len(train_loader)}]', f'G Loss: {g_loss.item()}' ] print(' '.join(log_txt)) step += 1 if args.debug: break out_path = Path(writer.log_dir) / f'netG_pre{epoch:03}.pth' torch.save(netG.state_dict(), out_path) else: # ommit pre-training netG.load_state_dict(torch.load(args.netG_pre)) if args.netD_pre: netD.load_state_dict(torch.load(args.netD_pre)) """ train with all losses """ step = 0 for epoch in range(1, args.n_epochs + 1): """ training loop """ netG.train() netD.train() for i, batch in enumerate(train_loader, 1): img_hr = batch['img_hr'].to(device) img_lr = batch['img_lr'].to(device) maps = {k: v.to(device) for k, v in batch['maps'].items()} weights = batch['weights'].to(device) _, img_sr = netG(img_lr, maps, weights) """ train D """ optimizer_D.zero_grad() for p in netD.parameters(): p.requires_grad = True for p in netG.parameters(): p.requires_grad = False # compute WGAN loss d_out_real = netD(img_hr) d_loss_real = criterion_adv(d_out_real, True) d_out_fake = netD(img_sr.detach()) d_loss_fake = criterion_adv(d_out_fake, False) d_loss = d_loss_real + d_loss_fake # gradient penalty gradient_penalty = compute_gp(netD, img_hr.data, img_sr.data) d_loss += 10 * gradient_penalty d_loss.backward() optimizer_D.step() """ train G """ optimizer_G.zero_grad() for p in netD.parameters(): p.requires_grad = False for p in netG.parameters(): p.requires_grad = True # compute all losses loss_rec = criterion_rec(img_sr, img_hr) loss_per = criterion_per(img_sr, img_hr) loss_adv = criterion_adv(netD(img_sr), True) loss_tex = criterion_tex(img_sr, maps, weights) # optimize with combined d_loss g_loss = (loss_rec * args.lambda_rec + loss_per * args.lambda_per + loss_adv * args.lambda_adv + loss_tex * args.lambda_tex) g_loss.backward() optimizer_G.step() """ logging """ writer.add_scalar('train/g_loss', g_loss.item(), step) writer.add_scalar('train/loss_rec', loss_rec.item(), step) writer.add_scalar('train/loss_per', loss_per.item(), step) writer.add_scalar('train/loss_tex', loss_tex.item(), step) writer.add_scalar('train/loss_adv', loss_adv.item(), step) writer.add_scalar('train/d_loss', d_loss.item(), step) writer.add_scalar('train/d_real', d_loss_real.item(), step) writer.add_scalar('train/d_fake', d_loss_fake.item(), step) if step % args.display_freq == 0: writer.add_images('train/img_lr', img_lr, step) writer.add_images('train/img_hr', img_hr, step) writer.add_images('train/img_sr', img_sr.clamp(0, 1), step) log_txt = [ f'[Train][Epoch{epoch}][{i}/{len(train_loader)}]', f'G Loss: {g_loss.item()}, D Loss: {d_loss.item()}' ] print(' '.join(log_txt)) scheduler_G.step() scheduler_D.step() step += 1 if args.debug: break """ validation loop """ netG.eval() netD.eval() val_psnr, val_ssim = 0, 0 tbar = tqdm(total=len(val_loader)) for i, batch in enumerate(val_loader, 1): img_hr = batch['img_hr'].to(device) img_lr = batch['img_lr'].to(device) maps = {k: v.to(device) for k, v in batch['maps'].items()} weights = batch['weights'].to(device) with torch.no_grad(): _, img_sr = netG(img_lr, maps, weights) val_psnr += criterion_psnr(img_hr, img_sr.clamp(0, 1)).item() val_ssim += criterion_ssim(img_hr, img_sr.clamp(0, 1)).item() tbar.update(1) if args.debug: break else: tbar.close() val_psnr /= len(val_loader) val_ssim /= len(val_loader) writer.add_scalar('val/psnr', val_psnr, epoch) writer.add_scalar('val/ssim', val_ssim, epoch) print(f'[Val][Epoch{epoch}] PSNR:{val_psnr:.4f}, SSIM:{val_ssim:.4f}') netG_path = Path(writer.log_dir) / f'netG_{epoch:03}.pth' netD_path = Path(writer.log_dir) / f'netD_{epoch:03}.pth' torch.save(netG.state_dict(), netG_path) torch.save(netD.state_dict(), netD_path)
def main(args): args.cuda = not args.no_cuda and torch.cuda.is_available() init_seeds(seed=int(time.time())) kwargs = {'num_workers': 2, 'pin_memory': True} if args.cuda else {} print(args.dataset) if args.dataset == 'MNIST': test_dataloader = data.DataLoader( MNIST(args.data_path, args.run_folder, transform=mnist_transformer()), batch_size=10000, shuffle=False, **kwargs) train_dataset = MNIST(args.data_path, args.run_folder, train=True, transform=mnist_transformer(), imbalance_ratio=args.imbalance_ratio) if args.imbalance_ratio == 100: args.num_images = 25711 else: args.num_images = 50000 args.budget = 125 args.initial_budget = 125 args.num_classes = 10 args.num_channels = 1 args.arch_scaler = 2 elif args.dataset == 'SVHN': test_dataloader = data.DataLoader( SVHN(args.data_path, args.run_folder, transform=svhn_transformer()), batch_size=5000, shuffle=False, **kwargs) train_dataset = SVHN(args.data_path, args.run_folder, train=True, transform=svhn_transformer(), imbalance_ratio=args.imbalance_ratio) if args.imbalance_ratio == 100: args.num_images = 318556 else: args.num_images = 500000 args.budget = 1250 args.initial_budget = 1250 args.num_classes = 10 args.num_channels = 3 args.arch_scaler = 1 elif args.dataset == 'cifar10': test_dataloader = data.DataLoader( datasets.CIFAR10(args.data_path, download=True, transform=cifar_transformer(), train=False), batch_size=args.batch_size, drop_last=False) train_dataset = CIFAR10(args.data_path) args.num_images = 50000 args.budget = 2500 args.initial_budget = 5000 args.num_classes = 10 args.num_channels = 3 elif args.dataset == 'cifar100': test_dataloader = data.DataLoader( datasets.CIFAR100(args.data_path, download=True, transform=cifar_transformer(), train=False), batch_size=args.batch_size, drop_last=False) train_dataset = CIFAR100(args.data_path) args.num_images = 50000 args.budget = 2500 args.initial_budget = 5000 args.num_classes = 100 args.num_channels = 3 elif args.dataset == 'ImageNet': test_dataloader = data.DataLoader( ImageNet(args.data_path + '/val', transform=imagenet_test_transformer()), batch_size=args.batch_size, shuffle=False, drop_last=False, **kwargs) if args.imbalance_ratio == 100: train_dataset = ImageNet(args.data_path + '/train_ir_100', transform=imagenet_train_transformer()) args.num_images = 645770 else: train_dataset = ImageNet(args.data_path + '/train', transform=imagenet_train_transformer()) args.num_images = 1281167 args.budget = 64000 args.initial_budget = 64000 args.num_classes = 1000 args.num_channels = 3 args.arch_scaler = 1 else: raise NotImplementedError all_indices = set(np.arange(args.num_images)) initial_indices = random.sample(all_indices, args.initial_budget) sampler = data.sampler.SubsetRandomSampler(initial_indices) #print(args.batch_size, sampler) # dataset with labels available querry_dataloader = data.DataLoader(train_dataset, sampler=sampler, batch_size=args.batch_size, drop_last=False, **kwargs) print('Sampler size =', len(querry_dataloader)) solver = Solver(args, test_dataloader) splits = range(1,11) current_indices = list(initial_indices) accuracies = [] for split in splits: print("Split =", split) # need to retrain all the models on the new images # re initialize and retrain the models #task_model = vgg.vgg16_bn(num_classes=args.num_classes) if args.dataset == 'MNIST': task_model = model.LeNet(num_classes=args.num_classes) elif args.dataset == 'SVHN': task_model = resnet.resnet10(num_classes=args.num_classes) elif args.dataset == 'ImageNet': task_model = resnet.resnet18(num_classes=args.num_classes) else: print('WRONG DATASET!') # loading pretrained if args.pretrained: print("Loading pretrained model", args.pretrained) checkpoint = torch.load(args.pretrained) task_model.load_state_dict({k: v for k, v in checkpoint['state_dict'].items() if 'fc' not in k}, strict=False) # copy all but last linear layers # vae = model.VAE(z_dim=args.latent_dim, nc=args.num_channels, s=args.arch_scaler) discriminator = model.Discriminator(z_dim=args.latent_dim, s=args.arch_scaler) #print("Sampling starts") unlabeled_indices = np.setdiff1d(list(all_indices), current_indices) unlabeled_sampler = data.sampler.SubsetRandomSampler(unlabeled_indices) unlabeled_dataloader = data.DataLoader(train_dataset, sampler=unlabeled_sampler, batch_size=args.batch_size, drop_last=False, **kwargs) #print("Train starts") # train the models on the current data acc, vae, discriminator = solver.train(querry_dataloader, task_model, vae, discriminator, unlabeled_dataloader) print('Final accuracy with {}% of data is: {:.2f}'.format(int(split*100.0*args.budget/args.num_images), acc)) accuracies.append(acc) sampled_indices = solver.sample_for_labeling(vae, discriminator, unlabeled_dataloader) current_indices = list(current_indices) + list(sampled_indices) sampler = data.sampler.SubsetRandomSampler(current_indices) querry_dataloader = data.DataLoader(train_dataset, sampler=sampler, batch_size=args.batch_size, drop_last=False, **kwargs) torch.save(accuracies, os.path.join(args.out_path, args.log_name))
elif sche_type == "auto": return LambdaLR(optim, lambda x: (((1 + np.cos(x * np.pi / epochs)) / 2) ** 1.0) * 0.9 + 0.1) else: return None if __name__ == "__main__": params_file = 'params.yml' params_file = check_file(params_file) params = Params('params.yml') # load params params.save_dir = os.path.join(os.getcwd(), params.save_dir) os.makedirs(params.save_dir, exist_ok=True) # build ouput directory device = select_device(params.device, batch_size=params.batch_size) # build GPU env init_seeds(1) train_loader, val_loader = get_loaders(params.input_dir, params.num_classes, params.img_size, params.batch_size, params.num_workers) net, ckpt = get_model(params) net = nn.DataParallel(net).to(device, non_blocking=True) ''' This CrossEntropyLoss implementation comes with a softmax activation function, which is not suitable for this multi-label classification situation ''' # loss = nn.CrossEntropyLoss() loss = nn.BCEWithLogitsLoss() if params.multilabels else nn.CrossEntropyLoss() ''' Adam optimizer has fastest training speed, but with Familiarity to data, SGD is recommanded '''