def forward(self, data, label, pro, former_avg): avg_list = self.avg(data, label, pro) synchronize(self.num_process) if self.avg_choice: dist.all_reduce(avg_list, op=dist.ReduceOp.SUM) weighted_avg = avg_list[:self.dim] / avg_list[-1:] else: loss_value = avg_list[-1:].clone() dist.all_reduce(loss_value, op=dist.ReduceOp.MIN) if loss_value != avg_list[-1:]: avg_list[:self.dim] = torch.zeros(self.dim).double() avg_list[-1:] = 0.0 else: avg_list[-1:] = 1.0 synchronize(self.num_process) dist.all_reduce(avg_list, op=dist.ReduceOp.SUM) weighted_avg = avg_list[:self.dim] / avg_list[-1:] norm = torch.norm(weighted_avg - former_avg) M = weighted_avg.size(0) flag = bool(norm**2 / M <= self.tol) weighted_avg_list = weighted_avg.expand(self.num, -1) self.X = weighted_avg_list + (self.X - weighted_avg_list) * math.exp(-self.drift * self.timestep) if self.noise_choice: noise_term = torch.randn(self.num * self.num_process, self.dim)[self.rank * self.num : self.rank * self.num + self.num, ].double() # self.X += self.noise * math.sqrt(self.timestep) * (self.X - weighted_avg_list).mul(noise_term) bm = torch.randn(self.num * self.num_process, self.dim)[self.rank * self.num : self.rank * self.num + self.num, ].double() self.X += self.noise * math.sqrt(self.timestep) * bm flag_noise = False if flag: bm = torch.randn(self.num * self.num_process, self.dim)[self.rank * self.num : self.rank * self.num + self.num, ].double() self.X += self.noise * math.sqrt(self.timestep) * bm flag_noise = True if self.rank == 0: print("noise!") return weighted_avg, flag_noise
def main(): parser = argparse.ArgumentParser(description="RetinaNet") parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--start_epoch", type=int, default=1) parser.add_argument("--dist", action="store_true", default=False) args = parser.parse_args() if (args.dist): torch.cuda.set_device(args.local_rank) dist.init_process_group(backend="nccl", init_method="env://") utils.synchronize() train(args.dist, args.start_epoch, args.local_rank)
def main(): # os.environ["CUDA_VISIBLE_DEVICES"]="1" parser = argparse.ArgumentParser(description="ATSS") parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--start_epoch", type=int, default=1) parser.add_argument("--dist", action="store_true") args = parser.parse_args() if (args.dist): torch.cuda.set_device(args.local_rank) dist.init_process_group(backend="nccl", init_method="env://") utils.synchronize() train(args.dist, args.start_epoch, args.local_rank)
def main(): # os.environ["CUDA_VISIBLE_DEVICES"]="0" parser = argparse.ArgumentParser(description="FCOS") parser.add_argument("--local_rank", type=int, default=0) gpu_nums = torch.cuda.device_count() is_dist = gpu_nums > 1 args = parser.parse_args() if (is_dist): torch.cuda.set_device(args.local_rank) dist.init_process_group(backend="nccl", init_method="env://") utils.synchronize() train(is_dist, args.local_rank)
def main_dist(uid: str, **kwargs): """ uid is a unique identifier for the experiment name Can be kept same as a previous run, by default will start executing from latest saved model **kwargs: allows arbit arguments of cfg to be changed """ cfg = conf num_gpus = torch.cuda.device_count() cfg.num_gpus = num_gpus if num_gpus > 1: if 'local_rank' in kwargs: # We are doing distributed parallel cfg.do_dist = True torch.cuda.set_device(kwargs['local_rank']) torch.distributed.init_process_group( backend="nccl", init_method="env://" ) synchronize() else: # We are doing data parallel cfg.do_dist = False # Update the config file depending on the command line args cfg = update_from_dict(cfg, kwargs, key_maps) # Freeze the cfg, can no longer be changed cfg.freeze() # print(cfg) # Initialize learner learn = learner_init(uid, cfg) # Train or Test if not (cfg.only_val or cfg.only_test): learn.fit(epochs=cfg.epochs, lr=cfg.lr) else: if cfg.only_val: learn.testing(learn.data.valid_dl) if cfg.only_test: learn.testing(learn.data.test_dl)
def inference(model, data_loader, dataset_name, device='cuda', output_folder=None, expected_results=(), expected_results_sigma_tol=4): device = torch.device(device) num_devices = get_world_size() logger = logging.getLogger("RetinaNet.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset))) total_timer = Timer() inference_timer = Timer() total_timer.tic() predictions = compute_on_dataset(model, data_loader, device, inference_timer) # wait for all processes to complete before measuring the time synchronize() total_time = total_timer.toc() total_time_str = get_time_str(total_time) logger.info( "Total run time: {} ({} s / img per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) predictions = accumulate_predictions_from_multiple_gpus(predictions) if not is_main_process(): return if output_folder: torch.save(predictions, os.path.join(output_folder, "predictions.pth")) extra_args = dict( expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) return evaluate(dataset=dataset, predictions=predictions, output_folder=output_folder, **extra_args)
input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) data_kwargs = { 'base_size': args.base_size, 'crop_size': args.crop_size, 'transform': input_transform } val_dataset = get_segmentation_dataset(args.dataset, split=args.split, mode=args.mode, **data_kwargs) sampler = make_data_sampler(val_dataset, False, distributed) batch_sampler = data.BatchSampler(sampler=sampler, batch_size=args.batch_size, drop_last=False) val_data = data.DataLoader(val_dataset, shuffle=False, batch_sampler=batch_sampler, num_workers=args.num_workers) metric = SegmentationMetric(val_dataset.num_class) metric = validate(model, val_data, metric, device) ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metric) if ptutil.is_main_process(): print('pixAcc: %.4f, mIoU: %.4f' % (pixAcc, mIoU))
def training(self): self.net.train() save_to_disk = ptutil.get_rank() == 0 start_training_time = time.time() trained_time = 0 mIoU = 0 best_miou = 0 tic = time.time() end = time.time() iteration, max_iter = 0, self.max_iter save_iter, eval_iter = self.per_iter * self.config.TRAIN.SAVE_EPOCH, self.per_iter * self.config.TRAIN.EVAL_EPOCHS self.logger.info("Start training, total epochs {:3d} = total iteration: {:6d}".format(self.config.TRAIN.EPOCHS, max_iter)) for i, (image, target) in enumerate(self.train_loader): iteration += 1 self.scheduler.step() self.optimizer.zero_grad() image, target = image.to(self.device,dtype=self.dtype), target.to(self.device) if self.config.DATASET.IMG_TRANSFORM == False: image = image.permute(0,3,1,2) outputs = self.net(image) loss_dict = self.criterion(outputs, target) loss_dict_reduced = ptutil.reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) if self.config.TRAIN.MIXED_PRECISION: with amp.scale_loss(loss,self.optimizer) as scale_loss: scale_loss.backward() else: loss.backward() self.optimizer.step() trained_time += time.time() - end end = time.time() if iteration % self.config.TRAIN.LOG_STEP == 0: eta_seconds = int((trained_time / iteration) * (max_iter - iteration)) log_str = ["Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}" .format(iteration, self.optimizer.param_groups[0]['lr'], time.time() - tic, str(datetime.timedelta(seconds=eta_seconds))), "total_loss: {:.3f}".format(losses_reduced.item())] log_str = ', '.join(log_str) self.logger.info(log_str) tic = time.time() if save_to_disk and iteration % save_iter == 0: model_path = os.path.join(self.config.TRAIN.SAVE_DIR, "{}_{}_{}_iter_{:06d}.pth" .format(self.config.MODEL.NAME, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME, iteration)) ptutil.save_model(self.net,model_path,self.logger) if self.config.TRAIN.EVAL_EPOCHS > 0 and iteration % eval_iter == 0 and not iteration == max_iter: metrics = ptutil.validate(self.net,self.valid_loader,self.metric,self.device,self.config) ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metrics) if mIoU !=None and mIoU >= best_miou: best_miou = mIoU model_path = os.path.join(self.config.TRAIN.SAVE_DIR, "{}_{}_{}_best.pth" .format(self.config.MODEL.NAME, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME)) ptutil.save_model(self.net,model_path,self.logger) if pixAcc is not None: self.logger.info('pixAcc: {:.4f}, mIoU: {:.4f}'.format(pixAcc, mIoU)) self.net.train() if save_to_disk: model_path = os.path.join(self.config.TRAIN.SAVE_DIR, "{}_{}_{}_iter_{:06d}.pth" .format(self.config.MODEL.NAME, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME, max_iter)) ptutil.save_model(self.net,model_path,self.logger) total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) self.logger.info("Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter)) # eval after training if not self.config.TRAIN.SKIP_EVAL: metrics = ptutil.validate(self.net,self.valid_loader,self.metric,self.device,self.config) ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metrics) if pixAcc is not None: self.logger.info('After training, pixAcc: {:.4f}, mIoU: {:.4f}'.format(pixAcc, mIoU))
semart_val_dataloader = DataLoader(dataset=semart_val, batch_size=args.batch_size, drop_last=False, num_workers=4) wpi_dataloader = DataLoader(dataset=wpi_data, batch_size=args.batch_size, drop_last=False, num_workers=4) if int(os.environ["WORLD_SIZE"]) > 1: torch.distributed.init_process_group(backend="nccl", init_method="env://") print("world size: {}".format(os.environ["WORLD_SIZE"])) print("rank: {}".format(args.local_rank)) synchronize() if int(os.environ["WORLD_SIZE"]) > 1: combined_model = torch.nn.parallel.DistributedDataParallel( CombinedModel(len(vectorizer.vocabulary_), device, args.resnet, l2_norm=True), device_ids=[args.local_rank], output_device=args.local_rank).cuda() else: combined_model = CombinedModel(len(vectorizer.vocabulary_), device, args.resnet, l2_norm=True).to(device)
def main(rank, args): """ Parameters ---------- rank : int Subprocess id args : dict Configuration """ if rank == 0: t1 = time.time() set_random_seed(args['seed']) # Remove the line below will result in problems for multiprocess torch.set_num_threads(1) # Setup dataset and data loader dataset = MoleculeDataset(args['dataset'], args['order'], ['train', 'val'], subset_id=rank, n_subsets=args['num_processes']) # Note that currently the batch size for the loaders should only be 1. train_loader = DataLoader(dataset.train_set, batch_size=args['batch_size'], shuffle=True, collate_fn=dataset.collate) val_loader = DataLoader(dataset.val_set, batch_size=args['batch_size'], shuffle=True, collate_fn=dataset.collate) if rank == 0: try: from tensorboardX import SummaryWriter writer = SummaryWriter(args['log_dir']) except ImportError: print( 'If you want to use tensorboard, install tensorboardX with pip.' ) writer = None train_printer = Printer(args['nepochs'], len(dataset.train_set), args['batch_size'], writer) val_printer = Printer(args['nepochs'], len(dataset.val_set), args['batch_size']) else: val_printer = None # Initialize model model = DGMG(atom_types=dataset.atom_types, bond_types=dataset.bond_types, node_hidden_size=args['node_hidden_size'], num_prop_rounds=args['num_propagation_rounds'], dropout=args['dropout']) if args['num_processes'] == 1: from utils import Optimizer optimizer = Optimizer(args['lr'], Adam(model.parameters(), lr=args['lr'])) else: from utils import MultiProcessOptimizer optimizer = MultiProcessOptimizer( args['num_processes'], args['lr'], Adam(model.parameters(), lr=args['lr'])) if rank == 0: t2 = time.time() best_val_prob = 0 # Training for epoch in range(args['nepochs']): model.train() if rank == 0: print('Training') for i, data in enumerate(train_loader): log_prob = model(actions=data, compute_log_prob=True) prob = log_prob.detach().exp() loss_averaged = -log_prob prob_averaged = prob optimizer.backward_and_step(loss_averaged) if rank == 0: train_printer.update(epoch + 1, loss_averaged.item(), prob_averaged.item()) synchronize(args['num_processes']) # Validation val_log_prob = evaluate(epoch, model, val_loader, val_printer) if args['num_processes'] > 1: dist.all_reduce(val_log_prob, op=dist.ReduceOp.SUM) val_log_prob /= args['num_processes'] # Strictly speaking, the computation of probability here is different from what is # performed on the training set as we first take an average of log likelihood and then # take the exponentiation. By Jensen's inequality, the resulting value is then a # lower bound of the real probabilities. val_prob = (-val_log_prob).exp().item() val_log_prob = val_log_prob.item() if val_prob >= best_val_prob: if rank == 0: torch.save({'model_state_dict': model.state_dict()}, args['checkpoint_dir']) print( 'Old val prob {:.10f} | new val prob {:.10f} | model saved' .format(best_val_prob, val_prob)) best_val_prob = val_prob elif epoch >= args['warmup_epochs']: optimizer.decay_lr() if rank == 0: print('Validation') if writer is not None: writer.add_scalar('validation_log_prob', val_log_prob, epoch) writer.add_scalar('validation_prob', val_prob, epoch) writer.add_scalar('lr', optimizer.lr, epoch) print('Validation log prob {:.4f} | prob {:.10f}'.format( val_log_prob, val_prob)) synchronize(args['num_processes']) if rank == 0: t3 = time.time() print('It took {} to setup.'.format(datetime.timedelta(seconds=t2 - t1))) print('It took {} to finish training.'.format( datetime.timedelta(seconds=t3 - t2))) print( '--------------------------------------------------------------------------' ) print('On average, an epoch takes {}.'.format( datetime.timedelta(seconds=(t3 - t2) / args['nepochs'])))
def main(rank, dev_id, args): set_seed() # Remove the line below will result in problems for multiprocess if args['num_devices'] > 1: torch.set_num_threads(1) if dev_id == -1: args['device'] = torch.device('cpu') else: args['device'] = torch.device('cuda:{}'.format(dev_id)) # Set current device torch.cuda.set_device(args['device']) train_set, val_set = load_dataset(args) get_center_subset(train_set, rank, args['num_devices']) train_loader = DataLoader(train_set, batch_size=args['batch_size'], collate_fn=collate_center, shuffle=True) val_loader = DataLoader(val_set, batch_size=args['batch_size'], collate_fn=collate_center, shuffle=False) model = WLNReactionCenter(node_in_feats=args['node_in_feats'], edge_in_feats=args['edge_in_feats'], node_pair_in_feats=args['node_pair_in_feats'], node_out_feats=args['node_out_feats'], n_layers=args['n_layers'], n_tasks=args['n_tasks']).to(args['device']) model.train() if rank == 0: print('# trainable parameters in the model: ', count_parameters(model)) criterion = BCEWithLogitsLoss(reduction='sum') optimizer = Adam(model.parameters(), lr=args['lr']) if args['num_devices'] <= 1: from utils import Optimizer optimizer = Optimizer(model, args['lr'], optimizer, max_grad_norm=args['max_norm']) else: from utils import MultiProcessOptimizer optimizer = MultiProcessOptimizer(args['num_devices'], model, args['lr'], optimizer, max_grad_norm=args['max_norm']) total_iter = 0 rank_iter = 0 grad_norm_sum = 0 loss_sum = 0 dur = [] for epoch in range(args['num_epochs']): t0 = time.time() for batch_id, batch_data in enumerate(train_loader): total_iter += args['num_devices'] rank_iter += 1 batch_reactions, batch_graph_edits, batch_mol_graphs, \ batch_complete_graphs, batch_atom_pair_labels = batch_data labels = batch_atom_pair_labels.to(args['device']) pred, biased_pred = reaction_center_prediction( args['device'], model, batch_mol_graphs, batch_complete_graphs) loss = criterion(pred, labels) / len(batch_reactions) loss_sum += loss.cpu().detach().data.item() grad_norm_sum += optimizer.backward_and_step(loss) if rank_iter % args['print_every'] == 0 and rank == 0: progress = 'Epoch {:d}/{:d}, iter {:d}/{:d} | ' \ 'loss {:.4f} | grad norm {:.4f}'.format( epoch + 1, args['num_epochs'], batch_id + 1, len(train_loader), loss_sum / args['print_every'], grad_norm_sum / args['print_every']) print(progress) grad_norm_sum = 0 loss_sum = 0 if total_iter % args['decay_every'] == 0: optimizer.decay_lr(args['lr_decay_factor']) if total_iter % args['decay_every'] == 0 and rank == 0: if epoch >= 1: dur.append(time.time() - t0) print('Training time per {:d} iterations: {:.4f}'.format( rank_iter, np.mean(dur))) total_samples = total_iter * args['batch_size'] prediction_summary = 'total samples {:d}, (epoch {:d}/{:d}, iter {:d}/{:d}) '.format( total_samples, epoch + 1, args['num_epochs'], batch_id + 1, len(train_loader)) + \ reaction_center_final_eval(args, args['top_ks_val'], model, val_loader, easy=True) print(prediction_summary) with open(args['result_path'] + '/val_eval.txt', 'a') as f: f.write(prediction_summary) torch.save({'model_state_dict': model.state_dict()}, args['result_path'] + '/model_{:d}.pkl'.format(total_samples)) t0 = time.time() model.train() synchronize(args['num_devices'])
def main(rank, args): set_random_seed(args['seed']) torch.set_num_threads(1) device = 'cuda' if (torch.cuda.is_available() and args['gpu']) else 'cpu' if rank == 0: print("Preparing data...") if args['dataset'] == "mnist": transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, ), (1.0, ))]) data_train = datasets.MNIST(root="./data_cache/", transform=transform, train=True, download=True) data_test = datasets.MNIST(root="./data_cache/", transform=transform, train=False) else: dataset = Data(args['dataset'], args['repeat_time'], args['seed'], args['n_dim'], args['sparsity'], rank, args['reg_fista'], args['iteration'], args['tolerance_fista']) res_loss = torch.zeros(args['repeat_time']).double() res_acc = torch.zeros(args['repeat_time']).double() log_loss = list() log_loss_ave = list() log_acc = list() log_loss_train = list() log_loss_ave_train = list() log_acc_train = list() ''' gisette: sparse logistic regression on the Gisette datasets, picture_reconstruction: optimizing the angles of the Radon Transformation of the picture of Lenna, compressive_sensing: the same numerical experiment as the one in New PIHT paper by Zhang Xiaoqun, mnist: numerical experiments with optimizing neural networks on the MNIST dataset, ras: numerical experiments with rastrigin function optimization problem ''' if args['dataset'] == "gisette": # preparing data... data_valid, label_valid = dataset.data_valid, dataset.label_valid data_valid, label_valid = data_valid.to(device), label_valid.to(device) data_train, label_train = dataset.data, dataset.label data_train, label_train = data_train.to(device), label_train.to(device) feature_dim = dataset.feature_dim trainloader = DataLoader(dataset, batch_size=args['batch_loss'], shuffle=True) weighted_avg = torch.zeros(1, feature_dim + 1).double().to(device) # initialize CBO optimizer if args['num_processes'] == 1 or device == 'cuda': optimizer = CBO_optimizer(num=args['num_particle'], dim=feature_dim + 1, drift=args['drift'], noise=args['noise'], temp=args['temperature'], timestep=args['timestep'], tol=args['tolerance'], seed=args['seed'], batch_avg=args['batch_avg'], avg_choice=args['avg_choice'], noise_choice=args['noise_choice'], device=device, lam_reg=args['reg'], batch_loss=args['batch_loss']) else: optimizer = MultiprocessCBO(num_process=args['num_processes'], rank=rank, num=args['num_particle'], dim=feature_dim + 1, drift=args['drift'], noise=args['noise'], temp=args['temperature'], timestep=args['timestep'], tol=args['tolerance'], seed=args['seed'], batch_avg=args['batch_avg'], avg_choice=args['avg_choice'], noise_choice=args['noise_choice'], device=device, lam_reg=args['reg'], batch_loss=args['batch_loss']) # optimizing... if rank == 0: print("Training...") epo_flag = False time_start = time.time() for i in range(args['epoch']): for train_batch, batch in enumerate(trainloader): if rank == 0: t1 = time.time() data, label = batch_loader(batch) data, label = data.to(device), label.to(device) former = copy.deepcopy(weighted_avg) weighted_avg, noise_flag = optimizer.forward( data, label, args['problem'], weighted_avg) # synchronize(args['num_processes']) if rank == 0: t2 = time.time() # noise_flag: whether to introduce BM in CBO # log recent results ''' if noise_flag and rank == 0: write_res('./results/train_loss_NCBO_logistic_log_loss_' + str(args['num_particle']) + '.npy', log_loss_train) write_res('./results/train_loss_NCBO_logistic_log_acc_' + str(args['num_particle']) + '.npy', log_acc_train) write_res('./results/loss_NCBO_logistic_log_loss_' + str(args['num_particle']) + '.npy', log_loss) write_res('./results/loss_NCBO_ori_logistic_log_acc_' + str(args['num_particle']) + '.npy', log_acc) ''' # validation u = weighted_avg[:-1] v = weighted_avg[-1:] pred_train = batch_logistic_class(data_train, u, v) pred_train_cnt = (pred_train == label_train).sum().double() loss_train = test_logistic_l0(data_train, label_train, u, v, args['reg']) log_loss_train.append(loss_train.item()) log_acc_train.append(pred_train_cnt.item() / data_train.size(0)) pred = batch_logistic_class(data_valid, u, v) pred_cnt = (pred == label_valid).sum().double() loss = test_logistic_l0(data_valid, label_valid, u, v, args['reg']) log_loss.append(loss.item()) log_acc.append(pred_cnt.item() / data_valid.size(0)) if rank == 0: print( "epoch: {:d} | iter: {:d} | loss: {:f} | validation precision: {:f}" .format(i, train_batch, loss.item(), pred_cnt.item() / data_valid.size(0))) synchronize(args['num_processes']) if rank == 0: t3 = time.time() print("training time: {:f}".format(t2 - t1)) print("test time: {:f}".format(t3 - t2)) # check whether to reach the stopping criterion delta_error = torch.norm(weighted_avg - former) err = delta_error * delta_error / weighted_avg.size(0) if err <= args['tolerance_stop']: # print(former_avg) # average, variance = var(former_avg) # print(np.linalg.norm(former_avg), average, variance) if rank == 0: print("Consensus!") epo_flag = True break if epo_flag: optimizer._initialize_particles() break time_end = time.time() if rank == 0: print("total time: {:f}".format(time_end - time_start)) print(min(log_loss_train)) print(max(log_acc_train)) print(min(log_loss)) print(max(log_acc)) write_res( './results/train_loss_RCBO_logistic_log_loss_' + str(args['num_particle']) + '.npy', log_loss_train) write_res( './results/train_loss_RCBO_logistic_log_acc_' + str(args['num_particle']) + '.npy', log_acc_train) write_res( './results/loss_RCBO_logistic_log_loss_' + str(args['num_particle']) + '.npy', log_loss) write_res( './results/loss_RCBO_logistic_log_acc_' + str(args['num_particle']) + '.npy', log_acc) print("Results written!") sys.exit("break!") elif args['dataset'] == "picture_reconstruction": data, label, theta = dataset.data, dataset.label, dataset.theta weighted_avg = torch.zeros(theta.size(0)).double() # initialize CBO optimizer if args['num_processes'] == 1: optimizer = CBO_optimizer(num=args['num_particle'], dim=theta.size(0), drift=args['drift'], noise=args['noise'], temp=args['temperature'], timestep=args['timestep'], tol=args['tolerance'], seed=args['seed'], batch_avg=args['batch_avg'], avg_choice=args['avg_choice'], noise_choice=args['noise_choice'], device=device, lam_reg=args['reg'], batch_loss=args['batch_loss']) else: optimizer = MultiprocessCBO(num_process=args['num_processes'], rank=rank, num=args['num_particle'], dim=theta.size(0), drift=args['drift'], noise=args['noise'], temp=args['temperature'], timestep=args['timestep'], tol=args['tolerance'], seed=args['seed'], batch_avg=args['batch_avg'], avg_choice=args['avg_choice'], noise_choice=args['noise_choice'], device=device, lam_reg=args['reg'], batch_loss=args['batch_loss']) optimizer._normalize_particles() optimizer.X *= 179.0 # optimizing... if rank == 0: print("Training...") for i in range(args['epoch']): if rank == 0: t1 = time.time() former = copy.deepcopy(weighted_avg) weighted_avg, noise_flag = optimizer.forward( data, label, args['problem'], weighted_avg) synchronize(args['num_processes']) if rank == 0: t2 = time.time() # validation acc = theta_acc(weighted_avg, theta) loss = loss_recons(weighted_avg, data, label) log_loss.append(loss) log_acc.append(acc) if rank == 0: print( "epoch: {:d} | loss: {:f} | validation error: {:f}".format( i, loss, acc)) synchronize(args['num_processes']) if rank == 0: t3 = time.time() print("training time: {:f}".format(t2 - t1)) print("test time: {:f}".format(t3 - t2)) # check whether to reach the stopping criterion delta_error = torch.norm(weighted_avg - former) err = delta_error**2 / weighted_avg.size(0) print("error: ", err) if err <= args['tolerance_stop']: optimizer._initialize_particles() # print(former_avg) # average, variance = var(former_avg) # print(np.linalg.norm(former_avg), average, variance) if rank == 0: print("Consensus!") break if rank == 0: write_res( './results/pic_loss_' + str(args['num_particle']) + '.npy', log_loss) write_res( './results/pic_acc_' + str(args['num_particle']) + '.npy', log_acc) print("Results written!") sys.exit("break!") elif args['dataset'] == "compressive_sensing": # data for compressive sensing data, label, observe, initial, L = dataset.data, dataset.label, dataset.observe, dataset.initial, dataset.L n_dim = dataset.sig_dim # initialize CBO optimizer if rank == 0: print("initialize CBO optimizer...") if args['num_processes'] == 1: optimizer = CBO_optimizer(num=args['num_particle'], dim=n_dim, drift=args['drift'], noise=args['noise'], temp=args['temperature'], timestep=args['timestep'], tol=args['tolerance'], seed=args['seed'], batch_avg=args['batch_avg'], avg_choice=args['avg_choice'], noise_choice=args['noise_choice'], device=device, lam_reg=args['reg'], batch_loss=args['batch_loss']) else: optimizer = MultiprocessCBO(num_process=args['num_processes'], rank=rank, num=args['num_particle'], dim=n_dim, drift=args['drift'], noise=args['noise'], temp=args['temperature'], timestep=args['timestep'], tol=args['tolerance'], seed=args['seed'], batch_avg=args['batch_avg'], avg_choice=args['avg_choice'], noise_choice=args['noise_choice'], device=device, lam_reg=args['reg'], batch_loss=args['batch_loss']) # start training if rank == 0: print("start training...") for j in range(args['repeat_time']): A = data[j] sig = label[j] b = observe[j] # for initial value optimizer._initialize_particles(j) X_0 = initial[j] gamma = math.sqrt(2.0 * args['reg'] / (L[j] + args['timestep'])) print("L: ", L[j]) print("gamma: ", gamma) loss = loss_cs(X_0, A, b, args['reg']) acc = acc_cs(X_0, sig) if rank == 0: print("FISTA done!") print("FISTA loss: ", loss) print("FISTA accuracy: ", acc) # sys.exit("sys exit: FISTA done!") # customize initialization for CBO # optimizer._custom_initialization(X_0, args['cs_std']) # weighted_avg = X_0.squeeze(1) weighted_avg = torch.zeros(X_0.size(0)).double() # synchronize(args['num_processes']) # optimizing... if rank == 0: print("training... | idx: {:d}".format(j)) for epoch in range(args['epoch']): # CBO descent former = copy.deepcopy(weighted_avg) weighted_avg, noise_flag = optimizer.forward( A, b, args['problem'], weighted_avg) # try projection operation after several CBO steps to preserve the structure if epoch > 0 and epoch % 5 == 0: # projection proj_index = (weighted_avg >= gamma).double() weighted_avg = weighted_avg * proj_index # optimizer.X = optimizer.X * proj_index print("projection!", (weighted_avg < gamma).sum().item()) # log recent results if noise_flag and rank == 0: write_res( './results/cs_log_loss_' + str(args['num_particle']) + '.npy', log_loss) write_res( './results/cs_log_acc_' + str(args['num_particle']) + '.npy', log_acc) # validation loss = loss_cs(weighted_avg.unsqueeze(1), A, b, args['reg']) acc = acc_cs(weighted_avg.unsqueeze(1), sig) log_loss.append(loss) log_acc.append(acc) if rank == 0: print("loss: ", loss) print("idx: {:d} | epoch: {:d} | error: {:f}".format( j, epoch, acc)) # check whether to reach the stopping criterion delta_error = torch.norm(weighted_avg - former) err = delta_error**2 / weighted_avg.size(0) # print("delta error: ", delta_error) if err <= args['tolerance_stop']: if rank == 0: print("Consensus!") break synchronize(args['num_processes']) if rank == 0: print("idx / n_data : {:d} / {:d} is done".format( j, args['repeat_time'])) optimizer._initialize_particles(j + 1) synchronize(args['num_processes']) sys.exit("sys exit: break!") elif args['dataset'] == "mnist": # preparing data... trainloader = DataLoader(data_train, batch_size=args['batch_loss'], shuffle=True) testloader = DataLoader(data_test, batch_size=args['batch_loss'], shuffle=False) if args['problem'] == 'one_module': feature_dim = 784 * 10 + 10 elif args['problem'] == 'two_module': feature_dim = 784 * 50 + 50 + 50 * 10 + 10 weighted_avg = torch.zeros(1, feature_dim).double().to(device) # initialize CBO optimizer if args['num_processes'] == 1 or device == 'cuda': optimizer = CBO_optimizer(num=args['num_particle'], dim=feature_dim, drift=args['drift'], noise=args['noise'], temp=args['temperature'], timestep=args['timestep'], tol=args['tolerance'], seed=args['seed'], batch_avg=args['batch_avg'], avg_choice=args['avg_choice'], noise_choice=args['noise_choice'], device=device, lam_reg=args['reg'], batch_loss=args['batch_loss']) else: optimizer = MultiprocessCBO(num_process=args['num_processes'], rank=rank, num=args['num_particle'], dim=feature_dim, drift=args['drift'], noise=args['noise'], temp=args['temperature'], timestep=args['timestep'], tol=args['tolerance'], seed=args['seed'], batch_avg=args['batch_avg'], avg_choice=args['avg_choice'], noise_choice=args['noise_choice'], device=device, lam_reg=args['reg'], batch_loss=args['batch_loss']) # same initialzation as weight initialization in Pytorch if args['problem'] == 'two_module': optimizer._kaiming_uniform_initialization() elif args['problem'] == 'one_module': optimizer._kaiming_1_uniform_initialization() # optimizing... if rank == 0: print("Training...") epo_flag = False time_start = time.time() for i in range(args['epoch']): for train_batch, (data, label) in enumerate(trainloader): if rank == 0: t1 = time.time() former = copy.deepcopy(weighted_avg) data, label = data.to(device), label.to(device) weighted_avg, noise_flag = optimizer.forward( data, label, args['problem'], weighted_avg) # log recent results if (noise_flag and rank == 0) or (i % 5 == 0): write_res( './results_mnist/log_loss_train_' + str(args['num_particle']) + '.npy', log_loss_train) # write_res('./results_mnist/log_loss_ave_train_' + str(args['num_particle']) + '.npy', log_loss_ave_train) write_res( './results_mnist/log_acc_train' + str(args['num_particle']) + '.npy', log_acc_train) write_res( './results_mnist/nnew_noise_TM_log_loss_' + str(args['num_particle']) + '.npy', log_loss) write_res( './results_mnist/nnew_noise_TM_log_acc_' + str(args['num_particle']) + '.npy', log_acc) # synchronize(args['num_processes']) # print train loss & acc if args['problem'] == 'one_module': loss_train, acc_train = OneModule_test( data, label, weighted_avg, device) elif args['problem'] == 'two_module': loss_train, acc_train = TwoModule_test( data, label, weighted_avg, device) log_loss_train.append(loss_train) # log_loss_ave_train.append(train_loss_ave) log_acc_train.append(acc_train) if rank == 0: print( "epoch: {:d} | iter: {:d} | train loss: {:f} | train precision: {:f}" .format(i, train_batch, loss_train.item(), acc_train.item())) t2 = time.time() print("training time: {:f}".format(t2 - t1)) # validation loss_sum = 0 acc_sum = 0 loss_ave = 0 t3 = time.time() for val_batch, (data_val, label_val) in enumerate(testloader): data_val, label_val = data_val.to(device), label_val.to( device) if args['problem'] == 'one_module': loss_val, acc_val = OneModule_test( data_val, label_val, weighted_avg, device) elif args['problem'] == 'two_module': loss_val, acc_val = TwoModule_test( data_val, label_val, weighted_avg, device) loss_sum += loss_val.item() * label_val.size(0) acc_sum += acc_val.item() * label_val.size(0) loss_sum /= len(testloader.dataset) acc_sum /= len(testloader.dataset) log_loss.append(loss_sum) log_acc.append(acc_sum) t4 = time.time() if rank == 0: # print("loss: ", loss) print( "epoch: {:d} | loss: {:f} | validation precision: {:f}" .format(i, loss_sum, acc_sum)) print("test time: {:f}".format(t3 - t2)) synchronize(args['num_processes']) # check whether to reach the stopping criterion delta_error = torch.norm(weighted_avg - former) err = delta_error**2 / weighted_avg.size(0) if err <= args['tolerance_stop']: res_loss[0] = loss_sum res_acc[0] = acc_sum # print(former_avg) # average, variance = var(former_avg) # print(np.linalg.norm(former_avg), average, variance) if rank == 0: print("Consensus!") epo_flag = True break synchronize(args['num_processes']) if rank == 0: time_end = time.time() print('epoch time', time_end - time_start) # sys.exit("time!") write_res( './results_mnist/log_loss_train_' + str(args['num_particle']) + '.npy', log_loss_train) write_res( './results_mnist/log_acc_train' + str(args['num_particle']) + '.npy', log_acc_train) write_res( './results_mnist/nnew_noise_TM_log_loss_' + str(args['num_particle']) + '.npy', log_loss) write_res( './results_mnist/nnew_noise_TM_log_acc_' + str(args['num_particle']) + '.npy', log_acc) print("Results written!") sys.exit("break!") elif args['dataset'] == 'ras': # initilize... B = torch.tensor(args['B']).double() C = torch.tensor(args['C']).double() weighted_avg = torch.zeros(1, args['ras_dim']).double() # initialize CBO optimizer if args['num_processes'] == 1: optimizer = CBO_optimizer(num=args['num_particle'], dim=args['ras_dim'], drift=args['drift'], noise=args['noise'], temp=args['temperature'], timestep=args['timestep'], tol=args['tolerance'], seed=args['seed'], batch_avg=args['batch_avg'], avg_choice=args['avg_choice'], noise_choice=args['noise_choice'], device=device, lam_reg=args['reg'], batch_loss=args['batch_loss']) else: optimizer = MultiprocessCBO(num_process=args['num_processes'], rank=rank, num=args['num_particle'], dim=args['ras_dim'], drift=args['drift'], noise=args['noise'], temp=args['temperature'], timestep=args['timestep'], tol=args['tolerance'], seed=args['seed'], batch_avg=args['batch_avg'], avg_choice=args['avg_choice'], noise_choice=args['noise_choice'], device=device, lam_reg=args['reg'], batch_loss=args['batch_loss']) # optimizing... if rank == 0: print("Training...") log_gap_list = list() min_gap_list = list() success_sum = 0 for times in range(args['repeat_time']): log_gap = list() set_random_seed(times) optimizer._uniform_initialization_particles(times) acc_flag = False min_gap = 1e5 for i in range(args['epoch']): if rank == 0: t1 = time.time() former = copy.deepcopy(weighted_avg) weighted_avg, noise_flag = optimizer.forward( B, C, args['problem'], weighted_avg) flag_acc = 0 if rank == 0: t2 = time.time() # validation pred = rastrigin(weighted_avg, B, C) gap = torch.log(abs(pred - C)) log_gap.append(gap) min_gap = min(min_gap, torch.norm(weighted_avg - B)**2) if (abs(weighted_avg - B) < 0.25).all(): flag_acc = 1 acc_flag = True if rank == 0: # print("loss: ", loss) print( "time: {:d} | epoch: {:d} | loss: {:f} | success: {:d}" .format(times, i, gap.item(), flag_acc)) synchronize(args['num_processes']) if rank == 0: t3 = time.time() print("training time: {:f}".format(t2 - t1)) print("test time: {:f}".format(t3 - t2)) # if pred <= 0 or flag_acc: # break # print(weighted_avg) log_gap_list.append(log_gap) min_gap_list.append(min_gap) success_sum += flag_acc if rank == 0: if acc_flag: print("success!") else: print("failed!") write_res( './results/NNCBO_rastrigin_log_loss_' + str(args['num_particle']) + '_' + str(args['ras_dim']) + '.npy', log_gap_list) write_res( './results/NNCBO_rastrigin_min_gap_' + str(args['num_particle']) + '_' + str(args['ras_dim']) + '.npy', min_gap_list) print("Results written!") print(success_sum) sys.exit("break!")
async def main() -> None: semaphore = asyncio.Semaphore(value=3) await asyncio.gather( *(synchronize(semaphore)(fetch)(file, checkpoint=10000) for file in glob.glob(_path("../data/*.pgn"))))
def training(self): self.seg_net.train() self.generator.train() self.feature_extracted.eval() for param in self.feature_extracted.parameters(): param.requires_grad = False save_to_disk = ptutil.get_rank() == 0 start_training_time = time.time() trained_time = 0 best_miou = 0 mean = torch.tensor([0.485, 0.456, 0.406]).float().cuda().view(1, 3, 1, 1) std = torch.tensor([0.229, 0.224, 0.225]).float().cuda().view(1, 3, 1, 1) tic = time.time() end = time.time() iteration, max_iter = 0, self.max_iter save_iter, eval_iter = self.per_iter * self.config.TRAIN.SAVE_EPOCH, self.per_iter * self.config.TRAIN.EVAL_EPOCH # save_iter, eval_iter = 10, 10 self.logger.info( "Start training, total epochs {:3d} = total iteration: {:6d}". format(self.config.TRAIN.EPOCHS, max_iter)) for i, (source_image, label) in enumerate(self.train_loader): iteration += 1 self.scheduler.step() # self.optimizer.zero_grad() self.gen_scheduler.step() # self.gen_optimizer.zero_grad() source_image, label = source_image.to(self.device, dtype=self.dtype), label.to( self.device) try: _, batch = self.target_trainloader_iter.__next__() except: self.target_trainloader_iter = enumerate(self.target_loader) _, batch = self.target_trainloader_iter.__next__() target_image = batch.to(self.device, dtype=self.dtype) if self.config.DATASET.IMG_TRANSFORM == False: source_image = source_image.permute(0, 3, 1, 2) target_image = target_image.permute(0, 3, 1, 2) source_image_norm = (((source_image / 255) - mean) / std) target_image_norm = (((target_image / 255) - mean) / std) else: source_image_norm = source_image target_image_norm = target_image source_feature = self.feature_extracted(source_image_norm) target_feature = self.feature_extracted(target_image_norm) target_feature_mean = torch.mean(target_feature, (2, 3), keepdim=True) target_feature_var = torch.std(target_feature, (2, 3), keepdim=True) source_feature_mean = torch.mean(source_feature, (2, 3), keepdim=True) source_feature_var = torch.std(source_feature, (2, 3), keepdim=True) adain_feature = ( (source_feature - source_feature_mean) / (source_feature_var + 0.00001)) * ( target_feature_var + 0.00001) + target_feature_mean gen_image_norm = self.generator(adain_feature) gen_image = ((gen_image_norm * std) + mean) * 255 gen_image_feature = self.feature_extracted(gen_image_norm) gen_image_feature_mean = torch.mean(gen_image_feature, (2, 3), keepdim=True) gen_image_feature_var = torch.std(gen_image_feature, (2, 3), keepdim=True) #adain_feature <--> gen_image_feature gen_image_feature gen_image_feature_mean <--> target_feature_mean #gen_image_feature_var <--> target_feature_var loss_feature_dict = self.gen_criterion(gen_image_feature, adain_feature) loss_mean_dict = self.gen_criterion(gen_image_feature_mean, target_feature_mean) loss_var_dict = self.gen_criterion(gen_image_feature_var, target_feature_var) loss_feature = sum(loss for loss in loss_feature_dict.values()) loss_feature_dict_reduced = ptutil.reduce_loss_dict( loss_feature_dict) loss_feature_reduced = sum( loss for loss in loss_feature_dict_reduced.values()) loss_mean = sum(loss for loss in loss_mean_dict.values()) loss_mean_dict_reduced = ptutil.reduce_loss_dict(loss_mean_dict) loss_mean_reduced = sum( loss for loss in loss_mean_dict_reduced.values()) loss_var = sum(loss for loss in loss_var_dict.values()) loss_var_dict_reduced = ptutil.reduce_loss_dict(loss_var_dict) loss_var_reduced = sum(loss for loss in loss_var_dict_reduced.values()) loss_gen = loss_feature + loss_mean + loss_var # train source image outputs = self.seg_net(source_image) source_seg_loss_dict = self.criterion(outputs, label) # train gen image gen_outputs = self.seg_net(gen_image) gen_seg_loss_dict = self.criterion(gen_outputs, label) # reduce losses over all GPUs for logging purposes outputs = outputs.detach() kl_loss_dict = self.kl_criterion(gen_outputs, outputs) source_seg_loss_dict_reduced = ptutil.reduce_loss_dict( source_seg_loss_dict) # print(type(loss_dict_reduced)) source_seg_losses_reduced = sum( loss for loss in source_seg_loss_dict_reduced.values()) source_seg_loss = sum(loss for loss in source_seg_loss_dict.values()) # source_seg_loss.backward() gen_seg_loss_dict_reduced = ptutil.reduce_loss_dict( gen_seg_loss_dict) gen_seg_losses_reduced = sum( loss for loss in gen_seg_loss_dict_reduced.values()) gen_seg_loss = sum(loss for loss in gen_seg_loss_dict.values()) kl_loss_dict_reduced = ptutil.reduce_loss_dict(kl_loss_dict) kl_losses_reduced = sum(loss for loss in kl_loss_dict_reduced.values()) kl_loss = sum(loss for loss in kl_loss_dict.values()) loss_seg = source_seg_loss + gen_seg_loss + kl_loss * 10 # loss_seg.backward(retain_graph=True) # loss = loss_gen + loss_seg # loss.backward() if config.TRAIN.MIXED_PRECISION: with amp.scale_loss(loss_gen, self.gen_optimizer, loss_id=1) as errGen_scale: errGen_scale.backward() with amp.scale_loss(loss_seg, self.optimizer, loss_id=2) as errSeg_scale: errSeg_scale.backward() else: loss = loss_gen + loss_seg loss.backward() if iteration % 8 == 0: self.optimizer.step() self.gen_optimizer.step() self.optimizer.zero_grad() self.gen_optimizer.zero_grad() trained_time += time.time() - end end = time.time() if iteration % self.config.TRAIN.LOG_STEP == 0: eta_seconds = int( (trained_time / iteration) * (max_iter - iteration)) log_str = [ "Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}". format(iteration, self.optimizer.param_groups[0]['lr'], time.time() - tic, str(datetime.timedelta(seconds=eta_seconds))), "source_seg_loss: {:.6f}, gen_seg_loss:{:.6f}, kl_loss:{:.6f}" .format(source_seg_losses_reduced.item(), gen_seg_losses_reduced.item(), kl_losses_reduced.item() * 10), "feature_loss:{:.6f}, mean_loss:{:.6f}, var_loss:{:.6f}". format(loss_feature_reduced.item(), loss_mean_reduced.item(), loss_var_reduced.item()) ] log_str = ', '.join(log_str) self.logger.info(log_str) tic = time.time() if save_to_disk and iteration % save_iter == 0: model_path = os.path.join( self.seg_dir, "{}_{}_{}_iter_{:06d}.pth".format( self.config.MODEL.SEG_NET, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME, iteration)) # self.save_model(model_path) ptutil.save_model(self.seg_net, model_path, self.logger) generator_path = os.path.join( self.generator_dir, '{}_{}_{}_iter_{:06d}.pth'.format( self.config.MODEL.TARGET_GENERATOR, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME, iteration)) # self.save_model_generator(generator_path) ptutil.save_model(self.generator, generator_path, self.logger) # Do eval when training, to trace the mAP changes and see performance improved whether or nor if self.config.TRAIN.EVAL_EPOCH > 0 and iteration % eval_iter == 0 and not iteration == max_iter: metrics = ptutil.validate(self.seg_net, self.valid_loader, self.metric, self.device, self.config) ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metrics) if mIoU != None and mIoU >= best_miou: best_miou = mIoU model_path = os.path.join( self.seg_dir, "{}_{}_{}_best.pth".format(self.config.MODEL.SEG_NET, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME)) ptutil.save_model(self.seg_net, model_path, self.logger) generator_path = os.path.join( self.generator_dir, '{}_{}_{}_best.pth'.format( self.config.TRAIN.TARGET_GENERATOR, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME)) ptutil.save_model(self.generator, generator_path, self.logger) if pixAcc is not None: self.logger.info('pixAcc: {:.4f}, mIoU: {:.4f}'.format( pixAcc, mIoU)) self.seg_net.train() if save_to_disk: model_path = os.path.join( self.seg_dir, "{}_{}_{}_iter_{:06d}.pth".format(self.config.TRAIN.SEG_NET, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME, max_iter)) ptutil.save_model(self.seg_net, model_path, self.logger) generator_path = os.path.join( self.generator_dir, '{}_{}_{}_iter_{:06d}.pth'.format( self.config.MODEL.TARGET_GENERATOR, self.config.TRAIN.SEG_LOSS, self.config.DATASET.NAME, max_iter)) ptutil.save_model(self.generator, generator_path, self.logger) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) self.logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) # eval after training if not self.config.TRAIN.SKIP_EVAL: metrics = ptutil.validate(self.seg_net, self.valid_loader, self.metric, self.device, self.config) ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metrics) if pixAcc is not None: self.logger.info( 'After training, pixAcc: {:.4f}, mIoU: {:.4f}'.format( pixAcc, mIoU))
def training(self): self.net.train() save_to_disk = ptutil.get_rank() == 0 start_training_time = time.time() trained_time = 0 tic = time.time() end = time.time() iteration, max_iter = 0, self.args.max_iter save_iter, eval_iter = self.args.per_iter * self.args.save_epoch, self.args.per_iter * self.args.eval_epoch # save_iter, eval_iter = self.args.per_iter * self.args.save_epoch, 10 logger.info("Start training, total epochs {:3d} = total iteration: {:6d}".format(self.args.epochs, max_iter)) # TODO: add mixup for i, batch in enumerate(self.train_loader): iteration += 1 self.scheduler.step() image = batch[0].to(self.device) fixed_targets = [batch[it].to(self.device) for it in range(1, 6)] gt_boxes = batch[6].to(self.device) self.optimizer.zero_grad() loss_dict = self.net(image, gt_boxes, *fixed_targets) # reduce losses over all GPUs for logging purposes loss_dict_reduced = ptutil.reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) loss.backward() self.optimizer.step() trained_time += time.time() - end end = time.time() if iteration % args.log_step == 0: eta_seconds = int((trained_time / iteration) * (max_iter - iteration)) log_str = ["Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}" .format(iteration, self.optimizer.param_groups[0]['lr'], time.time() - tic, str(datetime.timedelta(seconds=eta_seconds))), "total_loss: {:.3f}".format(losses_reduced.item())] for loss_name, loss_item in loss_dict_reduced.items(): log_str.append("{}: {:.3f}".format(loss_name, loss_item.item())) log_str = ', '.join(log_str) logger.info(log_str) tic = time.time() if save_to_disk and iteration % save_iter == 0: model_path = os.path.join(self.args.save_dir, "{}_iter_{:06d}.pth" .format(self.save_prefix, iteration)) self.save_model(model_path) # Do eval when training, to trace the mAP changes and see performance improved whether or nor if self.args.eval_epoch > 0 and iteration % eval_iter == 0 and not iteration == max_iter: metrics = self.validate() ptutil.synchronize() names, values = ptutil.accumulate_metric(metrics) if names is not None: log_str = ['{}: {:.5f}'.format(k, v) for k, v in zip(names, values)] log_str = '\n'.join(log_str) logger.info(log_str) self.net.train() if save_to_disk: model_path = os.path.join(self.args.save_dir, "{}_iter_{:06d}.pth" .format(self.save_prefix, max_iter)) self.save_model(model_path) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info( "Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter))
def training(self): self.net.train() save_to_disk = ptutil.get_rank() == 0 start_training_time = time.time() trained_time = 0 tic = time.time() end = time.time() iteration, max_iter = 0, self.args.max_iter save_iter, eval_iter = self.args.per_iter * self.args.save_epoch, self.args.per_iter * self.args.eval_epochs # save_iter, eval_iter = 10, 10 logger.info( "Start training, total epochs {:3d} = total iteration: {:6d}". format(self.args.epochs, max_iter)) for i, (image, target) in enumerate(self.train_loader): iteration += 1 self.scheduler.step() self.optimizer.zero_grad() image, target = image.to(self.device), target.to(self.device) outputs = self.net(image) loss_dict = self.criterion(outputs, target) # reduce losses over all GPUs for logging purposes loss_dict_reduced = ptutil.reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss = sum(loss for loss in loss_dict.values()) loss.backward() self.optimizer.step() trained_time += time.time() - end end = time.time() if iteration % args.log_step == 0: eta_seconds = int( (trained_time / iteration) * (max_iter - iteration)) log_str = [ "Iteration {:06d} , Lr: {:.5f}, Cost: {:.2f}s, Eta: {}". format(iteration, self.optimizer.param_groups[0]['lr'], time.time() - tic, str(datetime.timedelta(seconds=eta_seconds))), "total_loss: {:.3f}".format(losses_reduced.item()) ] log_str = ', '.join(log_str) logger.info(log_str) tic = time.time() if save_to_disk and iteration % save_iter == 0: model_path = os.path.join( self.args.save_dir, "{}_iter_{:06d}.pth".format('LEDNet', iteration)) self.save_model(model_path) # Do eval when training, to trace the mAP changes and see performance improved whether or nor if args.eval_epochs > 0 and iteration % eval_iter == 0 and not iteration == max_iter: metrics = self.validate() ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metrics) if pixAcc is not None: logger.info('pixAcc: {:.4f}, mIoU: {:.4f}'.format( pixAcc, mIoU)) self.net.train() if save_to_disk: model_path = os.path.join( self.args.save_dir, "{}_iter_{:06d}.pth".format('LEDNet', max_iter)) self.save_model(model_path) # compute training time total_training_time = int(time.time() - start_training_time) total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / max_iter)) # eval after training if not self.args.skip_eval: metrics = self.validate() ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metrics) if pixAcc is not None: logger.info( 'After training, pixAcc: {:.4f}, mIoU: {:.4f}'.format( pixAcc, mIoU))
def train(is_dist, start_epoch, local_rank): transforms = transform.build_transforms() coco_dataset = dataset.COCODataset(is_train=True, transforms=transforms) if (is_dist): sampler = distributedGroupSampler(coco_dataset) else: sampler = groupSampler(coco_dataset) dataloader = build_dataloader(coco_dataset, sampler) batch_time_meter = utils.AverageMeter() cls_loss_meter = utils.AverageMeter() reg_loss_meter = utils.AverageMeter() losses_meter = utils.AverageMeter() model = retinanet(is_train=True) if (start_epoch == 1): model.resnet.load_pretrained(pretrained_path[cfg.resnet_depth]) else: utils.load_model(model, start_epoch - 1) model = model.cuda() if is_dist: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[ local_rank, ], output_device=local_rank, broadcast_buffers=False) optimizer = solver.build_optimizer(model) scheduler = solver.scheduler(optimizer) model.train() logs = [] for epoch in range(start_epoch, cfg.max_epochs + 1): if is_dist: dataloader.sampler.set_epoch(epoch - 1) scheduler.lr_decay(epoch) end_time = time.time() for iteration, datas in enumerate(dataloader, 1): scheduler.linear_warmup(epoch, iteration - 1) images = datas["images"] bboxes = datas["bboxes"] labels = datas["labels"] res_img_shape = datas["res_img_shape"] pad_img_shape = datas["pad_img_shape"] images = images.cuda() bboxes = [bbox.cuda() for bbox in bboxes] labels = [label.cuda() for label in labels] loss_dict = model(images, gt_bboxes=bboxes, gt_labels=labels, res_img_shape=res_img_shape, pad_img_shape=pad_img_shape) cls_loss = loss_dict["cls_loss"] reg_loss = loss_dict["reg_loss"] losses = cls_loss + reg_loss optimizer.zero_grad() losses.backward() optimizer.step() batch_time_meter.update(time.time() - end_time) end_time = time.time() cls_loss_meter.update(cls_loss.item()) reg_loss_meter.update(reg_loss.item()) losses_meter.update(losses.item()) if (iteration % 50 == 0): if (local_rank == 0): res = "\t".join([ "Epoch: [%d/%d]" % (epoch, cfg.max_epochs), "Iter: [%d/%d]" % (iteration, len(dataloader)), "Time: %.3f (%.3f)" % (batch_time_meter.val, batch_time_meter.avg), "cls_loss: %.4f (%.4f)" % (cls_loss_meter.val, cls_loss_meter.avg), "reg_loss: %.4f (%.4f)" % (reg_loss_meter.val, reg_loss_meter.avg), "Loss: %.4f (%.4f)" % (losses_meter.val, losses_meter.avg), "lr: %.6f" % (optimizer.param_groups[0]["lr"]), ]) print(res) logs.append(res) batch_time_meter.reset() cls_loss_meter.reset() reg_loss_meter.reset() losses_meter.reset() if (local_rank == 0): utils.save_model(model, epoch) if (is_dist): utils.synchronize() if (local_rank == 0): with open("logs.txt", "w") as f: for i in logs: f.write(i + "\n")