def run_experiment(alg_name, map_id, specs_id, num_times, steps, r_good, multi_ag, show_print, render): # configuration of testing params testing_params = TestingParameters() # configuration of learning params training_params = TrainingParameters() # Setting the experiment tester = Tester(training_params, testing_params, map_id, specs_id, multi_ag) # Setting the curriculum learner curriculum = CurriculumLearner(tester.specs, total_steps=steps, r_good=r_good) # Setting up the saver saver = Saver(alg_name, tester, curriculum) # Baseline 1 (Decentrlized LTL DQN) if alg_name == "i-dqn-l": i_dqn_l.run_experiments(tester, curriculum, saver, num_times, show_print, render) else: i_lpopl.run_experiments(tester, curriculum, saver, num_times, show_print, render)
def __init__(self): super().__init__() now_time = time.strftime('%Y-%m-%d-%H-%M',time.localtime(time.time())) logger_path = os.path.join( self.args.training.save_dir, self.args.dataset.dataset_train, self.args.models.model_warpper, self.args.training.experiment_id, '%s.log' % now_time ) set_logger_path(logger_path) logger.info(self.args) # Define Saver self.saver = Saver(self.args) # Define Tensorboard Summary self.summary = TensorboardSummary() self.writer = self.summary.create_summary(self.saver.experiment_dir, self.args.models) self.init_training_container() self.batchsize = self.args.training.batchsize self.reset_batchsize() self.evaluator = Evaluator() self.best = 0.0 # show parameters to be trained logger.debug('\nTraining params:') for p in self.model.named_parameters(): if p[1].requires_grad: logger.debug(p[0]) logger.debug('\n') # Clear start epoch if fine-tuning logger.info('Starting iteration: %d' % self.start_it) logger.info('Total iterationes: %d' % self.args.training.max_iter)
def main(): args = parse_args() args.pretrain = False root_path = 'exps/exp_{}'.format(args.exp) if not os.path.exists(root_path): os.mkdir(root_path) os.mkdir(os.path.join(root_path, "log")) os.mkdir(os.path.join(root_path, "model")) base_lr = args.lr # base learning rate train_dataset, val_dataset = build_dataset(args.dataset, args.data_root, args.train_list) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=args.num_workers, pin_memory=True) model = VNet(args.n_channels, args.n_classes).cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0005) #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.7) model = torch.nn.DataParallel(model) model.train() if args.resume is None: assert os.path.exists(args.load_path) state_dict = model.state_dict() print("Loading weights...") pretrain_state_dict = torch.load(args.load_path, map_location="cpu")['state_dict'] for k in list(pretrain_state_dict.keys()): if k not in state_dict: del pretrain_state_dict[k] model.load_state_dict(pretrain_state_dict) print("Loaded weights") else: print("Resuming from {}".format(args.resume)) checkpoint = torch.load(args.resume, map_location="cpu") optimizer.load_state_dict(checkpoint['optimizer_state_dict']) model.load_state_dict(checkpoint['state_dict']) logger = Logger(root_path) saver = Saver(root_path) for epoch in range(args.start_epoch, args.epochs): train(model, train_loader, optimizer, logger, args, epoch) validate(model, val_loader, optimizer, logger, saver, args, epoch) adjust_learning_rate(args, optimizer, epoch)
def main(): args = parse_args() if args.turnon < 0: args.pretrain = True else: args.pretrain = False print("Using GPU: {}".format(args.local_rank)) root_path = 'exps/exp_{}'.format(args.exp) if args.local_rank == 0 and not os.path.exists(root_path): os.mkdir(root_path) os.mkdir(os.path.join(root_path, "log")) os.mkdir(os.path.join(root_path, "model")) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_dataset, val_dataset = build_dataset(args.dataset, args.data_root, args.train_list, sampling=args.sampling) args.world_size = len(args.gpu.split(",")) if args.world_size > 1: os.environ['MASTER_PORT'] = args.port torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group('nccl') device = torch.device('cuda:{}'.format(args.local_rank)) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=len(args.gpu.split(",")), rank=args.local_rank) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=args.num_workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=args.num_workers, pin_memory=True) model = VNet(args.n_channels, args.n_classes, input_size=64, pretrain=True).cuda(args.local_rank) model_ema = VNet(args.n_channels, args.n_classes, input_size=64, pretrain=True).cuda(args.local_rank) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0005) if args.world_size > 1: model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model_ema = DDP(model_ema, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model.train() model_ema.load_state_dict(model.state_dict()) print("Loaded weights") logger = Logger(root_path) saver = Saver(root_path, save_freq=args.save_freq) if args.sampling == 'default': contrast = RGBMoCo(128, K=4096, T=args.temperature).cuda(args.local_rank) elif args.sampling == 'layerwise': contrast = RGBMoCoNew(128, K=4096, T=args.temperature).cuda(args.local_rank) else: raise ValueError("unsupported sampling method") criterion = torch.nn.CrossEntropyLoss() flag = False for epoch in range(args.start_epoch, args.epochs): train_sampler.set_epoch(epoch) train(model, model_ema, train_loader, optimizer, logger, saver, args, epoch, contrast, criterion) validate(model_ema, val_loader, optimizer, logger, saver, args, epoch) adjust_learning_rate(args, optimizer, epoch)
drop_last=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=args.num_workers, pin_memory=True) model = VNet(args.n_channels, args.n_classes, input_size=64, pretrain=True).cuda() model_ema = VNet(args.n_channels, args.n_classes, input_size=64, pretrain=True).cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0005) model = torch.nn.DataParallel(model) model_ema = torch.nn.DataParallel(model_ema) model_ema.load_state_dict(model.state_dict()) print("Model Initialized") logger = Logger(root_path) saver = Saver(root_path, save_freq=args.save_freq) if args.sampling == 'default': contrast = RGBMoCo(128, K=4096, T=args.temperature).cuda() else: raise ValueError("unsupported sampling method") criterion = torch.nn.CrossEntropyLoss() for epoch in range(args.start_epoch, args.epochs): pretrain_jigsaw(model, model_ema, train_loader, optimizer, logger, saver, args, epoch, contrast, criterion) adjust_learning_rate(args, optimizer, epoch)
def main(): args = parse_args() args.pretrain = True print("Using GPU: {}".format(args.local_rank)) root_path = 'exps/exp_{}'.format(args.exp) if not os.path.exists(root_path) and args.local_rank == 0: os.mkdir(root_path) os.mkdir(os.path.join(root_path, "log")) os.mkdir(os.path.join(root_path, "model")) base_lr = args.lr # base learning rate batch_size = 1 max_iterations = 40000 cell_size = 96 # size of volume we crop patch from patch_size = 64 puzzle_config = 3 # 2 or 3 for 2X2X2 or 3X3X3 puzzle puzzle_num = puzzle_config**3 feature_len = 256 # iter_num = 0 sr_feature_size = 32 os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_dataset, val_dataset = build_dataset(args) args.world_size = len(args.gpu.split(",")) if args.world_size > 1: os.environ['MASTER_PORT'] = args.port torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group('nccl') device = torch.device('cuda:{}'.format(args.local_rank)) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=len(args.gpu.split(",")), rank=args.local_rank) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=args.num_workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=args.num_workers, pin_memory=True) model = VNet(args.n_channels, args.n_classes, input_size=64, pretrain=True).cuda(args.local_rank) model_ema = VNet(args.n_channels, args.n_classes, input_size=64, pretrain=True).cuda(args.local_rank) assert os.path.exists(args.load_path) state_dict = model.state_dict() print("Loading weights...") optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0005) #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.7) if args.world_size > 1: model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model.train() model_ema = DDP(model_ema, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) pretrain_state_dict = torch.load(args.load_path, map_location="cpu")['state_dict'] model.load_state_dict(pretrain_state_dict) model_ema.load_state_dict(pretrain_state_dict) print("Loaded weights") logger = Logger(root_path) saver = Saver(root_path, save_freq=args.save_freq) contrast = RGBMoCo(128, K=512).cuda(args.local_rank) criterion = torch.nn.CrossEntropyLoss() for epoch in range(args.start_epoch, args.epochs): train_sampler.set_epoch(epoch) adapt(model, model_ema, train_loader, optimizer, logger, saver, args, epoch, contrast, criterion) adjust_learning_rate(args, optimizer, epoch)
def main(): args = parse_args() args.pretrain = False print("Using GPU: {}".format(args.local_rank)) root_path = 'exps/exp_{}'.format(args.exp) if args.local_rank == 0 and not os.path.exists(root_path): os.mkdir(root_path) os.mkdir(os.path.join(root_path, "log")) os.mkdir(os.path.join(root_path, "model")) base_lr = args.lr # base learning rate os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu train_dataset, val_dataset = build_dataset(args.dataset, args.data_root, args.train_list) args.world_size = len(args.gpu.split(",")) if args.world_size > 1: os.environ['MASTER_PORT'] = args.port torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group('nccl') device = torch.device('cuda:{}'.format(args.local_rank)) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=len(args.gpu.split(",")), rank=args.local_rank) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=args.num_workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, num_workers=args.num_workers, pin_memory=True) model = VNet(args.n_channels, args.n_classes).cuda(args.local_rank) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=0.0005) #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.7) if args.world_size > 1: model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model.train() print("Loaded weights") logger = Logger(root_path) saver = Saver(root_path) for epoch in range(args.start_epoch, args.epochs): train(model, train_loader, optimizer, logger, args, epoch) validate(model, val_loader, optimizer, logger, saver, args, epoch) adjust_learning_rate(args, optimizer, epoch)