def get_allstrat_ram(args): from evaluation.eval_execution import get_solution_to_evaluate log_base = os.path.join( "data", "get_allstrat_ram", f"{args.platform}_{args.model_name}_{args.model_version}_{args.batch_size}_{args.input_shape}_{args.strategy}_{args.buffer_mem_mb}_gradless_eagerfalse" ) os.makedirs(log_base, exist_ok=True) logger = setup_logger("get_allstrat_ram") # load redis config dotenv_location = dotenv.find_dotenv() if len(dotenv_location): logger.info(f'Loading dotenv config from {dotenv_location}') dotenv.load_dotenv(dotenv_location) else: logger.warn("Failed to load dotenv config!") for strategy in SolveStrategy: if strategy == SolveStrategy.NOT_SPECIFIED: continue result, result_key = get_solution_to_evaluate( strategy, args.model_name, args.batch_size, args.platform, args.input_shape, args.model_version, args.buffer_mem_mb) if result: logger.info( f"For strategy {strategy.name}, peak ram is {result.peak_ram:.3E}, compute is {result.cpu:.3E}" ) else: logger.warn(f"no solution for strategy {strategy.name}")
def run_single_model(args): log_base = os.path.join( "data", "run_single_model", f"{args.platform}_{args.model_name}_{args.model_version}_{args.batch_size}_{args.input_shape}_{args.strategy}_{args.buffer_mem_mb}_gradless_eagerfalse" ) os.makedirs(log_base, exist_ok=True) logger = setup_logger("run_single_model") # load redis config dotenv_location = dotenv.find_dotenv() if len(dotenv_location): logger.info(f'Loading dotenv config from {dotenv_location}') dotenv.load_dotenv(dotenv_location) else: logger.warn("Failed to load dotenv config!") strategy = SolveStrategy(args.strategy) result, result_key, throughput = execute_one( log_base=log_base, solve_strategy=strategy, model_name=args.model_name, batch_size=args.batch_size, platform=args.platform, input_shape=args.input_shape, model_version=args.model_version, num_runs=args.num_runs, buffer_mem=args.buffer_mem_mb * 1000 * 1000) if result is None: logger.error("No result returned from execute_one") return metrics_single = dict( solve_strategy=strategy, model_name=args.model_name, batch_size=args.batch_size, platform=args.platform, input_shape=args.input_shape, model_version=args.model_version, num_runs=args.num_runs, result_key=result_key, buffer_mem=args.buffer_mem_mb * 1000 * 1000, throughput_it_per_s=throughput, ) if strategy == SolveStrategy.OPTIMAL_ILP_GC: metrics_single["vars"] = result.ilp_num_variables metrics_single["constraints"] = result.ilp_num_constraints metrics_single["solve_time"] = result.solve_time_s output_file = os.path.join(log_base, "metrics.pickle") with open(output_file, "wb") as f: pickle.dump(metrics_single, f, protocol=pickle.HIGHEST_PROTOCOL) logger.info(f"Saved throughput metrics to {output_file}")
def execute_one(log_base: str, solve_strategy: SolveStrategy, model_name: str, batch_size: int, platform: str, input_shape=None, model_version="v1", num_runs=16, buffer_mem: int = 0) -> Tuple[Optional[RSResult], str, int]: logger = setup_logger("eval_one") results_and_keys = get_solutions_to_evaluate(solve_strategy, model_name, batch_size, platform, input_shape, model_version, buffer_mem) if not results_and_keys: logger.info("No results found") return None, "", 0 if not EAGER: tf1.disable_eager_execution() for result, result_key in results_and_keys: tf.keras.backend.clear_session() model = get_keras_model(model_name, input_shape=input_shape) tf2 = TF2ExtractorParams(model, batch_size=batch_size, log_base=log_base) loss_fn = categorical_cross_entropy # TODO: vgg_unet may need a different loss graph = tf2.g # TODO TEST THIS VS TENSORSPEC runner = TF2Runner(model, graph, result.schedule, loss_fn=loss_fn, eager=EAGER, log_base=log_base, batch_size=batch_size) try: throughput = evaluate_solved_model(result=result, runner=runner, warmup=10 if EAGER else 64, trials=num_runs, batch_size=batch_size) logger.info( f"Successfully executed model with predicted memory usage {result.peak_ram}, " f"predicted cpu {result.cpu}, actual throughput {throughput}") return result, result_key, throughput except Exception as e: logger.error("Error running model with predicted mem usage %s: %s", result.peak_ram, e) logger.error("Traceback: %s", e.__traceback__) logger.error("Skipping result, going to next candidate.") return None, "", 0
def evaluate_solved_model(result: RSResult, runner: TF2Runner, warmup, trials, batch_size): logger = setup_logger("evaluate_solved_model") recompute_baseline = runner.tf_graph logger.debug("Warming up models") in_shape = runner.keras_model.input_shape out_shape = runner.keras_model.output_shape h = in_shape[1] w = in_shape[2] c = np.prod(out_shape[1:]) reshape_to = list(out_shape) print(reshape_to) reshape_to[0] = -1 for data in tqdm([ random_batch(batch_size, img_h=h, img_w=w, num_classes=c) for _ in range(warmup) ], desc="Warmup"): dat, lab = data lab = tf.reshape(lab, reshape_to) recompute_baseline(dat, lab) # run actual evaluation timer = Timer("timer_recompute") for i in tqdm(range(trials), desc="Profiling"): # TODO: Should we generate random batches on CPU and copy to GPU, inside the timing loop? # This would model the overhead of loading data, and bring throughputs down to be # more realistic images, labels = random_batch(batch_size, img_h=h, img_w=w, num_classes=c) labels = tf.reshape(labels, reshape_to) with timer: loss, gradients = recompute_baseline(images, labels) # todo assert correctness of the model by applying gradients tput = trials / timer.elapsed logger.info(f"{result.solve_strategy} throughput: {tput :2.4} iters/s") return tput
def __init__(self, keras_model: tf.keras.models.Model, g: DFGraph, schedule: Schedule, loss_fn=categorical_cross_entropy, eager: bool = True, log_base: str = None, debug=False, batch_size=None): self.log_base = log_base self.logger = setup_logger("TF2Runner", os.path.join(log_base, 'TF2Runner.log')) self.debug = debug self.schedule = schedule self.eager = eager self.batch_size = batch_size self.loss_fn = loss_fn self.keras_model = keras_model self.g = g self.tf_graph = self._generate_tf_graph()
def main(): args = parser() ### setup configs ### configfile = args.configfile with open(configfile) as f: configs = yaml.load(f) ## path process (path definition, make directories) now = datetime.now().isoformat() log_dir = Path(configs['log_dir']) / now paths = Paths(log_dir=log_dir) ### setup logs and summary writer ### setup_logger(logfile=paths.logfile) writer = SummaryWriter(str(paths.summary_dir)) ### setup GPU or CPU ### if configs['n_gpus'] > 0 and torch.cuda.is_available(): logger.info('CUDA is available! using GPU...\n') device = torch.device('cuda') else: logger.info('using CPU...\n') device = torch.device('cpu') ### Dataset ### logger.info('preparing dataset...') dataset_name = configs['dataset'] logger.info(f'==> dataset: {dataset_name}\n') if configs['dataset'] == 'cifar10': transform = transforms.Compose([ transforms.Resize(configs['img_size'], configs['img_size']), transforms.ToTensor(), transforms.Normalize(configs['color_mean'], configs['color_std']), ]) train_dataset = datasets.CIFAR10(root=configs['data_root'], train=True, transform=transform, download=True) test_dataset = datasets.CIFAR10(root=configs['data_root'], train=False, transform=transform, download=True) elif configs['dataset'] == 'custom': train_transform = DataTransforms(img_size=configs['img_size'], color_mean=configs['color_mean'], color_std=configs['color_std'], phase='train') test_transform = DataTransforms(img_size=configs['img_size'], color_mean=configs['color_mean'], color_std=configs['color_std'], phase='test') train_img_list, train_lbl_list, test_img_list, test_lbl_list = make_datapath_list( root=configs['data_root']) train_dataset = Dataset(train_img_list, train_lbl_list, transform=train_transform) test_dataset = Dataset(test_img_list, test_lbl_list, transform=test_transform) else: logger.debug('dataset is not supported') raise ValueError('dataset is not supported') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=configs['batch_size'], shuffle=True, num_workers=8) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=configs['batch_size'], shuffle=False, num_workers=8) ### Network ### logger.info('preparing network...') network = CNNAutoEncoder(in_channels=configs['n_channels'], n_classes=configs['n_classes']) network = network.to(device) cnn_criterion = nn.CrossEntropyLoss() ae_criterion = nn.MSELoss() optimizer = optim.Adam(network.parameters(), lr=configs['lr']) if configs['resume']: # Load checkpoint logger.info('==> Resuming from checkpoint...\n') if not Path(configs['resume']).exists(): logger.info('No checkpoint found !') raise ValueError('No checkpoint found !') ckpt = torch.load(configs['resume']) network.load_state_dict(ckpt['model_state_dict']) optimizer.load_state_dict(ckpt['optimizer_state_dict']) start_epoch = ckpt['epoch'] loss = ckpt['loss'] else: logger.info('==> Building model...\n') start_epoch = 0 logger.info('model summary: ') summary(network, input_size=(configs['n_channels'], configs['img_size'], configs['img_size'])) if configs["n_gpus"] > 1: network = nn.DataParallel(network) ### Metrics ### metrics = Metrics(n_classes=configs['n_classes'], classes=configs['classes'], writer=writer, metrics_dir=paths.metrics_dir, plot_confusion_matrix=plot_confusion_matrix) ### Train or Test ### kwargs = { 'device': device, 'network': network, 'optimizer': optimizer, 'criterions': (cnn_criterion, ae_criterion), 'classification_loss_weight': configs['classification_loss_weight'], 'autoencoder_loss_weight': configs['autoencoder_loss_weight'], 'data_loaders': (train_loader, test_loader), 'metrics': metrics, 'writer': writer, 'n_classses': configs['n_classes'], 'save_ckpt_interval': configs['save_ckpt_interval'], 'ckpt_dir': paths.ckpt_dir, } cnn_classifier = CNNClassifier(**kwargs) if args.inference: if not configs['resume']: logger.info('No checkpoint found for inference!') logger.info('mode: inference\n') cnn_classifier.test(epoch=start_epoch, inference=True) else: logger.info('mode: train\n') cnn_classifier.train(n_epochs=configs['n_epochs'], start_epoch=start_epoch)
def main(args): with open(args.configfile) as f: configs = yaml.safe_load(f) ## path process (path definition, make directories) now = datetime.now().isoformat() log_dir = Path(configs['log_dir']) / now paths = Paths(log_dir=log_dir) ### setup logs and summary writer ### setup_logger(logfile=paths.logfile) writer = SummaryWriter(str(paths.summary_dir)) ### setup GPU or CPU ### if configs['n_gpus'] > 0 and torch.cuda.is_available(): logger.info('CUDA is available! using GPU...\n') device = torch.device('cuda') else: logger.info('using CPU...\n') device = torch.device('cpu') ### Dataset ### logger.info('preparing dataset...') data_root = configs['data_root'] logger.info(f'==> dataset path: {data_root}\n') train_img_list, train_annot_list, test_img_list, test_annot_list = make_datapath_list(rootpath=data_root, train_data=configs['train_txt'], test_data=configs['test_txt'], img_extension=configs['img_extension'], anno_extension=configs['anno_extension']) train_transform = DataTransform(img_size=configs['img_size'], color_mean=configs['color_mean'], color_std=configs['color_std'], mode='train') test_transform = DataTransform(img_size=configs['img_size'], color_mean=configs['color_mean'], color_std=configs['color_std'], mode='test') train_dataset = VOCDataset(train_img_list, train_annot_list, transform=train_transform, label_color_map=configs['label_color_map']) test_dataset = VOCDataset(test_img_list, test_annot_list, transform=test_transform, label_color_map=configs['label_color_map']) ### DataLoader ### train_loader = DataLoader(train_dataset, batch_size=configs['batch_size'], shuffle=True) test_loader = DataLoader(test_dataset, batch_size=configs['batch_size'], shuffle=False) ### Network ### logger.info('preparing network...') network = PSPNet(n_classes=configs['n_classes'], img_size=configs['img_size'], img_size_8=configs['input_size_8']) network = network.to(device) criterion = PSPLoss(aux_weight=configs['aux_weight']) optimizer = optim.Adam(network.parameters(), lr=configs['lr'], weight_decay=configs['decay']) if configs['resume']: # Load checkpoint logger.info('==> Resuming from checkpoint...\n') if not Path(configs['resume']).exists(): logger.info('No checkpoint found !') raise ValueError('No checkpoint found !') ckpt = torch.load(configs['resume']) network.load_state_dict(ckpt['model_state_dict']) optimizer.load_state_dict(ckpt['optimizer_state_dict']) start_epoch = ckpt['epoch'] loss = ckpt['loss'] start_epoch = 0 else: logger.info('==> Building model...\n') start_epoch = 0 logger.info('model summary: ') summary(network, input_size=(configs['n_channels'], configs['img_size'], configs['img_size'])) if configs['n_gpus'] > 1: network = nn.DataParallel(network) ### Metrics ### metrics_cfg = { 'n_classes': configs['n_classes'], 'classes': configs['classes'], 'img_size': configs['img_size'], 'writer': writer, 'metrics_dir': paths.metrics_dir, } metrics = Metrics(**metrics_cfg) ### Visualize Results ### vis_img = VisImage(n_classes=configs['n_classes'], label_color_map=configs['label_color_map']) ### Train or Inference ### kwargs = { 'device': device, 'network': network, 'optimizer': optimizer, 'criterion': criterion, 'data_loaders': (train_loader, test_loader), 'metrics': metrics, 'vis_img': vis_img, 'img_size': configs['img_size'], 'writer': writer, 'save_ckpt_interval': configs['save_ckpt_interval'], 'ckpt_dir': paths.ckpt_dir, 'img_outdir': paths.img_outdir, } semantic_segmentaion = SemanticSegmentation(**kwargs) if args.inference: if not configs['resume']: logger.info('No checkpoint found for inference!') logger.info('mode: inference\n') semantic_segmentaion.test(epoch=start_epoch, inference=True) else: logger.info('mode: train\n') semantic_segmentaion.train(n_epochs=configs['n_epochs'], start_epoch=start_epoch)
def main(): args = parser() with open(args.configfile) as f: configs = yaml.safe_load(f) ## path process (path definition, make directories) now = datetime.now().isoformat() log_dir = Path(configs['log_dir']) / now paths = Paths(log_dir=log_dir) ### setup logs and summary writer ### setup_logger(logfile=paths.logfile) writer = SummaryWriter(str(paths.summary_dir)) ### setup GPU or CPU ### if configs['n_gpus'] > 0 and torch.cuda.is_available(): logger.info('CUDA is available! using GPU...\n') device = torch.device('cuda') else: logger.info('using CPU...\n') device = torch.device('cpu') ### Dataset ### logger.info('preparing dataset...') data_root = configs['data_root'] logger.info(f'==> dataset path: {data_root}\n') train_img_list, train_annot_list, test_img_list, test_annot_list = make_datapath_list(rootpath=data_root, train_data=configs['train_txt'], test_data=configs['test_txt']) train_transform = DataTransform(img_size=configs['img_size'], color_mean=configs['color_mean'], mode='train') test_transform = DataTransform(img_size=configs['img_size'], color_mean=configs['color_mean'], mode='test') transform_annot = Anno_xml2list(configs['classes']) train_dataset = VOCDataset(train_img_list, train_annot_list, transform=train_transform, transform_annot=transform_annot) test_dataset = VOCDataset(test_img_list, test_annot_list, transform=test_transform, transform_annot=transform_annot) ### DataLoader ### train_loader = DataLoader(train_dataset, batch_size=configs['batch_size'], shuffle=True, collate_fn=od_collate_fn) test_loader = DataLoader(test_dataset, batch_size=configs['batch_size'], shuffle=False, collate_fn=od_collate_fn) ### Network ### logger.info('preparing network...') ssd_cfg = { 'n_classes': configs['n_classes'], 'img_size': configs['img_size'], 'bbox_aspect_num': configs['bbox_aspect_num'], # number of aspect ratios of dbox 'feature_maps': configs['feature_maps'], # feature map size of each source 'steps': configs['steps'], # size of dbox 'min_sizes': configs['min_sizes'], # size of dbox 'max_sizes': configs['max_sizes'], # size of dbox 'aspect_ratios': configs['aspect_ratios'], # aspect ratios 'variances': configs['variances'], # variances for decode 'conf_thresh': configs['conf_thresh'], 'top_k': configs['top_k'], 'nms_thresh': configs['nms_thresh'], 'device': device, } network = SSD(**ssd_cfg) network = network.to(device) criterion = MultiBoxLoss(jaccard_thresh=configs['jaccord_thresh'], neg_pos=configs['neg_pos'], device=device) optimizer = optim.SGD(network.parameters(), lr=configs['lr'], weight_decay=configs['decay']) def weights_init(m): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight.data) if m.bias is not None: nn.init.constant_(m.bias, 0.0) if configs['pretrained']: # Load pretrained model logger.info('==> Pretrained VGG...\n') if not Path(configs['pretrained']).exists(): logger.info('No pretrained model found!') vgg_weights = torch.load(configs['pretrained']) network.vgg.load_state_dict(vgg_weights) network.extras.apply(weights_init) network.loc.apply(weights_init) network.conf.apply(weights_init) if configs['resume']: # Load checkpoint logger.info('==> Resuming from checkpoint...\n') if not Path(configs['resume']).exists(): logger.info('No checkpoint found !') raise ValueError('No checkpoint found !') ckpt = torch.load(configs['resume']) network.load_state_dict(ckpt) start_epoch = 0 # network.load_state_dict(ckpt['model_state_dict']) # optimizer.load_state_dict(ckpt['optimizer_state_dict']) # start_epoch = ckpt['epoch'] # loss = ckpt['loss'] else: logger.info('==> Building model...\n') start_epoch = 0 # logging logger.info('model summary: ') logger.info(summary(network, input_size=(configs['n_channels'], configs['img_size'], configs['img_size']))) if configs["n_gpus"] > 1: network = nn.DataParallel(network) ### Metrics ### metrics_cfg = { 'n_classes': configs['n_classes'], 'classes': configs['classes'], 'img_size': configs['img_size'], 'confidence_level': configs['confidence_level'], 'writer': writer, 'metrics_dir': paths.metrics_dir, 'imgs_dir': paths.img_outdir, } metrics = Metrics(**metrics_cfg) ### Visualize Results ### box_vis = BoxVis(configs['confidence_level'], configs['classes'], configs['label_color_map'], configs['font_path']) ### Train or Inference ### kwargs = { 'device': device, 'network': network, 'optimizer': optimizer, 'criterion': criterion, 'data_loaders': (train_loader, test_loader), 'metrics': metrics, 'box_vis': box_vis, 'img_size': configs['img_size'], 'writer': writer, 'save_ckpt_interval': configs['save_ckpt_interval'], 'ckpt_dir': paths.ckpt_dir, 'img_outdir': paths.img_outdir, } object_detection = ObjectDetection(**kwargs) if args.inference: if not configs['resume']: logger.info('No checkpoint found for inference!') logger.info('mode: inference\n') object_detection.test(epoch=start_epoch, inference=True) else: logger.info('mode: train\n') object_detection.train(n_epochs=configs['n_epochs'], start_epoch=start_epoch)
def main(): args = parser() ### setup configs ### configfile = args.configfile with open(configfile) as f: configs = yaml.load(f) ## path process (path definition, make directories) now = datetime.now().isoformat() log_dir = Path(configs['log_dir']) / now paths = Paths(log_dir=log_dir) ### setup logs and summary writer ### setup_logger(logfile=paths.logfile) writer = SummaryWriter(str(paths.summary_dir)) ### setup GPU or CPU ### if configs['n_gpus'] > 0 and torch.cuda.is_available(): logger.info('CUDA is available! using GPU...\n') device = torch.device('cuda') else: logger.info('using CPU...\n') device = torch.device('cpu') ### Dataset ### logger.info('preparing dataset...') train_transform = DataTransforms(img_size=configs['img_size'], color_mean=configs['color_mean'], color_std=configs['color_std'], phase='train') test_transform = DataTransforms(img_size=configs['img_size'], color_mean=configs['color_mean'], color_std=configs['color_std'], phase='test') train_img_list, test_img_list = make_datapath_list( root=configs['data_root']) train_dataset = Dataset(train_img_list, transform=train_transform) test_dataset = Dataset(test_img_list, transform=test_transform) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=configs['batch_size'], shuffle=True, num_workers=8) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=configs['batch_size'], shuffle=False, num_workers=8) ### Network ### logger.info('preparing network...') network = VAE(in_channels=configs['n_channels'], h_dim=1024, z_dim=32, device=device) network = network.to(device) criterion = nn.MSELoss() optimizer = optim.Adam(network.parameters(), lr=configs['lr']) if configs['resume']: # Load checkpoint logger.info('==> Resuming from checkpoint...\n') if not Path(configs['resume']).exists(): logger.info('No checkpoint found !') raise ValueError('No checkpoint found !') ckpt = torch.load(configs['resume']) network.load_state_dict(ckpt['model_state_dict']) optimizer.load_state_dict(ckpt['optimizer_state_dict']) start_epoch = ckpt['epoch'] loss = ckpt['loss'] else: logger.info('==> Building model...\n') start_epoch = 0 logger.info('model summary: ') summary(network, input_size=(configs['n_channels'], configs['img_size'], configs['img_size'])) if configs["n_gpus"] > 1: network = nn.DataParallel(network) ### Metrics ### metrics = Metrics(writer=writer, metrics_dir=paths.metrics_dir) ### Train or Test ### kwargs = { 'device': device, 'network': network, 'optimizer': optimizer, 'criterion': criterion, 'data_loaders': (train_loader, test_loader), 'metrics': metrics, 'writer': writer, 'save_ckpt_interval': configs['save_ckpt_interval'], 'ckpt_dir': paths.ckpt_dir, 'img_outdir': paths.img_outdir, } generalizer = Generalizer(**kwargs) if args.inference: if not configs['resume']: logger.info('No checkpoint found for inference!') logger.info('mode: inference\n') generalizer.test(epoch=start_epoch, inference=True) else: logger.info('mode: train\n') generalizer.train(n_epochs=configs['n_epochs'], start_epoch=start_epoch)
def get_solutions_to_evaluate( solve_strategy: SolveStrategy, model_name: str, batch_size: int, platform: str, input_shape=None, model_version="v1", buffer_mem: int = 0) -> List[Tuple[RSResult, str]]: """ :param solve_strategy: :param model_name: :param batch_size: :param platform: :param input_shape: :param model_version: :return: Instance of RSResult, or None. Returns None if the solution is not available in cache or no solution is available under the budget """ logger = setup_logger("test_execution_get_solution") # Load all results for this configuration, regardless of budget key_prefix = RedisCache.make_key(platform=platform, model_name=model_name, model_version=model_version, batch_size=batch_size, input_shape=input_shape) cache = RedisCache(key_prefix=key_prefix) cost_file = f"b{batch_size}_{platform}.npy" logger.info( f"Querying results for SS={solve_strategy}, model_name=f{model_name}, bs=f{batch_size}, " f"platform={platform}, cost_file={cost_file}, key prefix={key_prefix}") results, keys = cache.read_results(solver=solve_strategy, cost_file=cost_file, model_name=model_name) if not results: logger.error( f"No solutions found in cache for SS={solve_strategy}, model_name=f{model_name}, " f"bs=f{batch_size}, platform={platform}, cost_file={cost_file}, key prefix={key_prefix}" ) return [] # Filter results to those that abide by the budget platform_budget = platform_memory(platform) within_budget = [] for result, key in zip(results, keys): if not result.peak_ram: logger.warn(f"Falsey peak ram? {result.peak_ram}") continue if result.peak_ram + buffer_mem <= platform_budget: within_budget.append((result, key)) logger.info( f"Out of {len(results)} solver results, {len(within_budget)} had <= {platform_budget} - {buffer_mem} peak ram" ) if not within_budget: logger.warn( f"While {len(results)} solutions were found in cache, no solutions are within budget" ) return [] # Return solution in increasing order of cost within_budget.sort(key=lambda r: r[0].cpu) return within_budget # Return min compute solution min_compute = within_budget[0] for result in within_budget: if result[0].cpu < min_compute[0].cpu: min_compute = result logger.info( f"Using solution with f{min_compute[0].cpu} compute, f{min_compute[0].peak_ram} ram" ) return min_compute
def main(): # argparse parser = argparse.ArgumentParser() parser.add_argument('--config_file', help='path of config file', default=None, type=str) parser.add_argument('--clean_run', help='run from scratch', default=False, type=bool) parser.add_argument('opts', help='modify arguments', default=None, nargs=argparse.REMAINDER) args = parser.parse_args() # config setup if args.config_file is not None: cfg.merge_from_file(args.config_file) if args.opts is not None: cfg.merge_from_list(args.opts) cfg.freeze() if args.clean_run: if os.path.exists(f'../experiments/{cfg.SYSTEM.EXP_NAME}'): shutil.rmtree(f'../experiments/{cfg.SYSTEM.EXP_NAME}') if os.path.exists(f'../experiments/runs/{cfg.SYSTEM.EXP_NAME}'): shutil.rmtree(f'../experiments/runs/{cfg.SYSTEM.EXP_NAME}') # Note!: Sleeping to make tensorboard delete it's cache. time.sleep(5) search = defaultdict() search['lr'], search['momentum'], search['factor'], search['step_size'] = [ True ] * 4 set_seeds(cfg) logdir, chk_dir = save_config(cfg.SAVE_ROOT, cfg) writer = SummaryWriter(log_dir=logdir) # setup logger logger_dir = Path(chk_dir).parent logger = setup_logger(cfg.SYSTEM.EXP_NAME, save_dir=logger_dir) # Model prediction_model = BaseModule(cfg) noise_model = NoiseModule(cfg) model = [prediction_model, noise_model] device = cfg.SYSTEM.DEVICE if torch.cuda.is_available() else 'cpu' # load the data train_loader = get_loader(cfg, 'train') val_loader = get_loader(cfg, 'val') prediction_model, noise_model = model prediction_model.to(device) lr = cfg.SOLVER.LR momentum = cfg.SOLVER.MOMENTUM weight_decay = cfg.SOLVER.WEIGHT_DECAY betas = cfg.SOLVER.BETAS step_size = cfg.SOLVER.STEP_SIZE decay_factor = cfg.SOLVER.FACTOR # Optimizer if cfg.SOLVER.OPTIMIZER == 'Adam': optimizer = optim.Adam(prediction_model.parameters(), lr=lr, weight_decay=weight_decay, betas=betas) elif cfg.SOLVER.OPTIMIZER == 'SGD': optimizer = optim.SGD(prediction_model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum) if cfg.SOLVER.SCHEDULER == 'StepLR': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=decay_factor) elif cfg.SOLVER.SCHEDULER == 'ReduceLROnPlateau': scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=cfg.SOLVER.FACTOR, min_lr=cfg.SOLVER.MIN_LR, patience=cfg.SOLVER.PAITENCE, cooldown=cfg.SOLVER.COOLDOWN, threshold=cfg.SOLVER.THRESHOLD, eps=1e-24) # checkpointer chkpt = Checkpointer(prediction_model, optimizer, scheduler=scheduler, save_dir=chk_dir, logger=logger, save_to_disk=True) offset = 0 checkpointer = chkpt.load() if not checkpointer == {}: offset = checkpointer.pop('epoch') loader = [train_loader, val_loader] print(f'Same optimizer, {scheduler.optimizer == optimizer}') print(cfg) model = [prediction_model, noise_model] train(cfg, model, optimizer, scheduler, loader, chkpt, writer, offset) test_loader = get_loader(cfg, 'test') test(cfg, prediction_model, test_loader, writer, logger)