예제 #1
0
def get_allstrat_ram(args):
    from evaluation.eval_execution import get_solution_to_evaluate

    log_base = os.path.join(
        "data", "get_allstrat_ram",
        f"{args.platform}_{args.model_name}_{args.model_version}_{args.batch_size}_{args.input_shape}_{args.strategy}_{args.buffer_mem_mb}_gradless_eagerfalse"
    )
    os.makedirs(log_base, exist_ok=True)
    logger = setup_logger("get_allstrat_ram")

    # load redis config
    dotenv_location = dotenv.find_dotenv()
    if len(dotenv_location):
        logger.info(f'Loading dotenv config from {dotenv_location}')
        dotenv.load_dotenv(dotenv_location)
    else:
        logger.warn("Failed to load dotenv config!")

    for strategy in SolveStrategy:
        if strategy == SolveStrategy.NOT_SPECIFIED:
            continue
        result, result_key = get_solution_to_evaluate(
            strategy, args.model_name, args.batch_size, args.platform,
            args.input_shape, args.model_version, args.buffer_mem_mb)

        if result:
            logger.info(
                f"For strategy {strategy.name}, peak ram is {result.peak_ram:.3E}, compute is {result.cpu:.3E}"
            )
        else:
            logger.warn(f"no solution for strategy {strategy.name}")
예제 #2
0
def run_single_model(args):
    log_base = os.path.join(
        "data", "run_single_model",
        f"{args.platform}_{args.model_name}_{args.model_version}_{args.batch_size}_{args.input_shape}_{args.strategy}_{args.buffer_mem_mb}_gradless_eagerfalse"
    )
    os.makedirs(log_base, exist_ok=True)
    logger = setup_logger("run_single_model")

    # load redis config
    dotenv_location = dotenv.find_dotenv()
    if len(dotenv_location):
        logger.info(f'Loading dotenv config from {dotenv_location}')
        dotenv.load_dotenv(dotenv_location)
    else:
        logger.warn("Failed to load dotenv config!")

    strategy = SolveStrategy(args.strategy)
    result, result_key, throughput = execute_one(
        log_base=log_base,
        solve_strategy=strategy,
        model_name=args.model_name,
        batch_size=args.batch_size,
        platform=args.platform,
        input_shape=args.input_shape,
        model_version=args.model_version,
        num_runs=args.num_runs,
        buffer_mem=args.buffer_mem_mb * 1000 * 1000)

    if result is None:
        logger.error("No result returned from execute_one")
        return

    metrics_single = dict(
        solve_strategy=strategy,
        model_name=args.model_name,
        batch_size=args.batch_size,
        platform=args.platform,
        input_shape=args.input_shape,
        model_version=args.model_version,
        num_runs=args.num_runs,
        result_key=result_key,
        buffer_mem=args.buffer_mem_mb * 1000 * 1000,
        throughput_it_per_s=throughput,
    )

    if strategy == SolveStrategy.OPTIMAL_ILP_GC:
        metrics_single["vars"] = result.ilp_num_variables
        metrics_single["constraints"] = result.ilp_num_constraints
        metrics_single["solve_time"] = result.solve_time_s

    output_file = os.path.join(log_base, "metrics.pickle")
    with open(output_file, "wb") as f:
        pickle.dump(metrics_single, f, protocol=pickle.HIGHEST_PROTOCOL)
    logger.info(f"Saved throughput metrics to {output_file}")
예제 #3
0
def execute_one(log_base: str,
                solve_strategy: SolveStrategy,
                model_name: str,
                batch_size: int,
                platform: str,
                input_shape=None,
                model_version="v1",
                num_runs=16,
                buffer_mem: int = 0) -> Tuple[Optional[RSResult], str, int]:
    logger = setup_logger("eval_one")
    results_and_keys = get_solutions_to_evaluate(solve_strategy, model_name,
                                                 batch_size, platform,
                                                 input_shape, model_version,
                                                 buffer_mem)
    if not results_and_keys:
        logger.info("No results found")
        return None, "", 0

    if not EAGER:
        tf1.disable_eager_execution()
    for result, result_key in results_and_keys:
        tf.keras.backend.clear_session()
        model = get_keras_model(model_name, input_shape=input_shape)
        tf2 = TF2ExtractorParams(model,
                                 batch_size=batch_size,
                                 log_base=log_base)
        loss_fn = categorical_cross_entropy  # TODO: vgg_unet may need a different loss
        graph = tf2.g

        # TODO TEST THIS VS TENSORSPEC
        runner = TF2Runner(model,
                           graph,
                           result.schedule,
                           loss_fn=loss_fn,
                           eager=EAGER,
                           log_base=log_base,
                           batch_size=batch_size)

        try:
            throughput = evaluate_solved_model(result=result,
                                               runner=runner,
                                               warmup=10 if EAGER else 64,
                                               trials=num_runs,
                                               batch_size=batch_size)
            logger.info(
                f"Successfully executed model with predicted memory usage {result.peak_ram}, "
                f"predicted cpu {result.cpu}, actual throughput {throughput}")
            return result, result_key, throughput
        except Exception as e:
            logger.error("Error running model with predicted mem usage %s: %s",
                         result.peak_ram, e)
            logger.error("Traceback: %s", e.__traceback__)
            logger.error("Skipping result, going to next candidate.")
    return None, "", 0
예제 #4
0
def evaluate_solved_model(result: RSResult, runner: TF2Runner, warmup, trials,
                          batch_size):
    logger = setup_logger("evaluate_solved_model")

    recompute_baseline = runner.tf_graph

    logger.debug("Warming up models")
    in_shape = runner.keras_model.input_shape
    out_shape = runner.keras_model.output_shape

    h = in_shape[1]
    w = in_shape[2]
    c = np.prod(out_shape[1:])
    reshape_to = list(out_shape)
    print(reshape_to)
    reshape_to[0] = -1
    for data in tqdm([
            random_batch(batch_size, img_h=h, img_w=w, num_classes=c)
            for _ in range(warmup)
    ],
                     desc="Warmup"):
        dat, lab = data
        lab = tf.reshape(lab, reshape_to)
        recompute_baseline(dat, lab)

    # run actual evaluation
    timer = Timer("timer_recompute")
    for i in tqdm(range(trials), desc="Profiling"):
        # TODO: Should we generate random batches on CPU and copy to GPU, inside the timing loop?
        #       This would model the overhead of loading data, and bring throughputs down to be
        #       more realistic

        images, labels = random_batch(batch_size,
                                      img_h=h,
                                      img_w=w,
                                      num_classes=c)
        labels = tf.reshape(labels, reshape_to)
        with timer:
            loss, gradients = recompute_baseline(images, labels)

        # todo assert correctness of the model by applying gradients

    tput = trials / timer.elapsed
    logger.info(f"{result.solve_strategy} throughput: {tput :2.4} iters/s")
    return tput
예제 #5
0
    def __init__(self,
                 keras_model: tf.keras.models.Model,
                 g: DFGraph,
                 schedule: Schedule,
                 loss_fn=categorical_cross_entropy,
                 eager: bool = True,
                 log_base: str = None,
                 debug=False,
                 batch_size=None):
        self.log_base = log_base
        self.logger = setup_logger("TF2Runner",
                                   os.path.join(log_base, 'TF2Runner.log'))
        self.debug = debug
        self.schedule = schedule
        self.eager = eager
        self.batch_size = batch_size

        self.loss_fn = loss_fn
        self.keras_model = keras_model
        self.g = g

        self.tf_graph = self._generate_tf_graph()
예제 #6
0
def main():
    args = parser()

    ### setup configs ###
    configfile = args.configfile

    with open(configfile) as f:
        configs = yaml.load(f)

    ## path process (path definition, make directories)
    now = datetime.now().isoformat()
    log_dir = Path(configs['log_dir']) / now
    paths = Paths(log_dir=log_dir)

    ### setup logs and summary writer ###
    setup_logger(logfile=paths.logfile)

    writer = SummaryWriter(str(paths.summary_dir))

    ### setup GPU or CPU ###
    if configs['n_gpus'] > 0 and torch.cuda.is_available():
        logger.info('CUDA is available! using GPU...\n')
        device = torch.device('cuda')
    else:
        logger.info('using CPU...\n')
        device = torch.device('cpu')

    ### Dataset ###
    logger.info('preparing dataset...')
    dataset_name = configs['dataset']
    logger.info(f'==> dataset: {dataset_name}\n')

    if configs['dataset'] == 'cifar10':
        transform = transforms.Compose([
            transforms.Resize(configs['img_size'], configs['img_size']),
            transforms.ToTensor(),
            transforms.Normalize(configs['color_mean'], configs['color_std']),
        ])
        train_dataset = datasets.CIFAR10(root=configs['data_root'],
                                         train=True,
                                         transform=transform,
                                         download=True)
        test_dataset = datasets.CIFAR10(root=configs['data_root'],
                                        train=False,
                                        transform=transform,
                                        download=True)
    elif configs['dataset'] == 'custom':
        train_transform = DataTransforms(img_size=configs['img_size'],
                                         color_mean=configs['color_mean'],
                                         color_std=configs['color_std'],
                                         phase='train')
        test_transform = DataTransforms(img_size=configs['img_size'],
                                        color_mean=configs['color_mean'],
                                        color_std=configs['color_std'],
                                        phase='test')
        train_img_list, train_lbl_list, test_img_list, test_lbl_list = make_datapath_list(
            root=configs['data_root'])
        train_dataset = Dataset(train_img_list,
                                train_lbl_list,
                                transform=train_transform)
        test_dataset = Dataset(test_img_list,
                               test_lbl_list,
                               transform=test_transform)
    else:
        logger.debug('dataset is not supported')
        raise ValueError('dataset is not supported')

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=configs['batch_size'],
        shuffle=True,
        num_workers=8)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=configs['batch_size'],
                                              shuffle=False,
                                              num_workers=8)

    ### Network ###
    logger.info('preparing network...')

    network = CNNAutoEncoder(in_channels=configs['n_channels'],
                             n_classes=configs['n_classes'])

    network = network.to(device)
    cnn_criterion = nn.CrossEntropyLoss()
    ae_criterion = nn.MSELoss()
    optimizer = optim.Adam(network.parameters(), lr=configs['lr'])

    if configs['resume']:
        # Load checkpoint
        logger.info('==> Resuming from checkpoint...\n')
        if not Path(configs['resume']).exists():
            logger.info('No checkpoint found !')
            raise ValueError('No checkpoint found !')

        ckpt = torch.load(configs['resume'])
        network.load_state_dict(ckpt['model_state_dict'])
        optimizer.load_state_dict(ckpt['optimizer_state_dict'])
        start_epoch = ckpt['epoch']
        loss = ckpt['loss']
    else:
        logger.info('==> Building model...\n')
        start_epoch = 0

    logger.info('model summary: ')
    summary(network,
            input_size=(configs['n_channels'], configs['img_size'],
                        configs['img_size']))

    if configs["n_gpus"] > 1:
        network = nn.DataParallel(network)

    ### Metrics ###
    metrics = Metrics(n_classes=configs['n_classes'],
                      classes=configs['classes'],
                      writer=writer,
                      metrics_dir=paths.metrics_dir,
                      plot_confusion_matrix=plot_confusion_matrix)

    ### Train or Test ###
    kwargs = {
        'device': device,
        'network': network,
        'optimizer': optimizer,
        'criterions': (cnn_criterion, ae_criterion),
        'classification_loss_weight': configs['classification_loss_weight'],
        'autoencoder_loss_weight': configs['autoencoder_loss_weight'],
        'data_loaders': (train_loader, test_loader),
        'metrics': metrics,
        'writer': writer,
        'n_classses': configs['n_classes'],
        'save_ckpt_interval': configs['save_ckpt_interval'],
        'ckpt_dir': paths.ckpt_dir,
    }

    cnn_classifier = CNNClassifier(**kwargs)

    if args.inference:
        if not configs['resume']:
            logger.info('No checkpoint found for inference!')
        logger.info('mode: inference\n')
        cnn_classifier.test(epoch=start_epoch, inference=True)
    else:
        logger.info('mode: train\n')
        cnn_classifier.train(n_epochs=configs['n_epochs'],
                             start_epoch=start_epoch)
예제 #7
0
파일: main.py 프로젝트: ryoherisson/pspnet
def main(args):

    with open(args.configfile) as f:
        configs = yaml.safe_load(f)

    ## path process (path definition, make directories)
    now = datetime.now().isoformat()
    log_dir = Path(configs['log_dir']) / now
    paths = Paths(log_dir=log_dir)

    ### setup logs and summary writer ###
    setup_logger(logfile=paths.logfile)

    writer = SummaryWriter(str(paths.summary_dir))

    ### setup GPU or CPU ###
    if configs['n_gpus'] > 0 and torch.cuda.is_available():
        logger.info('CUDA is available! using GPU...\n')
        device = torch.device('cuda')
    else:
        logger.info('using CPU...\n')
        device = torch.device('cpu')

    ### Dataset ###
    logger.info('preparing dataset...')
    data_root = configs['data_root']
    logger.info(f'==> dataset path: {data_root}\n')

    train_img_list, train_annot_list, test_img_list, test_annot_list = make_datapath_list(rootpath=data_root, train_data=configs['train_txt'], test_data=configs['test_txt'],
        img_extension=configs['img_extension'], anno_extension=configs['anno_extension'])
    
    train_transform = DataTransform(img_size=configs['img_size'], color_mean=configs['color_mean'], color_std=configs['color_std'], mode='train')
    test_transform = DataTransform(img_size=configs['img_size'], color_mean=configs['color_mean'], color_std=configs['color_std'], mode='test')

    train_dataset = VOCDataset(train_img_list, train_annot_list, transform=train_transform, label_color_map=configs['label_color_map'])
    test_dataset = VOCDataset(test_img_list, test_annot_list, transform=test_transform, label_color_map=configs['label_color_map'])

    ### DataLoader ###
    train_loader = DataLoader(train_dataset, batch_size=configs['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=configs['batch_size'], shuffle=False)

    ### Network ###
    logger.info('preparing network...')

    network = PSPNet(n_classes=configs['n_classes'], img_size=configs['img_size'], img_size_8=configs['input_size_8'])
    network = network.to(device)
    criterion = PSPLoss(aux_weight=configs['aux_weight'])
    optimizer = optim.Adam(network.parameters(), lr=configs['lr'], weight_decay=configs['decay'])

    if configs['resume']:
        # Load checkpoint
        logger.info('==> Resuming from checkpoint...\n')
        if not Path(configs['resume']).exists():
            logger.info('No checkpoint found !')
            raise ValueError('No checkpoint found !')

        ckpt = torch.load(configs['resume'])
        network.load_state_dict(ckpt['model_state_dict'])
        optimizer.load_state_dict(ckpt['optimizer_state_dict'])
        start_epoch = ckpt['epoch']
        loss = ckpt['loss']
        start_epoch = 0 
    else:
        logger.info('==> Building model...\n')
        start_epoch = 0 


    logger.info('model summary: ')
    summary(network, input_size=(configs['n_channels'], configs['img_size'], configs['img_size']))

    if configs['n_gpus'] > 1:
        network = nn.DataParallel(network)
        
    ### Metrics ###
    metrics_cfg = {
        'n_classes': configs['n_classes'],
        'classes': configs['classes'],
        'img_size': configs['img_size'],
        'writer': writer,
        'metrics_dir': paths.metrics_dir,
    }

    metrics = Metrics(**metrics_cfg)

    ### Visualize Results ###
    vis_img = VisImage(n_classes=configs['n_classes'], label_color_map=configs['label_color_map'])

    ### Train or Inference ###
    kwargs = {
        'device': device,
        'network': network,
        'optimizer': optimizer,
        'criterion': criterion,
        'data_loaders': (train_loader, test_loader),
        'metrics': metrics,
        'vis_img': vis_img,
        'img_size': configs['img_size'],
        'writer': writer,
        'save_ckpt_interval': configs['save_ckpt_interval'],
        'ckpt_dir': paths.ckpt_dir,
        'img_outdir': paths.img_outdir,
    }

    semantic_segmentaion = SemanticSegmentation(**kwargs)

    if args.inference:
        if not configs['resume']:
            logger.info('No checkpoint found for inference!')
        logger.info('mode: inference\n')
        semantic_segmentaion.test(epoch=start_epoch, inference=True)
    else:
        logger.info('mode: train\n')
        semantic_segmentaion.train(n_epochs=configs['n_epochs'], start_epoch=start_epoch)
예제 #8
0
def main():
    args = parser()

    with open(args.configfile) as f:
        configs = yaml.safe_load(f)

    ## path process (path definition, make directories)
    now = datetime.now().isoformat()
    log_dir = Path(configs['log_dir']) / now
    paths = Paths(log_dir=log_dir)

    ### setup logs and summary writer ###
    setup_logger(logfile=paths.logfile)

    writer = SummaryWriter(str(paths.summary_dir))

    ### setup GPU or CPU ###
    if configs['n_gpus'] > 0 and torch.cuda.is_available():
        logger.info('CUDA is available! using GPU...\n')
        device = torch.device('cuda')
    else:
        logger.info('using CPU...\n')
        device = torch.device('cpu')
    
    ### Dataset ###
    logger.info('preparing dataset...')
    data_root = configs['data_root']
    logger.info(f'==> dataset path: {data_root}\n')

    train_img_list, train_annot_list, test_img_list, test_annot_list = make_datapath_list(rootpath=data_root, train_data=configs['train_txt'], test_data=configs['test_txt'])

    train_transform = DataTransform(img_size=configs['img_size'], color_mean=configs['color_mean'], mode='train')
    test_transform = DataTransform(img_size=configs['img_size'], color_mean=configs['color_mean'], mode='test')
    transform_annot = Anno_xml2list(configs['classes'])

    train_dataset = VOCDataset(train_img_list, train_annot_list, transform=train_transform, transform_annot=transform_annot)
    test_dataset = VOCDataset(test_img_list, test_annot_list, transform=test_transform, transform_annot=transform_annot)

    ### DataLoader ###
    train_loader = DataLoader(train_dataset, batch_size=configs['batch_size'], shuffle=True, collate_fn=od_collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=configs['batch_size'], shuffle=False, collate_fn=od_collate_fn)

    ### Network ###
    logger.info('preparing network...')

    ssd_cfg = {
        'n_classes': configs['n_classes'],
        'img_size': configs['img_size'],
        'bbox_aspect_num': configs['bbox_aspect_num'],  # number of aspect ratios of dbox
        'feature_maps': configs['feature_maps'],  # feature map size of each source
        'steps': configs['steps'],  # size of dbox
        'min_sizes': configs['min_sizes'],  # size of dbox
        'max_sizes': configs['max_sizes'],  # size of dbox
        'aspect_ratios': configs['aspect_ratios'],  # aspect ratios
        'variances': configs['variances'], # variances for decode
        'conf_thresh': configs['conf_thresh'],
        'top_k': configs['top_k'],
        'nms_thresh': configs['nms_thresh'],
        'device': device,
    }

    network = SSD(**ssd_cfg)
    network = network.to(device)
    criterion = MultiBoxLoss(jaccard_thresh=configs['jaccord_thresh'], neg_pos=configs['neg_pos'], device=device)
    optimizer = optim.SGD(network.parameters(), lr=configs['lr'], weight_decay=configs['decay'])

    def weights_init(m):
        if isinstance(m, nn.Conv2d):
            init.kaiming_normal_(m.weight.data)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0.0)

    if configs['pretrained']:
        # Load pretrained model
        logger.info('==> Pretrained VGG...\n')
        if not Path(configs['pretrained']).exists():
            logger.info('No pretrained model found!')
        
        vgg_weights = torch.load(configs['pretrained'])
        network.vgg.load_state_dict(vgg_weights)

    network.extras.apply(weights_init)
    network.loc.apply(weights_init)
    network.conf.apply(weights_init)

    if configs['resume']:
        # Load checkpoint
        logger.info('==> Resuming from checkpoint...\n')
        if not Path(configs['resume']).exists():
            logger.info('No checkpoint found !')
            raise ValueError('No checkpoint found !')

        ckpt = torch.load(configs['resume'])
        network.load_state_dict(ckpt)
        start_epoch = 0
        # network.load_state_dict(ckpt['model_state_dict'])
        # optimizer.load_state_dict(ckpt['optimizer_state_dict'])
        # start_epoch = ckpt['epoch']
        # loss = ckpt['loss']
    else:
        logger.info('==> Building model...\n')
        start_epoch = 0

    # logging
    logger.info('model summary: ')
    logger.info(summary(network, input_size=(configs['n_channels'], configs['img_size'], configs['img_size'])))

    if configs["n_gpus"] > 1:
        network = nn.DataParallel(network)

    ### Metrics ###
    metrics_cfg = {
        'n_classes': configs['n_classes'],
        'classes': configs['classes'],
        'img_size': configs['img_size'],
        'confidence_level': configs['confidence_level'],
        'writer': writer,
        'metrics_dir': paths.metrics_dir,
        'imgs_dir': paths.img_outdir,
    }
    
    metrics = Metrics(**metrics_cfg)

    ### Visualize Results ###
    box_vis = BoxVis(configs['confidence_level'], configs['classes'], configs['label_color_map'], configs['font_path'])

    ### Train or Inference ###
    kwargs = {
        'device': device,
        'network': network,
        'optimizer': optimizer,
        'criterion': criterion,
        'data_loaders': (train_loader, test_loader),
        'metrics': metrics,
        'box_vis': box_vis,
        'img_size': configs['img_size'],
        'writer': writer,
        'save_ckpt_interval': configs['save_ckpt_interval'],
        'ckpt_dir': paths.ckpt_dir,
        'img_outdir': paths.img_outdir,
    }

    object_detection = ObjectDetection(**kwargs)

    if args.inference:
        if not configs['resume']:
            logger.info('No checkpoint found for inference!')
        logger.info('mode: inference\n')
        object_detection.test(epoch=start_epoch, inference=True)
    else:
        logger.info('mode: train\n')
        object_detection.train(n_epochs=configs['n_epochs'], start_epoch=start_epoch)
예제 #9
0
def main():
    args = parser()

    ### setup configs ###
    configfile = args.configfile

    with open(configfile) as f:
        configs = yaml.load(f)

    ## path process (path definition, make directories)
    now = datetime.now().isoformat()
    log_dir = Path(configs['log_dir']) / now
    paths = Paths(log_dir=log_dir)

    ### setup logs and summary writer ###
    setup_logger(logfile=paths.logfile)

    writer = SummaryWriter(str(paths.summary_dir))

    ### setup GPU or CPU ###
    if configs['n_gpus'] > 0 and torch.cuda.is_available():
        logger.info('CUDA is available! using GPU...\n')
        device = torch.device('cuda')
    else:
        logger.info('using CPU...\n')
        device = torch.device('cpu')

    ### Dataset ###
    logger.info('preparing dataset...')

    train_transform = DataTransforms(img_size=configs['img_size'],
                                     color_mean=configs['color_mean'],
                                     color_std=configs['color_std'],
                                     phase='train')
    test_transform = DataTransforms(img_size=configs['img_size'],
                                    color_mean=configs['color_mean'],
                                    color_std=configs['color_std'],
                                    phase='test')
    train_img_list, test_img_list = make_datapath_list(
        root=configs['data_root'])
    train_dataset = Dataset(train_img_list, transform=train_transform)
    test_dataset = Dataset(test_img_list, transform=test_transform)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=configs['batch_size'],
        shuffle=True,
        num_workers=8)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=configs['batch_size'],
                                              shuffle=False,
                                              num_workers=8)

    ### Network ###
    logger.info('preparing network...')

    network = VAE(in_channels=configs['n_channels'],
                  h_dim=1024,
                  z_dim=32,
                  device=device)

    network = network.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(network.parameters(), lr=configs['lr'])

    if configs['resume']:
        # Load checkpoint
        logger.info('==> Resuming from checkpoint...\n')
        if not Path(configs['resume']).exists():
            logger.info('No checkpoint found !')
            raise ValueError('No checkpoint found !')

        ckpt = torch.load(configs['resume'])
        network.load_state_dict(ckpt['model_state_dict'])
        optimizer.load_state_dict(ckpt['optimizer_state_dict'])
        start_epoch = ckpt['epoch']
        loss = ckpt['loss']
    else:
        logger.info('==> Building model...\n')
        start_epoch = 0

    logger.info('model summary: ')
    summary(network,
            input_size=(configs['n_channels'], configs['img_size'],
                        configs['img_size']))

    if configs["n_gpus"] > 1:
        network = nn.DataParallel(network)

    ### Metrics ###
    metrics = Metrics(writer=writer, metrics_dir=paths.metrics_dir)

    ### Train or Test ###
    kwargs = {
        'device': device,
        'network': network,
        'optimizer': optimizer,
        'criterion': criterion,
        'data_loaders': (train_loader, test_loader),
        'metrics': metrics,
        'writer': writer,
        'save_ckpt_interval': configs['save_ckpt_interval'],
        'ckpt_dir': paths.ckpt_dir,
        'img_outdir': paths.img_outdir,
    }

    generalizer = Generalizer(**kwargs)

    if args.inference:
        if not configs['resume']:
            logger.info('No checkpoint found for inference!')
        logger.info('mode: inference\n')
        generalizer.test(epoch=start_epoch, inference=True)
    else:
        logger.info('mode: train\n')
        generalizer.train(n_epochs=configs['n_epochs'],
                          start_epoch=start_epoch)
예제 #10
0
def get_solutions_to_evaluate(
        solve_strategy: SolveStrategy,
        model_name: str,
        batch_size: int,
        platform: str,
        input_shape=None,
        model_version="v1",
        buffer_mem: int = 0) -> List[Tuple[RSResult, str]]:
    """

    :param solve_strategy:
    :param model_name:
    :param batch_size:
    :param platform:
    :param input_shape:
    :param model_version:
    :return: Instance of RSResult, or None. Returns None if the solution is not available in cache
    or no solution is available under the budget
    """
    logger = setup_logger("test_execution_get_solution")

    # Load all results for this configuration, regardless of budget
    key_prefix = RedisCache.make_key(platform=platform,
                                     model_name=model_name,
                                     model_version=model_version,
                                     batch_size=batch_size,
                                     input_shape=input_shape)
    cache = RedisCache(key_prefix=key_prefix)
    cost_file = f"b{batch_size}_{platform}.npy"
    logger.info(
        f"Querying results for SS={solve_strategy}, model_name=f{model_name}, bs=f{batch_size}, "
        f"platform={platform}, cost_file={cost_file}, key prefix={key_prefix}")
    results, keys = cache.read_results(solver=solve_strategy,
                                       cost_file=cost_file,
                                       model_name=model_name)
    if not results:
        logger.error(
            f"No solutions found in cache for SS={solve_strategy}, model_name=f{model_name}, "
            f"bs=f{batch_size}, platform={platform}, cost_file={cost_file}, key prefix={key_prefix}"
        )
        return []

    # Filter results to those that abide by the budget
    platform_budget = platform_memory(platform)
    within_budget = []
    for result, key in zip(results, keys):
        if not result.peak_ram:
            logger.warn(f"Falsey peak ram? {result.peak_ram}")
            continue
        if result.peak_ram + buffer_mem <= platform_budget:
            within_budget.append((result, key))
    logger.info(
        f"Out of {len(results)} solver results, {len(within_budget)} had <= {platform_budget} - {buffer_mem} peak ram"
    )
    if not within_budget:
        logger.warn(
            f"While {len(results)} solutions were found in cache, no solutions are within budget"
        )
        return []

    # Return solution in increasing order of cost
    within_budget.sort(key=lambda r: r[0].cpu)
    return within_budget

    # Return min compute solution
    min_compute = within_budget[0]
    for result in within_budget:
        if result[0].cpu < min_compute[0].cpu:
            min_compute = result
    logger.info(
        f"Using solution with f{min_compute[0].cpu} compute, f{min_compute[0].peak_ram} ram"
    )
    return min_compute
def main():
    # argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--config_file',
                        help='path of config file',
                        default=None,
                        type=str)
    parser.add_argument('--clean_run',
                        help='run from scratch',
                        default=False,
                        type=bool)
    parser.add_argument('opts',
                        help='modify arguments',
                        default=None,
                        nargs=argparse.REMAINDER)
    args = parser.parse_args()
    # config setup
    if args.config_file is not None:
        cfg.merge_from_file(args.config_file)
    if args.opts is not None: cfg.merge_from_list(args.opts)

    cfg.freeze()
    if args.clean_run:
        if os.path.exists(f'../experiments/{cfg.SYSTEM.EXP_NAME}'):
            shutil.rmtree(f'../experiments/{cfg.SYSTEM.EXP_NAME}')
        if os.path.exists(f'../experiments/runs/{cfg.SYSTEM.EXP_NAME}'):
            shutil.rmtree(f'../experiments/runs/{cfg.SYSTEM.EXP_NAME}')
            # Note!: Sleeping to make tensorboard delete it's cache.
            time.sleep(5)

    search = defaultdict()
    search['lr'], search['momentum'], search['factor'], search['step_size'] = [
        True
    ] * 4
    set_seeds(cfg)
    logdir, chk_dir = save_config(cfg.SAVE_ROOT, cfg)
    writer = SummaryWriter(log_dir=logdir)
    # setup logger
    logger_dir = Path(chk_dir).parent
    logger = setup_logger(cfg.SYSTEM.EXP_NAME, save_dir=logger_dir)
    # Model
    prediction_model = BaseModule(cfg)
    noise_model = NoiseModule(cfg)
    model = [prediction_model, noise_model]
    device = cfg.SYSTEM.DEVICE if torch.cuda.is_available() else 'cpu'
    # load the data
    train_loader = get_loader(cfg, 'train')
    val_loader = get_loader(cfg, 'val')
    prediction_model, noise_model = model
    prediction_model.to(device)
    lr = cfg.SOLVER.LR
    momentum = cfg.SOLVER.MOMENTUM
    weight_decay = cfg.SOLVER.WEIGHT_DECAY
    betas = cfg.SOLVER.BETAS
    step_size = cfg.SOLVER.STEP_SIZE
    decay_factor = cfg.SOLVER.FACTOR

    # Optimizer
    if cfg.SOLVER.OPTIMIZER == 'Adam':
        optimizer = optim.Adam(prediction_model.parameters(),
                               lr=lr,
                               weight_decay=weight_decay,
                               betas=betas)
    elif cfg.SOLVER.OPTIMIZER == 'SGD':
        optimizer = optim.SGD(prediction_model.parameters(),
                              lr=lr,
                              weight_decay=weight_decay,
                              momentum=momentum)
    if cfg.SOLVER.SCHEDULER == 'StepLR':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=step_size,
                                              gamma=decay_factor)
    elif cfg.SOLVER.SCHEDULER == 'ReduceLROnPlateau':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            factor=cfg.SOLVER.FACTOR,
            min_lr=cfg.SOLVER.MIN_LR,
            patience=cfg.SOLVER.PAITENCE,
            cooldown=cfg.SOLVER.COOLDOWN,
            threshold=cfg.SOLVER.THRESHOLD,
            eps=1e-24)
    # checkpointer
    chkpt = Checkpointer(prediction_model,
                         optimizer,
                         scheduler=scheduler,
                         save_dir=chk_dir,
                         logger=logger,
                         save_to_disk=True)
    offset = 0
    checkpointer = chkpt.load()
    if not checkpointer == {}:
        offset = checkpointer.pop('epoch')
    loader = [train_loader, val_loader]
    print(f'Same optimizer, {scheduler.optimizer == optimizer}')
    print(cfg)
    model = [prediction_model, noise_model]
    train(cfg, model, optimizer, scheduler, loader, chkpt, writer, offset)
    test_loader = get_loader(cfg, 'test')
    test(cfg, prediction_model, test_loader, writer, logger)