Пример #1
0
def main():
    # Build model.
    model = model_builder.build_model(cfg=cfg)

    # Read checkpoint.
    ckpt = torch.load(
        cfg.MODEL.PATH2CKPT,
        map_location=torch.device("cpu")) if cfg.GENERAL.RESUME else {}

    if cfg.GENERAL.RESUME:
        with utils.log_info(msg="Load pre-trained model.",
                            level="INFO",
                            state=True):
            model.load_state_dict(ckpt["model"])
    # Set device.
    model, device = utils.set_device(model, cfg.GENERAL.GPU)

    try:
        test_data_loader = data_loader.build_data_loader(
            cfg, cfg.DATA.DATASET, "test")
        generate(cfg=cfg,
                 model=model,
                 data_loader=test_data_loader,
                 device=device)
    except:
        utils.notify("Can not build data loader for test set.", level="ERROR")
        raise ValueError("")
Пример #2
0
def main():
    # Set logger to record information.
    utils.check_env(cfg)
    logger = Logger(cfg)
    logger.log_info(cfg)
    metrics_handler = MetricsHandler(cfg.metrics)
    # utils.pack_code(cfg, logger=logger)

    # Build model.
    model = model_builder.build_model(cfg=cfg, logger=logger)
    optimizer = optimizer_helper.build_optimizer(cfg=cfg, model=model)
    lr_scheduler = lr_scheduler_helper.build_scheduler(cfg=cfg,
                                                       optimizer=optimizer)

    # Read checkpoint.
    ckpt = torch.load(cfg.model.path2ckpt) if cfg.gnrl.resume else {}
    if cfg.gnrl.resume:
        with logger.log_info(msg="Load pre-trained model.",
                             level="INFO",
                             state=True,
                             logger=logger):
            model.load_state_dict(ckpt["model"])
            optimizer.load_state_dict(ckpt["optimizer"])
            lr_scheduler.load_state_dict(ckpt["lr_scheduler"])

    # Set device.
    model, device = utils.set_pipline(
        model, cfg) if cfg.gnrl.PIPLINE else utils.set_device(
            model, cfg.gnrl.cuda)

    resume_epoch = ckpt["epoch"] if cfg.gnrl.resume else 0
    loss_fn = loss_fn_helper.build_loss_fn(cfg=cfg)

    # Prepare dataset.
    train_loaders, valid_loaders, test_loaders = dict(), dict(), dict()
    for dataset in cfg.data.datasets:
        if cfg.data[dataset].TRAIN:
            try:
                train_loaders[dataset] = data_loader.build_data_loader(
                    cfg, dataset, "train")
            except:
                utils.notify(msg="Failed to build train loader of %s" %
                             dataset)
        if cfg.data[dataset].VALID:
            try:
                valid_loaders[dataset] = data_loader.build_data_loader(
                    cfg, dataset, "valid")
            except:
                utils.notify(msg="Failed to build valid loader of %s" %
                             dataset)
        if cfg.data[dataset].TEST:
            try:
                test_loaders[dataset] = data_loader.build_data_loader(
                    cfg, dataset, "test")
            except:
                utils.notify(msg="Failed to build test loader of %s" % dataset)

    # TODO Train, evaluate model and save checkpoint.
    for epoch in range(cfg.train.max_epoch):
        epoch += 1
        if resume_epoch >= epoch:
            continue

        eval_kwargs = {
            "epoch": epoch,
            "cfg": cfg,
            "model": model,
            "loss_fn": loss_fn,
            "device": device,
            "metrics_handler": metrics_handler,
            "logger": logger,
            "save": cfg.save.save,
        }
        train_kwargs = {
            "epoch": epoch,
            "cfg": cfg,
            "model": model,
            "loss_fn": loss_fn,
            "optimizer": optimizer,
            "device": device,
            "lr_scheduler": lr_scheduler,
            "metrics_handler": metrics_handler,
            "logger": logger,
        }
        ckpt_kwargs = {
            "epoch": epoch,
            "cfg": cfg,
            "model": model.state_dict(),
            "metrics_handler": metrics_handler,
            "optimizer": optimizer.state_dict(),
            "lr_scheduler": lr_scheduler.state_dict(),
        }

        for dataset in cfg.data.datasets:
            if cfg.data[dataset].TRAIN:
                utils.notify("Train on %s" % dataset)
                train_one_epoch(data_loader=train_loaders[dataset],
                                **train_kwargs)

        utils.save_ckpt(path2file=cfg.model.path2ckpt, **ckpt_kwargs)

        if epoch in cfg.gnrl.ckphs:
            utils.save_ckpt(path2file=os.path.join(
                cfg.model.ckpts,
                cfg.gnrl.id + "_" + str(epoch).zfill(5) + ".pth"),
                            **ckpt_kwargs)
            for dataset in cfg.data.datasets:
                utils.notify("Evaluating test set of %s" % dataset,
                             logger=logger)
                if cfg.data[dataset].TEST:
                    evaluate(data_loader=test_loaders[dataset],
                             phase="test",
                             **eval_kwargs)

        for dataset in cfg.data.datasets:
            utils.notify("Evaluating valid set of %s" % dataset, logger=logger)
            if cfg.data[dataset].VALID:
                evaluate(data_loader=valid_loaders[dataset],
                         phase="valid",
                         **eval_kwargs)
    # End of train-valid for loop.

    eval_kwargs = {
        "epoch": epoch,
        "cfg": cfg,
        "model": model,
        "loss_fn": loss_fn,
        "device": device,
        "metrics_handler": metrics_handler,
        "logger": logger,
        "save": cfg.save.save,
    }

    for dataset in cfg.data.datasets:
        if cfg.data[dataset].VALID:
            utils.notify("Evaluating valid set of %s" % dataset, logger=logger)
            evaluate(data_loader=valid_loaders[dataset],
                     phase="valid",
                     **eval_kwargs)
    for dataset in cfg.data.datasets:
        if cfg.data[dataset].TEST:
            utils.notify("Evaluating test set of %s" % dataset, logger=logger)
            evaluate(data_loader=test_loaders[dataset],
                     phase="test",
                     **eval_kwargs)

    for dataset in cfg.data.datasets:
        if "train" in cfg.data[dataset].INFER:
            utils.notify("Inference on train set of %s" % dataset)
            inference(data_loader=train_loaders[dataset],
                      phase="infer_train",
                      **eval_kwargs)
        if "valid" in cfg.data[dataset].INFER:
            utils.notify("Inference on valid set of %s" % dataset)
            inference(data_loader=valid_loaders[dataset],
                      phase="infer_valid",
                      **eval_kwargs)
        if "test" in cfg.data[dataset].INFER:
            utils.notify("Inference on test set of %s" % dataset)
            inference(data_loader=test_loaders[dataset],
                      phase="infer_test",
                      **eval_kwargs)

    return None
Пример #3
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu  # local rank, local machine cuda id
    args.local_rank = args.gpu
    args.batch_size = args.batch_size_per_gpu
    args.batch_size_total = args.batch_size * args.world_size
    #rescale base lr
    args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max(
        1, args.batch_size_total // 256))

    # set random seed, make sure all random subgraph generated would be the same
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.gpu:
        torch.cuda.manual_seed(args.seed)

    global_rank = args.gpu + args.machine_rank * ngpus_per_node
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=global_rank)

    # Setup logging format.
    logging.setup_logging(args.logging_save_path, 'w')

    logger.info(
        f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \
                    gpu per node {ngpus_per_node}, world size {args.world_size}"
    )

    # synchronize is needed here to prevent a possible timeout after calling
    # init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    args.rank = comm.get_rank()  # global rank
    args.local_rank = args.gpu
    torch.cuda.set_device(args.gpu)

    # build model
    logger.info("=> creating model '{}'".format(args.arch))
    model = models.model_factory.create_model(args)
    model.cuda(args.gpu)

    # use sync batchnorm
    if getattr(args, 'sync_bn', False):
        model.apply(lambda m: setattr(m, 'need_sync', True))

    model = comm.get_parallel_model(model, args.gpu)  #local rank

    logger.info(model)

    criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda(
        args.gpu)
    soft_criterion = loss_ops.AdaptiveLossSoft(args.alpha_min, args.alpha_max,
                                               args.iw_clip).cuda(args.gpu)

    if not getattr(args, 'inplace_distill', True):
        soft_criterion = None

    ## load dataset, train_sampler: distributed
    train_loader, val_loader, train_sampler = build_data_loader(args)
    args.n_iters_per_epoch = len(train_loader)

    logger.info(f'building optimizer and lr scheduler, \
            local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}'
                )
    optimizer = build_optimizer(args, model)
    lr_scheduler = build_lr_scheduler(args, optimizer)

    # optionally resume from a checkpoint
    if args.resume:
        saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger)

    logger.info(args)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        args.curr_epoch = epoch
        logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0]))

        # train for one epoch
        acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \
                soft_criterion=soft_criterion, lr_scheduler=lr_scheduler)

        if comm.is_master_process() or args.distributed:
            # validate supernet model
            validate(train_loader, val_loader, model, criterion, args)

        if comm.is_master_process():
            # save checkpoints
            saver.save_checkpoint(
                args.checkpoint_save_path,
                model,
                optimizer,
                lr_scheduler,
                args,
                epoch,
            )
Пример #4
0
run_args = parser.parse_args()

if __name__ == '__main__':
    args = setup(run_args.config_file)
    args.model = run_args.model
    args.gpu = run_args.gpu

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    args.__dict__['active_subnet'] = args.__dict__['pareto_models'][args.model]
    print(args.active_subnet)

    train_loader, val_loader, train_sampler = build_data_loader(args)

    ## init static attentivenas model with weights inherited from the supernet
    model = models.model_factory.create_model(args)

    model.to(args.gpu)
    model.eval()

    # bn running stats calibration following Slimmable (https://arxiv.org/abs/1903.05134)
    # please consider trying a different random seed if you see a small accuracy drop
    with torch.no_grad():
        model.reset_running_stats_for_calibration()
        for batch_idx, (images, _) in enumerate(train_loader):
            if batch_idx >= args.post_bn_calibration_batch_num:
                break
            images = images.cuda(args.gpu, non_blocking=True)
Пример #5
0
def main():
    # Set logger to record information.
    logger = Logger(cfg)
    logger.log_info(cfg)
    metrics_logger = Metrics()
    utils.pack_code(cfg, logger=logger)

    # Build model.
    model = model_builder.build_model(cfg=cfg, logger=logger)

    # Read checkpoint.
    ckpt = torch.load(cfg.MODEL.PATH2CKPT) if cfg.GENERAL.RESUME else {}

    if cfg.GENERAL.RESUME:
        model.load_state_dict(ckpt["model"])
    resume_epoch = ckpt["epoch"] if cfg.GENERAL.RESUME else 0
    optimizer = ckpt[
        "optimizer"] if cfg.GENERAL.RESUME else optimizer_helper.build_optimizer(
            cfg=cfg, model=model)
    # lr_scheduler = ckpt["lr_scheduler"] if cfg.GENERAL.RESUME else lr_scheduler_helper.build_scheduler(cfg=cfg, optimizer=optimizer)
    lr_scheduler = lr_scheduler_helper.build_scheduler(cfg=cfg,
                                                       optimizer=optimizer)
    lr_scheduler.sychronize(resume_epoch)
    loss_fn = ckpt[
        "loss_fn"] if cfg.GENERAL.RESUME else loss_fn_helper.build_loss_fn(
            cfg=cfg)

    # Set device.
    model, device = utils.set_device(model, cfg.GENERAL.GPU)

    # Prepare dataset.
    if cfg.GENERAL.TRAIN:
        try:
            train_data_loader = data_loader.build_data_loader(
                cfg, cfg.DATA.DATASET, "train")
        except:
            logger.log_info("Cannot build train dataset.")
    if cfg.GENERAL.VALID:
        try:
            valid_data_loader = data_loader.build_data_loader(
                cfg, cfg.DATA.DATASET, "valid")
        except:
            logger.log_info("Cannot build valid dataset.")
    if cfg.GENERAL.TEST:
        try:
            test_data_loader = data_loader.build_data_loader(
                cfg, cfg.DATA.DATASET, "test")
        except:
            logger.log_info("Cannot build test dataset.")

    # Train, evaluate model and save checkpoint.
    for epoch in range(cfg.TRAIN.MAX_EPOCH):
        if resume_epoch >= epoch:
            continue

        try:
            train_one_epoch(
                epoch=epoch,
                cfg=cfg,
                model=model,
                data_loader=train_data_loader,
                device=device,
                loss_fn=loss_fn,
                optimizer=optimizer,
                lr_scheduler=lr_scheduler,
                metrics_logger=metrics_logger,
                logger=logger,
            )
        except:
            logger.log_info("Failed to train model.")

        optimizer.zero_grad()
        with torch.no_grad():
            utils.save_ckpt(
                path2file=os.path.join(
                    cfg.MODEL.CKPT_DIR,
                    cfg.GENERAL.ID + "_" + str(epoch).zfill(3) + ".pth"),
                logger=logger,
                model=model.state_dict(),
                epoch=epoch,
                optimizer=optimizer,
                lr_scheduler=lr_scheduler,  # NOTE Need attribdict>=0.0.5
                loss_fn=loss_fn,
                metrics=metrics_logger,
            )
        try:
            evaluate(
                epoch=epoch,
                cfg=cfg,
                model=model,
                data_loader=valid_data_loader,
                device=device,
                loss_fn=loss_fn,
                metrics_logger=metrics_logger,
                phase="valid",
                logger=logger,
                save=cfg.SAVE.SAVE,
            )
        except:
            logger.log_info("Failed to evaluate model.")

        with torch.no_grad():
            utils.save_ckpt(
                path2file=os.path.join(
                    cfg.MODEL.CKPT_DIR,
                    cfg.GENERAL.ID + "_" + str(epoch).zfill(3) + ".pth"),
                logger=logger,
                model=model.state_dict(),
                epoch=epoch,
                optimizer=optimizer,
                lr_scheduler=lr_scheduler,  # NOTE Need attribdict>=0.0.5
                loss_fn=loss_fn,
                metrics=metrics_logger,
            )

    # If test set has target images, evaluate and save them, otherwise just try to generate output images.
    if cfg.DATA.DATASET == "DualPixelNTIRE2021":
        try:
            generate(
                cfg=cfg,
                model=model,
                data_loader=valid_data_loader,
                device=device,
                phase="valid",
                logger=logger,
            )
        except:
            logger.log_info(
                "Failed to generate output images of valid set of NTIRE2021.")
    try:
        evaluate(
            epoch=epoch,
            cfg=cfg,
            model=model,
            data_loader=test_data_loader,
            device=device,
            loss_fn=loss_fn,
            metrics_logger=metrics_logger,
            phase="test",
            logger=logger,
            save=True,
        )
    except:
        logger.log_info("Failed to test model, try to generate images.")
        try:
            generate(
                cfg=cfg,
                model=model,
                data_loader=test_data_loader,
                device=device,
                phase="test",
                logger=logger,
            )
        except:
            logger.log_info("Cannot generate output images of test set.")
    return None
Пример #6
0
def eval_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu  # local rank, local machine cuda id
    args.local_rank = args.gpu
    args.batch_size = args.batch_size_per_gpu

    global_rank = args.gpu + args.machine_rank * ngpus_per_node
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=global_rank)

    # Setup logging format.
    logging.setup_logging("stdout.log", 'w')

    # synchronize is needed here to prevent a possible timeout after calling
    # init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    args.rank = comm.get_rank()  # global rank
    torch.cuda.set_device(args.gpu)

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    # build the supernet
    logger.info("=> creating model '{}'".format(args.arch))
    model = models.model_factory.create_model(args)
    model.cuda(args.gpu)
    model = comm.get_parallel_model(model, args.gpu)  #local rank

    # define loss function (criterion)
    criterion = nn.CrossEntropyLoss().cuda()

    ## load dataset, train_sampler: distributed
    train_loader, val_loader, train_sampler = build_data_loader(args)

    assert args.resume
    #reloading model
    model.module.load_weights_from_pretrained_models(args.resume)

    if train_sampler:
        train_sampler.set_epoch(0)

    targeted_min_flops = args.evo_search.targeted_min_flops
    targeted_max_flops = args.evo_search.targeted_max_flops

    # run evolutionary search
    parent_popu = []
    for idx in range(args.evo_search.parent_popu_size):
        if idx == 0:
            cfg = model.module.sample_min_subnet()
        else:
            cfg = model.module.sample_active_subnet_within_range(
                targeted_min_flops, targeted_max_flops)
        cfg['net_id'] = f'net_{idx % args.world_size}_evo_0_{idx}'
        parent_popu.append(cfg)

    pareto_global = {}
    for evo in range(args.evo_search.evo_iter):
        # partition the set of candidate sub-networks
        # and send them to each GPU for parallel evaluation

        # sub-networks to be evaluated on GPU {args.rank}
        my_subnets_to_be_evaluated = {}
        n_evaluated = len(parent_popu) // args.world_size * args.world_size
        for cfg in parent_popu[:n_evaluated]:
            if cfg['net_id'].startswith(f'net_{args.rank}_'):
                my_subnets_to_be_evaluated[cfg['net_id']] = cfg

        # aggregating all evaluation results
        eval_results = attentive_nas_eval.validate(
            my_subnets_to_be_evaluated,
            train_loader,
            val_loader,
            model,
            criterion,
            args,
            logger,
        )

        # update the Pareto frontier
        # in this case, we search the best FLOPs vs. accuracy trade-offs
        for cfg in eval_results:
            f = round(
                cfg['flops'] / args.evo_search.step) * args.evo_search.step
            if f not in pareto_global or pareto_global[f]['acc1'] < cfg['acc1']:
                pareto_global[f] = cfg

        # next batch of sub-networks to be evaluated
        parent_popu = []
        # mutate
        for idx in range(args.evo_search.mutate_size):
            while True:
                old_cfg = random.choice(list(pareto_global.values()))
                cfg = model.module.mutate_and_reset(
                    old_cfg, prob=args.evo_search.mutate_prob)
                flops = model.module.compute_active_subnet_flops()
                if flops >= targeted_min_flops and flops <= targeted_max_flops:
                    break
            cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_mutate_{idx}'
            parent_popu.append(cfg)

        # cross over
        for idx in range(args.evo_search.crossover_size):
            while True:
                cfg1 = random.choice(list(pareto_global.values()))
                cfg2 = random.choice(list(pareto_global.values()))
                cfg = model.module.crossover_and_reset(cfg1, cfg2)
                flops = model.module.compute_active_subnet_flops()
                if flops >= targeted_min_flops and flops <= targeted_max_flops:
                    break
            cfg['net_id'] = f'net_{idx % args.world_size}_evo_{evo}_crossover_{idx}'
            parent_popu.append(cfg)