예제 #1
0
파일: train.py 프로젝트: yrpang/mindspore
def train():
    """Train function."""
    args = get_args("train")
    if args.need_profiler:
        from mindspore.profiler.profiling import Profiler
        profiler = Profiler(output_path=args.outputs_dir,
                            is_detail=True,
                            is_show_op_path=True)
    ds = create_dataset(args)
    G_A = get_generator(args)
    G_B = get_generator(args)
    D_A = get_discriminator(args)
    D_B = get_discriminator(args)
    load_ckpt(args, G_A, G_B, D_A, D_B)
    imgae_pool_A = ImagePool(args.pool_size)
    imgae_pool_B = ImagePool(args.pool_size)
    generator = Generator(G_A, G_B, args.lambda_idt > 0)

    loss_D = DiscriminatorLoss(args, D_A, D_B)
    loss_G = GeneratorLoss(args, generator, D_A, D_B)
    optimizer_G = nn.Adam(generator.trainable_params(),
                          get_lr(args),
                          beta1=args.beta1)
    optimizer_D = nn.Adam(loss_D.trainable_params(),
                          get_lr(args),
                          beta1=args.beta1)

    net_G = TrainOneStepG(loss_G, generator, optimizer_G)
    net_D = TrainOneStepD(loss_D, optimizer_D)

    data_loader = ds.create_dict_iterator()
    reporter = Reporter(args)
    reporter.info('==========start training===============')
    for _ in range(args.max_epoch):
        reporter.epoch_start()
        for data in data_loader:
            img_A = data["image_A"]
            img_B = data["image_B"]
            res_G = net_G(img_A, img_B)
            fake_A = res_G[0]
            fake_B = res_G[1]
            res_D = net_D(img_A, img_B, imgae_pool_A.query(fake_A),
                          imgae_pool_B.query(fake_B))
            reporter.step_end(res_G, res_D)
            reporter.visualizer(img_A, img_B, fake_A, fake_B)
        reporter.epoch_end(net_G)
        if args.need_profiler:
            profiler.analyse()
            break

    reporter.info('==========end training===============')
예제 #2
0
def predict():
    """Predict function."""
    args = get_args("predict")
    G_A = get_generator(args)
    G_B = get_generator(args)
    # Use BatchNorm2d with batchsize=1, affine=False, training=True instead of InstanceNorm2d
    # Use real mean and varance rather than moving_men and moving_varance in BatchNorm2d
    G_A.set_train(True)
    G_B.set_train(True)
    load_ckpt(args, G_A, G_B)

    imgs_out = os.path.join(args.outputs_dir, "predict")
    if not os.path.exists(imgs_out):
        os.makedirs(imgs_out)
    if not os.path.exists(os.path.join(imgs_out, "fake_A")):
        os.makedirs(os.path.join(imgs_out, "fake_A"))
    if not os.path.exists(os.path.join(imgs_out, "fake_B")):
        os.makedirs(os.path.join(imgs_out, "fake_B"))
    args.data_dir = 'testA'
    ds = create_dataset(args)
    reporter = Reporter(args)
    reporter.start_predict("A to B")
    for data in ds.create_dict_iterator(output_numpy=True):
        img_A = Tensor(data["image"])
        path_A = str(data["image_name"][0], encoding="utf-8")
        fake_B = G_A(img_A)
        save_image(fake_B, os.path.join(imgs_out, "fake_B", path_A))
    reporter.info('save fake_B at %s', os.path.join(imgs_out, "fake_B",
                                                    path_A))
    reporter.end_predict()
    args.data_dir = 'testB'
    ds = create_dataset(args)
    reporter.dataset_size = args.dataset_size
    reporter.start_predict("B to A")
    for data in ds.create_dict_iterator(output_numpy=True):
        img_B = Tensor(data["image"])
        path_B = str(data["image_name"][0], encoding="utf-8")
        fake_A = G_B(img_B)
        save_image(fake_A, os.path.join(imgs_out, "fake_A", path_B))
    reporter.info('save fake_A at %s', os.path.join(imgs_out, "fake_A",
                                                    path_B))
    reporter.end_predict()
예제 #3
0
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            model.parameters()),
                                     lr=lr,
                                     weight_decay=config.weight_decay)
    elif config.optim == "SGD":
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                           model.parameters()),
                                    lr=lr,
                                    momentum=config.momentum,
                                    weight_decay=config.weight_decay)

    start_iter = 0
    if config.resume:
        print("Loading the trained params and the state of optimizer...")
        start_iter = load_ckpt(config.resume,
                               [("model", model)],
                               [("optimizer", optimizer)])
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr
        print("Starting from iter ", start_iter)

    trainer = Trainer(start_iter, config, device, model, dataset_train,
                      dataset_val, criterion, optimizer, experiment=experiment)
    if config.comet:
        with experiment.train():
            trainer.iterate()
    else:
        trainer.iterate()

# Set the configuration for testing
elif config.mode == "test":
예제 #4
0
def main(config):
    CASE_NUM = config['case_num']

    DATASET = config['dataset']
    NORMALIZATION = config['normalization']

    BATCH_SIZE = config['batch_size']
    MAX_EPOCH = config['max_epoch']
    OPTIM_TYPE = config['optimzer']
    LR = config['learning_rate']
    LR_STEP = config['lr_step']
    LR_DECAY = config['lr_decay']
    L2_DECAY = config['l2_decay']
    TB_STATE = config['use_tensorboard']

    MODEL_NAME = config['model_name']
    ALPHA = config['alpha']
    BETA = config['beta']
    GAMMA = config['gamma']
    PHI = config['phi']
    LOSS_FN = config['loss_fn']
    KERNEL_SIZE = config['kernel_size']

    result_dir = RESULT_ROOT_DIR + '/' + CASE_NUM
    ckpt_path = result_dir + '/' + 'checkpoint.pt'

    #%%
    data_fname, data_dim = select_data(DATASET)
    data_path = '../data/' + data_fname

    data_test = NLUDataset(data_path, mode='test', random_seed=42)
    dataloader_test = DataLoader(data_test,
                                 batch_size=BATCH_SIZE,
                                 shuffle=True,
                                 num_workers=4)

    classes = data_test.labels
    num_classes = len(classes)

    #%%
    device = ('cuda' if torch.cuda.is_available() else 'cpu')
    if device == 'cuda': print('Using GPU, %s' % torch.cuda.get_device_name(0))

    net = select_model(MODEL_NAME, data_dim, KERNEL_SIZE, num_classes, ALPHA,
                       BETA, PHI)
    net.to(device)
    loss_fn = select_loss(LOSS_FN)

    #%%
    net, best_validation_acc = load_ckpt(ckpt_path, net)

    start_time = time.time()
    test_acc, test_loss = evaluate(dataloader_test, device, net, loss_fn)
    curr_time = time.time()
    ttt = curr_time - start_time
    tt1 = ttt / data_test.__len__()

    print('########################################################')
    print('# Test accuracy of %d: %.4f' % (CASE_NUM, test_acc))
    print("# Average %.6f s to process one input" % (tt1))
    print('########################################################')
예제 #5
0
def main():

    arg = args()

    if not os.path.exists(arg.exp_name):
        os.makedirs(arg.exp_name)

    assert arg.exp_name.split(
        '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..."
    output_dir = arg.exp_name

    if arg.local_rank == 0:
        save_scripts_in_exp_dir(output_dir)

    logger = logging_set(output_dir, arg.local_rank)
    logger.info(arg)
    logger.info(
        '\n================ experient name:[{}] ===================\n'.format(
            arg.exp_name))
    os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    np.random.seed(0)
    torch.manual_seed(0)

    config = edict(yaml.load(open(arg.cfg, 'r')))

    if arg.search:
        assert arg.search in [
            'None', 'sync', 'random', 'second_order_gradient',
            'first_order_gradient'
        ]
        config.train.arch_search_strategy = arg.search

    if arg.batchsize:
        logger.info("update batchsize to {}".format(arg.batchsize))
        config.train.batchsize = arg.batchsize

    config.num_workers = arg.num_workers

    print(
        'GPU memory : \ntotal | used\n',
        os.popen(
            'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader'
        ).read())

    logger.info(
        '------------------------------ configuration ---------------------------'
    )
    logger.info(
        '\n==> available {} GPUs , use numbers are {} device is {}\n'.format(
            torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"],
            torch.cuda.current_device()))
    # torch.cuda._initialized = True
    logger.info(pprint.pformat(config))
    logger.info(
        '------------------------------- -------- ----------------------------'
    )

    best = 0

    criterion = MSELoss()

    Arch = bulid_up_network(config, criterion)

    if config.train.arch_search_strategy == 'random':

        logger.info("==>random seed is {}".format(config.train.random_seed))
        np.random.seed(config.train.random_seed)
        torch.manual_seed(config.train.random_seed)

        Arch.arch_parameters_random_search()

    if arg.param_flop:
        Arch._print_info()

    if len(arg.gpu) > 1:
        use_multi_gpu = True

        if arg.distributed:
            torch.distributed.init_process_group(backend="nccl")
            #torch.distributed.init_process_group(backend="nccl",init_method='env://')
            local_rank = torch.distributed.get_rank()
            torch.cuda.set_device(local_rank)
            device = torch.device("cuda", local_rank)
            Arch.to(device)

            Arch = torch.nn.parallel.DistributedDataParallel(
                Arch,
                device_ids=[local_rank],
                output_device=local_rank,
                find_unused_parameters=True)
            logger.info("local rank = {}".format(local_rank))
        else:
            Arch = torch.nn.DataParallel(Arch).cuda()
    else:
        use_multi_gpu = False
        Arch = Arch.cuda()

    Search = Search_Arch(Arch.module,
                         config) if use_multi_gpu else Search_Arch(
                             Arch, config)  # Arch.module for nn.DataParallel

    search_strategy = config.train.arch_search_strategy

    if not arg.distributed:
        train_queue, arch_queue, valid_queue = Dataloaders(
            search_strategy, config, arg)
    else:
        train_queue, \
        arch_queue, \
        valid_queue, \
        train_sampler_dist, = Dataloaders(search_strategy,config,arg)
    #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None!

    logger.info(
        "\nNeural Architecture Search strategy is {}".format(search_strategy))
    assert search_strategy in [
        'first_order_gradient', 'random', 'None', 'second_order_gradient',
        'sync'
    ]

    if search_strategy == 'sync':
        # arch_parameters is also registered to model's parameters
        # so the weight-optimizer will also update the arch_parameters
        logger.info(
            "sync: The arch_parameters is also optimized by weight-optmizer synchronously"
        )
        optimizer = torch.optim.Adam(
            Arch.parameters(),
            lr=config.train.w_lr_cosine_begin,
        )

    else:
        # if search strategy is None,random,second_order_gradient and so on
        # the arch_parameters will be filtered by the weight-optimizer
        optimizer = torch.optim.Adam(
            filter_arch_parameters(Arch),
            lr=config.train.w_lr_cosine_begin,
        )
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer,  step_size = config.train.lr_step_size,
    #                                                       gamma = config.train.lr_decay_gamma )
    if config.train.scheduler_name == "MultiStepLR":
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.train.LR_STEP, config.train.LR_FACTOR)
    elif config.train.scheduler_name == "CosineAnnealingLR":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=config.train.epoch_end,
            eta_min=config.train.w_lr_cosine_end)

    # best_result

    logger.info(
        "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=="
    )
    begin, end = config.train.epoch_begin, config.train.epoch_end

    if arg.load_ckpt:
        if use_multi_gpu:
            begin, best = load_ckpt(Arch.module, optimizer, scheduler,
                                    output_dir, logger)
        else:
            begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir,
                                    logger)

    for epoch in range(begin, end):

        lr = scheduler.get_lr()[0]
        logger.info(
            '==>time:({})--training...... current learning rate is {:.7f}'.
            format(datetime.datetime.now(), lr))

        if arg.distributed:
            train_sampler_dist.set_epoch(epoch)
            #valid_sampler_dist.set_epoch(epoch)

        train(
            epoch,
            train_queue,
            arch_queue,
            Arch,
            Search,
            criterion,
            optimizer,
            lr,
            search_strategy,
            output_dir,
            logger,
            config,
            arg,
        )
        scheduler.step()

        if not arg.distributed or (arg.distributed and arg.local_rank == 0):

            eval_results = evaluate(Arch, valid_queue, config, output_dir)

            if use_multi_gpu:
                best = save_model(epoch, best, eval_results, Arch.module,
                                  optimizer, scheduler, output_dir, logger)
            else:
                best = save_model(epoch, best, eval_results, Arch, optimizer,
                                  scheduler, output_dir, logger)
예제 #6
0
파일: train.py 프로젝트: zeta1999/ESANet
def train_main():
    args = parse_args()

    # directory for storing weights and other training related files
    training_starttime = datetime.now().strftime("%d_%m_%Y-%H_%M_%S-%f")
    ckpt_dir = os.path.join(args.results_dir, args.dataset,
                            f'checkpoints_{training_starttime}')
    os.makedirs(ckpt_dir, exist_ok=True)
    os.makedirs(os.path.join(ckpt_dir, 'confusion_matrices'), exist_ok=True)

    with open(os.path.join(ckpt_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, indent=4)

    with open(os.path.join(ckpt_dir, 'argsv.txt'), 'w') as f:
        f.write(' '.join(sys.argv))
        f.write('\n')

    # when using multi scale supervision the label needs to be downsampled.
    label_downsampling_rates = [8, 16, 32]

    # data preparation ---------------------------------------------------------
    data_loaders = prepare_data(args, ckpt_dir)

    if args.valid_full_res:
        train_loader, valid_loader, valid_loader_full_res = data_loaders
    else:
        train_loader, valid_loader = data_loaders
        valid_loader_full_res = None

    cameras = train_loader.dataset.cameras
    n_classes_without_void = train_loader.dataset.n_classes_without_void
    if args.class_weighting != 'None':
        class_weighting = train_loader.dataset.compute_class_weights(
            weight_mode=args.class_weighting,
            c=args.c_for_logarithmic_weighting)
    else:
        class_weighting = np.ones(n_classes_without_void)

    # model building -----------------------------------------------------------
    model, device = build_model(args, n_classes=n_classes_without_void)

    if args.freeze > 0:
        print('Freeze everything but the output layer(s).')
        for name, param in model.named_parameters():
            if 'out' not in name:
                param.requires_grad = False

    # loss, optimizer, learning rate scheduler, csvlogger  ----------

    # loss functions (only loss_function_train is really needed.
    # The other loss functions are just there to compare valid loss to
    # train loss)
    loss_function_train = \
        utils.CrossEntropyLoss2d(weight=class_weighting, device=device)

    pixel_sum_valid_data = valid_loader.dataset.compute_class_weights(
        weight_mode='linear')
    pixel_sum_valid_data_weighted = \
        np.sum(pixel_sum_valid_data * class_weighting)
    loss_function_valid = utils.CrossEntropyLoss2dForValidData(
        weight=class_weighting,
        weighted_pixel_sum=pixel_sum_valid_data_weighted,
        device=device)
    loss_function_valid_unweighted = \
        utils.CrossEntropyLoss2dForValidDataUnweighted(device=device)

    optimizer = get_optimizer(args, model)

    # in this script lr_scheduler.step() is only called once per epoch
    lr_scheduler = OneCycleLR(optimizer,
                              max_lr=[i['lr'] for i in optimizer.param_groups],
                              total_steps=args.epochs,
                              div_factor=25,
                              pct_start=0.1,
                              anneal_strategy='cos',
                              final_div_factor=1e4)

    # load checkpoint if parameter last_ckpt is provided
    if args.last_ckpt:
        ckpt_path = os.path.join(ckpt_dir, args.last_ckpt)
        epoch_last_ckpt, best_miou, best_miou_epoch = \
            load_ckpt(model, optimizer, ckpt_path, device)
        start_epoch = epoch_last_ckpt + 1
    else:
        start_epoch = 0
        best_miou = 0
        best_miou_epoch = 0

    valid_split = valid_loader.dataset.split

    # build the log keys for the csv log file and for the web logger
    log_keys = [f'mIoU_{valid_split}']
    if args.valid_full_res:
        log_keys.append(f'mIoU_{valid_split}_full-res')
        best_miou_full_res = 0

    log_keys_for_csv = log_keys.copy()

    # mIoU for each camera
    for camera in cameras:
        log_keys_for_csv.append(f'mIoU_{valid_split}_{camera}')
        if args.valid_full_res:
            log_keys_for_csv.append(f'mIoU_{valid_split}_full-res_{camera}')

    log_keys_for_csv.append('epoch')
    for i in range(len(lr_scheduler.get_lr())):
        log_keys_for_csv.append('lr_{}'.format(i))
    log_keys_for_csv.extend(['loss_train_total', 'loss_train_full_size'])
    for rate in label_downsampling_rates:
        log_keys_for_csv.append('loss_train_down_{}'.format(rate))
    log_keys_for_csv.extend([
        'time_training', 'time_validation', 'time_confusion_matrix',
        'time_forward', 'time_post_processing', 'time_copy_to_gpu'
    ])

    valid_names = [valid_split]
    if args.valid_full_res:
        valid_names.append(valid_split + '_full-res')
    for valid_name in valid_names:
        # iou for every class
        for i in range(n_classes_without_void):
            log_keys_for_csv.append(f'IoU_{valid_name}_class_{i}')
        log_keys_for_csv.append(f'loss_{valid_name}')
        if loss_function_valid_unweighted is not None:
            log_keys_for_csv.append(f'loss_{valid_name}_unweighted')

    csvlogger = CSVLogger(log_keys_for_csv,
                          os.path.join(ckpt_dir, 'logs.csv'),
                          append=True)

    # one confusion matrix per camera and one for whole valid data
    confusion_matrices = dict()
    for camera in cameras:
        confusion_matrices[camera] = \
            ConfusionMatrixTensorflow(n_classes_without_void)
        confusion_matrices['all'] = \
            ConfusionMatrixTensorflow(n_classes_without_void)

    # start training -----------------------------------------------------------
    for epoch in range(int(start_epoch), args.epochs):
        # unfreeze
        if args.freeze == epoch and args.finetune is None:
            print('Unfreezing')
            for param in model.parameters():
                param.requires_grad = True

        logs = train_one_epoch(model,
                               train_loader,
                               device,
                               optimizer,
                               loss_function_train,
                               epoch,
                               lr_scheduler,
                               args.modality,
                               label_downsampling_rates,
                               debug_mode=args.debug)

        # validation after every epoch -----------------------------------------
        miou, logs = validate(model,
                              valid_loader,
                              device,
                              cameras,
                              confusion_matrices,
                              args.modality,
                              loss_function_valid,
                              logs,
                              ckpt_dir,
                              epoch,
                              loss_function_valid_unweighted,
                              debug_mode=args.debug)

        if args.valid_full_res:
            miou_full_res, logs = validate(model,
                                           valid_loader_full_res,
                                           device,
                                           cameras,
                                           confusion_matrices,
                                           args.modality,
                                           loss_function_valid,
                                           logs,
                                           ckpt_dir,
                                           epoch,
                                           loss_function_valid_unweighted,
                                           add_log_key='_full-res',
                                           debug_mode=args.debug)

        logs.pop('time', None)
        csvlogger.write_logs(logs)

        # save weights
        print(miou['all'])
        save_current_checkpoint = False
        if miou['all'] > best_miou:
            best_miou = miou['all']
            best_miou_epoch = epoch
            save_current_checkpoint = True

        if args.valid_full_res and miou_full_res['all'] > best_miou_full_res:
            best_miou_full_res = miou_full_res['all']
            best_miou_full_res_epoch = epoch
            save_current_checkpoint = True

        # don't save weights for the first 10 epochs as mIoU is likely getting
        # better anyway
        if epoch >= 10 and save_current_checkpoint is True:
            save_ckpt(ckpt_dir, model, optimizer, epoch)

        # save / overwrite latest weights (useful for resuming training)
        save_ckpt_every_epoch(ckpt_dir, model, optimizer, epoch, best_miou,
                              best_miou_epoch)

    # write a finish file with best miou values in order overview
    # training result quickly
    with open(os.path.join(ckpt_dir, 'finished.txt'), 'w') as f:
        f.write('best miou: {}\n'.format(best_miou))
        f.write('best miou epoch: {}\n'.format(best_miou_epoch))
        if args.valid_full_res:
            f.write(f'best miou full res: {best_miou_full_res}\n')
            f.write(f'best miou full res epoch: {best_miou_full_res_epoch}\n')

    print("Training completed ")
예제 #7
0
파일: export.py 프로젝트: yrpang/mindspore
# limitations under the License.
# ============================================================================
"""export file."""
import numpy as np

from mindspore import context, Tensor
from mindspore.train.serialization import export
from src.models import get_generator
from src.utils import get_args, load_ckpt

args = get_args("export")

context.set_context(mode=context.GRAPH_MODE, device_target=args.platform)

if __name__ == '__main__':
    G_A = get_generator(args)
    G_B = get_generator(args)
    # Use BatchNorm2d with batchsize=1, affine=False, training=True instead of InstanceNorm2d
    # Use real mean and varance rather than moving_men and moving_varance in BatchNorm2d
    G_A.set_train(True)
    G_B.set_train(True)
    load_ckpt(args, G_A, G_B)

    input_shp = [1, 3, args.image_size, args.image_size]
    input_array = Tensor(
        np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32))
    G_A_file = f"{args.file_name}_BtoA"
    export(G_A, input_array, file_name=G_A_file, file_format=args.file_format)
    G_B_file = f"{args.file_name}_AtoB"
    export(G_B, input_array, file_name=G_B_file, file_format=args.file_format)
예제 #8
0
def main(config):
    CASE_NUM = config['case_num']

    DATASET = config['dataset']
    NORMALIZATION = config['normalization']

    BATCH_SIZE = config['batch_size']
    MAX_EPOCH = config['max_epoch']
    OPTIM_TYPE = config['optimizer']
    LR = config['learning_rate']
    LR_STEP = config['lr_step']
    LR_DECAY = config['lr_decay']
    L2_DECAY = config['l2_decay']
    TB_STATE = config['use_tensorboard']

    MODEL_NAME = config['model_name']
    ALPHA = config['alpha']
    BETA = config['beta']
    GAMMA = config['gamma']
    PHI = config['phi']
    LOSS_FN = config['loss_fn']
    KERNEL_SIZE = config['kernel_size']

    result_dir = make_dir(RESULT_ROOT_DIR,
                          str(CASE_NUM),
                          overwrite=args.overwrite)
    ckpt_path = result_dir + '/' + 'checkpoint.pt'

    # =============================================== Select data and construct
    data_fname, data_dim = select_data(DATASET)
    data_path = '../data/' + data_fname

    data_train = NLUDataset(data_path,
                            mode='train',
                            normalization=NORMALIZATION,
                            random_seed=42)
    dataloader_train = DataLoader(data_train,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=4)

    data_valid = NLUDataset(data_path,
                            mode='valid',
                            normalization=NORMALIZATION,
                            random_seed=42)
    dataloader_valid = DataLoader(data_valid,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=4)

    data_test = NLUDataset(data_path,
                           mode='test',
                           normalization=NORMALIZATION,
                           random_seed=42)
    dataloader_test = DataLoader(data_test,
                                 batch_size=BATCH_SIZE,
                                 shuffle=True,
                                 num_workers=4)

    num_train_samples = data_train.__len__()
    classes = data_train.labels
    num_classes = len(classes)

    # =============================================== Initialize model and optimizer
    device = ('cuda' if torch.cuda.is_available() else 'cpu')
    if device == 'cuda': print('Using GPU, %s' % torch.cuda.get_device_name(0))

    net = select_model(MODEL_NAME, data_dim, KERNEL_SIZE, num_classes, ALPHA,
                       BETA, PHI)
    net.to(device)
    loss_fn = select_loss(LOSS_FN)
    optimizer = select_optimizer(OPTIM_TYPE, net.parameters(), LR, L2_DECAY)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=LR_STEP,
                                          gamma=LR_DECAY)

    # =============================================== Train
    it = 0
    train_losses, valid_losses, valid_accs = {}, {}, {}
    best_validation_acc = 0
    log_term = 5

    for epoch in range(MAX_EPOCH):
        #------------------------------------------------ One epoch start
        one_epoch_start = time.time()
        print('Epoch {} / Learning Rate: {:.0e}'.format(
            epoch,
            scheduler.get_lr()[0]))
        #------------------------------------------------ Train
        train_losses, it, net, optimizer, scheduler \
            = train_1epoch(dataloader_train, device, train_losses, it, net, loss_fn, optimizer, scheduler, log_every=log_term)
        #------------------------------------------------ Validation
        valid_acc, valid_loss = evaluate(dataloader_valid, device, net,
                                         loss_fn)
        valid_losses[it] = valid_loss
        valid_accs[it] = valid_acc
        #------------------------------------------------ Save model
        saved = ''
        if valid_acc > best_validation_acc:
            best_validation_acc = valid_acc
            saved = save_ckpt(ckpt_path, net, best_validation_acc)
        print('Epoch {} / Valid loss: {:.4f}, Valid acc: {:.4f} {}'.format(
            epoch, valid_loss, valid_acc, saved))
        #------------------------------------------------ One epoch end
        curr_time = time.time()
        print("One epoch time = %.2f s" % (curr_time - one_epoch_start))
        print('#------------------------------------------------------#')

    save_train_log(result_dir, train_losses, valid_losses, valid_accs,
                   best_validation_acc)

    # =============================================== Test
    net, best_validation_acc = load_ckpt(ckpt_path, net)
    test_acc, test_loss = evaluate(dataloader_test, device, net, loss_fn)

    return test_acc
예제 #9
0
파일: train.py 프로젝트: zhangrj91/PoseNFS
def main():

    arg = args()

    if not os.path.exists(arg.exp_name):
        os.makedirs(arg.exp_name)

    assert arg.exp_name.split(
        '/')[0] == 'o', "'o' is the directory of experiment, --exp_name o/..."
    output_dir = arg.exp_name

    save_scripts_in_exp_dir(output_dir)

    logger = logging_set(output_dir)

    logger.info(
        '\n================ experient name:[{}] ===================\n'.format(
            arg.exp_name))
    os.environ["CUDA_VISIBLE_DEVICES"] = arg.gpu

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    np.random.seed(0)
    torch.manual_seed(0)

    config = edict(yaml.load(open(arg.cfg, 'r')))

    if arg.search:
        assert arg.search in [
            'None', 'sync', 'random', 'second_order_gradient',
            'first_order_gradient'
        ]
        config.train.arch_search_strategy = arg.search

    if arg.batchsize:
        logger.info("update batchsize to {}".format(arg.batchsize))
        config.train.batchsize = arg.batchsize

    config.num_workers = arg.num_workers

    print(
        'GPU memory : \ntotal | used\n',
        os.popen(
            'nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader'
        ).read())

    logger.info(
        '------------------------------ configuration ---------------------------'
    )
    logger.info(
        '\n==> available {} GPUs , use numbers are {} device is {}\n'.format(
            torch.cuda.device_count(), os.environ["CUDA_VISIBLE_DEVICES"],
            torch.cuda.current_device()))
    # torch.cuda._initialized = True
    logger.info(pprint.pformat(config))
    logger.info(
        '------------------------------- -------- ----------------------------'
    )

    criterion = MSELoss()

    Arch = bulid_up_network(config, criterion)

    if config.train.arch_search_strategy == 'random':

        logger.info("==>random seed is {}".format(config.train.random_seed))
        np.random.seed(config.train.random_seed)
        torch.manual_seed(config.train.random_seed)
        Arch.arch_parameters_random_search()

    if arg.param_flop:
        Arch._print_info()

    # dump_input = torch.rand((1,3,128,128))
    # graph = SummaryWriter(output_dir+'/log')
    # graph.add_graph(Arch, (dump_input, ))

    if len(arg.gpu) > 1:
        use_multi_gpu = True
        Arch = torch.nn.DataParallel(Arch).cuda()
    else:
        use_multi_gpu = False
        Arch = Arch.cuda()

    Search = Search_Arch(Arch.module,
                         config) if use_multi_gpu else Search_Arch(
                             Arch, config)  # Arch.module for nn.DataParallel
    search_strategy = config.train.arch_search_strategy
    train_queue, arch_queue, valid_queue = Dataloaders(search_strategy, config,
                                                       arg)
    #Note: if the search strategy is `None` or `SYNC`, the arch_queue is None!

    logger.info(
        "\nNeural Architecture Search strategy is {}".format(search_strategy))
    assert search_strategy in [
        'first_order_gradient', 'random', 'None', 'second_order_gradient',
        'sync'
    ]

    if search_strategy == 'sync':
        # arch_parameters is also registered to model's parameters
        # so the weight-optimizer will also update the arch_parameters
        logger.info(
            "sync: The arch_parameters is also optimized by weight-optmizer synchronously"
        )
        optimizer = torch.optim.Adam(
            Arch.parameters(),
            lr=config.train.w_lr_cosine_begin,
        )

    else:
        # if search strategy is None,random,second_order_gradient and so on
        # the arch_parameters will be filtered by the weight-optimizer
        optimizer = torch.optim.Adam(
            filter_arch_parameters(Arch),
            lr=config.train.w_lr_cosine_begin,
        )
    #scheduler = torch.optim.lr_scheduler.StepLR(optimizer,  step_size = config.train.lr_step_size,
    #                                                       gamma = config.train.lr_decay_gamma )
    if config.train.scheduler_name == "MultiStepLR":
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.train.LR_STEP, config.train.LR_FACTOR)
    elif config.train.scheduler_name == "CosineAnnealingLR":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=config.train.epoch_end,
            eta_min=config.train.w_lr_cosine_end)

    # best_result
    best = 0

    logger.info(
        "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+= training +=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=="
    )
    begin, end = config.train.epoch_begin, config.train.epoch_end

    if arg.load_ckpt:
        if use_multi_gpu:
            begin, best = load_ckpt(Arch.module, optimizer, scheduler,
                                    output_dir, logger)
        else:
            begin, best = load_ckpt(Arch, optimizer, scheduler, output_dir,
                                    logger)

    for epoch in range(begin, end):

        lr = scheduler.get_lr()[0]
        logger.info(
            '==>time:({})--training...... current learning rate is {:.7f}'.
            format(datetime.datetime.now(), lr))

        train(
            epoch,
            train_queue,
            arch_queue,
            Arch,
            Search,
            criterion,
            optimizer,
            lr,
            search_strategy,
            output_dir,
            logger,
            config,
            arg,
        )
        scheduler.step()

        eval_results = evaluate(Arch, valid_queue, config, output_dir)
        if use_multi_gpu:
            best = save_model(epoch, best, eval_results, Arch.module,
                              optimizer, scheduler, output_dir, logger)
        else:

            best = save_model(epoch, best, eval_results, Arch, optimizer,
                              scheduler, output_dir, logger)

        ## visualize_heatamp
        if arg.visualize and epoch % 5 == 0:
            for i in range(len(valid_queue.dataset)):

                if valid_queue.dataset[i][1] != 185250:  # choose an image_id
                    continue
                print(valid_queue.dataset[i][1])
                sample = valid_queue.dataset[i]

                img = sample[0].unsqueeze(0)
                #samples = next(iter(valid_dataloader))
                #img = samples[0]
                output = Arch(img)
                print(img.size(), output.size())
                visualize_heatamp(img, output, 'heatmaps', show_img=False)
                break