Exemplo n.º 1
0
def main():
    """Main function"""

    # Initialization
    args = parse_args()
    rank, n_ranks = init_workers(args.distributed_backend)

    # Load configuration
    config = load_config(args.config)

    # Prepare output directory
    output_dir = os.path.expandvars(args.output_dir if args.output_dir is not None
                                    else config['output_dir'])
    os.makedirs(output_dir, exist_ok=True)

    # Setup logging
    log_file = os.path.join(output_dir, 'out_%i.log' % rank)
    config_logging(verbose=args.verbose, log_file=log_file)
    logging.info('Initialized rank %i out of %i', rank, n_ranks)
    if rank == 0:
        logging.info('Configuration: %s' % config)

    # Load the datasets
    is_distributed = args.distributed_backend is not None
    train_data_loader, valid_data_loader = get_data_loaders(
        distributed=is_distributed, **config['data_config'])

    # Load the trainer
    gpu = (rank % args.ranks_per_node) if args.rank_gpu else args.gpu
    if gpu is not None:
        logging.info('Using GPU %i', gpu)
    trainer = get_trainer(name=config['trainer'], distributed=is_distributed,
                          rank=rank, output_dir=output_dir, gpu=gpu)
    # Build the model
    trainer.build_model(**config['model_config'])
    if rank == 0:
        trainer.print_model_summary()

    # Run the training
    summary = trainer.train(train_data_loader=train_data_loader,
                            valid_data_loader=valid_data_loader,
                            **config['train_config'])
    trainer.write_summaries()

    # Print some conclusions
    logging.info('Finished training')
    logging.info('Train samples %g time %g s rate %g samples/s',
                 np.mean(summary['train_samples']),
                 np.mean(summary['train_time']),
                 np.mean(summary['train_rate']))
    if valid_data_loader is not None:
        logging.info('Valid samples %g time %g s rate %g samples/s',
                     np.mean(summary['valid_samples']),
                     np.mean(summary['valid_time']),
                     np.mean(summary['valid_rate']))

    logging.info('All done!')
Exemplo n.º 2
0
def main():
    """Main function"""

    # Parse the command line
    args = parse_args()

    # Initialize distributed workers
    rank, n_ranks = init_workers(args.distributed)

    # Load configuration
    config = load_config(args.config, output_dir=args.output_dir,
                         n_ranks=n_ranks, crayai=args.crayai)
    config = update_config(config, args)
    os.makedirs(config['output_dir'], exist_ok=True)

    # Setup logging
    config_logging(verbose=args.verbose, output_dir=config['output_dir'],
                   append=args.resume, rank=rank)
    logging.info('Initialized rank %i out of %i', rank, n_ranks)
    if args.show_config and (rank == 0):
        logging.info('Command line config: %s' % args)
    if rank == 0:
        logging.info('Configuration: %s', config)
        logging.info('Saving job outputs to %s', config['output_dir'])
        if args.distributed is not None:
            logging.info('Using distributed mode: %s', args.distributed)

    # Reproducible training
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(args.seed + 10)

    # Save configuration in the outptut directory
    if rank == 0:
        save_config(config)

    # Load the datasets
    is_distributed = (args.distributed is not None)
    # Workaround because multi-process I/O not working with MPI backend
    if args.distributed in ['ddp-mpi', 'cray']:
        if rank == 0:
            logging.info('Disabling I/O workers because of MPI issue')
        config['data']['n_workers'] = 0
    train_data_loader, valid_data_loader = get_data_loaders(
        distributed=is_distributed, rank=rank, n_ranks=n_ranks, **config['data'])
    logging.info('Loaded %g training samples', len(train_data_loader.dataset))
    if valid_data_loader is not None:
        logging.info('Loaded %g validation samples', len(valid_data_loader.dataset))

    # Load the trainer
    gpu = (rank % args.ranks_per_node) if args.rank_gpu else args.gpu
    if gpu is not None:
        logging.info('Choosing GPU %s', gpu)
    trainer = get_trainer(distributed_mode=args.distributed,
                          output_dir=config['output_dir'],
                          rank=rank, n_ranks=n_ranks,
                          gpu=gpu, pbt_checkpoint=args.pbt_checkpoint,
                          **config['trainer'])

    # Build the model and optimizer
    model_config = config.get('model', {})
    optimizer_config = config.get('optimizer', {})
    logging.debug("Building model")
    trainer.build_model(optimizer_config=optimizer_config, **model_config)
    if rank == 0:
        trainer.print_model_summary()

    # Checkpoint resume
    if args.resume:
        trainer.load_checkpoint()

    # Run the training
    logging.debug("Training")
    summary = trainer.train(train_data_loader=train_data_loader,
                            valid_data_loader=valid_data_loader,
                            **config['training'])

    # Print some conclusions
    n_train_samples = len(train_data_loader.sampler)
    logging.info('Finished training')
    train_time = summary.train_time.mean()
    logging.info('Train samples %g time %g s rate %g samples/s',
                 n_train_samples, train_time, n_train_samples / train_time)
    if valid_data_loader is not None:
        n_valid_samples = len(valid_data_loader.sampler)
        valid_time = summary.valid_time.mean()
        logging.info('Valid samples %g time %g s rate %g samples/s',
                     n_valid_samples, valid_time, n_valid_samples / valid_time)

    # Drop to IPython interactive shell
    if args.interactive and (rank == 0):
        logging.info('Starting IPython interactive session')
        import IPython
        IPython.embed()

    if rank == 0:
        if args.crayai:
            print("FoM: %e" % summary['valid_loss'][0])
        logging.info('All done!')
Exemplo n.º 3
0
def main():
    """Main function"""

    # Parse the command line
    args = parse_args()

    # Setup logging
    log_format = '%(asctime)s %(levelname)s %(message)s'
    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(level=log_level, format=log_format)
    logging.info('Initializing')
    if args.show_config:
        logging.info('Command line config: %s' % args)

    # Initialize MPI
    if args.distributed:
        dist.init_process_group(backend='mpi')
        logging.info('MPI rank %i out of %i', dist.get_rank(),
                     dist.get_world_size())

    # Load configuration
    with open(args.config) as f:
        config = yaml.load(f)
    if not args.distributed or (dist.get_rank() == 0):
        logging.info('Configuration: %s' % config)
    data_config = config['data_config']
    model_config = config.get('model_config', {})
    train_config = config['train_config']

    # Load the datasets
    train_data_loader, valid_data_loader = get_data_loaders(
        distributed=args.distributed, **data_config)
    logging.info('Loaded %g training samples', len(train_data_loader.dataset))
    if valid_data_loader is not None:
        logging.info('Loaded %g validation samples',
                     len(valid_data_loader.dataset))
    print('train_data')
    print(train_data_loader.dataset)
    print('valid_data')
    print(valid_data_loader.dataset)

    # Load the trainer
    experiment_config = config['experiment_config']
    output_dir = experiment_config.pop('output_dir', None)
    if args.distributed and dist.get_rank() != 0:
        output_dir = None
    trainer = get_trainer(distributed=args.distributed,
                          output_dir=output_dir,
                          device=args.device,
                          **experiment_config)
    # Build the model
    trainer.build_model(**model_config)
    if not args.distributed or (dist.get_rank() == 0):
        trainer.print_model_summary()
    print('model')
    print(trainer)

    # Run the training
    summary = trainer.train(train_data_loader=train_data_loader,
                            valid_data_loader=valid_data_loader,
                            **train_config)
    if not args.distributed or (dist.get_rank() == 0):
        trainer.write_summaries()
    print('summary')
    print(summary)

    # Print some conclusions
    n_train_samples = len(train_data_loader.sampler)
    logging.info('Finished training')
    train_time = np.mean(summary['train_time'])
    logging.info('Train samples %g time %gs rate %g samples/s',
                 n_train_samples, train_time, n_train_samples / train_time)
    if valid_data_loader is not None:
        n_valid_samples = len(valid_data_loader.sampler)
        valid_time = np.mean(summary['valid_time'])
        logging.info('Valid samples %g time %g s rate %g samples/s',
                     n_valid_samples, valid_time, n_valid_samples / valid_time)

    # Drop to IPython interactive shell
    if args.interactive:
        logging.info('Starting IPython interactive session')
        import IPython
        IPython.embed()

    logging.info('All done!')
Exemplo n.º 4
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='SWD')

    # Mandatory Arguments
    parser.add_argument('--dataset', type=str, default="cifar10",
                        help="The dataset to consider")

    parser.add_argument('--lr', type=float, default=.1, metavar='LR',
                        help='learning rate (default: .1) (negative -> Adam)')

    parser.add_argument('--batch-size', type=int, default=128, metavar='N',
                        help='input batch size for training (default: 100)')

    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')

    parser.add_argument('--epochs', type=int, default=300, metavar='N',
                        help='number of epochs to train')

    parser.add_argument('--ft-epochs', type=int, default=150, metavar='N',
                        help='number of epochs to train')
    
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')

    parser.add_argument("--seed", type=int, default=random.randint(0,1000000),
                        help = "random seed to initialize.")

    parser.add_argument("--cutout", action="store_true",
                        help = "perform cutout")

    parser.add_argument("--mixup", action="store_true",
                        help = "perform mixup")

    parser.add_argument("--cutmix", action="store_true",
                        help = "perform cutmix")

    parser.add_argument("--auto-augment", action="store_true",
                        help = "perform auto_augment")

    parser.add_argument('--feature-maps', type=int, default=64,
                        help='Total feature_maps')

    parser.add_argument('--wd', default = "5e-4", type=float,
                        help='Weight decay')

    parser.add_argument('--a', default = "-1", type=float,
                        help='Parameter a')

    parser.add_argument('--width', default="0.1", type=float, help="parameter width")

    parser.add_argument('--half', action='store_true',
                        help='Half precision')

    ## ----------------------------------------------------------------------------------------
    args = parser.parse_args()
    
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    torch.manual_seed(args.seed)

    train_loader, test_loader, metadata = get_data_loaders(args)
    model = ResNet20(args).to(device)

    if args.half:
        model.half()  # convert to half precision
        for layer in model.modules():
            if isinstance(layer, torch.nn.BatchNorm2d):
                layer.float()

    n_params = torch.sum(torch.LongTensor([elt.numel() for elt in model.parameters()])).item()
    print(str(n_params) + " parameters maximum with " + str(args.feature_maps) + " feature maps")

    if args.lr > 0:
        optimizer = optim.SGD(model.parameters(), lr = args.lr, momentum = 0.9, weight_decay = 0)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [args.epochs // 3, 2 * args.epochs // 3], gamma=0.1)
    else:
        optimizer = optim.Adam(model.parameters())
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [2 * args.epochs // 3], gamma=0.1)

    for epoch in range(args.epochs):
        train_data = train(model, epoch, args, device, train_loader, optimizer)
        test_data = test(model, args, device, test_loader)
        display_progress(epoch, train_data, test_data)
        torch.save(model, "/tmp/best_model.pt")
        scheduler.step()

    values = None
    for parameters in model.parameters():
        if values == None:
            if args.half:
                values = torch.abs(parameters.data.view(-1).half())
            else:
                values = torch.abs(parameters.data.view(-1))
        else:
            if args.half:
                values = torch.cat([values,torch.abs(parameters.data.view(-1).half())], dim=0)
            else:
                values = torch.cat([values,torch.abs(parameters.data.view(-1))], dim=0)
    values = torch.sort(values)[0]
    print("sorted {:d} values".format(values.shape[0]))
    
    perfs = []
    perfs_ft = []
    ths = []
    prunes = [500, 800, 900, 950, 980, 990, 995, 998, 999]
    for i in prunes:
        print("Testing with pruning {:3d}/1000...                       ".format(i),end='')
        model = torch.load("/tmp/best_model.pt")
        th = values[((i * values.shape[0]) // 1000)]
        ths.append(th.item())
        print(str(th.item()) + " ", end='')
        for parameters in model.parameters():
            if args.half:
                parameters.data = parameters.data * (torch.abs(parameters.data) >= th).half()
            else:
                parameters.data = parameters.data * (torch.abs(parameters.data) >= th).float()
        masks = []
        res = test(model, args, device, test_loader)
        perfs.append(res["test_acc"])
        print(str(res["test_acc"]) + " ", end='')
        for parameters in model.parameters():
            if args.half:
                masks.append((torch.abs(parameters.data) >= th).half())
            else:
                masks.append((torch.abs(parameters.data) >= th).float())
        print("tuning")

        if args.lr > 0:
            optimizer = optim.SGD(model.parameters(), lr = args.lr, momentum = 0.9, weight_decay = 0)
            scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [args.ft_epochs // 3, 2 * args.ft_epochs // 3], gamma=0.1)
        else:
            optimizer = optim.Adam(model.parameters())
            scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [2 * args.ft_epochs // 3], gamma=0.1)

        for epoch in range(args.epochs, args.epochs + args.ft_epochs):
            train_data = train(model, epoch, args, device, train_loader, optimizer, masks = masks)
            test_data = test(model, args, device, test_loader)
            scheduler.step()
        res = test(model, args, device, test_loader)
        perfs_ft.append(res["test_acc"])
        print(" " + str(res["test_acc"]))
    
    values = {
        "\"dataset\": \"{:s}\"": args.dataset,
        "\"wd\": {:f}": args.wd,
        "\"a\": {:f}": args.a,
        "\"width\": {:f}": args.width,
        "\"epochs\": {:d}": args.epochs,
        "\"ft-epochs\": {:d}": args.ft_epochs,
        "\"feature_maps\": {:d}": args.feature_maps,
        "\"auto_augment\": {:b}": args.auto_augment,
        "\"cutout\": {:b}": args.cutout,
        "\"mixup\": {:b}": args.mixup,
        "\"cutmix\": {:b}": args.cutmix,
        "\"seed\": {:d}" : args.seed,
        "\"training_loss\": {:f}": train_data["train_loss"],
        "\"training_acc\": {:f}": train_data["train_acc"],
        "\"test_loss\": {:f}": test_data["test_loss"],
        "\"test_acc\": {:f}": test_data["test_acc"],
        "\"nparams\": {:d}": n_params,
        "\"ths\": {:s}": str(ths),
        "\"perfs\": {:s}": str(perfs),
        "\"perfs_ft\": {:s}": str(perfs_ft)
    }
    file_output = open("results.txt","a")
    file_output.write("results.append({")
    for key in values.keys():
        file_output.write(key.format(values[key]) + ", ")
    file_output.write("})\n")
    file_output.close()
Exemplo n.º 5
0
    parser.add_argument("-temperature",
                        "--temperature",
                        type=float,
                        default=10.0)
    parser.add_argument("-distil-weight",
                        "--distil-weight",
                        type=float,
                        default=10.0)
    args = parser.parse_args()
    print(args)
    args.no_shuffle = True

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.manual_seed(args.seed)

    train_loader, test_loader, metadata = get_data_loaders(args)

    teacher_model = ResNet.fromFile(args.teacher)
    teacher_model.eval()

    model = get_model(args, metadata)

    print("N parameters : ", model.n_parameters)
    if args.resume is not None:
        model.load_state_dict(torch.load(args.resume)["state_dict"])

    teacher_model = teacher_model.to(device)
    model = model.to(device)

    scheduler = None
    if args.optimizer == "sgd":
Exemplo n.º 6
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='SWD')

    # Mandatory Arguments
    parser.add_argument('--dataset',
                        type=str,
                        default="cifar10",
                        help="The dataset to consider")

    parser.add_argument('--lr',
                        type=float,
                        default=.1,
                        metavar='LR',
                        help='learning rate (default: .1) (negative -> Adam)')

    parser.add_argument('--batch-size',
                        type=int,
                        default=128,
                        metavar='N',
                        help='input batch size for training (default: 100)')

    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')

    parser.add_argument('--epochs',
                        type=int,
                        default=300,
                        metavar='N',
                        help='number of epochs to train')

    parser.add_argument('--model',
                        type=str,
                        default="resnet18",
                        choices=list(dict_models.keys()),
                        help='model to train')

    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')

    parser.add_argument("--seed",
                        type=int,
                        default=random.randint(0, 1000000),
                        help="random seed to initialize.")

    parser.add_argument("--cutout", action="store_true", help="perform cutout")

    parser.add_argument("--mixup", action="store_true", help="perform mixup")

    parser.add_argument("--cutmix", action="store_true", help="perform cutmix")

    parser.add_argument("--auto-augment",
                        action="store_true",
                        help="perform auto_augment")

    parser.add_argument('--feature-maps',
                        type=int,
                        default=64,
                        help='Total feature_maps')

    parser.add_argument('--wd',
                        default="5e-4",
                        type=float,
                        help='Weight decay')

    parser.add_argument('--temp-init',
                        default="1",
                        type=float,
                        help='Initial importance of binarization')

    parser.add_argument('--temp-final',
                        default="1e2",
                        type=float,
                        help='Final importance of binarization')

    parser.add_argument('-l', default=-1, type=int, help='l')
    parser.add_argument('-c', default=-1, type=int, help='c')

    parser.add_argument('--output',
                        default="results.txt",
                        type=str,
                        help="Output file to write on")

    parser.add_argument('--half', action='store_true', help='Half precision')

    ## ----------------------------------------------------------------------------------------
    args = parser.parse_args()

    if args.c == -1 and args.l == -1:
        raise Exception("Set c or l")
    elif args.c == -1 and args.l == -1:
        raise Exception("Set only one of c or l")
    if not os.path.isdir("result"):
        os.makedirs("result")
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    torch.manual_seed(args.seed)

    train_loader, test_loader, metadata = get_data_loaders(args)

    model = dict_models[args.model](
        args, num_classes=metadata["n_classes"]).to(device)

    if args.half:
        model.half()  # convert to half precision
        for layer in model.modules():
            if isinstance(layer, torch.nn.BatchNorm2d):
                layer.float()

    n_params = torch.sum(
        torch.LongTensor([elt.numel() for elt in model.parameters()])).item()
    print(
        str(n_params) + " parameters maximum with " + str(args.feature_maps) +
        " feature maps")

    if args.lr > 0:
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=0.9,
                              weight_decay=args.wd)
        scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer, [args.epochs // 3, 2 * args.epochs // 3], gamma=0.1)
    else:
        optimizer = optim.Adam(model.parameters())
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                   [2 * args.epochs // 3],
                                                   gamma=0.1)

    for epoch in range(args.epochs):
        train_data = train(model, epoch, args, device, train_loader, optimizer)
        test_data = test(model, epoch, args, device, test_loader)
        display_progress(epoch, train_data, test_data)
        scheduler.step()

    values = {
        "\"model\": \"{:s}\"": args.model,
        "\"dataset\": \"{:s}\"": args.dataset,
        "\"wd\": {:f}": args.wd,
        "\"epochs\": {:d}": args.epochs,
        "\"feature_maps\": {:d}": args.feature_maps,
        "\"auto_augment\": {:b}": args.auto_augment,
        "\"cutout\": {:b}": args.cutout,
        "\"mixup\": {:b}": args.mixup,
        "\"cutmix\": {:b}": args.cutmix,
        "\"seed\": {:d}": args.seed,
        "\"training_loss\": {:f}": train_data["train_loss"],
        "\"training_acc\": {:f}": train_data["train_acc"],
        "\"test_loss\": {:f}": test_data["test_loss"],
        "\"test_acc\": {:f}": test_data["test_acc"],
        "\"nb_ops\": {:f}": params.nb_ops,
        "\"nparams\": {:d}": n_params,
        "\"temp_init\": {:f}": args.temp_init,
        "\"temp_final\": {:f}": args.temp_final,
        "\"l\": {:f}": args.l,
        "\"c\": {:f}": args.c
    }
    file_output = open(args.output, "a")
    file_output.write("results.append({")
    for key in values.keys():
        file_output.write(key.format(values[key]) + ", ")
    file_output.write("})\n")
    file_output.close()
    filename = "result/"
    for key, value in sorted(values.items()):
        if "training_loss" not in key and "training_acc" not in key and "test_loss" not in key and "test_acc" not in key and "nparams" not in key:
            filename += "{}_".format(value)


#            print(key,value)
    filename = "{}.pt".format(filename[:-1])
    torch.save(model, filename)
Exemplo n.º 7
0
    os.makedirs(out_dir, exist_ok=True)

    logging.basicConfig(filename=(out_dir + "/process_log.log"),
                        level=logging.INFO,
                        format=logfilename)
    logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

    logging.info("=" * 20)
    logging.info("=" * 20)
    logging.info("=" * 20)
    logging.info("\nStarting...")
    logging.info("Config:")
    logging.info(cfg)

    train_data_loader, valid_data_loader = get_data_loaders(cfg['train'])
    logging.info('Loaded %g training samples', len(train_data_loader.dataset))
    if valid_data_loader is not None:
        logging.info('Loaded %g validation samples',
                     len(valid_data_loader.dataset))

    # Load the trainer
    trainer = GNNTrainer(cfg['trainer'],
                         output_dir=out_dir,
                         device=args_in.device,
                         train_loader=train_data_loader)
    # Build the model and optimizer
    trainer.build_model(**cfg.get('model', {}))
    trainer.print_model_summary()
    # Run the training
    summary = trainer.train(train_data_loader=train_data_loader,
Exemplo n.º 8
0
def main():
    """Main function"""

    # Initialization
    args = parse_args()
    rank, n_ranks = init_workers(args.distributed)

    # Load configuration
    config = load_config(args.config)
    data_config = config['data_config']
    model_config = config.get('model_config', {})
    train_config = config['train_config']

    # Prepare output directory
    output_dir = config.get('output_dir', None)
    if output_dir is not None:
        output_dir = os.path.expandvars(output_dir)
        os.makedirs(output_dir, exist_ok=True)

    # Setup logging
    log_file = (os.path.join(output_dir, 'out_%i.log' %
                             rank) if output_dir is not None else None)
    config_logging(verbose=args.verbose, log_file=log_file)
    logging.info('Initialized rank %i out of %i', rank, n_ranks)
    if rank == 0:
        logging.info('Configuration: %s' % config)

    # Load the datasets
    train_data_loader, valid_data_loader = get_data_loaders(
        distributed=args.distributed, **data_config)

    # Load the trainer
    trainer = get_trainer(name=config['trainer'],
                          distributed=args.distributed,
                          rank=rank,
                          output_dir=output_dir,
                          device=args.device)
    # Build the model
    trainer.build_model(**model_config)
    if rank == 0:
        trainer.print_model_summary()

    # Run the training
    summary = trainer.train(train_data_loader=train_data_loader,
                            valid_data_loader=valid_data_loader,
                            **train_config)
    if output_dir is not None:
        trainer.write_summaries()

    # Print some conclusions
    n_train_samples = len(train_data_loader.sampler)
    logging.info('Finished training')
    train_time = np.mean(summary['train_time'])
    logging.info('Train samples %g time %g s rate %g samples/s',
                 n_train_samples, train_time, n_train_samples / train_time)
    if valid_data_loader is not None:
        n_valid_samples = len(valid_data_loader.sampler)
        valid_time = np.mean(summary['valid_time'])
        logging.info('Valid samples %g time %g s rate %g samples/s',
                     n_valid_samples, valid_time, n_valid_samples / valid_time)

    # Drop to IPython interactive shell
    if args.interactive and rank == 0:
        logging.info('Starting IPython interactive session')
        import IPython
        IPython.embed()

    logging.info('All done!')
Exemplo n.º 9
0
def distributed_node(process_id, args):

    print('===>>> Process', process_id, 'in Node', args.node_rank)
    total_time = TimeLapse()
    total_time.start()

    args.process_id = process_id
    loggers = initialise_loggers(args)
    set_training_environment(args)

    train_loader, val_loader, test_loader = get_data_loaders(args)

    checkpoint = recover_saved_session(args)

    model, optimizer, criterion, saved_loggers = build_model(args, checkpoint)

    if saved_loggers is not None:
        loggers = saved_loggers
    else:
        get_model_metrics(model, args, loggers)

    ### ToDo
    ### This is needed to add new fields in partially trained files due to saved_loggers delete no saved keys
    ### Maybe can be solved automatically by adding a function after updating that compares a new empty loggers
    ### with the existing logger and add non existing keys (or deleting currently used ones?)
    if 'epoch_number' not in loggers: loggers['epoch_number'] = ListMeter()

    #print_loggers(loggers)

    print(
        '===>>> Model have been trained for {} epochs from {} required'.format(
            args.start_epoch, args.epochs))

    for epoch in range(args.start_epoch, args.epochs):

        ###args.epoch = epoch
        loggers['last_epoch'] = epoch

        adjust_lr(optimizer, args, loggers)

        train(model, train_loader, optimizer, criterion, args, loggers)
        evaluate(model, val_loader, criterion, args, loggers)
        if test_loader is not None:
            evaluate(model,
                     test_loader,
                     criterion,
                     epoch,
                     args,
                     loggers,
                     validation=False)

        # Save checkpoint only in master process
        if args.node_rank == 0 and args.process_id == 0:

            is_best = loggers['epoch_val_performance'].val > loggers[
                'best_val_performance']
            if is_best:
                loggers['best_val_performance'] = loggers[
                    'epoch_val_performance'].val
                loggers['train_best_val_performance'] = loggers[
                    'epoch_train_performance'].val
                if test_loader is not None:
                    loggers['test_best_val_performance'] = loggers[
                        'epoch_test_performance'].val
                loggers['epoch_best_val_performance'] = loggers['last_epoch']

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'amp': amp.state_dict(),
                    'loggers': loggers,
                }, is_best, args)
            save_loggers(loggers, args)
            #print_loggers(loggers)

    total_time.stop()
    loggers['total_time'] = loggers['total_time'] + total_time.time(
    )  ###### Have to solve how to recover time if process is broken before reachen final epoch where the total time is computed
    save_loggers(loggers, args)
    save_summary(loggers, args)
    #print('This session time:', total_time.time())
    print('Total time for ', args.epochs, 'epochs:', loggers['total_time'])
Exemplo n.º 10
0
def main():
    """Main function"""

    # Initialization
    args = parse_args()
    rank, n_ranks = init_workers(args.distributed_backend)

    # Load configuration
    config = load_config(args)

    # Prepare output directory
    output_dir = config.get('output_dir', None)
    if output_dir is not None:
        output_dir = os.path.expandvars(output_dir)
        os.makedirs(output_dir, exist_ok=True)

    # Setup logging
    log_file = (os.path.join(output_dir, 'out_%i.log' %
                             rank) if output_dir is not None else None)
    config_logging(verbose=args.verbose, log_file=log_file, append=args.resume)
    logging.info('Initialized rank %i out of %i', rank, n_ranks)
    try_barrier()
    if rank == 0:
        logging.info('Configuration: %s' % config)

    # Load the datasets
    distributed = args.distributed_backend is not None
    train_data_loader, valid_data_loader = get_data_loaders(
        distributed=distributed, **config['data'])

    # Load the trainer
    gpu = (rank % args.ranks_per_node) if args.rank_gpu else args.gpu
    if gpu is not None:
        logging.info('Using GPU %i', gpu)
    trainer = get_trainer(name=config['trainer'],
                          distributed=distributed,
                          rank=rank,
                          output_dir=output_dir,
                          gpu=gpu)

    # Build the model and optimizer
    trainer.build(config)

    # Resume from checkpoint
    if args.resume:
        trainer.load_checkpoint()

    # Run the training
    summary = trainer.train(train_data_loader=train_data_loader,
                            valid_data_loader=valid_data_loader,
                            **config['train'])

    # Print some conclusions
    try_barrier()
    n_train_samples = len(train_data_loader.sampler)
    logging.info('Finished training')
    train_time = np.mean(summary['train_time'])
    logging.info('Train samples %g time %g s rate %g samples/s',
                 n_train_samples, train_time, n_train_samples / train_time)
    if valid_data_loader is not None:
        n_valid_samples = len(valid_data_loader.sampler)
        valid_time = np.mean(summary['valid_time'])
        logging.info('Valid samples %g time %g s rate %g samples/s',
                     n_valid_samples, valid_time, n_valid_samples / valid_time)

    logging.info('All done!')
Exemplo n.º 11
0
def main():
    """Main function"""

    # Parse the command line
    args = parse_args()
    # Initialize MPI
    rank, n_ranks = init_workers(args.distributed)

    # Load configuration
    config = load_config(args.config)
    output_dir = os.path.expandvars(config.get('output_dir', None))
    if rank == 0:
        os.makedirs(output_dir, exist_ok=True)
    else:
        output_dir = None

    # Setup logging
    config_logging(verbose=args.verbose, output_dir=output_dir)
    logging.info('Initialized rank %i out of %i', rank, n_ranks)
    if args.show_config and (rank == 0):
        logging.info('Command line config: %s' % args)
    if rank == 0:
        logging.info('Configuration: %s', config)
        logging.info('Saving job outputs to %s', output_dir)

    # Load the datasets
    train_data_loader, valid_data_loader = get_data_loaders(
        distributed=args.distributed, **config['data'])
    logging.info('Loaded %g training samples', len(train_data_loader.dataset))
    if valid_data_loader is not None:
        logging.info('Loaded %g validation samples',
                     len(valid_data_loader.dataset))

    # Load the trainer
    trainer = get_trainer(distributed=args.distributed,
                          output_dir=output_dir,
                          device=args.device,
                          **config['trainer'])
    # Build the model and optimizer
    trainer.build_model(**config.get('model', {}))
    if rank == 0:
        trainer.print_model_summary()

    # Run the training
    summary = trainer.train(train_data_loader=train_data_loader,
                            valid_data_loader=valid_data_loader,
                            **config['training'])
    if rank == 0:
        trainer.write_summaries()

    # Print some conclusions
    n_train_samples = len(train_data_loader.sampler)
    logging.info('Finished training')
    train_time = np.mean(summary['train_time'])
    logging.info('Train samples %g time %g s rate %g samples/s',
                 n_train_samples, train_time, n_train_samples / train_time)
    if valid_data_loader is not None:
        n_valid_samples = len(valid_data_loader.sampler)
        valid_time = np.mean(summary['valid_time'])
        logging.info('Valid samples %g time %g s rate %g samples/s',
                     n_valid_samples, valid_time, n_valid_samples / valid_time)

    # Drop to IPython interactive shell
    if args.interactive and (rank == 0):
        logging.info('Starting IPython interactive session')
        import IPython
        IPython.embed()

    if rank == 0:
        logging.info('All done!')