예제 #1
0
    def _load_lr_finder(self):
        print('loading lr finder...')

        from utils.lr_finder import LRFinder
        self.lr_finder = LRFinder(self.args, self.data_loader, self.model,
                                  self.criterion, self.optimizer)

        print('lr finder load finished!')
예제 #2
0
 def get_good_lr(self, model, model_file):
     lr_finder = LRFinder(model, model_file)
     lr = lr_finder.find(self.x_train,
                         self.y_train,
                         start_lr=0.000001,
                         end_lr=10,
                         batch_size=self.batch_size,
                         epochs=2,
                         num_batches=300,
                         return_model=False)
     return lr
예제 #3
0
 def lr_finder(self, end_lr=10, num_iter=100, img_path='./'):
     print('Start finding LR')
     lr_finder = LRFinder(self.model,
                          self.optimizer,
                          self.criterion,
                          device=("cuda:0"))
     lr_finder.range_test(self.train_loader,
                          end_lr=end_lr,
                          num_iter=num_iter)
     lr_finder.plot()
     savefig(img_path)
     clf()
예제 #4
0
파일: train.py 프로젝트: gouskos/weaver
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--data-config',
                        type=str,
                        default='data/ak15_points_pf_sv_v0.yaml',
                        help='data config YAML file')
    parser.add_argument('-i',
                        '--data-train',
                        nargs='*',
                        default=[],
                        help='training files')
    parser.add_argument('-t',
                        '--data-test',
                        nargs='*',
                        default=[],
                        help='testing files')
    parser.add_argument(
        '--data-fraction',
        type=float,
        default=1,
        help=
        'fraction of events to load from each file; for training, the events are randomly selected for each epoch'
    )
    parser.add_argument(
        '--data-dilation',
        type=int,
        default=1,
        help=
        'reduce number of file by a factor of `d` for training. NOT recommended in general - use `--data-fraction` instead.'
    )
    parser.add_argument(
        '--files-per-fetch',
        type=int,
        default=20,
        help=
        'number of files to load each time; shuffling is done within these events, so choose a number large enough to get events from all classes'
    )
    parser.add_argument('--train-val-split',
                        type=float,
                        default=0.8,
                        help='training/validation split fraction')
    parser.add_argument(
        '--demo',
        action='store_true',
        default=False,
        help=
        'quickly test the setup by running over only a small number of events')
    parser.add_argument(
        '--lr-finder',
        type=str,
        default=None,
        help=
        'run learning rate finder instead of the actual training; format: ``start_lr, end_lr, num_iters``'
    )
    parser.add_argument(
        '-n',
        '--network-config',
        type=str,
        default='networks/particle_net_pfcand_sv.py',
        help=
        'network architecture configuration file; the path must be relative to the current dir'
    )
    parser.add_argument(
        '--network-option',
        nargs=2,
        action='append',
        default=[],
        help=
        'options to pass to the model class constructor, e.g., `--network-option use_counts False`'
    )
    parser.add_argument(
        '-m',
        '--model-prefix',
        type=str,
        default='test_output/model_name',
        help=
        'path to save or load the model; for training, this will be used as a prefix; for testing, this should be the full path including extension'
    )
    parser.add_argument('--num-epochs',
                        type=int,
                        default=20,
                        help='number of epochs')
    parser.add_argument(
        '--optimizer',
        type=str,
        default='ranger',
        choices=['adam', 'ranger'],  # TODO: add more
        help='optimizer for the training')
    parser.add_argument(
        '--load-epoch',
        type=int,
        default=None,
        help=
        'used to resume interrupted training, load model and optimizer state saved in the `epoch-%d_state.pt` and `epoch-%d_optimizer.pt` files'
    )
    parser.add_argument('--start-lr',
                        type=float,
                        default=5e-3,
                        help='start learning rate')
    parser.add_argument(
        '--lr-steps',
        type=str,
        default='10,20',
        help=
        'steps to reduce the lr; currently only used when setting `--optimizer` to adam'
    )
    parser.add_argument('--batch-size',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument(
        '--use-amp',
        action='store_true',
        default=False,
        help='use mixed precision training (fp16); NOT WORKING YET')
    parser.add_argument(
        '--gpus',
        type=str,
        default='0',
        help='device for the training/testing; to use CPU, set to empty string ('
        '); to use multiple gpu, set it as a comma separated list, e.g., `1,2,3,4`'
    )
    parser.add_argument(
        '--num-workers',
        type=int,
        default=2,
        help=
        'number of threads to load the dataset; memory consuption and disk access load increases (~linearly) with this numbers'
    )
    parser.add_argument('--predict',
                        action='store_true',
                        default=False,
                        help='run prediction instead of training')
    parser.add_argument(
        '--predict-output',
        type=str,
        help=
        'path to save the prediction output, support `.root` and `.awkd` format'
    )
    parser.add_argument(
        '--export-onnx',
        type=str,
        default=None,
        help=
        'export the PyTorch model to ONNX model and save it at the given path (path must ends w/ .onnx); '
        'needs to set `--data-config`, `--network-config`, and `--model-prefix` (requires the full model path)'
    )

    args = parser.parse_args()
    _logger.info(args)

    if args.use_amp:
        raise NotImplementedError


#         from apex import amp

    if args.data_dilation > 1:
        _logger.warning(
            'Use of `data-dilation` is not recomended in general -- consider using `data-fraction` instead.'
        )

    # training/testing mode
    training_mode = not args.predict

    # device
    if args.gpus:
        gpus = [int(i) for i in args.gpus.split(',')]
        dev = torch.device(gpus[0])
    else:
        gpus = None
        dev = torch.device('cpu')

    # load data
    if training_mode:
        filelist = sorted(sum([glob.glob(f) for f in args.data_train], []))
        # np.random.seed(1)
        np.random.shuffle(filelist)
        if args.demo:
            filelist = filelist[:20]
            _logger.info(filelist)
            args.data_fraction = 0.1
            args.files_per_fetch = 5
        train_data = SimpleIterDataset(filelist,
                                       args.data_config,
                                       for_training=True,
                                       partial_load=((0, args.train_val_split),
                                                     args.data_fraction),
                                       dilation=args.data_dilation,
                                       files_per_fetch=args.files_per_fetch)
        val_data = SimpleIterDataset(filelist,
                                     args.data_config,
                                     for_training=True,
                                     partial_load=((args.train_val_split, 1),
                                                   args.data_fraction),
                                     dilation=args.data_dilation,
                                     files_per_fetch=args.files_per_fetch)
        train_loader = DataLoader(train_data,
                                  num_workers=args.num_workers,
                                  batch_size=args.batch_size,
                                  drop_last=True,
                                  pin_memory=True)
        val_loader = DataLoader(val_data,
                                num_workers=args.num_workers,
                                batch_size=args.batch_size,
                                drop_last=True,
                                pin_memory=True)
        data_config = train_data.config
    else:
        filelist = sorted(sum([glob.glob(f) for f in args.data_test], []))
        test_data = SimpleIterDataset(filelist,
                                      args.data_config,
                                      for_training=False,
                                      files_per_fetch=1)
        test_loader = DataLoader(test_data,
                                 num_workers=args.num_workers,
                                 batch_size=args.batch_size,
                                 drop_last=False,
                                 pin_memory=True)
        data_config = test_data.config

    # model
    network_module = import_module(
        args.network_config.replace('.py', '').replace('/', '.'))
    network_options = {k: ast.literal_eval(v) for k, v in args.network_option}
    if args.export_onnx:
        network_options['for_inference'] = True
    model, model_info = network_module.get_model(data_config,
                                                 **network_options)
    _logger.info(model)

    # export to ONNX
    if args.export_onnx:
        assert (args.export_onnx.endswith('.onnx'))
        model_path = args.model_prefix
        _logger.info('Exporting model %s to ONNX' % model_path)
        model.load_state_dict(torch.load(model_path, map_location='cpu'))
        model = model.cpu()
        model.eval()

        os.makedirs(os.path.dirname(args.export_onnx), exist_ok=True)
        inputs = tuple(
            torch.ones(model_info['input_shapes'][k], dtype=torch.float32)
            for k in model_info['input_names'])
        torch.onnx.export(model,
                          inputs,
                          args.export_onnx,
                          input_names=model_info['input_names'],
                          output_names=model_info['output_names'],
                          dynamic_axes=model_info.get('dynamic_axes', None),
                          opset_version=11)
        _logger.info('ONNX model saved to %s', args.export_onnx)
        return

    # note: we should always save/load the state_dict of the original model, not the one wrapped by nn.DataParallel
    # so we do not convert it to nn.DataParallel now
    model = model.to(dev)

    # loss function
    try:
        loss_func = network_module.get_loss(data_config, **network_options)
        _logger.info(loss_func)
    except AttributeError:
        loss_func = torch.nn.CrossEntropyLoss()
        _logger.warning(
            'Loss function not defined in %s. Will use `torch.nn.CrossEntropyLoss()` by default.',
            args.network_config)

    if training_mode:
        # optimizer & learning rate
        if args.optimizer == 'adam':
            opt = torch.optim.Adam(model.parameters(), lr=args.start_lr)
            if args.lr_finder is None:
                lr_steps = [int(x) for x in args.lr_steps.split(',')]
                scheduler = torch.optim.lr_scheduler.MultiStepLR(
                    opt, milestones=lr_steps, gamma=0.1)
        else:
            from utils.nn.optimizer.ranger import Ranger
            opt = Ranger(model.parameters(), lr=args.start_lr)
            if args.lr_finder is None:
                lr_decay_epochs = max(1, int(args.num_epochs * 0.3))
                lr_decay_rate = 0.01**(1. / lr_decay_epochs)
                scheduler = torch.optim.lr_scheduler.MultiStepLR(
                    opt,
                    milestones=list(
                        range(args.num_epochs - lr_decay_epochs,
                              args.num_epochs)),
                    gamma=lr_decay_rate)

        # TODO: mixed precision training
        if args.use_amp:
            #             model, opt = amp.initialize(
            #                model, opt, opt_level="O2",
            #                keep_batchnorm_fp32=True, loss_scale="dynamic"
            #             )
            model, opt = amp.initialize(model,
                                        opt,
                                        opt_level="O1",
                                        keep_batchnorm_fp32=None,
                                        loss_scale="dynamic")

        # load previous training and resume if `--load-epoch` is set
        if args.load_epoch is not None:
            _logger.info('Resume training from epoch %d' % args.load_epoch)
            model_state = torch.load(args.model_prefix +
                                     '_epoch-%d_state.pt' % args.load_epoch,
                                     map_location=dev)
            model.load_state_dict(model_state)
            opt_state = torch.load(args.model_prefix +
                                   '_epoch-%d_optimizer.pt' % args.load_epoch,
                                   map_location=dev)
            opt.load_state_dict(opt_state)

        # mutli-gpu
        if gpus is not None and len(gpus) > 1:
            model = torch.nn.DataParallel(
                model, device_ids=gpus
            )  # model becomes `torch.nn.DataParallel` w/ model.module being the orignal `torch.nn.Module`
        model = model.to(dev)

        # lr finder: keep it after all other setups
        if args.lr_finder is not None:
            start_lr, end_lr, num_iter = args.lr_finder.replace(' ',
                                                                '').split(',')
            from utils.lr_finder import LRFinder
            lr_finder = LRFinder(model,
                                 opt,
                                 loss_func,
                                 device=dev,
                                 input_names=train_data.config.input_names,
                                 label_names=train_data.config.label_names)
            lr_finder.range_test(train_loader,
                                 start_lr=float(start_lr),
                                 end_lr=float(end_lr),
                                 num_iter=int(num_iter))
            lr_finder.plot(output='lr_finder.png'
                           )  # to inspect the loss-learning rate graph
            return

        # training loop
        best_valid_acc = 0
        for epoch in range(args.num_epochs):
            if args.load_epoch is not None:
                if epoch <= args.load_epoch:
                    continue
            print('-' * 50)
            _logger.info('Epoch #%d training' % epoch)
            train(model, loss_func, opt, scheduler, train_loader, dev)
            if args.model_prefix:
                dirname = os.path.dirname(args.model_prefix)
                if dirname and not os.path.exists(dirname):
                    os.makedirs(dirname)
                state_dict = model.module.state_dict() if isinstance(
                    model, torch.nn.DataParallel) else model.state_dict()
                torch.save(state_dict,
                           args.model_prefix + '_epoch-%d_state.pt' % epoch)
                torch.save(
                    opt.state_dict(),
                    args.model_prefix + '_epoch-%d_optimizer.pt' % epoch)

            _logger.info('Epoch #%d validating' % epoch)
            valid_acc = evaluate(model, val_loader, dev, loss_func=loss_func)
            if valid_acc > best_valid_acc:
                best_valid_acc = valid_acc
                if args.model_prefix:
                    shutil.copy2(
                        args.model_prefix + '_epoch-%d_state.pt' % epoch,
                        args.model_prefix + '_best_acc_state.pt')
                    torch.save(model, args.model_prefix + '_best_acc_full.pt')
            _logger.info(
                'Epoch #%d: Current validation acc: %.5f (best: %.5f)' %
                (epoch, valid_acc, best_valid_acc))
    else:
        # run prediction
        if args.model_prefix.endswith('.onnx'):
            _logger.info('Loading model %s for eval' % args.model_prefix)
            from utils.nn.tools import evaluate_onnx
            test_acc, scores, labels, observers = evaluate_onnx(
                args.model_prefix, test_loader)
        else:
            model_path = args.model_prefix if args.model_prefix.endswith(
                '.pt') else args.model_prefix + '_best_acc_state.pt'
            _logger.info('Loading model %s for eval' % model_path)
            model.load_state_dict(torch.load(model_path, map_location=dev))
            if gpus is not None and len(gpus) > 1:
                model = torch.nn.DataParallel(model, device_ids=gpus)
            model = model.to(dev)
            test_acc, scores, labels, observers = evaluate(model,
                                                           test_loader,
                                                           dev,
                                                           for_training=False)
        _logger.info('Test acc %.5f' % test_acc)

        if args.predict_output:
            os.makedirs(os.path.dirname(args.predict_output), exist_ok=True)
            if args.predict_output.endswith('.root'):
                from utils.data.fileio import _write_root
                output = {}
                for idx, label_name in enumerate(data_config.label_value):
                    output[label_name] = (
                        labels[data_config.label_names[0]] == idx)
                    output['score_' + label_name] = scores[:, idx]
                for k, v in labels.items():
                    if k == data_config.label_names[0]:
                        continue
                    if v.ndim > 1:
                        _logger.warning('Ignoring %s, not a 1d array.', k)
                        continue
                    output[k] = v
                for k, v in observers.items():
                    if v.ndim > 1:
                        _logger.warning('Ignoring %s, not a 1d array.', k)
                        continue
                    output[k] = v
                _write_root(args.predict_output, output)
            else:
                import awkward
                output = {'scores': scores}
                output.update(labels)
                output.update(observers)
                awkward.save(args.predict_output, output, mode='w')

            _logger.info('Written output to %s' % args.predict_output)
예제 #5
0
    def run(self):
        # check GPU
        print("Device: ", tf.test.gpu_device_name())

        # load from checkpoint
        if self.config.checkpoint:
            self.model.load_weights(self.config.checkpoint, by_name=True)

        # create optimizer
        optimizer = pydoc.locate(self.config.optimizer.name)(
            **self.config.optimizer.args
        )


        # load data
        train_flow = self.data.train

        # compilation arguments
        compile_args = dict(
            loss=self.config.loss,
            metrics=[
                "accuracy"],
        )

        self.model.compile(optimizer=optimizer, **compile_args)

        epochs = ceil(self.config.iterations / len(train_flow))
        steps_per_epoch = self.config.iterations // epochs

        lr_finder = LRFinder(self.model)
        lr_finder.find_generator(
            train_flow,
            start_lr=self.config.start_lr,
            end_lr=self.config.end_lr,
            epochs=epochs,
            steps_per_epoch=steps_per_epoch,
            callbacks=self.callbacks,
            workers=self.config.workers,
            verbose=self.config.verbose,
        )

        lr_finder.plot_loss(n_skip_beginning=10, n_skip_end=5)
        plt.savefig(
            os.path.join(
                self.config.experiment_dir,
                "loss_{}it.png".format(self.config.iterations),
            )
        )

        lr_finder.plot_loss_change(
            sma=20, n_skip_beginning=10, n_skip_end=5, y_lim=(-0.01, 0.01)
        )
        plt.savefig(
            os.path.join(
                self.config.experiment_dir,
                "loss_change_{}it.png".format(self.config.iterations),
            )
        )

        lr_finder.plot_exp_loss()
        plt.savefig(
            os.path.join(
                self.config.experiment_dir,
                "exp_loss_{}it.png".format(self.config.iterations),
            )
        )

        lr_finder.plot_exp_loss_change()
        plt.savefig(
            os.path.join(
                self.config.experiment_dir,
                "exp_loss_change_{}it.png".format(self.config.iterations),
            )
        )

        pd.DataFrame({"lr": lr_finder.lrs, "loss": lr_finder.losses}).to_csv(
            os.path.join(self.config.experiment_dir, "lr_loss.csv")
        )
예제 #6
0
def main(args):
    _logger.info('args:\n - %s',
                 '\n - '.join(str(it) for it in args.__dict__.items()))

    if args.file_fraction < 1:
        _logger.warning(
            'Use of `file-fraction` is not recommended in general -- prefer using `data-fraction` instead.'
        )

    # classification/regression mode
    if args.regression_mode:
        _logger.info('Running in regression mode')
        from utils.nn.tools import train_regression as train
        from utils.nn.tools import evaluate_regression as evaluate
    else:
        _logger.info('Running in classification mode')
        from utils.nn.tools import train_classification as train
        from utils.nn.tools import evaluate_classification as evaluate

    # training/testing mode
    training_mode = not args.predict

    # device
    if args.gpus:
        gpus = [int(i) for i in args.gpus.split(',')]
        dev = torch.device(gpus[0])
    else:
        gpus = None
        dev = torch.device('cpu')

    # load data
    if training_mode:
        train_loader, val_loader, data_config, train_input_names, train_label_names = train_load(
            args)
    else:
        test_loaders, data_config = test_load(args)

    if args.io_test:
        data_loader = train_loader if training_mode else list(
            test_loaders.values())[0]()
        iotest(args, data_loader)
        return

    model, model_info, network_module, network_options = model_setup(
        args, data_config)

    if args.print:
        return

    if args.profile:
        profile(args, model, model_info, device=dev)
        return

    # export to ONNX
    if args.export_onnx:
        onnx(args, model, data_config, model_info)
        return

    if args.tensorboard:
        from utils.nn.tools import TensorboardHelper
        tb = TensorboardHelper(tb_comment=args.tensorboard,
                               tb_custom_fn=args.tensorboard_custom_fn)
    else:
        tb = None

    # note: we should always save/load the state_dict of the original model, not the one wrapped by nn.DataParallel
    # so we do not convert it to nn.DataParallel now
    orig_model = model

    if training_mode:
        model = orig_model.to(dev)
        # loss function
        try:
            loss_func = network_module.get_loss(data_config, **network_options)
            _logger.info('Using loss function %s with options %s' %
                         (loss_func, network_options))
        except AttributeError:
            loss_func = torch.nn.CrossEntropyLoss()
            _logger.warning(
                'Loss function not defined in %s. Will use `torch.nn.CrossEntropyLoss()` by default.',
                args.network_config)

        # optimizer & learning rate
        opt, scheduler = optim(args, model, dev)

        # multi-gpu
        if gpus is not None and len(gpus) > 1:
            # model becomes `torch.nn.DataParallel` w/ model.module being the original `torch.nn.Module`
            model = torch.nn.DataParallel(model, device_ids=gpus)
        model = model.to(dev)

        # lr finder: keep it after all other setups
        if args.lr_finder is not None:
            start_lr, end_lr, num_iter = args.lr_finder.replace(' ',
                                                                '').split(',')
            from utils.lr_finder import LRFinder
            lr_finder = LRFinder(model,
                                 opt,
                                 loss_func,
                                 device=dev,
                                 input_names=train_input_names,
                                 label_names=train_label_names)
            lr_finder.range_test(train_loader,
                                 start_lr=float(start_lr),
                                 end_lr=float(end_lr),
                                 num_iter=int(num_iter))
            lr_finder.plot(output='lr_finder.png'
                           )  # to inspect the loss-learning rate graph
            return

        if args.use_amp:
            from torch.cuda.amp import GradScaler
            scaler = GradScaler()
        else:
            scaler = None

        # training loop
        best_valid_metric = np.inf if args.regression_mode else 0
        for epoch in range(args.num_epochs):
            if args.load_epoch is not None:
                if epoch <= args.load_epoch:
                    continue
            print('-' * 50)
            _logger.info('Epoch #%d training' % epoch)
            train(model,
                  loss_func,
                  opt,
                  scheduler,
                  train_loader,
                  dev,
                  epoch,
                  steps_per_epoch=args.steps_per_epoch,
                  grad_scaler=scaler,
                  tb_helper=tb)
            if args.model_prefix:
                dirname = os.path.dirname(args.model_prefix)
                if dirname and not os.path.exists(dirname):
                    os.makedirs(dirname)
                state_dict = model.module.state_dict() if isinstance(
                    model, torch.nn.DataParallel) else model.state_dict()
                torch.save(state_dict,
                           args.model_prefix + '_epoch-%d_state.pt' % epoch)
                torch.save(
                    opt.state_dict(),
                    args.model_prefix + '_epoch-%d_optimizer.pt' % epoch)

            _logger.info('Epoch #%d validating' % epoch)
            valid_metric = evaluate(model,
                                    val_loader,
                                    dev,
                                    epoch,
                                    loss_func=loss_func,
                                    steps_per_epoch=args.steps_per_epoch_val,
                                    tb_helper=tb)
            is_best_epoch = (valid_metric < best_valid_metric
                             ) if args.regression_mode else (
                                 valid_metric > best_valid_metric)
            if is_best_epoch:
                best_valid_metric = valid_metric
                if args.model_prefix:
                    shutil.copy2(
                        args.model_prefix + '_epoch-%d_state.pt' % epoch,
                        args.model_prefix + '_best_epoch_state.pt')
                    torch.save(model,
                               args.model_prefix + '_best_epoch_full.pt')
            _logger.info(
                'Epoch #%d: Current validation metric: %.5f (best: %.5f)' %
                (epoch, valid_metric, best_valid_metric),
                color='bold')

    if args.data_test:
        if training_mode:
            del train_loader, val_loader
            test_loaders, data_config = test_load(args)

        if not args.model_prefix.endswith('.onnx'):
            model = orig_model.to(dev)
            model_path = args.model_prefix if args.model_prefix.endswith(
                '.pt') else args.model_prefix + '_best_epoch_state.pt'
            _logger.info('Loading model %s for eval' % model_path)
            model.load_state_dict(torch.load(model_path, map_location=dev))
            if gpus is not None and len(gpus) > 1:
                model = torch.nn.DataParallel(model, device_ids=gpus)
            model = model.to(dev)

        for name, get_test_loader in test_loaders.items():
            test_loader = get_test_loader()
            # run prediction
            if args.model_prefix.endswith('.onnx'):
                _logger.info('Loading model %s for eval' % args.model_prefix)
                from utils.nn.tools import evaluate_onnx
                test_metric, scores, labels, observers = evaluate_onnx(
                    args.model_prefix, test_loader)
            else:
                test_metric, scores, labels, observers = evaluate(
                    model,
                    test_loader,
                    dev,
                    epoch=None,
                    for_training=False,
                    tb_helper=tb)
            _logger.info('Test metric %.5f' % test_metric, color='bold')
            del test_loader

            if args.predict_output:
                if '/' not in args.predict_output:
                    args.predict_output = os.path.join(
                        os.path.dirname(args.model_prefix), 'predict_output',
                        args.predict_output)
                os.makedirs(os.path.dirname(args.predict_output),
                            exist_ok=True)
                if name == '':
                    output_path = args.predict_output
                else:
                    base, ext = os.path.splitext(args.predict_output)
                    output_path = base + '_' + name + ext
                if output_path.endswith('.root'):
                    save_root(args, output_path, data_config, scores, labels,
                              observers)
                else:
                    save_awk(args, output_path, scores, labels, observers)
                _logger.info('Written output to %s' % output_path,
                             color='bold')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Enable Multi-GPU training
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
    net = nn.DataParallel(net)

augs_train = iaa.Sequential([
    iaa.Scale((imsize, imsize), 0),
])


db_train = dataloader.SurfaceNormalsDataset(
    input_dir='data/datasets/train/milk-bottles-train/resized-files/preprocessed-rgb-imgs',
    label_dir='data/datasets/train/milk-bottles-train/resized-files/preprocessed-camera-normals',
    transform=augs_train,
    input_only=None,
)


trainLoader = DataLoader(db_train, batch_size=p['trainBatchSize'], shuffle=True, num_workers=32, drop_last=True)


# %matplotlib inline

lr_finder = LRFinder(net, optimizer, criterion, device="cuda")
lr_finder.range_test(trainLoader, end_lr=1, num_iter=100)
lr_finder.plot()
plt.show()
예제 #8
0

# In[10]:


# Parameters
params = {'batch_size': 128,
          'shuffle': True,
          'num_workers': 8}

model = DenseNet().to(device)
optimizer = AdamW(model.parameters(), lr=1e-7, eps=1e-8, weight_decay=0.0001)
criterion = nn.L1Loss()
train_dataloader_lr = data.DataLoader(LANL_Dataset_LR(train_df), **params)

lr_find = LRFinder(model, optimizer, criterion)
lr_find.range_test(train_dataloader_lr)
lr_find.plot()
lr_find.reset()


# In[12]:


def LANL_train(model,
               dataloaders,
               optimizer,
               criterion=nn.L1Loss(),
               num_epochs=1000,
               patience=300,
               snapshot_path='./snapshots',
예제 #9
0
 def lr_search(self, model, batches, val_batches, end_lr, num_iter, step_mode, log):
     lr_finder = LRFinder(model, model.optimizer, nn.MSELoss(), device="cuda")
     lr_finder.range_test(batches, val_loader=val_batches, end_lr=end_lr, num_iter=num_iter, step_mode=step_mode)
     lr_finder.plot(log_lr=False)
예제 #10
0
class Processor():
    def __init__(self, args):
        self.args = args
        self._save_arg()

        if self.args.phase == 'train' or self.args.phase == 'visualize':
            self._load_logger()

        self.data_loader = {}
        if self.args.phase == 'train':
            self._load_train_data()
            self._load_test_data()
        elif self.args.phase == 'test':
            self._load_test_data()
        elif self.args.phase == 'visualize':
            self._load_visualize_data()

        self._load_model()
        self._load_evaluator()

        if self.args.phase == 'train' or self.args.phase == 'lr_finder':
            self._load_criterion()
            self._load_optimizer()

        if self.args.phase == 'train':
            self._load_trainer()
        elif self.args.phase == 'lr_finder':
            self._load_lr_finder()
        elif self.args.phase == 'test':
            self._load_tester()
        elif self.args.phase == 'visualize':
            self._load_visualizer()

    def _load_logger(self):
        from tensorboardX import SummaryWriter
        self.writer = SummaryWriter(
            log_dir=os.path.join('runs', self.args.model,
                                 datetime.now().isoformat()))

    def _load_train_data(self):
        print('loading train data...')

        import torchvision.transforms as transforms

        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(15),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        import torchvision
        train_dataset = torchvision.datasets.CIFAR100(
            root='./data', train=True, transform=train_transform)
        self.data_loader['train'] = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.args.batch_size,
            shuffle=True,
            num_workers=self.args.workers,
            pin_memory=True)

        print('train data load finished!')

    def _load_test_data(self):
        print('loading test data...')

        import torchvision.transforms as transforms

        test_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        import torchvision
        test_dataset = torchvision.datasets.CIFAR100(root='./data',
                                                     train=False,
                                                     transform=test_transform)
        self.data_loader['test'] = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=self.args.batch_size,
            shuffle=False,
            num_workers=self.args.workers,
            pin_memory=True)

        print('test data load finished!')

    def _load_visualize_data(self):
        print('loading visualize data...')

        import torchvision.transforms as transforms

        visualize_transform = transforms.Compose([transforms.ToTensor()])

        import torchvision
        test_dataset = torchvision.datasets.CIFAR100(
            root='./data', train=False, transform=visualize_transform)
        self.data_loader['visualize'] = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=self.args.batch_size,
            shuffle=False,
            num_workers=self.args.workers,
            pin_memory=True)

        print('visualize data load finished!')

    def _load_model(self):
        print('loading model...')

        if self.args.model == 'resnet':
            from models.resnet import resnet
            self.model = resnet(**self.args.model_args)
        elif self.args.model == 'densenet':
            from models.densenet import densenet
            self.model = densenet(**self.args.model_args)

        self.policies = self.model.parameters()

        #self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpus).cuda()

        if self.args.resume:
            if os.path.isfile(self.args.resume):
                print(("=> loading checkpoint '{}'".format(self.args.resume)))
                checkpoint = torch.load(self.args.resume)
                d = collections.OrderedDict()
                for key, value in checkpoint['state_dict'].items():
                    tmp = key[7:]
                    d[tmp] = value
                self.args.start_epoch = checkpoint['epoch']
                #self.model.load_state_dict(checkpoint['state_dict'])
                self.model.load_state_dict(d)
                print(("=> loaded checkpoint '{}' (epoch {})".format(
                    self.args.phase, checkpoint['epoch'])))
            else:
                print(("=> no checkpoint found at '{}'".format(
                    self.args.resume)))

        print('model load finished!')

    def _load_evaluator(self):
        print('loading evaluator...')

        from utils.evaluator import Evaluator
        self.evaluator = Evaluator()

        print('evaluator load finished!')

    def _load_criterion(self):
        print('loading criterion...')

        self.criterion = nn.CrossEntropyLoss()

        print('criterion load finished!')

    def _load_optimizer(self):
        print('loading optimizer...')

        import torch.optim as optim

        if self.args.optimizer == 'adadelta':
            self.optimizer = optim.Adadelta(self.policies,
                                            lr=self.args.lr,
                                            weight_decay=self.args.wd)
        elif self.args.optimizer == 'adagrad':
            self.optimizer = optim.Adagrad(self.policies,
                                           lr=self.args.lr,
                                           weight_decay=self.args.wd)
        elif self.args.optimizer == 'adam':
            self.optimizer = optim.Adam(self.policies,
                                        lr=self.args.lr,
                                        weight_decay=self.args.wd)
        elif self.args.optimizer == 'rmsprop':
            self.optimizer = optim.RMSprop(self.policies,
                                           lr=self.args.lr,
                                           momentum=self.args.momentum,
                                           weight_decay=self.args.wd)
        elif self.args.optimizer == 'sgd':
            self.optimizer = optim.SGD(self.policies,
                                       lr=self.args.lr,
                                       momentum=self.args.momentum,
                                       dampening=0,
                                       nesterov=self.args.nesterov,
                                       weight_decay=self.args.wd)
        elif self.args.optimizer == 'adabound':
            import adabound
            self.optimizer = adabound.AdaBound(self.policies,
                                               lr=self.args.lr,
                                               final_lr=self.args.final_lr)

        if self.args.scheduler == 'step_lr':
            self.scheduler = optim.lr_scheduler.StepLR(
                self.optimizer,
                step_size=self.args.step_size,
                gamma=0.2,
                last_epoch=-1)
        elif self.args.scheduler == 'multi_step_lr':
            self.scheduler = optim.lr_scheduler.MultiStepLR(
                self.optimizer,
                milestones=self.args.milestones,
                gamma=0.2,
                last_epoch=-1)

        print('optimizer load finished!')

    def _load_trainer(self):
        print('loading trainer...')

        from trainer import Trainer
        self.trainer = Trainer(self.args, self.writer, self.data_loader, self.model, \
                               self.evaluator, self.criterion, self.scheduler, self.optimizer)

        print('trainer load finished!')

    def _load_lr_finder(self):
        print('loading lr finder...')

        from utils.lr_finder import LRFinder
        self.lr_finder = LRFinder(self.args, self.data_loader, self.model,
                                  self.criterion, self.optimizer)

        print('lr finder load finished!')

    def _load_tester(self):
        print('loading tester...')

        from tester import Tester
        self.tester = Tester(self.args, self.writer, self.data_loader,
                             self.model, self.evaluator)

        print('tester load finished!')

    def _load_visualizer(self):
        print('loading visualizer...')

        from utils.visualizer import Visualizer
        self.visualizer = Visualizer(self.writer, self.model, self.args.mode)

        print('visualizer load finished!')

    def start(self):
        import torch.backends.cudnn as cudnn
        cudnn.benchmark = True
        self._print_log('Parameters:\n{}\n'.format(str(vars(self.args))))
        if self.args.phase == 'train':
            self.trainer.start()
        elif self.args.phase == 'lr_finder':
            self.lr_finder.start()
        elif self.args.phase == 'test':
            self.tester.start()
        elif self.args.phase == 'visualize':
            for batch_idx, (image,
                            label) in enumerate(self.data_loader['visualize'],
                                                0):
                if batch_idx == self.args.sample_idx:
                    #image = image.cuda()
                    self.visualizer.start(image)
        else:
            raise ValueError

    def _print_log(self, s):
        print(s)
        if self.args.print_log:
            with open('{}/log.txt'.format(self.args.work_dir), 'a') as f:
                print(s, file=f)

    def _save_arg(self):
        arg_dict = vars(self.args)
        if not os.path.exists(self.args.work_dir):
            os.makedirs(self.args.work_dir)
        with open('{}/config.yaml'.format(self.args.work_dir), 'w') as f:
            yaml.dump(arg_dict, f)
예제 #11
0
파일: train.py 프로젝트: AlexDeMoor/weaver
def main(args):
    _logger.info(args)

    if args.file_fraction < 1:
        _logger.warning(
            'Use of `file-fraction` is not recommended in general -- prefer using `data-fraction` instead.'
        )

    # training/testing mode
    training_mode = not args.predict

    # device
    if args.gpus:
        gpus = [int(i) for i in args.gpus.split(',')]
        dev = torch.device(gpus[0])
    else:
        gpus = None
        dev = torch.device('cpu')

    # load data
    if training_mode:
        train_loader, val_loader, data_config, train_input_names, train_label_names = train_load(
            args)
    else:
        test_loader, data_config = test_load(args)

    if args.io_test:
        data_loader = train_loader if training_mode else test_loader
        iotest(args, data_loader)
        return

    model, model_info, network_module = _model(args, data_config)

    # export to ONNX
    if args.export_onnx:
        onnx(args, model, data_config, model_info)
        return

    # note: we should always save/load the state_dict of the original model, not the one wrapped by nn.DataParallel
    # so we do not convert it to nn.DataParallel now
    model = model.to(dev)

    if training_mode:
        # loss function
        try:
            loss_func = network_module.get_loss(data_config, **network_options)
            _logger.info(loss_func)
        except AttributeError:
            loss_func = torch.nn.CrossEntropyLoss()
            _logger.warning(
                'Loss function not defined in %s. Will use `torch.nn.CrossEntropyLoss()` by default.',
                args.network_config)

        # optimizer & learning rate
        opt, scheduler = optim(args, model)

        # load previous training and resume if `--load-epoch` is set
        if args.load_epoch is not None:
            _logger.info('Resume training from epoch %d' % args.load_epoch)
            model_state = torch.load(args.model_prefix +
                                     '_epoch-%d_state.pt' % args.load_epoch,
                                     map_location=dev)
            model.load_state_dict(model_state)
            opt_state = torch.load(args.model_prefix +
                                   '_epoch-%d_optimizer.pt' % args.load_epoch,
                                   map_location=dev)
            opt.load_state_dict(opt_state)

        # multi-gpu
        if gpus is not None and len(gpus) > 1:
            model = torch.nn.DataParallel(
                model, device_ids=gpus
            )  # model becomes `torch.nn.DataParallel` w/ model.module being the orignal `torch.nn.Module`
        model = model.to(dev)

        # lr finder: keep it after all other setups
        if args.lr_finder is not None:
            start_lr, end_lr, num_iter = args.lr_finder.replace(' ',
                                                                '').split(',')
            from utils.lr_finder import LRFinder
            lr_finder = LRFinder(model,
                                 opt,
                                 loss_func,
                                 device=dev,
                                 input_names=train_input_names,
                                 label_names=train_label_names)
            lr_finder.range_test(train_loader,
                                 start_lr=float(start_lr),
                                 end_lr=float(end_lr),
                                 num_iter=int(num_iter))
            lr_finder.plot(output='lr_finder.png'
                           )  # to inspect the loss-learning rate graph
            return

        if args.use_amp:
            from torch.cuda.amp import GradScaler
            scaler = GradScaler()
        else:
            scaler = None

        # training loop
        best_valid_acc = 0
        for epoch in range(args.num_epochs):
            if args.load_epoch is not None:
                if epoch <= args.load_epoch:
                    continue
            print('-' * 50)
            _logger.info('Epoch #%d training' % epoch)
            train(model,
                  loss_func,
                  opt,
                  scheduler,
                  train_loader,
                  dev,
                  grad_scaler=scaler)
            if args.model_prefix:
                dirname = os.path.dirname(args.model_prefix)
                if dirname and not os.path.exists(dirname):
                    os.makedirs(dirname)
                state_dict = model.module.state_dict() if isinstance(
                    model, torch.nn.DataParallel) else model.state_dict()
                torch.save(state_dict,
                           args.model_prefix + '_epoch-%d_state.pt' % epoch)
                torch.save(
                    opt.state_dict(),
                    args.model_prefix + '_epoch-%d_optimizer.pt' % epoch)

            _logger.info('Epoch #%d validating' % epoch)
            valid_acc = evaluate(model, val_loader, dev, loss_func=loss_func)
            if valid_acc > best_valid_acc:
                best_valid_acc = valid_acc
                if args.model_prefix:
                    shutil.copy2(
                        args.model_prefix + '_epoch-%d_state.pt' % epoch,
                        args.model_prefix + '_best_acc_state.pt')
                    torch.save(model, args.model_prefix + '_best_acc_full.pt')
            _logger.info(
                'Epoch #%d: Current validation acc: %.5f (best: %.5f)' %
                (epoch, valid_acc, best_valid_acc))
    else:
        # run prediction
        predict_model(args, test_loader, model, dev, data_config, gpus)