def main():
    args = EigenmetricRegressionSolver.get_eigenmetric_regression_arguments()
    model = AlbertForEigenmetricRegression.from_scratch(
        num_labels=len(ALBERT_EIGENMETRICS),
        top_comment_pretrained_model_name_or_path=args.
        top_comment_pretrained_model_name_or_path,
        post_pretrained_model_name_or_path=args.
        post_pretrained_model_name_or_path,
        classifier_dropout_prob=args.classifier_dropout_prob,
        meta_data_size=len(ALBERT_META_FEATURES),
        subreddit_pretrained_path=args.subreddit_pretrained_path,
        num_subreddit_embeddings=NUM_SUBREDDIT_EMBEDDINGS,
        subreddit_embeddings_size=SUBREDDIT_EMBEDDINGS_SIZE)
    if args.freeze_alberts:
        model = model.freeze_bert()

    save_dict = {
        "model_construct_params_dict": model.param_dict(),
        "state_dict": model.state_dict()
    }

    logx.initialize(logdir=args.output_dir,
                    coolname=True,
                    tensorboard=False,
                    no_timestamp=False,
                    eager_flush=True)

    logx.save_model(save_dict, metric=0, epoch=0, higher_better=False)
    def __init__(self, input_dir: Path, output_dir: Path, model: nn.Module,
                 device: torch.device, per_gpu_batch_size: int, n_gpu: int,
                 batch_size: int, learning_rate: float, weight_decay: float,
                 n_epoch: int, seed: int, **kwargs):
        # construct param dict
        self.construct_param_dict = OrderedDict({
            "input_dir": str(input_dir),
            "output_dir": str(output_dir),
            "learning_rate": learning_rate,
            "n_epoch": n_epoch,
            "per_gpu_batch_size": per_gpu_batch_size,
            "weight_decay": weight_decay,
            "seed": seed
        })

        self.subreddit_embedding_device = torch.device("cuda")

        # build log
        logx.initialize(logdir=output_dir,
                        coolname=True,
                        tensorboard=True,
                        no_timestamp=False,
                        hparams={
                            "solver_hparams": self.state_dict(),
                            "model_hparams": model.param_dict()
                        },
                        eager_flush=True)
        # arguments
        self.input_dir = input_dir
        self.output_dir = output_dir

        # training utilities
        self.model = model

        # data utilities
        self.train_dataloader = kwargs.pop("train_dataloader", None)
        self.dev_dataloader = kwargs.pop("dev_dataloader", None)
        self.batch_size = batch_size

        self.n_epoch = n_epoch
        self.seed = seed
        # device
        self.device = device
        self.n_gpu = n_gpu
        logx.msg(f'Number of GPU: {self.n_gpu}.')

        self.criterion = nn.MSELoss()

        # optimizer and scheduler
        if self.train_dataloader:
            self.optimizer, self.scheduler = self.get_optimizer(
                named_parameters=self.model.named_parameters(),
                learning_rate=learning_rate,
                weight_decay=weight_decay,
                train_dataloader=self.train_dataloader,
                n_epoch=n_epoch)
        # set up random seeds and model location
        self.setup()
Пример #3
0
    def __init__(self, input_dir, output_dir, model, device,
                 per_gpu_batch_size, n_gpu, batch_size, learning_rate,
                 weight_decay, n_epoch, seed, top_k, **kwargs):
        # construct param dict
        self.construct_param_dict = OrderedDict({
            "input_dir": str(input_dir),
            "output_dir": str(output_dir),
            "learning_rate": learning_rate,
            "n_epoch": n_epoch,
            "per_gpu_batch_size": per_gpu_batch_size,
            "weight_decay": weight_decay,
            "seed": seed,
            "top_k": top_k,
        })

        # build log
        logx.initialize(logdir=output_dir,
                        coolname=True,
                        tensorboard=True,
                        no_timestamp=False,
                        hparams={
                            "solver_construct_dict": self.construct_param_dict,
                            "model_construct_dict": model.model_construct_dict
                        },
                        eager_flush=True)
        # arguments
        self.record_training_loss_per_epoch = kwargs.pop(
            "record_training_loss_per_epoch", False)
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.top_k = top_k
        # training utilities
        self.model = model

        # data utilities
        self.train_dataloader = kwargs.pop("train_dataloader", None)
        self.dev_dataloader = kwargs.pop("dev_dataloader", None)
        self.batch_size = batch_size

        self.n_epoch = n_epoch
        self.seed = seed
        # device
        self.device = device
        self.n_gpu = n_gpu
        logx.msg(f'Number of GPU: {self.n_gpu}.')

        self.criterion = kl_div_add_mse_loss

        # optimizer and scheduler
        if self.train_dataloader:
            self.optimizer, self.scheduler = self.get_optimizer(
                named_parameters=self.model.named_parameters(),
                learning_rate=learning_rate,
                weight_decay=weight_decay,
                train_dataloader=self.train_dataloader,
                n_epoch=n_epoch)
        # set up random seeds and model location
        self.setup()
    def __init__(self, input_dir: Path, output_dir: Path, model_construct_params_dict: OrderedDict, **kwargs):
        """
        api for xgboost
        :param args: model arguments
        :param input_dir: the path to the input directory
        :param output_dir: the path to save checkpoints
        (this parameter is also in args; add it as an explicit parameter to remind of this side effect)
        """

        self.construct_param_dict = \
            OrderedDict({
                "input_dir": str(input_dir),
                "output_dir": str(output_dir),
                "model_construct_params_dict": model_construct_params_dict,
            })

        # build log


        self.input_dir = input_dir
        self.output_dir = output_dir
        self.data = kwargs.pop("data", None)
        if model_construct_params_dict['xgb_model']:
            logx.initialize(logdir=output_dir,
                            coolname=True,
                            tensorboard=False,
                            no_timestamp=False,
                            hparams={"solver_hparams": self.construct_param_dict},
                            eager_flush=True)
            logx.msg(f"loaded models from {model_construct_params_dict['xgb_model']}")
            self.bst = xgb.Booster({'nthread': 32})
            self.bst.load_model(model_construct_params_dict['xgb_model'])
            logx.msg(f"loaded pretrained model from {model_construct_params_dict['xgb_model']}")
        else:
            logx.initialize(logdir=output_dir,
                            coolname=True,
                            tensorboard=True,
                            no_timestamp=False,
                            hparams={"solver_hparams": self.construct_param_dict},
                            eager_flush=True)
Пример #5
0
def main():
    args = parser.parse_args()

    logx.initialize(logdir=args.logdir, tensorboard=True, hparams=vars(args))

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker,
                 nprocs=ngpus_per_node,
                 args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)
Пример #6
0
def train():
    for time in range(5):
        logx.initialize(get_logdir("../runs"),
                        tensorboard=True,
                        coolname=False)

        model.load_state_dict(
            torch.load("..\\runs\exp10\last_checkpoint_ep0.pth")
            ['state_dict'])  # warmup

        dataset_train = TrainDataset(
            '../' + cfg.root_folder +
            '/five_fold/train_kfold_{}.csv'.format(time),
            '../' + cfg.root_folder + '/train/', train_transform)
        train_loader = DataLoader(dataset_train,
                                  batch_size=cfg.bs,
                                  shuffle=True)
        test_data = TrainDataset(
            '../' + cfg.root_folder +
            '/five_fold/test_kfold_{}.csv'.format(time),
            '../' + cfg.root_folder + '/train/',
        )
        test_load = DataLoader(test_data, batch_size=cfg.bs, shuffle=False)

        # train
        for epoch in range(cfg.epoch):
            loss_epoch = 0
            total = 0
            correct = 0
            for i, (x, y) in enumerate(train_loader, 1):
                x, y = x.to(device), y.to(device)
                y_hat = model(x)
                # 计算正确率
                total += x.size(0)
                _, predict = torch.max(y_hat.data, dim=1)
                correct += (predict == y).sum().item()

                # 损失
                loss = criterion(y_hat, y)
                loss_epoch += loss.item()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # 过程可视化
                if i % 30 == 0:
                    print(
                        'epoch:%d,  enumerate:%d,  loss_avg:%f,  now_acc:%f' %
                        (epoch, i, loss_epoch / i, correct / total))

            # epoch matric 可视化
            train_loss = loss_epoch / i
            train_acc = (correct / total) * 100
            logx.metric('train', {'loss': train_loss, 'acc': train_acc}, epoch)

            # valid
            # 开发集正确率
            correct = 0
            total = 0
            val_loss = 0
            with torch.no_grad():
                for i, (img, label) in enumerate(test_load, 1):
                    img, label = img.to(device), label.to(device)
                    output = model(img)
                    loss = criterion(output, label)
                    val_loss += loss.cpu().item()
                    _, predicted = torch.max(output.data, dim=1)  # 最大值,位置
                    total += img.size(0)
                    correct += (predicted == label).sum().item()
            val_acc = (100 * correct / total)
            val_loss /= i
            logx.metric('val', {'loss': val_loss, 'acc': val_acc}, epoch)
            # epoch lossand other metric
            print(
                'epoch over; train_loss:%f, val_loss:%f, train_acc=%f, val_acc:%f'
                % (train_loss, val_loss, train_acc, val_acc))
            logx.save_model({
                'state_dict': model.state_dict(),
                'epoch': epoch
            },
                            val_acc,
                            higher_better=True,
                            epoch=epoch,
                            delete_old=True)
            scheduler.step()
Пример #7
0

if __name__ == '__main__':
    args = Opts().init()

    # load the dataset
    train_loader, n_train, properties = get_dataset(args=args, flag='train')
    val_loader, n_val, _ = get_dataset(args=args, flag='val')
    mean, std = properties[0], properties[1]

    # criterion
    criterion = nn.CrossEntropyLoss(
    ) if args.n_classes > 1 else args.loss_function

    # initialize the information
    logx.initialize(logdir=args.dir_log, coolname=True, tensorboard=True)
    logx.msg('Start training...\n')

    table = PrettyTable(["key", "value"])
    table.align = 'l'
    infos = {
        'vis': args.vis,
        'seed': args.seed,
        'epoch': args.epochs,
        'data aug': args.aug,
        'resume': args.resume,
        'optimizer': args.optim,
        'dataset': args.dataset,
        'training size': n_train,
        'validation size': n_val,
        'learning rate': args.lr,
Пример #8
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=False,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    net = network.get_net(args, criterion)
    optim, scheduler = get_optimizer(args, net)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    if args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        # validate(val_loader, net, criterion=None, optim=None, epoch=0,
        #          calc_metrics=False, dump_assets=args.dump_assets,
        #          dump_all_images=True)
        if not os.path.exists(args.result_dir + 'image_2/'):
            os.mkdir(args.result_dir + 'image_2/')
        if not os.path.exists(args.result_dir + 'image_3/'):
            os.mkdir(args.result_dir + 'image_3/')

        num_image = 7481
        for idx in tqdm(range(num_image)):
            sample_idx = "%06d" % idx
            eval_minibatch(sample_idx, "image_2/", net, args)
            eval_minibatch(sample_idx, "image_3/", net, args)

        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)
Пример #9
0
import os

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
# from torch.cuda.amp import autocast, GradScaler
from torchvision.datasets import ImageFolder
from runx.logx import logx

from utils import Config, get_model
from dataset import Caltech, get_tfms

config = Config()
logdir = config.exp_name
logx.initialize(logdir, coolname=True, tensorboard=True)


def train_epoch(epoch):
    model.train()
    losses = 0.0
    total, correct = 0.0, 0.0
    for step, (x, y) in enumerate(train_loader):
        x, y = x.to(config.device), y.to(config.device)
        out = model(x)
        loss = criterion(out, y)
        losses += loss.cpu().detach().numpy()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
Пример #10
0
from data_loader import Train_Dataset
import torch
import torch.nn as nn
from torch import optim
from torchvision import transforms, models
from torch.utils.data import DataLoader
import argparse
from runx.logx import logx
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logx.initialize("./logs/exp9", coolname=True, tensorboard=True)


def train(model, loader, optimizer, scheduler=None):
    print('now LR: ' + str(optimizer.param_groups[0]['lr']))
    model.train()
    total, correct, train_loss = 0.0, 0.0, 0.0
    for step, data in enumerate(loader):
        x, y = data
        x, y = x.to(device), y.to(device)
        out = model(x)
        # print("size:", x.size(), y.size(), out.size())

        loss = nn.CrossEntropyLoss()(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()
def main():
    """
    Main Function
    """
    rank = args.rank
    cfg.GLOBAL_RANK = rank
    args.gpus = torch.cuda.device_count()
    device = torch.device("cpu")
    loc_dist = True if args.gpus > 1 else False
    loc_rank = rank % args.gpus
    args.gpu = loc_rank
    args.local_rank = loc_rank
    if loc_dist:
        device = "cuda:" + str(loc_rank)
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = "19500"
        os.environ["NCCL_SOCKET_IFNAME"] = "ib"
        torch.cuda.set_device(device)
        torch.distributed.init_process_group(backend="nccl",
                                             rank=loc_rank,
                                             world_size=args.gpus)
        # torch.cuda.set_device(device)
    elif args.gpus == 1:
        args.gpus = torch.cuda.device_count()
        device = "cuda:0"
        args.local_rank = 0
        torch.cuda.set_device(device)

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    #     args.ngpu = torch.cuda.device_count()
    #     args.best_record = {'mean_iu': -1, 'epoch': 0}

    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    cwd = os.getcwd()
    sz = ht.MPI_WORLD.size
    filename = cwd + "/citys-heat-checkpoint-" + str(sz) + ".pth.tar"
    if args.resume and os.path.isfile(filename):
        checkpoint = torch.load(filename, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        logx.msg(f"Resuming from: checkpoint={args.resume}, "
                 f"epoch {args.start_epoch}, arch {args.arch}")
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        logx.msg(f"Loading weights from: checkpoint={args.snapshot}")

    net = network.get_net(args, criterion)
    net = net.to(device)
    # args.lr = (1. / args.world_size * (5 * (args.world_size - 1) / 6.)) * 0.0125 * args.world_size
    optim, scheduler = get_optimizer(args, net)

    # the scheduler in this code is only run at the end of each epoch
    # todo: make heat an option not this whole file
    # if args.heat:
    dp_optim = ht.optim.DASO(
        local_optimizer=optim,
        total_epochs=args.max_epoch,
        max_global_skips=4,
    )
    #if args.no_cycling:
    dp_optim.disable_cycling(global_skips=args.batch_skip,
                             batches_to_wait=args.gs)
    # this is where the network is wrapped with DDDP (w/apex) or DP
    htnet = ht.nn.DataParallelMultiGPU(net,
                                       comm=ht.MPI_WORLD,
                                       optimizer=dp_optim)

    if args.summary:
        print(str(net))
        from thop import profile
        img = torch.randn(1, 3, 1024, 2048).cuda()
        mask = torch.randn(1, 1, 1024, 2048).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print0(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
        dp_optim.stability.load_dict(checkpoint["skip_stable"])
    if args.restore_net:
        #restore_net(net, checkpoint)
        htnet.load_state_dict(checkpoint["state_dict"])
        #dp_optim.module.load_state_dist(checkpoint["state_dict"])
    # htnet = ht.nn.DataParallelMultiGPU(net, ht.MPI_WORLD, dp_optim)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        # TODO: need a loss value for the restart at a certain epoch...
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets
    # todo: HeAT fixes -- not urgent --
    if args.eval == 'val':
        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader,
                     net,
                     criterion=criterion_val,
                     optim=optim,
                     epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader,
                 net,
                 criterion=None,
                 optim=None,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    scaler = amp.GradScaler()
    if dp_optim.comm.rank == 0:
        print("scheduler", args.lr_schedule)
    dp_optim.add_scaler(scaler)

    nodes = str(int(dp_optim.comm.size / torch.cuda.device_count()))
    cwd = os.getcwd()
    fname = cwd + "/" + nodes + "-heat-citys-benchmark"
    if args.resume and rank == 0 and os.path.isfile(fname + ".pkl"):
        with open(fname + ".pkl", "rb") as f:
            out_dict = pickle.load(f)
    else:
        out_dict = {
            "epochs": [],
            nodes + "-avg-batch-time": [],
            nodes + "-total-train-time": [],
            nodes + "-train-loss": [],
            nodes + "-val-loss": [],
            nodes + "-val-iou": [],
            nodes + "-val-time": [],
        }
        print0("Output dict:", fname)

    for epoch in range(args.start_epoch, args.max_epoch):
        # todo: HeAT fixes -- possible conflict between processes
        update_epoch(epoch)

        if args.only_coarse:  # default: false
            train_obj.only_coarse()
            train_obj.build_epoch()
        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
            else:
                train_obj.build_epoch()
        else:
            pass

        ls, bt, btt = train(train_loader, htnet, dp_optim, epoch, scaler)
        dp_optim.epoch_loss_logic(ls, loss_globally_averaged=True)

        # if epoch % args.val_freq == 0:
        vls, iu, vtt = validate(val_loader, htnet, criterion_val, dp_optim,
                                epoch)
        if args.lr_schedule == "plateau":
            if dp_optim.comm.rank == 0:
                print("loss", ls, 'best:',
                      scheduler.best * (1. - scheduler.threshold),
                      scheduler.num_bad_epochs)
            scheduler.step(ls)  # val_loss)
        else:
            scheduler.step()

        if args.rank == 0:
            save_checkpoint({
                "epoch": epoch + 1,
                "arch": args.arch,
                "state_dict": htnet.state_dict(),
                "optimizer": optim.state_dict(),
                "skip_stable": dp_optim.stability.get_dict()
            })

        out_dict["epochs"].append(epoch)
        out_dict[nodes + "-train-loss"].append(ls)
        out_dict[nodes + "-avg-batch-time"].append(bt)
        out_dict[nodes + "-total-train-time"].append(btt)
        out_dict[nodes + "-val-loss"].append(vls)
        out_dict[nodes + "-val-iou"].append(iu)
        out_dict[nodes + "-val-time"].append(vtt)

        if args.rank == 0:
            save_obj(out_dict, fname)

    if args.rank == 0:
        print("\nRESULTS\n")
        import pandas as pd
        df = pd.DataFrame.from_dict(out_dict).set_index("epochs")
        with pd.option_context("display.max_rows", None, "display.max_columns",
                               None):
            # more options can be specified also
            print(df)
        if args.benchmarking:
            try:
                fulldf = pd.read_csv(cwd + "/heat-bench-results.csv")
                fulldf = pd.concat([df, fulldf], axis=1)
            except FileNotFoundError:
                fulldf = df
            fulldf.to_csv(cwd + "/heat-bench-results.csv")
Пример #12
0
def main():
    # Settings
    args = get_args()

    # cuda and devices
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if use_cuda else 'cpu')
    # kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    # init logx
    logx.initialize(logdir=args.logdir,
                    coolname=True,
                    tensorboard=True,
                    hparams=vars(args))

    # random seed
    set_random_seed(args.seed)

    # dataset and dataloader
    mean = [0.5, 0.5, 0.5]
    std = [0.5, 0.5, 0.5]
    normalize = transforms.Normalize(mean, std)
    transform = transforms.Compose([transforms.ToTensor(), normalize])

    train_dataset = Dynamic_Scenes_Dataset(root_dir=args.dataset_dir,
                                           is_training=True,
                                           crop=True,
                                           crop_size=(256, 256))
    val_dataset = Dynamic_Scenes_Dataset(root_dir=args.dataset_dir,
                                         is_training=False,
                                         crop=False)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=8,
                              pin_memory=True)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.test_batch_size,
                            shuffle=False,
                            num_workers=1,
                            pin_memory=True)

    # model architecture
    if args.model_arch == 0:
        model = AHDRNet()
    elif args.model_arch == 1:
        model = AHDR(6, 5, 64, 32)
    else:
        logx.msg("This model is not yet implemented!\n")
        return

    if args.init_weights:
        init_parameters(model)
    if args.load:
        model.load_state_dict(torch.load(args.load, map_location=device))
        logx.msg(f'Model loaded from {args.load}')
    model.to(device)
    # # log graph
    # dummy_input = torch.from_numpy(np.random.rand(1, 6, 256, 256)).float().to(device)
    # logx.add_graph(model, input_to_model=(dummy_input, dummy_input, dummy_input))

    # loss function and optimizer
    if args.loss_func == 0:
        criterion = nn.L1Loss()
    elif args.loss_func == 1:
        criterion = nn.MSELoss()
    else:
        logx.msg("Error loss functions.\n")
        return
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    num_parameters = sum(
        torch.numel(parameter) for parameter in model.parameters())

    logx.msg(f'''Starting training:
        Model Paras:     {num_parameters}
        Epochs:          {args.epochs}
        Batch size:      {args.batch_size}
        Loss function:   {args.loss_func}
        Learning rate:   {args.lr}
        Training size:   {len(train_loader)}
        Device:          {device.type}
        Dataset dir:     {args.dataset_dir}
        ''')

    for epoch in range(1, args.epochs + 1):
        adjust_learning_rate(args, optimizer, epoch)
        train(args, model, device, train_loader, optimizer, epoch, criterion)
        validation(args, model, device, val_loader, optimizer, epoch,
                   criterion)
def main():
    """
    Main Function
    """
    rank = args.rank
    cfg.GLOBAL_RANK = rank
    args.gpus = torch.cuda.device_count()
    device = torch.device("cpu")
    hvd.init()

    torch.manual_seed(999999)
    #if args.cuda:
    args.cuda = True
    # Horovod: pin GPU to local rank.
    torch.cuda.set_device(hvd.local_rank())
    #torch.cuda.manual_seed(args.seed)

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True,
                    hparams=vars(args),
                    global_rank=args.global_rank)
    #print("vefore assert and infer")
    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    #     args.ngpu = torch.cuda.device_count()
    #     args.best_record = {'mean_iu': -1, 'epoch': 0}
    #print("before datasets / loss")
    train_loader, val_loader, train_obj = datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    cwd = os.getcwd()
    sz = ht.MPI_WORLD.size
    filename = cwd + "/citys-hvd-checkpoint-" + str(sz) + ".pth.tar"
    if args.resume and os.path.isfile(filename):
        checkpoint = torch.load(filename, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        logx.msg(f"Resuming from: checkpoint={args.resume}, " \
                 f"epoch {args.start_epoch}, arch {args.arch}")
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        logx.msg(f"Loading weights from: checkpoint={args.snapshot}")

    # todo: HeAT fixes -- urgent -- DDDP / optim / scheduler
    net = network.get_net(args, criterion)
    # net = net.to(device)

    # todo: optim -> direct wrap after this, scheduler stays the same?
    optim, scheduler = get_optimizer(args, net)

    # if args.fp16:
    #     net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)
    compression = hvd.Compression.fp16  # if args.fp16_allreduce else hvd.Compression.none

    optim = hvd.DistributedOptimizer(
        optim,
        named_parameters=net.named_parameters(),
        compression=compression,
        backward_passes_per_step=1,  # args.batches_per_allreduce,
        op=hvd.Average,
        gradient_predivide_factor=1.0,  # args.gradient_predivide_factor)
    )
    #print("after hvd optimizer setup")

    if args.summary:
        print(str(net))
        from thop import profile
        img = torch.randn(1, 3, 1024, 2048).cuda()
        mask = torch.randn(1, 1, 1024, 2048).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print0(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        #net.loat_state_dict(checkpoint["state_dict"])
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()
    #print("before parameter broadcasts")
    #hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    #hvd.broadcast_optimizer_state(optim, root_rank=0)

    if args.start_epoch != 0:
        # TODO: need a loss value for the restart at a certain epoch...
        scheduler.step(args.start_epoch)

    #net = net.cuda()
    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets
    # todo: HeAT fixes -- not urgent --
    # if args.eval == 'val':
    #     if args.dump_topn:
    #         validate_topn(val_loader, net, criterion_val, optim, 0, args)
    #     else:
    #         validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0,
    #                  dump_assets=args.dump_assets,
    #                  dump_all_images=args.dump_all_images,
    #                  calc_metrics=not args.no_metrics)
    #     return 0
    # elif args.eval == 'folder':
    #     # Using a folder for evaluation means to not calculate metrics
    #     validate(val_loader, net, criterion=None, optim=None, epoch=0,
    #              calc_metrics=False, dump_assets=args.dump_assets,
    #              dump_all_images=True)
    #     return 0
    # elif args.eval is not None:
    #     raise 'unknown eval option {}'.format(args.eval)

    scaler = None  #amp.GradScaler()
    args.amp = False  #True

    nodes = str(int(hvd.size() / torch.cuda.device_count()))
    cwd = os.getcwd()
    fname = cwd + "/" + nodes + "-hvd-citys-benchmark"
    if args.resume and rank == 0 and os.path.isfile(fname + ".pkl"):
        with open(fname + ".pkl", "rb") as f:
            out_dict = pickle.load(f)
    else:
        out_dict = {
            "epochs": [],
            nodes + "-avg-batch-time": [],
            nodes + "-total-train-time": [],
            nodes + "-train-loss": [],
            nodes + "-val-loss": [],
            nodes + "-val-iou": [],
            nodes + "-val-time": [],
        }
        print0("Output dict:", fname)
    # train_losses, train_btimes, train_ttime = [], [], []
    # val_losses, val_iu, val_ttime = [], [], []

    for epoch in range(args.start_epoch, args.max_epoch):
        # todo: HeAT fixes -- possible conflict between processes
        update_epoch(epoch)

        if args.only_coarse:  # default: false
            train_obj.only_coarse()
            train_obj.build_epoch()
        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
            else:
                train_obj.build_epoch()
        else:
            pass

        ls, bt, btt = train(train_loader, net, optim, epoch, scaler)
        # dp_optim.epoch_loss_logic(ls, loss_globally_averaged=True)

        # if epoch % args.val_freq == 0:
        vls, iu, vtt = validate(val_loader, net, criterion_val, optim, epoch)
        if args.lr_schedule == "plateau":
            scheduler.step(ls)  # val_loss)
        else:
            scheduler.step()

        if args.rank == 0:
            save_checkpoint({
                "epoch": epoch + 1,
                "arch": args.arch,
                "state_dict": net.state_dict(),
                "optimizer": optim.state_dict(),
                # "skip_stable": optim.stability.get_dict()
            })

        out_dict["epochs"].append(epoch)
        out_dict[nodes + "-train-loss"].append(ls)
        out_dict[nodes + "-avg-batch-time"].append(bt)
        out_dict[nodes + "-total-train-time"].append(btt)
        out_dict[nodes + "-val-loss"].append(vls)
        out_dict[nodes + "-val-iou"].append(iu)
        out_dict[nodes + "-val-time"].append(vtt)
        if args.rank == 0:
            save_obj(out_dict, fname)

    if args.rank == 0:
        print("\nRESULTS\n")
        import pandas as pd
        df = pd.DataFrame.from_dict(out_dict).set_index("epochs")
        with pd.option_context("display.max_rows", None, "display.max_columns",
                               None):
            # more options can be specified also
            print(df)
        if args.benchmarking:
            try:
                fulldf = pd.read_csv(cwd + "/hvd-bench-results.csv")
                fulldf = pd.concat([df, fulldf], axis=1)
            except FileNotFoundError:
                fulldf = df
            fulldf.to_csv(cwd + "/hvd-bench-results.csv")
Пример #14
0
            correct += (pred == y).squeeze().sum().cpu().numpy()
    valid_acc = correct / total
    print("valid accuracy", valid_acc)
    logx.metric('val', {
        'loss': valid_loss,
        'accuracy': valid_acc
    },
                epoch=epoch)
    return valid_acc


for k in range(5):
    print("training for fold {}".format(k))
    start_epoch = 0
    logx.initialize(os.path.join(log_dir, 'fold_{}'.format(k)),
                    coolname=True,
                    tensorboard=True)
    # 数据加载
    desc_train = os.path.join(cfg.ds_folder, 'new_train_{}.csv'.format(k))
    desc_valid = os.path.join(cfg.ds_folder, 'new_valid_{}.csv'.format(k))

    train_data = TrainDataset(desc_train,
                              data_folder=os.path.join(cfg.ds_folder,
                                                       "train/"),
                              transform=transform_train)
    valid_data = TrainDataset(desc_valid,
                              data_folder=os.path.join(cfg.ds_folder,
                                                       "train/"),
                              transform=transform_test)

    # 构建DataLoader
Пример #15
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True, hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = \
        datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir,
                            args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume,
                                map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    net = network.get_net(args, criterion)
    optim, scheduler = get_optimizer(args, net)

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.summary:
        print(str(net))
        from pytorchOpCounter.thop import profile
        img = torch.randn(1, 3, 1024, 2048).cuda()
        mask = torch.randn(1, 1, 1024, 2048).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets
    if args.eval == 'val':

        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader, net, criterion=None, optim=None, epoch=0,
                 calc_metrics=False, dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    for epoch in range(args.start_epoch, args.max_epoch):
        update_epoch(epoch)

        if args.only_coarse:
            train_obj.only_coarse()
            train_obj.build_epoch()
            if args.apex:
                train_loader.sampler.set_num_samples()

        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
        else:
            pass

        train(train_loader, net, optim, epoch)

        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)

        if epoch % args.val_freq == 0:
            validate(val_loader, net, criterion_val, optim, epoch)

        scheduler.step()

        if check_termination(epoch):
            return 0
Пример #16
0
def main():
    """
    Main Function
    """
    if AutoResume:
        AutoResume.init()

    assert args.result_dir is not None, 'need to define result_dir arg'
    logx.initialize(logdir=args.result_dir,
                    tensorboard=True,
                    hparams=vars(args),
                    global_rank=args.global_rank)

    # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer
    assert_and_infer_cfg(args)
    prep_experiment(args)
    train_loader, val_loader, train_obj = \
        datasets.setup_loaders(args)
    criterion, criterion_val = get_loss(args)

    auto_resume_details = None
    if AutoResume:
        auto_resume_details = AutoResume.get_resume_details()

    if auto_resume_details:
        checkpoint_fn = auto_resume_details.get("RESUME_FILE", None)
        checkpoint = torch.load(checkpoint_fn,
                                map_location=torch.device('cpu'))
        args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None)
        args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = ("Found details of a requested auto-resume: checkpoint={}"
               " tensorboard={} at epoch {}")
        logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch))
    elif args.resume:
        checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
        args.arch = checkpoint['arch']
        args.start_epoch = int(checkpoint['epoch']) + 1
        args.restore_net = True
        args.restore_optimizer = True
        msg = "Resuming from: checkpoint={}, epoch {}, arch {}"
        logx.msg(msg.format(args.resume, args.start_epoch, args.arch))
    elif args.snapshot:
        if 'ASSETS_PATH' in args.snapshot:
            args.snapshot = args.snapshot.replace('ASSETS_PATH',
                                                  cfg.ASSETS_PATH)
        checkpoint = torch.load(args.snapshot,
                                map_location=torch.device('cpu'))
        args.restore_net = True
        msg = "Loading weights from: checkpoint={}".format(args.snapshot)
        logx.msg(msg)

    #define the NASA optimizer parameter
    iter_tot = len(train_loader) * args.max_epoch
    #    tau = args.tau_factor/sqrt(iter_tot)
    tau = 1
    net = network.get_net(args, criterion)
    k = 1
    #    optim, scheduler = get_optimizer(args, net)
    optim, scheduler = get_optimizer(args, net, tau, k)
    # Visualize feature maps
    #activation = {}
    #def get_activation(name):
    #def hook(model, input, output):
    #activation[name] = output.detach()
    #return hook

    #net.layer[0].register_forward_hook(get_activation('conv1'))
    #data, _ = dataset[0]
    #data.unsqueeze_(0)
    #output = model(data)

    #act = activation['conv1'].squeeze()
    #fig, axarr = plt.subplots(act.size(0))
    #for idx in range(act.size(0)):
    #axarr[idx].imshow(act[idx])

    if args.fp16:
        net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level)

    net = network.wrap_network_in_dataparallel(net, args.apex)

    if args.summary:

        from thop import profile
        img = torch.randn(1, 3, 640, 640).cuda()
        mask = torch.randn(1, 1, 640, 640).cuda()
        macs, params = profile(net, inputs={'images': img, 'gts': mask})
        print(f'macs {macs} params {params}')
        sys.exit()

    if args.restore_optimizer:
        restore_opt(optim, checkpoint)
    if args.restore_net:
        restore_net(net, checkpoint)

    if args.init_decoder:
        net.module.init_mods()

    torch.cuda.empty_cache()

    if args.start_epoch != 0:
        scheduler.step(args.start_epoch)

    # There are 4 options for evaluation:
    #  --eval val                           just run validation
    #  --eval val --dump_assets             dump all images and assets
    #  --eval folder                        just dump all basic images
    #  --eval folder --dump_assets          dump all images and assets

    if args.eval == 'test':
        validate(val_loader,
                 net,
                 criterion=None,
                 optim=None,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True,
                 testing=True,
                 grid=city)

        return 0

    if args.eval == 'val':

        if args.dump_topn:
            validate_topn(val_loader, net, criterion_val, optim, 0, args)
        else:
            validate(val_loader,
                     net,
                     criterion=criterion_val,
                     optim=optim,
                     epoch=0,
                     dump_assets=args.dump_assets,
                     dump_all_images=args.dump_all_images,
                     calc_metrics=not args.no_metrics)
        return 0
    elif args.eval == 'folder':
        # Using a folder for evaluation means to not calculate metrics
        validate(val_loader,
                 net,
                 criterion=criterion_val,
                 optim=optim,
                 epoch=0,
                 calc_metrics=False,
                 dump_assets=args.dump_assets,
                 dump_all_images=True)
        return 0
    elif args.eval is not None:
        raise 'unknown eval option {}'.format(args.eval)

    for epoch in range(args.start_epoch, args.max_epoch):
        update_epoch(epoch)

        if args.only_coarse:
            train_obj.only_coarse()
            train_obj.build_epoch()
            if args.apex:
                train_loader.sampler.set_num_samples()

        elif args.class_uniform_pct:
            if epoch >= args.max_cu_epoch:
                train_obj.disable_coarse()
                train_obj.build_epoch()
                if args.apex:
                    train_loader.sampler.set_num_samples()
            else:
                train_obj.build_epoch()
        else:
            pass

        train(train_loader, net, optim, epoch)

        if args.apex:
            train_loader.sampler.set_epoch(epoch + 1)

        if epoch % args.val_freq == 0:
            validate(val_loader, net, criterion_val, optim, epoch)

        scheduler.step()

        if check_termination(epoch):
            return 0
Пример #17
0
#         logx.metric('val', metric_valid, epoch)
#         if valid_acc > best_accuracy:
#             best_accuracy = valid_acc
#             torch.save({'state_dict}': my_model.state_dict()}, './logs/exp13/fold' + str(i) + '/highest_valid_acc.pth')
#         logx.save_model({'state_dict}': my_model.state_dict()}, valid_loss, epoch, higher_better=False, delete_old=True)
#         print("current_acc:{0}, best_acc:{1}".format(valid_acc, best_accuracy))
#
#     print('------------------------')

i = 2
my_model = models.resnext50_32x4d(pretrained=False, zero_init_residual=True)
# my_model = models.resnet50(pretrained=False)
my_model = my_model.to(device)
optimizer = optim.Adam(my_model.parameters())
logx.initialize("./logs/exp14/resnext50+da+normlize",
                coolname=True,
                tensorboard=True)
train_dataset = Train_Dataset('./data/train_' + str(i) + '.csv',
                              './data/train',
                              transform=train_transformer)
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
valid_dataset = Train_Dataset('./data/valid_' + str(i) + '.csv',
                              './data/train',
                              transform=train_transformer)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=64)
print('model: ' + str(i) + ' || train_dataset: train_' + str(i) +
      ' || valid_dataset: valid_' + str(i))

best_accuracy = 0
for epoch in range(epochs):
    print('model: ' + str(i) + '||epoch: ' + str(epoch))
Пример #18
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=20,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--save-model',
                        action='store_true',
                        default=False,
                        help='For Saving the current Model')
    parser.add_argument('--logdir',
                        type=str,
                        default=None,
                        help='target log directory')
    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    logx.initialize(logdir=args.logdir,
                    coolname=True,
                    tensorboard=True,
                    hparams=vars(args))

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                              batch_size=args.test_batch_size,
                                              shuffle=True,
                                              **kwargs)

    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        train(args, model, device, train_loader, optimizer, epoch)
        test(args, model, device, test_loader, epoch, optimizer)
Пример #19
0
from data_loader_2 import Train_Dataset
import torch
import torch.nn as nn
from torch import optim
from torchvision import transforms, models
from torch.utils.data import DataLoader
import argparse
from runx.logx import logx
from loss import CrossEntropyLabelSmooth
import pandas as pd

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
logx.initialize("./logs/exp14/resnet50", coolname=True, tensorboard=True)


def train(model, loader, optimizer):
    # print('now LR: ' + str(optimizer.param_groups[0]['lr']))
    model.train()
    total, correct, train_loss = 0.0, 0.0, 0.0
    c = nn.CrossEntropyLoss().to(device)
    for step, data in enumerate(loader):
        x, y = data
        x, y = x.to(device), y.to(device)
        out = model(x)
        # print("size:", x.size(), y.size(), out.size())

        loss = c(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()