def main(): args = EigenmetricRegressionSolver.get_eigenmetric_regression_arguments() model = AlbertForEigenmetricRegression.from_scratch( num_labels=len(ALBERT_EIGENMETRICS), top_comment_pretrained_model_name_or_path=args. top_comment_pretrained_model_name_or_path, post_pretrained_model_name_or_path=args. post_pretrained_model_name_or_path, classifier_dropout_prob=args.classifier_dropout_prob, meta_data_size=len(ALBERT_META_FEATURES), subreddit_pretrained_path=args.subreddit_pretrained_path, num_subreddit_embeddings=NUM_SUBREDDIT_EMBEDDINGS, subreddit_embeddings_size=SUBREDDIT_EMBEDDINGS_SIZE) if args.freeze_alberts: model = model.freeze_bert() save_dict = { "model_construct_params_dict": model.param_dict(), "state_dict": model.state_dict() } logx.initialize(logdir=args.output_dir, coolname=True, tensorboard=False, no_timestamp=False, eager_flush=True) logx.save_model(save_dict, metric=0, epoch=0, higher_better=False)
def __init__(self, input_dir: Path, output_dir: Path, model: nn.Module, device: torch.device, per_gpu_batch_size: int, n_gpu: int, batch_size: int, learning_rate: float, weight_decay: float, n_epoch: int, seed: int, **kwargs): # construct param dict self.construct_param_dict = OrderedDict({ "input_dir": str(input_dir), "output_dir": str(output_dir), "learning_rate": learning_rate, "n_epoch": n_epoch, "per_gpu_batch_size": per_gpu_batch_size, "weight_decay": weight_decay, "seed": seed }) self.subreddit_embedding_device = torch.device("cuda") # build log logx.initialize(logdir=output_dir, coolname=True, tensorboard=True, no_timestamp=False, hparams={ "solver_hparams": self.state_dict(), "model_hparams": model.param_dict() }, eager_flush=True) # arguments self.input_dir = input_dir self.output_dir = output_dir # training utilities self.model = model # data utilities self.train_dataloader = kwargs.pop("train_dataloader", None) self.dev_dataloader = kwargs.pop("dev_dataloader", None) self.batch_size = batch_size self.n_epoch = n_epoch self.seed = seed # device self.device = device self.n_gpu = n_gpu logx.msg(f'Number of GPU: {self.n_gpu}.') self.criterion = nn.MSELoss() # optimizer and scheduler if self.train_dataloader: self.optimizer, self.scheduler = self.get_optimizer( named_parameters=self.model.named_parameters(), learning_rate=learning_rate, weight_decay=weight_decay, train_dataloader=self.train_dataloader, n_epoch=n_epoch) # set up random seeds and model location self.setup()
def __init__(self, input_dir, output_dir, model, device, per_gpu_batch_size, n_gpu, batch_size, learning_rate, weight_decay, n_epoch, seed, top_k, **kwargs): # construct param dict self.construct_param_dict = OrderedDict({ "input_dir": str(input_dir), "output_dir": str(output_dir), "learning_rate": learning_rate, "n_epoch": n_epoch, "per_gpu_batch_size": per_gpu_batch_size, "weight_decay": weight_decay, "seed": seed, "top_k": top_k, }) # build log logx.initialize(logdir=output_dir, coolname=True, tensorboard=True, no_timestamp=False, hparams={ "solver_construct_dict": self.construct_param_dict, "model_construct_dict": model.model_construct_dict }, eager_flush=True) # arguments self.record_training_loss_per_epoch = kwargs.pop( "record_training_loss_per_epoch", False) self.input_dir = input_dir self.output_dir = output_dir self.top_k = top_k # training utilities self.model = model # data utilities self.train_dataloader = kwargs.pop("train_dataloader", None) self.dev_dataloader = kwargs.pop("dev_dataloader", None) self.batch_size = batch_size self.n_epoch = n_epoch self.seed = seed # device self.device = device self.n_gpu = n_gpu logx.msg(f'Number of GPU: {self.n_gpu}.') self.criterion = kl_div_add_mse_loss # optimizer and scheduler if self.train_dataloader: self.optimizer, self.scheduler = self.get_optimizer( named_parameters=self.model.named_parameters(), learning_rate=learning_rate, weight_decay=weight_decay, train_dataloader=self.train_dataloader, n_epoch=n_epoch) # set up random seeds and model location self.setup()
def __init__(self, input_dir: Path, output_dir: Path, model_construct_params_dict: OrderedDict, **kwargs): """ api for xgboost :param args: model arguments :param input_dir: the path to the input directory :param output_dir: the path to save checkpoints (this parameter is also in args; add it as an explicit parameter to remind of this side effect) """ self.construct_param_dict = \ OrderedDict({ "input_dir": str(input_dir), "output_dir": str(output_dir), "model_construct_params_dict": model_construct_params_dict, }) # build log self.input_dir = input_dir self.output_dir = output_dir self.data = kwargs.pop("data", None) if model_construct_params_dict['xgb_model']: logx.initialize(logdir=output_dir, coolname=True, tensorboard=False, no_timestamp=False, hparams={"solver_hparams": self.construct_param_dict}, eager_flush=True) logx.msg(f"loaded models from {model_construct_params_dict['xgb_model']}") self.bst = xgb.Booster({'nthread': 32}) self.bst.load_model(model_construct_params_dict['xgb_model']) logx.msg(f"loaded pretrained model from {model_construct_params_dict['xgb_model']}") else: logx.initialize(logdir=output_dir, coolname=True, tensorboard=True, no_timestamp=False, hparams={"solver_hparams": self.construct_param_dict}, eager_flush=True)
def main(): args = parser.parse_args() logx.initialize(logdir=args.logdir, tensorboard=True, hparams=vars(args)) if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args)
def train(): for time in range(5): logx.initialize(get_logdir("../runs"), tensorboard=True, coolname=False) model.load_state_dict( torch.load("..\\runs\exp10\last_checkpoint_ep0.pth") ['state_dict']) # warmup dataset_train = TrainDataset( '../' + cfg.root_folder + '/five_fold/train_kfold_{}.csv'.format(time), '../' + cfg.root_folder + '/train/', train_transform) train_loader = DataLoader(dataset_train, batch_size=cfg.bs, shuffle=True) test_data = TrainDataset( '../' + cfg.root_folder + '/five_fold/test_kfold_{}.csv'.format(time), '../' + cfg.root_folder + '/train/', ) test_load = DataLoader(test_data, batch_size=cfg.bs, shuffle=False) # train for epoch in range(cfg.epoch): loss_epoch = 0 total = 0 correct = 0 for i, (x, y) in enumerate(train_loader, 1): x, y = x.to(device), y.to(device) y_hat = model(x) # 计算正确率 total += x.size(0) _, predict = torch.max(y_hat.data, dim=1) correct += (predict == y).sum().item() # 损失 loss = criterion(y_hat, y) loss_epoch += loss.item() optimizer.zero_grad() loss.backward() optimizer.step() # 过程可视化 if i % 30 == 0: print( 'epoch:%d, enumerate:%d, loss_avg:%f, now_acc:%f' % (epoch, i, loss_epoch / i, correct / total)) # epoch matric 可视化 train_loss = loss_epoch / i train_acc = (correct / total) * 100 logx.metric('train', {'loss': train_loss, 'acc': train_acc}, epoch) # valid # 开发集正确率 correct = 0 total = 0 val_loss = 0 with torch.no_grad(): for i, (img, label) in enumerate(test_load, 1): img, label = img.to(device), label.to(device) output = model(img) loss = criterion(output, label) val_loss += loss.cpu().item() _, predicted = torch.max(output.data, dim=1) # 最大值,位置 total += img.size(0) correct += (predicted == label).sum().item() val_acc = (100 * correct / total) val_loss /= i logx.metric('val', {'loss': val_loss, 'acc': val_acc}, epoch) # epoch lossand other metric print( 'epoch over; train_loss:%f, val_loss:%f, train_acc=%f, val_acc:%f' % (train_loss, val_loss, train_acc, val_acc)) logx.save_model({ 'state_dict': model.state_dict(), 'epoch': epoch }, val_acc, higher_better=True, epoch=epoch, delete_old=True) scheduler.step()
if __name__ == '__main__': args = Opts().init() # load the dataset train_loader, n_train, properties = get_dataset(args=args, flag='train') val_loader, n_val, _ = get_dataset(args=args, flag='val') mean, std = properties[0], properties[1] # criterion criterion = nn.CrossEntropyLoss( ) if args.n_classes > 1 else args.loss_function # initialize the information logx.initialize(logdir=args.dir_log, coolname=True, tensorboard=True) logx.msg('Start training...\n') table = PrettyTable(["key", "value"]) table.align = 'l' infos = { 'vis': args.vis, 'seed': args.seed, 'epoch': args.epochs, 'data aug': args.aug, 'resume': args.resume, 'optimizer': args.optim, 'dataset': args.dataset, 'training size': n_train, 'validation size': n_val, 'learning rate': args.lr,
def main(): """ Main Function """ if AutoResume: AutoResume.init() assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=False, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) auto_resume_details = None if AutoResume: auto_resume_details = AutoResume.get_resume_details() if auto_resume_details: checkpoint_fn = auto_resume_details.get("RESUME_FILE", None) checkpoint = torch.load(checkpoint_fn, map_location=torch.device('cpu')) args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None) args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1 args.restore_net = True args.restore_optimizer = True msg = ("Found details of a requested auto-resume: checkpoint={}" " tensorboard={} at epoch {}") logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch)) elif args.resume: checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True msg = "Resuming from: checkpoint={}, epoch {}, arch {}" logx.msg(msg.format(args.resume, args.start_epoch, args.arch)) elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True msg = "Loading weights from: checkpoint={}".format(args.snapshot) logx.msg(msg) net = network.get_net(args, criterion) optim, scheduler = get_optimizer(args, net) net = network.wrap_network_in_dataparallel(net, args.apex) if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: scheduler.step(args.start_epoch) if args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics # validate(val_loader, net, criterion=None, optim=None, epoch=0, # calc_metrics=False, dump_assets=args.dump_assets, # dump_all_images=True) if not os.path.exists(args.result_dir + 'image_2/'): os.mkdir(args.result_dir + 'image_2/') if not os.path.exists(args.result_dir + 'image_3/'): os.mkdir(args.result_dir + 'image_3/') num_image = 7481 for idx in tqdm(range(num_image)): sample_idx = "%06d" % idx eval_minibatch(sample_idx, "image_2/", net, args) eval_minibatch(sample_idx, "image_3/", net, args) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval)
import os import torch from torch.utils.data import DataLoader import torch.nn as nn import torch.optim as optim # from torch.cuda.amp import autocast, GradScaler from torchvision.datasets import ImageFolder from runx.logx import logx from utils import Config, get_model from dataset import Caltech, get_tfms config = Config() logdir = config.exp_name logx.initialize(logdir, coolname=True, tensorboard=True) def train_epoch(epoch): model.train() losses = 0.0 total, correct = 0.0, 0.0 for step, (x, y) in enumerate(train_loader): x, y = x.to(config.device), y.to(config.device) out = model(x) loss = criterion(out, y) losses += loss.cpu().detach().numpy() optimizer.zero_grad() loss.backward() optimizer.step()
from data_loader import Train_Dataset import torch import torch.nn as nn from torch import optim from torchvision import transforms, models from torch.utils.data import DataLoader import argparse from runx.logx import logx import pandas as pd device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logx.initialize("./logs/exp9", coolname=True, tensorboard=True) def train(model, loader, optimizer, scheduler=None): print('now LR: ' + str(optimizer.param_groups[0]['lr'])) model.train() total, correct, train_loss = 0.0, 0.0, 0.0 for step, data in enumerate(loader): x, y = data x, y = x.to(device), y.to(device) out = model(x) # print("size:", x.size(), y.size(), out.size()) loss = nn.CrossEntropyLoss()(out, y) optimizer.zero_grad() loss.backward() optimizer.step() if scheduler: scheduler.step()
def main(): """ Main Function """ rank = args.rank cfg.GLOBAL_RANK = rank args.gpus = torch.cuda.device_count() device = torch.device("cpu") loc_dist = True if args.gpus > 1 else False loc_rank = rank % args.gpus args.gpu = loc_rank args.local_rank = loc_rank if loc_dist: device = "cuda:" + str(loc_rank) os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "19500" os.environ["NCCL_SOCKET_IFNAME"] = "ib" torch.cuda.set_device(device) torch.distributed.init_process_group(backend="nccl", rank=loc_rank, world_size=args.gpus) # torch.cuda.set_device(device) elif args.gpus == 1: args.gpus = torch.cuda.device_count() device = "cuda:0" args.local_rank = 0 torch.cuda.set_device(device) assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=True, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) # args.ngpu = torch.cuda.device_count() # args.best_record = {'mean_iu': -1, 'epoch': 0} train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) cwd = os.getcwd() sz = ht.MPI_WORLD.size filename = cwd + "/citys-heat-checkpoint-" + str(sz) + ".pth.tar" if args.resume and os.path.isfile(filename): checkpoint = torch.load(filename, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True logx.msg(f"Resuming from: checkpoint={args.resume}, " f"epoch {args.start_epoch}, arch {args.arch}") elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True logx.msg(f"Loading weights from: checkpoint={args.snapshot}") net = network.get_net(args, criterion) net = net.to(device) # args.lr = (1. / args.world_size * (5 * (args.world_size - 1) / 6.)) * 0.0125 * args.world_size optim, scheduler = get_optimizer(args, net) # the scheduler in this code is only run at the end of each epoch # todo: make heat an option not this whole file # if args.heat: dp_optim = ht.optim.DASO( local_optimizer=optim, total_epochs=args.max_epoch, max_global_skips=4, ) #if args.no_cycling: dp_optim.disable_cycling(global_skips=args.batch_skip, batches_to_wait=args.gs) # this is where the network is wrapped with DDDP (w/apex) or DP htnet = ht.nn.DataParallelMultiGPU(net, comm=ht.MPI_WORLD, optimizer=dp_optim) if args.summary: print(str(net)) from thop import profile img = torch.randn(1, 3, 1024, 2048).cuda() mask = torch.randn(1, 1, 1024, 2048).cuda() macs, params = profile(net, inputs={'images': img, 'gts': mask}) print0(f'macs {macs} params {params}') sys.exit() if args.restore_optimizer: restore_opt(optim, checkpoint) dp_optim.stability.load_dict(checkpoint["skip_stable"]) if args.restore_net: #restore_net(net, checkpoint) htnet.load_state_dict(checkpoint["state_dict"]) #dp_optim.module.load_state_dist(checkpoint["state_dict"]) # htnet = ht.nn.DataParallelMultiGPU(net, ht.MPI_WORLD, dp_optim) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: # TODO: need a loss value for the restart at a certain epoch... scheduler.step(args.start_epoch) # There are 4 options for evaluation: # --eval val just run validation # --eval val --dump_assets dump all images and assets # --eval folder just dump all basic images # --eval folder --dump_assets dump all images and assets # todo: HeAT fixes -- not urgent -- if args.eval == 'val': if args.dump_topn: validate_topn(val_loader, net, criterion_val, optim, 0, args) else: validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, dump_assets=args.dump_assets, dump_all_images=args.dump_all_images, calc_metrics=not args.no_metrics) return 0 elif args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics validate(val_loader, net, criterion=None, optim=None, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval) scaler = amp.GradScaler() if dp_optim.comm.rank == 0: print("scheduler", args.lr_schedule) dp_optim.add_scaler(scaler) nodes = str(int(dp_optim.comm.size / torch.cuda.device_count())) cwd = os.getcwd() fname = cwd + "/" + nodes + "-heat-citys-benchmark" if args.resume and rank == 0 and os.path.isfile(fname + ".pkl"): with open(fname + ".pkl", "rb") as f: out_dict = pickle.load(f) else: out_dict = { "epochs": [], nodes + "-avg-batch-time": [], nodes + "-total-train-time": [], nodes + "-train-loss": [], nodes + "-val-loss": [], nodes + "-val-iou": [], nodes + "-val-time": [], } print0("Output dict:", fname) for epoch in range(args.start_epoch, args.max_epoch): # todo: HeAT fixes -- possible conflict between processes update_epoch(epoch) if args.only_coarse: # default: false train_obj.only_coarse() train_obj.build_epoch() elif args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.disable_coarse() train_obj.build_epoch() else: train_obj.build_epoch() else: pass ls, bt, btt = train(train_loader, htnet, dp_optim, epoch, scaler) dp_optim.epoch_loss_logic(ls, loss_globally_averaged=True) # if epoch % args.val_freq == 0: vls, iu, vtt = validate(val_loader, htnet, criterion_val, dp_optim, epoch) if args.lr_schedule == "plateau": if dp_optim.comm.rank == 0: print("loss", ls, 'best:', scheduler.best * (1. - scheduler.threshold), scheduler.num_bad_epochs) scheduler.step(ls) # val_loss) else: scheduler.step() if args.rank == 0: save_checkpoint({ "epoch": epoch + 1, "arch": args.arch, "state_dict": htnet.state_dict(), "optimizer": optim.state_dict(), "skip_stable": dp_optim.stability.get_dict() }) out_dict["epochs"].append(epoch) out_dict[nodes + "-train-loss"].append(ls) out_dict[nodes + "-avg-batch-time"].append(bt) out_dict[nodes + "-total-train-time"].append(btt) out_dict[nodes + "-val-loss"].append(vls) out_dict[nodes + "-val-iou"].append(iu) out_dict[nodes + "-val-time"].append(vtt) if args.rank == 0: save_obj(out_dict, fname) if args.rank == 0: print("\nRESULTS\n") import pandas as pd df = pd.DataFrame.from_dict(out_dict).set_index("epochs") with pd.option_context("display.max_rows", None, "display.max_columns", None): # more options can be specified also print(df) if args.benchmarking: try: fulldf = pd.read_csv(cwd + "/heat-bench-results.csv") fulldf = pd.concat([df, fulldf], axis=1) except FileNotFoundError: fulldf = df fulldf.to_csv(cwd + "/heat-bench-results.csv")
def main(): # Settings args = get_args() # cuda and devices use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') # kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} # init logx logx.initialize(logdir=args.logdir, coolname=True, tensorboard=True, hparams=vars(args)) # random seed set_random_seed(args.seed) # dataset and dataloader mean = [0.5, 0.5, 0.5] std = [0.5, 0.5, 0.5] normalize = transforms.Normalize(mean, std) transform = transforms.Compose([transforms.ToTensor(), normalize]) train_dataset = Dynamic_Scenes_Dataset(root_dir=args.dataset_dir, is_training=True, crop=True, crop_size=(256, 256)) val_dataset = Dynamic_Scenes_Dataset(root_dir=args.dataset_dir, is_training=False, crop=False) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True) val_loader = DataLoader(val_dataset, batch_size=args.test_batch_size, shuffle=False, num_workers=1, pin_memory=True) # model architecture if args.model_arch == 0: model = AHDRNet() elif args.model_arch == 1: model = AHDR(6, 5, 64, 32) else: logx.msg("This model is not yet implemented!\n") return if args.init_weights: init_parameters(model) if args.load: model.load_state_dict(torch.load(args.load, map_location=device)) logx.msg(f'Model loaded from {args.load}') model.to(device) # # log graph # dummy_input = torch.from_numpy(np.random.rand(1, 6, 256, 256)).float().to(device) # logx.add_graph(model, input_to_model=(dummy_input, dummy_input, dummy_input)) # loss function and optimizer if args.loss_func == 0: criterion = nn.L1Loss() elif args.loss_func == 1: criterion = nn.MSELoss() else: logx.msg("Error loss functions.\n") return optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) num_parameters = sum( torch.numel(parameter) for parameter in model.parameters()) logx.msg(f'''Starting training: Model Paras: {num_parameters} Epochs: {args.epochs} Batch size: {args.batch_size} Loss function: {args.loss_func} Learning rate: {args.lr} Training size: {len(train_loader)} Device: {device.type} Dataset dir: {args.dataset_dir} ''') for epoch in range(1, args.epochs + 1): adjust_learning_rate(args, optimizer, epoch) train(args, model, device, train_loader, optimizer, epoch, criterion) validation(args, model, device, val_loader, optimizer, epoch, criterion)
def main(): """ Main Function """ rank = args.rank cfg.GLOBAL_RANK = rank args.gpus = torch.cuda.device_count() device = torch.device("cpu") hvd.init() torch.manual_seed(999999) #if args.cuda: args.cuda = True # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) #torch.cuda.manual_seed(args.seed) assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=True, hparams=vars(args), global_rank=args.global_rank) #print("vefore assert and infer") # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) # args.ngpu = torch.cuda.device_count() # args.best_record = {'mean_iu': -1, 'epoch': 0} #print("before datasets / loss") train_loader, val_loader, train_obj = datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) cwd = os.getcwd() sz = ht.MPI_WORLD.size filename = cwd + "/citys-hvd-checkpoint-" + str(sz) + ".pth.tar" if args.resume and os.path.isfile(filename): checkpoint = torch.load(filename, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True logx.msg(f"Resuming from: checkpoint={args.resume}, " \ f"epoch {args.start_epoch}, arch {args.arch}") elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True logx.msg(f"Loading weights from: checkpoint={args.snapshot}") # todo: HeAT fixes -- urgent -- DDDP / optim / scheduler net = network.get_net(args, criterion) # net = net.to(device) # todo: optim -> direct wrap after this, scheduler stays the same? optim, scheduler = get_optimizer(args, net) # if args.fp16: # net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level) compression = hvd.Compression.fp16 # if args.fp16_allreduce else hvd.Compression.none optim = hvd.DistributedOptimizer( optim, named_parameters=net.named_parameters(), compression=compression, backward_passes_per_step=1, # args.batches_per_allreduce, op=hvd.Average, gradient_predivide_factor=1.0, # args.gradient_predivide_factor) ) #print("after hvd optimizer setup") if args.summary: print(str(net)) from thop import profile img = torch.randn(1, 3, 1024, 2048).cuda() mask = torch.randn(1, 1, 1024, 2048).cuda() macs, params = profile(net, inputs={'images': img, 'gts': mask}) print0(f'macs {macs} params {params}') sys.exit() if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: #net.loat_state_dict(checkpoint["state_dict"]) restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() #print("before parameter broadcasts") #hvd.broadcast_parameters(net.state_dict(), root_rank=0) #hvd.broadcast_optimizer_state(optim, root_rank=0) if args.start_epoch != 0: # TODO: need a loss value for the restart at a certain epoch... scheduler.step(args.start_epoch) #net = net.cuda() # There are 4 options for evaluation: # --eval val just run validation # --eval val --dump_assets dump all images and assets # --eval folder just dump all basic images # --eval folder --dump_assets dump all images and assets # todo: HeAT fixes -- not urgent -- # if args.eval == 'val': # if args.dump_topn: # validate_topn(val_loader, net, criterion_val, optim, 0, args) # else: # validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, # dump_assets=args.dump_assets, # dump_all_images=args.dump_all_images, # calc_metrics=not args.no_metrics) # return 0 # elif args.eval == 'folder': # # Using a folder for evaluation means to not calculate metrics # validate(val_loader, net, criterion=None, optim=None, epoch=0, # calc_metrics=False, dump_assets=args.dump_assets, # dump_all_images=True) # return 0 # elif args.eval is not None: # raise 'unknown eval option {}'.format(args.eval) scaler = None #amp.GradScaler() args.amp = False #True nodes = str(int(hvd.size() / torch.cuda.device_count())) cwd = os.getcwd() fname = cwd + "/" + nodes + "-hvd-citys-benchmark" if args.resume and rank == 0 and os.path.isfile(fname + ".pkl"): with open(fname + ".pkl", "rb") as f: out_dict = pickle.load(f) else: out_dict = { "epochs": [], nodes + "-avg-batch-time": [], nodes + "-total-train-time": [], nodes + "-train-loss": [], nodes + "-val-loss": [], nodes + "-val-iou": [], nodes + "-val-time": [], } print0("Output dict:", fname) # train_losses, train_btimes, train_ttime = [], [], [] # val_losses, val_iu, val_ttime = [], [], [] for epoch in range(args.start_epoch, args.max_epoch): # todo: HeAT fixes -- possible conflict between processes update_epoch(epoch) if args.only_coarse: # default: false train_obj.only_coarse() train_obj.build_epoch() elif args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.disable_coarse() train_obj.build_epoch() else: train_obj.build_epoch() else: pass ls, bt, btt = train(train_loader, net, optim, epoch, scaler) # dp_optim.epoch_loss_logic(ls, loss_globally_averaged=True) # if epoch % args.val_freq == 0: vls, iu, vtt = validate(val_loader, net, criterion_val, optim, epoch) if args.lr_schedule == "plateau": scheduler.step(ls) # val_loss) else: scheduler.step() if args.rank == 0: save_checkpoint({ "epoch": epoch + 1, "arch": args.arch, "state_dict": net.state_dict(), "optimizer": optim.state_dict(), # "skip_stable": optim.stability.get_dict() }) out_dict["epochs"].append(epoch) out_dict[nodes + "-train-loss"].append(ls) out_dict[nodes + "-avg-batch-time"].append(bt) out_dict[nodes + "-total-train-time"].append(btt) out_dict[nodes + "-val-loss"].append(vls) out_dict[nodes + "-val-iou"].append(iu) out_dict[nodes + "-val-time"].append(vtt) if args.rank == 0: save_obj(out_dict, fname) if args.rank == 0: print("\nRESULTS\n") import pandas as pd df = pd.DataFrame.from_dict(out_dict).set_index("epochs") with pd.option_context("display.max_rows", None, "display.max_columns", None): # more options can be specified also print(df) if args.benchmarking: try: fulldf = pd.read_csv(cwd + "/hvd-bench-results.csv") fulldf = pd.concat([df, fulldf], axis=1) except FileNotFoundError: fulldf = df fulldf.to_csv(cwd + "/hvd-bench-results.csv")
correct += (pred == y).squeeze().sum().cpu().numpy() valid_acc = correct / total print("valid accuracy", valid_acc) logx.metric('val', { 'loss': valid_loss, 'accuracy': valid_acc }, epoch=epoch) return valid_acc for k in range(5): print("training for fold {}".format(k)) start_epoch = 0 logx.initialize(os.path.join(log_dir, 'fold_{}'.format(k)), coolname=True, tensorboard=True) # 数据加载 desc_train = os.path.join(cfg.ds_folder, 'new_train_{}.csv'.format(k)) desc_valid = os.path.join(cfg.ds_folder, 'new_valid_{}.csv'.format(k)) train_data = TrainDataset(desc_train, data_folder=os.path.join(cfg.ds_folder, "train/"), transform=transform_train) valid_data = TrainDataset(desc_valid, data_folder=os.path.join(cfg.ds_folder, "train/"), transform=transform_test) # 构建DataLoader
def main(): """ Main Function """ if AutoResume: AutoResume.init() assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=True, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) train_loader, val_loader, train_obj = \ datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) auto_resume_details = None if AutoResume: auto_resume_details = AutoResume.get_resume_details() if auto_resume_details: checkpoint_fn = auto_resume_details.get("RESUME_FILE", None) checkpoint = torch.load(checkpoint_fn, map_location=torch.device('cpu')) args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None) args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1 args.restore_net = True args.restore_optimizer = True msg = ("Found details of a requested auto-resume: checkpoint={}" " tensorboard={} at epoch {}") logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch)) elif args.resume: checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True msg = "Resuming from: checkpoint={}, epoch {}, arch {}" logx.msg(msg.format(args.resume, args.start_epoch, args.arch)) elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True msg = "Loading weights from: checkpoint={}".format(args.snapshot) logx.msg(msg) net = network.get_net(args, criterion) optim, scheduler = get_optimizer(args, net) if args.fp16: net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level) net = network.wrap_network_in_dataparallel(net, args.apex) if args.summary: print(str(net)) from pytorchOpCounter.thop import profile img = torch.randn(1, 3, 1024, 2048).cuda() mask = torch.randn(1, 1, 1024, 2048).cuda() macs, params = profile(net, inputs={'images': img, 'gts': mask}) print(f'macs {macs} params {params}') sys.exit() if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: scheduler.step(args.start_epoch) # There are 4 options for evaluation: # --eval val just run validation # --eval val --dump_assets dump all images and assets # --eval folder just dump all basic images # --eval folder --dump_assets dump all images and assets if args.eval == 'val': if args.dump_topn: validate_topn(val_loader, net, criterion_val, optim, 0, args) else: validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, dump_assets=args.dump_assets, dump_all_images=args.dump_all_images, calc_metrics=not args.no_metrics) return 0 elif args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics validate(val_loader, net, criterion=None, optim=None, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval) for epoch in range(args.start_epoch, args.max_epoch): update_epoch(epoch) if args.only_coarse: train_obj.only_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() elif args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.disable_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() else: train_obj.build_epoch() else: pass train(train_loader, net, optim, epoch) if args.apex: train_loader.sampler.set_epoch(epoch + 1) if epoch % args.val_freq == 0: validate(val_loader, net, criterion_val, optim, epoch) scheduler.step() if check_termination(epoch): return 0
def main(): """ Main Function """ if AutoResume: AutoResume.init() assert args.result_dir is not None, 'need to define result_dir arg' logx.initialize(logdir=args.result_dir, tensorboard=True, hparams=vars(args), global_rank=args.global_rank) # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) prep_experiment(args) train_loader, val_loader, train_obj = \ datasets.setup_loaders(args) criterion, criterion_val = get_loss(args) auto_resume_details = None if AutoResume: auto_resume_details = AutoResume.get_resume_details() if auto_resume_details: checkpoint_fn = auto_resume_details.get("RESUME_FILE", None) checkpoint = torch.load(checkpoint_fn, map_location=torch.device('cpu')) args.result_dir = auto_resume_details.get("TENSORBOARD_DIR", None) args.start_epoch = int(auto_resume_details.get("EPOCH", None)) + 1 args.restore_net = True args.restore_optimizer = True msg = ("Found details of a requested auto-resume: checkpoint={}" " tensorboard={} at epoch {}") logx.msg(msg.format(checkpoint_fn, args.result_dir, args.start_epoch)) elif args.resume: checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.arch = checkpoint['arch'] args.start_epoch = int(checkpoint['epoch']) + 1 args.restore_net = True args.restore_optimizer = True msg = "Resuming from: checkpoint={}, epoch {}, arch {}" logx.msg(msg.format(args.resume, args.start_epoch, args.arch)) elif args.snapshot: if 'ASSETS_PATH' in args.snapshot: args.snapshot = args.snapshot.replace('ASSETS_PATH', cfg.ASSETS_PATH) checkpoint = torch.load(args.snapshot, map_location=torch.device('cpu')) args.restore_net = True msg = "Loading weights from: checkpoint={}".format(args.snapshot) logx.msg(msg) #define the NASA optimizer parameter iter_tot = len(train_loader) * args.max_epoch # tau = args.tau_factor/sqrt(iter_tot) tau = 1 net = network.get_net(args, criterion) k = 1 # optim, scheduler = get_optimizer(args, net) optim, scheduler = get_optimizer(args, net, tau, k) # Visualize feature maps #activation = {} #def get_activation(name): #def hook(model, input, output): #activation[name] = output.detach() #return hook #net.layer[0].register_forward_hook(get_activation('conv1')) #data, _ = dataset[0] #data.unsqueeze_(0) #output = model(data) #act = activation['conv1'].squeeze() #fig, axarr = plt.subplots(act.size(0)) #for idx in range(act.size(0)): #axarr[idx].imshow(act[idx]) if args.fp16: net, optim = amp.initialize(net, optim, opt_level=args.amp_opt_level) net = network.wrap_network_in_dataparallel(net, args.apex) if args.summary: from thop import profile img = torch.randn(1, 3, 640, 640).cuda() mask = torch.randn(1, 1, 640, 640).cuda() macs, params = profile(net, inputs={'images': img, 'gts': mask}) print(f'macs {macs} params {params}') sys.exit() if args.restore_optimizer: restore_opt(optim, checkpoint) if args.restore_net: restore_net(net, checkpoint) if args.init_decoder: net.module.init_mods() torch.cuda.empty_cache() if args.start_epoch != 0: scheduler.step(args.start_epoch) # There are 4 options for evaluation: # --eval val just run validation # --eval val --dump_assets dump all images and assets # --eval folder just dump all basic images # --eval folder --dump_assets dump all images and assets if args.eval == 'test': validate(val_loader, net, criterion=None, optim=None, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True, testing=True, grid=city) return 0 if args.eval == 'val': if args.dump_topn: validate_topn(val_loader, net, criterion_val, optim, 0, args) else: validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, dump_assets=args.dump_assets, dump_all_images=args.dump_all_images, calc_metrics=not args.no_metrics) return 0 elif args.eval == 'folder': # Using a folder for evaluation means to not calculate metrics validate(val_loader, net, criterion=criterion_val, optim=optim, epoch=0, calc_metrics=False, dump_assets=args.dump_assets, dump_all_images=True) return 0 elif args.eval is not None: raise 'unknown eval option {}'.format(args.eval) for epoch in range(args.start_epoch, args.max_epoch): update_epoch(epoch) if args.only_coarse: train_obj.only_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() elif args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.disable_coarse() train_obj.build_epoch() if args.apex: train_loader.sampler.set_num_samples() else: train_obj.build_epoch() else: pass train(train_loader, net, optim, epoch) if args.apex: train_loader.sampler.set_epoch(epoch + 1) if epoch % args.val_freq == 0: validate(val_loader, net, criterion_val, optim, epoch) scheduler.step() if check_termination(epoch): return 0
# logx.metric('val', metric_valid, epoch) # if valid_acc > best_accuracy: # best_accuracy = valid_acc # torch.save({'state_dict}': my_model.state_dict()}, './logs/exp13/fold' + str(i) + '/highest_valid_acc.pth') # logx.save_model({'state_dict}': my_model.state_dict()}, valid_loss, epoch, higher_better=False, delete_old=True) # print("current_acc:{0}, best_acc:{1}".format(valid_acc, best_accuracy)) # # print('------------------------') i = 2 my_model = models.resnext50_32x4d(pretrained=False, zero_init_residual=True) # my_model = models.resnet50(pretrained=False) my_model = my_model.to(device) optimizer = optim.Adam(my_model.parameters()) logx.initialize("./logs/exp14/resnext50+da+normlize", coolname=True, tensorboard=True) train_dataset = Train_Dataset('./data/train_' + str(i) + '.csv', './data/train', transform=train_transformer) train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True) valid_dataset = Train_Dataset('./data/valid_' + str(i) + '.csv', './data/train', transform=train_transformer) valid_loader = DataLoader(dataset=valid_dataset, batch_size=64) print('model: ' + str(i) + ' || train_dataset: train_' + str(i) + ' || valid_dataset: valid_' + str(i)) best_accuracy = 0 for epoch in range(epochs): print('model: ' + str(i) + '||epoch: ' + str(epoch))
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=20, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--logdir', type=str, default=None, help='target log directory') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() logx.initialize(logdir=args.logdir, coolname=True, tensorboard=True, hparams=vars(args)) torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader, epoch, optimizer)
from data_loader_2 import Train_Dataset import torch import torch.nn as nn from torch import optim from torchvision import transforms, models from torch.utils.data import DataLoader import argparse from runx.logx import logx from loss import CrossEntropyLabelSmooth import pandas as pd device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logx.initialize("./logs/exp14/resnet50", coolname=True, tensorboard=True) def train(model, loader, optimizer): # print('now LR: ' + str(optimizer.param_groups[0]['lr'])) model.train() total, correct, train_loss = 0.0, 0.0, 0.0 c = nn.CrossEntropyLoss().to(device) for step, data in enumerate(loader): x, y = data x, y = x.to(device), y.to(device) out = model(x) # print("size:", x.size(), y.size(), out.size()) loss = c(out, y) optimizer.zero_grad() loss.backward() optimizer.step()