def main(): # Load a configuration prog_args = arg_parse() if prog_args.gpu: os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda print("CUDA", prog_args.cuda) else: print("Using CPU") # Configure the logging directory if prog_args.writer: path = os.path.join(prog_args.logdir, io_utils.gen_explainer_prefix(prog_args)) if os.path.isdir(path) and prog_args.clean_log: print('Removing existing log dir: ', path) if not input( "Are you sure you want to remove this directory? (y/n): " ).lower().strip()[:1] == "y": sys.exit(1) shutil.rmtree(path) writer = SummaryWriter(path) else: writer = None # Load a model checkpoint ckpt = io_utils.load_ckpt(prog_args) cg_dict = ckpt["cg"] # get computation graph input_dim = cg_dict["feat"].shape[2] num_classes = cg_dict["pred"].shape[2] print("Loaded model from {}".format(prog_args.ckptdir)) print("input dim: ", input_dim, "; num classes: ", num_classes) # Determine explainer mode graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0 or prog_args.graph_idx >= 0) # build model print("Method: ", prog_args.method) if graph_mode: # Explain Graph prediction model = models.GcnEncoderGraph( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) else: if prog_args.dataset == "ppi_essential": # class weight in CE loss for handling imbalanced label classes prog_args.loss_weight = torch.tensor([1.0, 5.0], dtype=torch.float).cuda() # Explain Node prediction model = models.GcnEncoderNode( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) if prog_args.gpu: model = model.cuda() # load state_dict (obtained by model.state_dict() when saving checkpoint) model.load_state_dict(ckpt["model_state"]) # Create explainer explainer = explain.Explainer( model=model, adj=cg_dict["adj"], feat=cg_dict["feat"], label=cg_dict["label"], pred=cg_dict["pred"], train_idx=cg_dict["train_idx"], args=prog_args, writer=writer, print_training=True, graph_mode=graph_mode, graph_idx=prog_args.graph_idx, ) # TODO: API should definitely be cleaner # Let's define exactly which modes we support # We could even move each mode to a different method (even file) if prog_args.explain_node is not None: explainer.explain(prog_args.explain_node, unconstrained=False) elif graph_mode: if prog_args.multigraph_class >= 0: print(cg_dict["label"]) # only run for graphs with label specified by multigraph_class labels = cg_dict["label"].numpy() graph_indices = [] for i, l in enumerate(labels): if l == prog_args.multigraph_class: graph_indices.append(i) if len(graph_indices) > 30: break print( "Graph indices for label ", prog_args.multigraph_class, " : ", graph_indices, ) explainer.explain_graphs(graph_indices=graph_indices) elif prog_args.graph_idx == -1: # just run for a customized set of indices explainer.explain_graphs(graph_indices=[1, 2, 3, 4]) else: explainer.explain( node_idx=0, graph_idx=prog_args.graph_idx, graph_mode=True, unconstrained=False, ) io_utils.plot_cmap_tb(writer, "tab20", 20, "tab20_cmap") else: if prog_args.multinode_class >= 0: print(cg_dict["label"]) # only run for nodes with label specified by multinode_class labels = cg_dict["label"][0] # already numpy matrix node_indices = [] for i, l in enumerate(labels): if len(node_indices) > 4: break if l == prog_args.multinode_class: node_indices.append(i) print( "Node indices for label ", prog_args.multinode_class, " : ", node_indices, ) explainer.explain_nodes(node_indices, prog_args) else: # explain a set of nodes masked_adj = explainer.explain_nodes_gnn_stats( range(400, 700, 5), prog_args)
def main(args): # globale model # BaseManager.register('ReplayMemory', ReplayMemory) manager = BaseManager() manager.start() global_replayMemory = manager.ReplayMemory(args.capacity) global_wolp_ddpg = DDPG(global_replayMemory, None, None, None, args, 1) actor_optimizer = SharedAdam(global_wolp_ddpg.actor.parameters(), lr=args.actor_lr) critic_optimzer = SharedAdam(global_wolp_ddpg.critic.parameters(), lr=args.critic_lr) global_wolp_ddpg.share_memory() writer = SummaryWriter('%s/logs' % args.logPath) # create train and test worker # train_work_list = [] train_finsh_list = [] train_queue = [] for i in range(args.train_worker_nums): queue = mp.Queue(int(1e6)) train_finsh = mp.Value('i', 0) train_queue.append(queue) train_finsh_list.append(train_finsh) train_work_list.append( train_woker(i, global_wolp_ddpg, global_replayMemory, queue, actor_optimizer, critic_optimzer, args, train_finsh)) test_queue = mp.Queue(int(1e6)) test_finish = mp.Value('i', 0) test = test_woker(args.train_worker_nums, global_wolp_ddpg, test_queue, args, train_finsh_list, test_finish) # start worker # for i in range(args.train_worker_nums): train_work_list[i].start() test.start() # visulize # while True: if test_finish.value == 1: break else: train_flags = 0 for i in range(len(train_work_list)): train_flags |= train_queue[i].empty() if train_flags: pass else: reward_dict = {} mean = 0 step = 0 for i in range(len(train_work_list)): data = train_queue[i].get() reward_dict['work_%d' % data[0]] = data[1] step = data[2] mean += data[1] reward_dict['avg'] = mean / len(train_work_list) writer.add_scalars('reward', reward_dict, step) if test_queue.empty(): pass else: data = test_queue.get() writer.add_scalar('test_work_%d_reward' % data[0], data[1], data[2]) # join work stop # for i in range(args.train_worker_nums): train_work_list[i].join() test.join()
cfg = get_train_config(config_file='config/train_config.yaml') os.environ['CUDA_VISIBLE_DEVICES'] = cfg['cuda_devices'] time_TrainStart = str(int(time.time())) the_ckpt_root = cfg['ckpt_root']+cfg['step']+time_TrainStart+'/' os.mkdir(the_ckpt_root) shutil.copyfile('./config/train_config.yaml', the_ckpt_root+'train_config.yaml') # tensorboardx with open('training_log.json', 'a') as f: f.write('--------------------------\n') f.write('PID: '+str(os.getpid())+'\n') f.write('PWD: '+the_ckpt_root+'\n') f.write(str(json.dumps(cfg,indent=2))+'\n') writer = SummaryWriter('runs/'+cfg['step']+time_TrainStart+'_'+str(os.getpid())) ###################################### # multi-modal dataset if cfg['modality'] == 'mesh': data_set = { x: ModelNet40(cfg=cfg['dataset'], part=x) for x in ['train', 'test'] } elif cfg['modality'] == 'view': data_set = { x: mv_ModelNet40(cfg=cfg['dataset'], part=x) for x in ['train', 'test'] } elif cfg['modality'] == 'meshview': data_set = { x: mesh_mv_ModelNet40(cfg=cfg['dataset'], part=x) for x in ['train', 'test']
def main_run(dataset, trainDataset, valDataset, outDir, stackSize, trainBatchSize, valBatchSize, numEpochs, lr1, decay_factor, decay_step): if dataset == 'gtea61': num_classes = 61 elif dataset == 'gtea71': num_classes = 71 elif dataset == 'gtea_gaze': num_classes = 44 elif dataset == 'egtea': num_classes = 106 else: print('Dataset not found') sys.exit() min_accuracy = 0 model_folder = os.path.join( './', outDir, dataset, 'flow_surfaceNormals_fm') # Dir for saving models and log files # Create the dir if os.path.exists(model_folder): print('Dir {} exists!'.format(model_folder)) sys.exit() os.makedirs(model_folder) # Log files writer = SummaryWriter(model_folder) train_log_loss = open((model_folder + '/train_log_loss.txt'), 'w') train_log_acc = open((model_folder + '/train_log_acc.txt'), 'w') val_log_loss = open((model_folder + '/val_log_loss.txt'), 'w') val_log_acc = open((model_folder + '/val_log_acc.txt'), 'w') valInstances = 0 # Data loader normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) spatial_transform = Compose([ Scale(256), RandomHorizontalFlip(), MultiScaleCornerCrop([1, 0.875, 0.75, 0.65625], 224), ToTensor(), normalize ]) vid_seq_train = makeDataset(trainDataset, spatial_transform=spatial_transform, seqLen=stackSize, fmt='.png') train_loader = torch.utils.data.DataLoader(vid_seq_train, batch_size=trainBatchSize, shuffle=True, num_workers=4, pin_memory=True) if valDataset is not None: vid_seq_val = makeDataset(valDataset, spatial_transform=Compose([ Scale(256), CenterCrop(224), ToTensor(), normalize ]), seqLen=stackSize, fmt='.png') val_loader = torch.utils.data.DataLoader(vid_seq_val, batch_size=valBatchSize, shuffle=False, num_workers=2, pin_memory=True) valInstances = vid_seq_val.__len__() trainInstances = vid_seq_train.__len__() print('Number of samples in the dataset: training = {} | validation = {}'. format(trainInstances, valInstances)) model = flow_resnet34(True, channels=3 * stackSize, num_classes=num_classes) model.train(True) train_params = list(model.parameters()) DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(DEVICE) print(DEVICE) loss_fn = nn.CrossEntropyLoss() optimizer_fn = torch.optim.SGD(train_params, lr=lr1, momentum=0.9, weight_decay=5e-4) optim_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer_fn, milestones=decay_step, gamma=decay_factor) train_iter = 0 print("Start training") for epoch in range(numEpochs): epoch_loss = 0 numCorrTrain = 0 trainSamples = float(0) iterPerEpoch = 0 model.train(True) writer.add_scalar('lr', optimizer_fn.param_groups[0]['lr'], epoch + 1) for i, (inputs, targets) in enumerate(train_loader): train_iter += 1 iterPerEpoch += 1 optimizer_fn.zero_grad() inputVariable = Variable(inputs.to(DEVICE)) labelVariable = Variable(targets.to(DEVICE)) trainSamples += inputs.size(0) output_label, _ = model(inputVariable) loss = loss_fn(output_label, labelVariable) loss.backward() optimizer_fn.step() _, predicted = torch.max(output_label.data, 1) numCorrTrain += (predicted == targets.to(DEVICE)).sum() epoch_loss += loss.item() optim_scheduler.step() avg_loss = epoch_loss / iterPerEpoch trainAccuracy = (numCorrTrain / trainSamples) * 100 print('Train: Epoch = {} | Loss = {} | Accuracy = {}'.format( epoch + 1, avg_loss, trainAccuracy)) writer.add_scalar('train/epoch_loss', avg_loss, epoch + 1) writer.add_scalar('train/accuracy', trainAccuracy, epoch + 1) train_log_loss.write('Training loss after {} epoch = {}\n'.format( epoch + 1, avg_loss)) train_log_acc.write('Training accuracy after {} epoch = {}\n'.format( epoch + 1, trainAccuracy)) if valDataset is not None: if (epoch + 1) % 1 == 0: model.train(False) val_loss_epoch = 0 val_iter = 0 val_samples = float(0) numCorr = 0 for j, (inputs, targets) in enumerate(val_loader): val_iter += 1 val_samples += inputs.size(0) with torch.no_grad(): inputVariable = Variable(inputs.to(DEVICE)) labelVariable = Variable(targets.to(DEVICE)) output_label, _ = model(inputVariable) val_loss = loss_fn(output_label, labelVariable) val_loss_epoch += val_loss.item() _, predicted = torch.max(output_label.data, 1) numCorr += (predicted == targets.to(DEVICE)).sum() val_accuracy = (numCorr / val_samples) * 100 avg_val_loss = val_loss_epoch / val_iter print('Validation: Epoch = {} | Loss = {} | Accuracy = {}'. format(epoch + 1, avg_val_loss, val_accuracy)) writer.add_scalar('val/epoch_loss', avg_val_loss, epoch + 1) writer.add_scalar('val/accuracy', val_accuracy, epoch + 1) val_log_loss.write('Val Loss after {} epochs = {}\n'.format( epoch + 1, avg_val_loss)) val_log_acc.write( 'Val Accuracy after {} epochs = {}%\n'.format( epoch + 1, val_accuracy)) if val_accuracy > min_accuracy: save_path_model = (model_folder + '/model_flow_state_dict.pth') torch.save(model.state_dict(), save_path_model) min_accuracy = val_accuracy else: if (epoch + 1) % 10 == 0: save_path_model = (model_folder + '/model_flow_state_dict_epoch' + str(epoch + 1) + '.pth') torch.save(model.state_dict(), save_path_model) train_log_loss.close() train_log_acc.close() val_log_acc.close() val_log_loss.close() writer.export_scalars_to_json(model_folder + "/all_scalars.json") writer.close()
action='store_false') parser.add_argument('--use-extra-linear', action='store_true') args = parser.parse_args() # MAIN #====================== hps = HParameters() d = args.__dict__ if d['use_cpu']: d['use_cuda'] = False else: d['use_cuda'] = True hps.load_from_args(d) hps.writer = SummaryWriter(logdir=hps.output_dir) print("Parameters:") print( "----------------------------------------------------------------------" ) print(hps) if hps.train: train(hps) else: results = [['No', 'Split', 'Mean F-score']] for i, split_filename in enumerate(hps.splits): f_score = eval_split(hps, split_filename, data_dir=hps.output_dir) results.append( [i + 1, split_filename,
# from pybullet_envs.bullet.racecarGymEnv import RacecarGymEnv # from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv from evaluator import Evaluator from ddpg import DDPG from util import * from tensorboardX import SummaryWriter from observation_processor import queue from multi import fastenv # from llll import Subprocess gym.undo_logger_setup() import time writer = SummaryWriter() def train(num_iterations, agent, env, evaluate, validate_interval, output, window_length, max_episode_length=None, debug=False, visualize=False, traintimes=None, resume=None): if resume is not None: print('load weight') agent.load_weights(output) agent.memory.load(output) def sigint_handler(signum, frame): print('memory saving...'), agent.memory.save(output) print('done') exit() signal.signal(signal.SIGINT, sigint_handler)
parser.add_argument('--lr_decay', type=float, default=5e-5) parser.add_argument('--max_iter', type=int, default=160000) parser.add_argument('--batch_size', type=int, default=8) parser.add_argument('--style_weight', type=float, default=10.0) parser.add_argument('--content_weight', type=float, default=1.0) parser.add_argument('--n_threads', type=int, default=16) parser.add_argument('--save_model_interval', type=int, default=10000) parser.add_argument('--save_image_interval', type=int, default=100) args = parser.parse_args() device = torch.device('cuda') save_dir = Path(args.save_dir) save_dir.mkdir(exist_ok=True, parents=True) log_dir = Path(args.log_dir) log_dir.mkdir(exist_ok=True, parents=True) writer = SummaryWriter(log_dir=str(log_dir)) decoder = model.decoder vgg = model.vgg vgg.load_state_dict(torch.load(args.vgg)) vgg = nn.Sequential(*list(vgg.children())[:31]) network = model.Net(vgg, decoder) network.train() network.to(device) content_tf = train_transform() style_tf = train_transform() content_dataset = FlatFolderDataset(args.content_dir, content_tf) style_dataset = FlatFolderDataset(args.style_dir, style_tf)
def main(): args, cfg = parse_config() cfg.ROOT_DIR = Path(cfg.DATA_CONFIG.DATA_PATH) if args.launcher == 'none': dist_train = False total_gpus = 1 else: total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)( args.tcp_port, args.local_rank, backend='nccl' ) dist_train = True print('total gpu num: %d' % (total_gpus)) if args.batch_size is None: args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU else: assert args.batch_size % total_gpus == 0, 'Batch size should match the number of gpus' args.batch_size = args.batch_size // total_gpus args.epochs = cfg.OPTIMIZATION.NUM_EPOCHS if args.epochs is None else args.epochs if args.fix_random_seed: common_utils.set_random_seed(666) output_dir = cfg.ROOT_DIR / 'output' / cfg.EXP_GROUP_PATH / cfg.TAG / args.extra_tag ckpt_dir = output_dir / 'ckpt' output_dir.mkdir(parents=True, exist_ok=True) ckpt_dir.mkdir(parents=True, exist_ok=True) log_file = output_dir / ('log_train_%s.txt' % datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) logger = common_utils.create_logger(log_file, rank=cfg.LOCAL_RANK) # log to file logger.info('**********************Start logging**********************') gpu_list = os.environ['CUDA_VISIBLE_DEVICES'] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys() else 'ALL' logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list) if dist_train: logger.info('total_batch_size: %d' % (total_gpus * args.batch_size)) for key, val in vars(args).items(): logger.info('{:16} {}'.format(key, val)) log_config_to_file(cfg, logger=logger) if cfg.LOCAL_RANK == 0: os.system('cp %s %s' % (args.cfg_file, output_dir)) tb_log = SummaryWriter(log_dir=str(output_dir / 'tensorboard')) if cfg.LOCAL_RANK == 0 else None # -----------------------create dataloader & network & optimizer--------------------------- train_set, train_loader, train_sampler = build_dataloader( dataset_cfg=cfg.DATA_CONFIG, class_names=cfg.CLASS_NAMES, batch_size=args.batch_size, dist=dist_train, workers=args.workers, logger=logger, training=True, merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch, total_epochs=args.epochs ) model = build_network(model_cfg=cfg.MODEL, num_class=len(cfg.CLASS_NAMES), dataset=train_set) if args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model.cuda() ## adam_onecycle 优化器??? optimizer = build_optimizer(model, cfg.OPTIMIZATION) # load checkpoint if it is possible start_epoch = it = 0 last_epoch = -1 if args.pretrained_model is not None: model.load_params_from_file(filename=args.pretrained_model, to_cpu=dist, logger=logger) if args.ckpt is not None: it, start_epoch = model.load_params_with_optimizer(args.ckpt, to_cpu=dist, optimizer=optimizer, logger=logger) last_epoch = start_epoch + 1 else: ckpt_list = glob.glob(str(ckpt_dir / '*checkpoint_epoch_*.pth')) if len(ckpt_list) > 0: ckpt_list.sort(key=os.path.getmtime) it, start_epoch = model.load_params_with_optimizer( ckpt_list[-1], to_cpu=dist, optimizer=optimizer, logger=logger ) last_epoch = start_epoch + 1 model.train() # before wrap to DistributedDataParallel to support fixed some parameters if dist_train: model = nn.parallel.DistributedDataParallel(model, device_ids=[cfg.LOCAL_RANK % torch.cuda.device_count()]) logger.info(model) ## 学习率?? lr_scheduler主要是通过step来修改optimizer.lr和optimizer.mom的值..关键还在上面的optimizer上 lr_scheduler, lr_warmup_scheduler = build_scheduler( optimizer, total_iters_each_epoch=len(train_loader), total_epochs=args.epochs, last_epoch=last_epoch, optim_cfg=cfg.OPTIMIZATION ) # -----------------------start training--------------------------- logger.info('**********************Start training %s/%s(%s)**********************' % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag)) train_model( model, optimizer, train_loader, model_func=model_fn_decorator(), lr_scheduler=lr_scheduler, optim_cfg=cfg.OPTIMIZATION, start_epoch=start_epoch, total_epochs=args.epochs, start_iter=it, rank=cfg.LOCAL_RANK, tb_log=tb_log, ckpt_save_dir=ckpt_dir, train_sampler=train_sampler, lr_warmup_scheduler=lr_warmup_scheduler, ckpt_save_interval=args.ckpt_save_interval, max_ckpt_save_num=args.max_ckpt_save_num, merge_all_iters_to_one_epoch=args.merge_all_iters_to_one_epoch ) logger.info('**********************End training %s/%s(%s)**********************\n\n\n' % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag)) logger.info('**********************Start evaluation %s/%s(%s)**********************' % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag)) test_set, test_loader, sampler = build_dataloader( dataset_cfg=cfg.DATA_CONFIG, class_names=cfg.CLASS_NAMES, batch_size=args.batch_size, dist=dist_train, workers=args.workers, logger=logger, training=False ) eval_output_dir = output_dir / 'eval' / 'eval_with_train' eval_output_dir.mkdir(parents=True, exist_ok=True) args.start_epoch = max(args.epochs - 10, 0) # Only evaluate the last 10 epochs repeat_eval_ckpt( model.module if dist_train else model, test_loader, args, eval_output_dir, logger, ckpt_dir, dist_test=dist_train ) logger.info('**********************End evaluation %s/%s(%s)**********************' % (cfg.EXP_GROUP_PATH, cfg.TAG, args.extra_tag))
def main(): """Create the model and start the training.""" device = torch.device("cuda" if not args.cpu else "cpu") w, h = map(int, args.input_size.split(',')) input_size = (w, h) w, h = map(int, args.input_size_target.split(',')) input_size_target = (w, h) cudnn.enabled = True # Create network if args.model == 'DeepLab': model = DeeplabMulti(num_classes=args.num_classes) if args.restore_from[:4] == 'http' : saved_state_dict = model_zoo.load_url(args.restore_from) else: saved_state_dict = torch.load(args.restore_from) new_params = model.state_dict().copy() for i in saved_state_dict: # Scale.layer5.conv2d_list.3.weight i_parts = i.split('.') # print i_parts if not args.num_classes == 19 or not i_parts[1] == 'layer5': new_params['.'.join(i_parts[1:])] = saved_state_dict[i] # print i_parts model.load_state_dict(new_params) model.train() model.to(device) cudnn.benchmark = True # init D model_D1 = FCDiscriminator(num_classes=args.num_classes).to(device) model_D2 = FCDiscriminator(num_classes=args.num_classes).to(device) model_D1.train() model_D1.to(device) model_D2.train() model_D2.to(device) if not os.path.exists(args.snapshot_dir): os.makedirs(args.snapshot_dir) trainloader = data.DataLoader( GTA5DataSet(args.data_dir, args.data_list, max_iters=args.num_steps * args.iter_size * args.batch_size, crop_size=input_size, scale=args.random_scale, mirror=args.random_mirror, mean=IMG_MEAN), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) trainloader_iter = enumerate(trainloader) targetloader = data.DataLoader(cityscapesDataSet(args.data_dir_target, args.data_list_target, max_iters=args.num_steps * args.iter_size * args.batch_size, crop_size=input_size_target, scale=False, mirror=args.random_mirror, mean=IMG_MEAN, set=args.set), batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) targetloader_iter = enumerate(targetloader) # implement model.optim_parameters(args) to handle different models' lr setting optimizer = optim.SGD(model.optim_parameters(args), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) optimizer.zero_grad() optimizer_D1 = optim.Adam(model_D1.parameters(), lr=args.learning_rate_D, betas=(0.9, 0.99)) optimizer_D1.zero_grad() optimizer_D2 = optim.Adam(model_D2.parameters(), lr=args.learning_rate_D, betas=(0.9, 0.99)) optimizer_D2.zero_grad() if args.gan == 'Vanilla': bce_loss = torch.nn.BCEWithLogitsLoss() elif args.gan == 'LS': bce_loss = torch.nn.MSELoss() seg_loss = torch.nn.CrossEntropyLoss(ignore_index=255) interp = nn.Upsample(size=(input_size[1], input_size[0]), mode='bilinear', align_corners=True) interp_target = nn.Upsample(size=(input_size_target[1], input_size_target[0]), mode='bilinear', align_corners=True) # labels for adversarial training source_label = 0 target_label = 1 # set up tensor board if args.tensorboard: if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) writer = SummaryWriter(args.log_dir) for i_iter in range(args.num_steps): loss_seg_value1 = 0 loss_adv_target_value1 = 0 loss_D_value1 = 0 loss_seg_value2 = 0 loss_adv_target_value2 = 0 loss_D_value2 = 0 optimizer.zero_grad() adjust_learning_rate(optimizer, i_iter) optimizer_D1.zero_grad() optimizer_D2.zero_grad() adjust_learning_rate_D(optimizer_D1, i_iter) adjust_learning_rate_D(optimizer_D2, i_iter) for sub_i in range(args.iter_size): # train G # don't accumulate grads in D for param in model_D1.parameters(): param.requires_grad = False for param in model_D2.parameters(): param.requires_grad = False # train with source _, batch = trainloader_iter.__next__() images, labels, _, _ = batch images = images.to(device) labels = labels.long().to(device) pred1, pred2 = model(images) pred1 = interp(pred1) pred2 = interp(pred2) loss_seg1 = seg_loss(pred1, labels) loss_seg2 = seg_loss(pred2, labels) loss = loss_seg2 + args.lambda_seg * loss_seg1 # proper normalization loss = loss / args.iter_size loss.backward() loss_seg_value1 += loss_seg1.item() / args.iter_size loss_seg_value2 += loss_seg2.item() / args.iter_size # train with target _, batch = targetloader_iter.__next__() images, _, _ = batch images = images.to(device) pred_target1, pred_target2 = model(images) pred_target1 = interp_target(pred_target1) pred_target2 = interp_target(pred_target2) D_out1 = model_D1(F.softmax(pred_target1)) D_out2 = model_D2(F.softmax(pred_target2)) loss_adv_target1 = bce_loss(D_out1, torch.FloatTensor(D_out1.data.size()).fill_(source_label).to(device)) loss_adv_target2 = bce_loss(D_out2, torch.FloatTensor(D_out2.data.size()).fill_(source_label).to(device)) loss = args.lambda_adv_target1 * loss_adv_target1 + args.lambda_adv_target2 * loss_adv_target2 loss = loss / args.iter_size loss.backward() loss_adv_target_value1 += loss_adv_target1.item() / args.iter_size loss_adv_target_value2 += loss_adv_target2.item() / args.iter_size # train D # bring back requires_grad for param in model_D1.parameters(): param.requires_grad = True for param in model_D2.parameters(): param.requires_grad = True # train with source pred1 = pred1.detach() pred2 = pred2.detach() D_out1 = model_D1(F.softmax(pred1)) D_out2 = model_D2(F.softmax(pred2)) loss_D1 = bce_loss(D_out1, torch.FloatTensor(D_out1.data.size()).fill_(source_label).to(device)) loss_D2 = bce_loss(D_out2, torch.FloatTensor(D_out2.data.size()).fill_(source_label).to(device)) loss_D1 = loss_D1 / args.iter_size / 2 loss_D2 = loss_D2 / args.iter_size / 2 loss_D1.backward() loss_D2.backward() loss_D_value1 += loss_D1.item() loss_D_value2 += loss_D2.item() # train with target pred_target1 = pred_target1.detach() pred_target2 = pred_target2.detach() D_out1 = model_D1(F.softmax(pred_target1)) D_out2 = model_D2(F.softmax(pred_target2)) loss_D1 = bce_loss(D_out1, torch.FloatTensor(D_out1.data.size()).fill_(target_label).to(device)) loss_D2 = bce_loss(D_out2, torch.FloatTensor(D_out2.data.size()).fill_(target_label).to(device)) loss_D1 = loss_D1 / args.iter_size / 2 loss_D2 = loss_D2 / args.iter_size / 2 loss_D1.backward() loss_D2.backward() loss_D_value1 += loss_D1.item() loss_D_value2 += loss_D2.item() optimizer.step() optimizer_D1.step() optimizer_D2.step() if args.tensorboard: scalar_info = { 'loss_seg1': loss_seg_value1, 'loss_seg2': loss_seg_value2, 'loss_adv_target1': loss_adv_target_value1, 'loss_adv_target2': loss_adv_target_value2, 'loss_D1': loss_D_value1, 'loss_D2': loss_D_value2, } if i_iter % 10 == 0: for key, val in scalar_info.items(): writer.add_scalar(key, val, i_iter) print('exp = {}'.format(args.snapshot_dir)) print( 'iter = {0:8d}/{1:8d}, loss_seg1 = {2:.3f} loss_seg2 = {3:.3f} loss_adv1 = {4:.3f}, loss_adv2 = {5:.3f} loss_D1 = {6:.3f} loss_D2 = {7:.3f}'.format( i_iter, args.num_steps, loss_seg_value1, loss_seg_value2, loss_adv_target_value1, loss_adv_target_value2, loss_D_value1, loss_D_value2)) if i_iter >= args.num_steps_stop - 1: print('save model ...') torch.save(model.state_dict(), osp.join(args.snapshot_dir, 'GTA5_' + str(args.num_steps_stop) + '.pth')) torch.save(model_D1.state_dict(), osp.join(args.snapshot_dir, 'GTA5_' + str(args.num_steps_stop) + '_D1.pth')) torch.save(model_D2.state_dict(), osp.join(args.snapshot_dir, 'GTA5_' + str(args.num_steps_stop) + '_D2.pth')) break if i_iter % args.save_pred_every == 0 and i_iter != 0: print('taking snapshot ...') torch.save(model.state_dict(), osp.join(args.snapshot_dir, 'GTA5_' + str(i_iter) + '.pth')) torch.save(model_D1.state_dict(), osp.join(args.snapshot_dir, 'GTA5_' + str(i_iter) + '_D1.pth')) torch.save(model_D2.state_dict(), osp.join(args.snapshot_dir, 'GTA5_' + str(i_iter) + '_D2.pth')) if args.tensorboard: writer.close()
def main(): global args args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) # global cuda; cuda = torch.device('cuda') # uncomment this if only gpu # added by Shahab global cuda if torch.cuda.is_available(): cuda = torch.device('cuda') else: cuda = torch.device('cpu') ### dpc model ### if args.model == 'dpc-rnn': model = DPC_RNN(sample_size=args.img_dim, num_seq=args.num_seq, seq_len=args.seq_len, network=args.net, pred_step=args.pred_step) elif args.model == 'dpc-plus': model = DPC_Plus(sample_size=args.img_dim, num_seq=args.num_seq, seq_len=args.seq_len, network=args.net, pred_step=args.pred_step) else: raise ValueError('wrong model!') model = nn.DataParallel(model) model = model.to(cuda) global criterion criterion = nn.CrossEntropyLoss() global criterion_aux global temperature temperature = 1 if args.wandb: wandb.init(f"CPC {args.prefix}", config=args) wandb.watch(model) ### optimizer ### if args.train_what == 'last': for name, param in model.module.resnet.named_parameters(): param.requires_grad = False else: pass # train all layers print('\n===========Check Grad============') for name, param in model.named_parameters(): print(name, param.requires_grad) print('=================================\n') params = model.parameters() optimizer = optim.Adam(params, lr=args.lr, weight_decay=args.wd) # setting additional criterions if args.target == 'obj_categ' and (args.dataset == 'tdw' or args.dataset == 'cifar10'): criterion_aux = nn.CrossEntropyLoss() elif args.target == 'self_motion': criterion_aux = nn.MSELoss(reduction='sum') # criterion_aux = nn.L1Loss(reduction = 'sum') elif args.target == 'act_recog' and args.dataset == 'ucf101': criterion_aux = nn.CrossEntropyLoss() else: raise NotImplementedError( f"{args.target} is not a valid target variable or the selected dataset doesn't support this target variable" ) args.old_lr = None best_acc = 0 best_loss = 1e10 global iteration iteration = 0 ### restart training ### global img_path img_path, model_path = set_path(args) if os.path.exists(os.path.join(img_path, 'last.pth.tar')): args.resume = os.path.join(img_path, 'last.pth.tar') else: pass if args.resume: if os.path.isfile(args.resume): args.old_lr = float(re.search('_lr(.+?)_', args.resume).group(1)) print("=> loading resumed checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.start_epoch = checkpoint['epoch'] iteration = checkpoint['iteration'] # best_acc = checkpoint['best_acc'] best_loss = checkpoint['best_loss'] model.load_state_dict(checkpoint['state_dict']) if not args.reset_lr: # if didn't reset lr, load old optimizer optimizer.load_state_dict(checkpoint['optimizer']) else: print('==== Change lr from %f to %f ====' % (args.old_lr, args.lr)) print("=> loaded resumed checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("[Warning] no checkpoint found at '{}'".format(args.resume)) if args.pretrain: if os.path.isfile(args.pretrain): print("=> loading pretrained checkpoint '{}'".format( args.pretrain)) checkpoint = torch.load(args.pretrain, map_location=torch.device('cpu')) model = neq_load_customized(model, checkpoint['state_dict']) print("=> loaded pretrained checkpoint '{}' (epoch {})".format( args.pretrain, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.pretrain)) ### load data ### if args.dataset == 'ucf101': # designed for ucf101, short size=256, rand crop to 224x224 then scale to 128x128 transform = transforms.Compose([ RandomHorizontalFlip(consistent=True), RandomCrop(size=224, consistent=True), Scale(size=(args.img_dim, args.img_dim)), RandomGray(consistent=False, p=0.5), ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25, p=1.0), ToTensor(), Normalize() ]) elif args.dataset == 'catcam': # designed for ucf101, short size=256, rand crop to 224x224 then scale to 128x128 transform = transforms.Compose([ RandomHorizontalFlip(consistent=True), RandomCrop(size=224, consistent=True), Scale(size=(args.img_dim, args.img_dim)), RandomGray(consistent=False, p=0.5), ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25, p=1.0), ToTensor(), Normalize() ]) elif args.dataset == 'k400': # designed for kinetics400, short size=150, rand crop to 128x128 transform = transforms.Compose([ RandomSizedCrop(size=args.img_dim, consistent=True, p=1.0), RandomHorizontalFlip(consistent=True), RandomGray(consistent=False, p=0.5), ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25, p=1.0), ToTensor(), Normalize() ]) elif args.dataset == 'airsim': transform = transforms.Compose([ RandomHorizontalFlip(consistent=True), RandomCrop(size=112, consistent=True), Scale(size=(args.img_dim, args.img_dim)), RandomGray(consistent=False, p=0.5), ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25, p=1.0), ToTensor(), Normalize() ]) elif args.dataset == 'tdw': transform = transforms.Compose([ #RandomHorizontalFlip(consistent=True), #RandomCrop(size=128, consistent=True), Scale(size=(args.img_dim, args.img_dim)), #RandomGray(consistent=False, p=0.5), #ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25, p=1.0), ToTensor(), Normalize(mean=[0.5036, 0.4681, 0.4737], std=[0.2294, 0.2624, 0.2830]) ]) train_loader = get_data(transform, 'train') val_loader = get_data(transform, 'val') # setup tools global de_normalize de_normalize = denorm() global writer_train try: # old version writer_val = SummaryWriter(log_dir=os.path.join(img_path, 'val')) writer_train = SummaryWriter(log_dir=os.path.join(img_path, 'train')) except: # v1.7 writer_val = SummaryWriter(logdir=os.path.join(img_path, 'val')) writer_train = SummaryWriter(logdir=os.path.join(img_path, 'train')) ### main loop ### save_checkpoint_freq = args.save_checkpoint_freq for epoch in range(args.start_epoch, args.epochs): train_loss, train_acc, train_accuracy_list, train_loss_hd = train( train_loader, model, optimizer, epoch) val_loss, val_acc, val_accuracy_list, val_loss_hd = validate( val_loader, model, epoch) if args.wandb: wandb.log({ "epoch": epoch, "cpc train loss": train_loss, "cpc train accuracy top1": train_accuracy_list[0], "cpc val loss": val_loss, "cpc val accuracy top1": val_accuracy_list[0], "heading train loss": train_loss_hd, "heading val loss": val_loss_hd }) # save curve writer_train.add_scalar('global/loss', train_loss, epoch) writer_train.add_scalar('global/accuracy', train_acc, epoch) writer_val.add_scalar('global/loss', val_loss, epoch) writer_val.add_scalar('global/accuracy', val_acc, epoch) writer_train.add_scalar('accuracy/top1', train_accuracy_list[0], epoch) writer_train.add_scalar('accuracy/top3', train_accuracy_list[1], epoch) writer_train.add_scalar('accuracy/top5', train_accuracy_list[2], epoch) writer_val.add_scalar('accuracy/top1', val_accuracy_list[0], epoch) writer_val.add_scalar('accuracy/top3', val_accuracy_list[1], epoch) writer_val.add_scalar('accuracy/top5', val_accuracy_list[2], epoch) # save check_point is_best_loss = (val_loss + val_loss_hd) < best_loss best_loss = min(val_loss + val_loss_hd, best_loss) # is_best = val_acc > best_acc; best_acc = max(val_acc, best_acc) if epoch % save_checkpoint_freq == 0: save_this = True else: save_this = False save_checkpoint( { 'epoch': epoch + 1, 'net': args.net, 'state_dict': model.state_dict(), 'best_loss': best_loss, # 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'iteration': iteration }, is_best_loss, filename=os.path.join(model_path, 'epoch%s.pth.tar' % str(epoch + 1)), keep_all=save_this) save_checkpoint( { 'epoch': epoch + 1, 'net': args.net, 'state_dict': model.state_dict(), 'best_loss': best_loss, # 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), 'iteration': iteration }, is_best_loss, filename=os.path.join(model_path, 'last.pth.tar'), keep_all=save_this) print('Training from ep %d to ep %d finished' % (args.start_epoch, args.epochs))
def main(): # Arguments parser = argparse.ArgumentParser(description='High Quality Monocular Depth Estimation via Transfer Learning') parser.add_argument('--epochs', default=20, type=int, help='number of total epochs to run') parser.add_argument('--lr', '--learning-rate', default=0.0001, type=float, help='initial learning rate') parser.add_argument('--bs', default=4, type=int, help='batch size') args = parser.parse_args() # Create model model = Model().cuda() print('Model created.') # Training parameters optimizer = torch.optim.Adam( model.parameters(), args.lr ) batch_size = args.bs prefix = 'densenet_' + str(batch_size) # Load data train_loader, test_loader = getTrainingTestingData(batch_size=batch_size) # Logging writer = SummaryWriter(comment='{}-lr{}-e{}-bs{}'.format(prefix, args.lr, args.epochs, args.bs), flush_secs=30) # Loss l1_criterion = nn.L1Loss() # Start training... for epoch in range(args.epochs): batch_time = AverageMeter() losses = AverageMeter() N = len(train_loader) # Switch to train mode model.train() end = time.time() for i, sample_batched in enumerate(train_loader): optimizer.zero_grad() # Prepare sample and target image = torch.autograd.Variable(sample_batched['image'].cuda()) depth = torch.autograd.Variable(sample_batched['depth'].cuda(non_blocking=True)) # Normalize depth depth_n = DepthNorm( depth ) # Predict output = model(image) # Compute the loss l_depth = l1_criterion(output, depth_n) l_ssim = torch.clamp((1 - ssim(output, depth_n, val_range = 1000.0 / 10.0)) * 0.5, 0, 1) loss = (1.0 * l_ssim) + (0.1 * l_depth) # Update step losses.update(loss.data.item(), image.size(0)) loss.backward() optimizer.step() # Measure elapsed time batch_time.update(time.time() - end) end = time.time() eta = str(datetime.timedelta(seconds=int(batch_time.val*(N - i)))) # Log progress niter = epoch*N+i if i % 5 == 0: # Print to console print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.sum:.3f})\t' 'ETA {eta}\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})' .format(epoch, i, N, batch_time=batch_time, loss=losses, eta=eta)) # Log to tensorboard writer.add_scalar('Train/Loss', losses.val, niter) if i % 300 == 0: LogProgress(model, writer, test_loader, niter) # Record epoch's intermediate results LogProgress(model, writer, test_loader, niter) writer.add_scalar('Train/Loss.avg', losses.avg, epoch)
def train(FLAGS): """ FLAGS: saveto: str reload: store_true config_path: str pretrain_path: str, default="" model_name: str log_path: str """ # write log of training to file. write_log_to_file( os.path.join(FLAGS.log_path, "%s.log" % time.strftime("%Y%m%d-%H%M%S"))) GlobalNames.USE_GPU = FLAGS.use_gpu if GlobalNames.USE_GPU: CURRENT_DEVICE = "cpu" else: CURRENT_DEVICE = "cuda:0" config_path = os.path.abspath(FLAGS.config_path) with open(config_path.strip()) as f: configs = yaml.load(f) INFO(pretty_configs(configs)) # Add default configs configs = default_configs(configs) data_configs = configs['data_configs'] model_configs = configs['model_configs'] optimizer_configs = configs['optimizer_configs'] training_configs = configs['training_configs'] GlobalNames.SEED = training_configs['seed'] set_seed(GlobalNames.SEED) best_model_prefix = os.path.join( FLAGS.saveto, FLAGS.model_name + GlobalNames.MY_BEST_MODEL_SUFFIX) timer = Timer() # ================================================================================== # # Load Data INFO('Loading data...') timer.tic() # Generate target dictionary vocab_src = Vocabulary(**data_configs["vocabularies"][0]) vocab_tgt = Vocabulary(**data_configs["vocabularies"][1]) train_batch_size = training_configs["batch_size"] * max( 1, training_configs["update_cycle"]) train_buffer_size = training_configs["buffer_size"] * max( 1, training_configs["update_cycle"]) train_bitext_dataset = ZipDataset( TextLineDataset( data_path=data_configs['train_data'][0], vocabulary=vocab_src, max_len=data_configs['max_len'][0], ), TextLineDataset( data_path=data_configs['train_data'][1], vocabulary=vocab_tgt, max_len=data_configs['max_len'][1], ), shuffle=training_configs['shuffle']) valid_bitext_dataset = ZipDataset( TextLineDataset( data_path=data_configs['valid_data'][0], vocabulary=vocab_src, ), TextLineDataset( data_path=data_configs['valid_data'][1], vocabulary=vocab_tgt, )) training_iterator = DataIterator( dataset=train_bitext_dataset, batch_size=train_batch_size, use_bucket=training_configs['use_bucket'], buffer_size=train_buffer_size, batching_func=training_configs['batching_key']) valid_iterator = DataIterator( dataset=valid_bitext_dataset, batch_size=training_configs['valid_batch_size'], use_bucket=True, buffer_size=100000, numbering=True) bleu_scorer = SacreBLEUScorer( reference_path=data_configs["bleu_valid_reference"], num_refs=data_configs["num_refs"], lang_pair=data_configs["lang_pair"], sacrebleu_args=training_configs["bleu_valid_configs"] ['sacrebleu_args'], postprocess=training_configs["bleu_valid_configs"]['postprocess']) INFO('Done. Elapsed time {0}'.format(timer.toc())) lrate = optimizer_configs['learning_rate'] is_early_stop = False # ================================ Begin ======================================== # # Build Model & Optimizer # We would do steps below on after another # 1. build models & criterion # 2. move models & criterion to gpu if needed # 3. load pre-trained model if needed # 4. build optimizer # 5. build learning rate scheduler if needed # 6. load checkpoints if needed # 0. Initial model_collections = Collections() checkpoint_saver = Saver( save_prefix="{0}.ckpt".format( os.path.join(FLAGS.saveto, FLAGS.model_name)), num_max_keeping=training_configs['num_kept_checkpoints']) best_model_saver = Saver( save_prefix=best_model_prefix, num_max_keeping=training_configs['num_kept_best_model']) # 1. Build Model & Criterion INFO('Building model...') timer.tic() nmt_model = build_model(n_src_vocab=vocab_src.max_n_words, n_tgt_vocab=vocab_tgt.max_n_words, **model_configs) INFO(nmt_model) critic = NMTCriterion(label_smoothing=model_configs['label_smoothing']) INFO(critic) INFO('Done. Elapsed time {0}'.format(timer.toc())) # 2. Move to GPU if GlobalNames.USE_GPU: nmt_model = nmt_model.cuda() critic = critic.cuda() # 3. Load pretrained model if needed load_pretrained_model(nmt_model, FLAGS.pretrain_path, exclude_prefix=None, device=CURRENT_DEVICE) # 4. Build optimizer INFO('Building Optimizer...') optim = Optimizer(name=optimizer_configs['optimizer'], model=nmt_model, lr=lrate, grad_clip=optimizer_configs['grad_clip'], optim_args=optimizer_configs['optimizer_params']) # 5. Build scheduler for optimizer if needed if optimizer_configs['schedule_method'] is not None: if optimizer_configs['schedule_method'] == "loss": scheduler = ReduceOnPlateauScheduler( optimizer=optim, **optimizer_configs["scheduler_configs"]) elif optimizer_configs['schedule_method'] == "noam": scheduler = NoamScheduler(optimizer=optim, **optimizer_configs['scheduler_configs']) else: WARN( "Unknown scheduler name {0}. Do not use lr_scheduling.".format( optimizer_configs['schedule_method'])) scheduler = None else: scheduler = None # 6. build moving average if training_configs['moving_average_method'] is not None: ma = MovingAverage( moving_average_method=training_configs['moving_average_method'], named_params=nmt_model.named_parameters(), alpha=training_configs['moving_average_alpha']) else: ma = None INFO('Done. Elapsed time {0}'.format(timer.toc())) # Reload from latest checkpoint if FLAGS.reload: checkpoint_saver.load_latest(model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # ================================================================================== # # Prepare training eidx = model_collections.get_collection("eidx", [0])[-1] uidx = model_collections.get_collection("uidx", [0])[-1] bad_count = model_collections.get_collection("bad_count", [0])[-1] oom_count = model_collections.get_collection("oom_count", [0])[-1] summary_writer = SummaryWriter(log_dir=FLAGS.log_path) cum_samples = 0 cum_words = 0 best_valid_loss = 1.0 * 1e10 # Max Float saving_files = [] # Timer for computing speed timer_for_speed = Timer() timer_for_speed.tic() INFO('Begin training...') while True: summary_writer.add_scalar("Epoch", (eidx + 1), uidx) # Build iterator and progress bar training_iter = training_iterator.build_generator() training_progress_bar = tqdm(desc=' - (Epoch %d) ' % eidx, total=len(training_iterator), unit="sents") for batch in training_iter: uidx += 1 if scheduler is None: pass elif optimizer_configs["schedule_method"] == "loss": scheduler.step(metric=best_valid_loss) else: scheduler.step(global_step=uidx) seqs_x, seqs_y = batch n_samples_t = len(seqs_x) n_words_t = sum(len(s) for s in seqs_y) cum_samples += n_samples_t cum_words += n_words_t training_progress_bar.update(n_samples_t) optim.zero_grad() try: # Prepare data for seqs_x_t, seqs_y_t in split_shard( seqs_x, seqs_y, split_size=training_configs['update_cycle']): x, y = prepare_data(seqs_x_t, seqs_y_t, cuda=GlobalNames.USE_GPU) loss = compute_forward( model=nmt_model, critic=critic, seqs_x=x, seqs_y=y, eval=False, normalization=n_samples_t, norm_by_words=training_configs["norm_by_words"]) optim.step() except RuntimeError as e: if 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom_count += 1 optim.zero_grad() else: raise e if ma is not None and eidx >= training_configs[ 'moving_average_start_epoch']: ma.step() # ================================================================================== # # Display some information if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['disp_freq']): # words per second and sents per second words_per_sec = cum_words / (timer.toc(return_seconds=True)) sents_per_sec = cum_samples / (timer.toc(return_seconds=True)) lrate = list(optim.get_lrate())[0] summary_writer.add_scalar("Speed(words/sec)", scalar_value=words_per_sec, global_step=uidx) summary_writer.add_scalar("Speed(sents/sen)", scalar_value=sents_per_sec, global_step=uidx) summary_writer.add_scalar("lrate", scalar_value=lrate, global_step=uidx) summary_writer.add_scalar("oom_count", scalar_value=oom_count, global_step=uidx) # Reset timer timer.tic() cum_words = 0 cum_samples = 0 # ================================================================================== # # Saving checkpoints if should_trigger_by_steps( uidx, eidx, every_n_step=training_configs['save_freq'], debug=FLAGS.debug): model_collections.add_to_collection("uidx", uidx) model_collections.add_to_collection("eidx", eidx) model_collections.add_to_collection("bad_count", bad_count) if not is_early_stop: checkpoint_saver.save(global_step=uidx, model=nmt_model, optim=optim, lr_scheduler=scheduler, collections=model_collections, ma=ma) # ================================================================================== # # Loss Validation & Learning rate annealing if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['loss_valid_freq'], debug=FLAGS.debug): if ma is not None: origin_state_dict = deepcopy(nmt_model.state_dict()) nmt_model.load_state_dict(ma.export_ma_params(), strict=False) valid_loss = loss_validation( model=nmt_model, critic=critic, valid_iterator=valid_iterator, ) model_collections.add_to_collection("history_losses", valid_loss) min_history_loss = np.array( model_collections.get_collection("history_losses")).min() summary_writer.add_scalar("loss", valid_loss, global_step=uidx) summary_writer.add_scalar("best_loss", min_history_loss, global_step=uidx) best_valid_loss = min_history_loss if ma is not None: nmt_model.load_state_dict(origin_state_dict) del origin_state_dict # ================================================================================== # # BLEU Validation & Early Stop if should_trigger_by_steps( global_step=uidx, n_epoch=eidx, every_n_step=training_configs['bleu_valid_freq'], min_step=training_configs['bleu_valid_warmup'], debug=FLAGS.debug): if ma is not None: origin_state_dict = deepcopy(nmt_model.state_dict()) nmt_model.load_state_dict(ma.export_ma_params(), strict=False) valid_bleu = bleu_validation( uidx=uidx, valid_iterator=valid_iterator, batch_size=training_configs["bleu_valid_batch_size"], model=nmt_model, bleu_scorer=bleu_scorer, vocab_tgt=vocab_tgt, valid_dir=FLAGS.valid_path, max_steps=training_configs["bleu_valid_configs"] ["max_steps"], beam_size=training_configs["bleu_valid_configs"] ["beam_size"], alpha=training_configs["bleu_valid_configs"]["alpha"]) model_collections.add_to_collection(key="history_bleus", value=valid_bleu) best_valid_bleu = float( np.array(model_collections.get_collection( "history_bleus")).max()) summary_writer.add_scalar("bleu", valid_bleu, uidx) summary_writer.add_scalar("best_bleu", best_valid_bleu, uidx) # If model get new best valid bleu score if valid_bleu >= best_valid_bleu: bad_count = 0 if is_early_stop is False: # 1. save the best model torch.save(nmt_model.state_dict(), best_model_prefix + ".final") # 2. record all several best models best_model_saver.save(global_step=uidx, model=nmt_model) else: bad_count += 1 # At least one epoch should be traversed if bad_count >= training_configs[ 'early_stop_patience'] and eidx > 0: is_early_stop = True WARN("Early Stop!") summary_writer.add_scalar("bad_count", bad_count, uidx) if ma is not None: nmt_model.load_state_dict(origin_state_dict) del origin_state_dict INFO( "{0} Loss: {1:.2f} BLEU: {2:.2f} lrate: {3:6f} patience: {4}" .format(uidx, valid_loss, valid_bleu, lrate, bad_count)) training_progress_bar.close() eidx += 1 if eidx > training_configs["max_epochs"]: break
return parser if __name__ == "__main__": args = get_parser().parse_args() name = "%s" % args.comment.replace('/', '_') try: args.device = [int(item) for item in args.device.split(',')] except AttributeError: args.device = [int(args.device)] setup_runtime(seed=42, cuda_dev_id=args.device) print(args, flush=True) print() print(name, flush=True) writer = SummaryWriter('./runs/%s/%s' % (args.data, name)) writer.add_text( 'args', " \n".join(['%s %s' % (arg, getattr(args, arg)) for arg in vars(args)])) # Setup model and train_loader print('Commencing!', flush=True) model, train_loader = return_model_loader(args) train_loader = RotationDataLoader(args.imagenet_path, is_validation=False, crop_size=224, batch_size=args.batch_size, num_workers=args.workers, shuffle=True)
# From the total number of iterations, how many training datasets are needed epochs = int(args.iters // len(dataloader)) print("[*] Start training model based on MSE loss.") print(f"[*] Generator pre-training for {epochs} epochs.") # Writer train PSNR model log. if args.start_epoch == 0: with open(f"FSRCNN_{args.upscale_factor}x_Loss.csv", "w+") as f: writer = csv.writer(f) writer.writerow(["Epoch", "MSE Loss"]) # Creates a GradScaler once at the beginning of training. scaler = amp.GradScaler() # Start write training log writer = SummaryWriter("logs") print("Run `tensorboard --logdir=./logs` view training log.") for epoch in range(args.start_epoch, epochs): progress_bar = tqdm(enumerate(dataloader), total=len(dataloader)) avg_loss = 0. for iteration, (input, target) in progress_bar: optimizer.zero_grad() lr, hr = input.to(device), target.to(device) # Runs the forward pass with autocasting. #with amp.autocast(): sr = model(lr) loss = criterion(sr, hr)
def train_net(cfg): # Enable the inbuilt cudnn auto-tuner to find the best algorithm to use torch.backends.cudnn.benchmark = True # Set up data augmentation IMG_SIZE = cfg.CONST.IMG_H, cfg.CONST.IMG_W CROP_SIZE = cfg.CONST.CROP_IMG_H, cfg.CONST.CROP_IMG_W train_transforms = utils.data_transforms.Compose([ utils.data_transforms.RandomCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground( cfg.TRAIN.RANDOM_BG_COLOR_RANGE), utils.data_transforms.ColorJitter(cfg.TRAIN.BRIGHTNESS, cfg.TRAIN.CONTRAST, cfg.TRAIN.SATURATION), utils.data_transforms.RandomNoise(cfg.TRAIN.NOISE_STD), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.RandomFlip(), utils.data_transforms.RandomPermuteRGB(), utils.data_transforms.ToTensor(), ]) val_transforms = utils.data_transforms.Compose([ utils.data_transforms.CenterCrop(IMG_SIZE, CROP_SIZE), utils.data_transforms.RandomBackground(cfg.TEST.RANDOM_BG_COLOR_RANGE), utils.data_transforms.Normalize(mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD), utils.data_transforms.ToTensor(), ]) # Set up data loader train_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[ cfg.DATASET.TRAIN_DATASET](cfg) val_dataset_loader = utils.data_loaders.DATASET_LOADER_MAPPING[ cfg.DATASET.TEST_DATASET](cfg) train_data_loader = torch.utils.data.DataLoader( dataset=train_dataset_loader.get_dataset( utils.data_loaders.DatasetType.TRAIN, cfg.CONST.N_VIEWS_RENDERING, train_transforms), batch_size=cfg.CONST.BATCH_SIZE, num_workers=cfg.TRAIN.NUM_WORKER, pin_memory=True, shuffle=True, drop_last=True) val_data_loader = torch.utils.data.DataLoader( dataset=val_dataset_loader.get_dataset( utils.data_loaders.DatasetType.VAL, cfg.CONST.N_VIEWS_RENDERING, val_transforms), batch_size=1, num_workers=1, pin_memory=True, shuffle=False) # Set up networks encoder = Encoder(cfg) decoder = Decoder(cfg) refiner = Refiner(cfg) merger = Merger(cfg) print('[DEBUG] %s Parameters in Encoder: %d.' % (dt.now(), utils.network_utils.count_parameters(encoder))) print('[DEBUG] %s Parameters in Decoder: %d.' % (dt.now(), utils.network_utils.count_parameters(decoder))) print('[DEBUG] %s Parameters in Refiner: %d.' % (dt.now(), utils.network_utils.count_parameters(refiner))) print('[DEBUG] %s Parameters in Merger: %d.' % (dt.now(), utils.network_utils.count_parameters(merger))) # Initialize weights of networks encoder.apply(utils.network_utils.init_weights) decoder.apply(utils.network_utils.init_weights) refiner.apply(utils.network_utils.init_weights) merger.apply(utils.network_utils.init_weights) # Set up solver if cfg.TRAIN.POLICY == 'adam': encoder_solver = torch.optim.Adam(filter(lambda p: p.requires_grad, encoder.parameters()), lr=cfg.TRAIN.ENCODER_LEARNING_RATE, betas=cfg.TRAIN.BETAS) decoder_solver = torch.optim.Adam(decoder.parameters(), lr=cfg.TRAIN.DECODER_LEARNING_RATE, betas=cfg.TRAIN.BETAS) refiner_solver = torch.optim.Adam(refiner.parameters(), lr=cfg.TRAIN.REFINER_LEARNING_RATE, betas=cfg.TRAIN.BETAS) merger_solver = torch.optim.Adam(merger.parameters(), lr=cfg.TRAIN.MERGER_LEARNING_RATE, betas=cfg.TRAIN.BETAS) elif cfg.TRAIN.POLICY == 'sgd': encoder_solver = torch.optim.SGD(filter(lambda p: p.requires_grad, encoder.parameters()), lr=cfg.TRAIN.ENCODER_LEARNING_RATE, momentum=cfg.TRAIN.MOMENTUM) decoder_solver = torch.optim.SGD(decoder.parameters(), lr=cfg.TRAIN.DECODER_LEARNING_RATE, momentum=cfg.TRAIN.MOMENTUM) refiner_solver = torch.optim.SGD(refiner.parameters(), lr=cfg.TRAIN.REFINER_LEARNING_RATE, momentum=cfg.TRAIN.MOMENTUM) merger_solver = torch.optim.SGD(merger.parameters(), lr=cfg.TRAIN.MERGER_LEARNING_RATE, momentum=cfg.TRAIN.MOMENTUM) else: raise Exception('[FATAL] %s Unknown optimizer %s.' % (dt.now(), cfg.TRAIN.POLICY)) # Set up learning rate scheduler to decay learning rates dynamically encoder_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( encoder_solver, milestones=cfg.TRAIN.ENCODER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA) decoder_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( decoder_solver, milestones=cfg.TRAIN.DECODER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA) refiner_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( refiner_solver, milestones=cfg.TRAIN.REFINER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA) merger_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( merger_solver, milestones=cfg.TRAIN.MERGER_LR_MILESTONES, gamma=cfg.TRAIN.GAMMA) if torch.cuda.is_available(): encoder = torch.nn.DataParallel(encoder).cuda() decoder = torch.nn.DataParallel(decoder).cuda() refiner = torch.nn.DataParallel(refiner).cuda() merger = torch.nn.DataParallel(merger).cuda() # Set up loss functions bce_loss = torch.nn.BCELoss() # Load pretrained model if exists init_epoch = 0 best_iou = -1 best_epoch = -1 if 'WEIGHTS' in cfg.CONST and cfg.TRAIN.RESUME_TRAIN: print('[INFO] %s Recovering from %s ...' % (dt.now(), cfg.CONST.WEIGHTS)) checkpoint = torch.load(cfg.CONST.WEIGHTS) init_epoch = checkpoint['epoch_idx'] best_iou = checkpoint['best_iou'] best_epoch = checkpoint['best_epoch'] encoder.load_state_dict(checkpoint['encoder_state_dict']) decoder.load_state_dict(checkpoint['decoder_state_dict']) if cfg.NETWORK.USE_REFINER: refiner.load_state_dict(checkpoint['refiner_state_dict']) if cfg.NETWORK.USE_MERGER: merger.load_state_dict(checkpoint['merger_state_dict']) print( '[INFO] %s Recover complete. Current epoch #%d, Best IoU = %.4f at epoch #%d.' % (dt.now(), init_epoch, best_iou, best_epoch)) # Summary writer for TensorBoard output_dir = os.path.join(cfg.DIR.OUT_PATH, '%s', dt.now().isoformat()) log_dir = output_dir % 'logs' ckpt_dir = output_dir % 'checkpoints' train_writer = SummaryWriter(os.path.join(log_dir, 'train')) val_writer = SummaryWriter(os.path.join(log_dir, 'test')) # Training loop for epoch_idx in range(init_epoch, cfg.TRAIN.NUM_EPOCHES): # Tick / tock epoch_start_time = time() # Batch average meterics batch_time = utils.network_utils.AverageMeter() data_time = utils.network_utils.AverageMeter() encoder_losses = utils.network_utils.AverageMeter() refiner_losses = utils.network_utils.AverageMeter() # switch models to training mode encoder.train() decoder.train() merger.train() refiner.train() batch_end_time = time() n_batches = len(train_data_loader) for batch_idx, (taxonomy_names, sample_names, rendering_images, ground_truth_volumes) in enumerate(train_data_loader): # Measure data time data_time.update(time() - batch_end_time) # Get data from data loader rendering_images = utils.network_utils.var_or_cuda( rendering_images) ground_truth_volumes = utils.network_utils.var_or_cuda( ground_truth_volumes) # Train the encoder, decoder, refiner, and merger image_features = encoder(rendering_images) raw_features, generated_volumes = decoder(image_features) if cfg.NETWORK.USE_MERGER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_MERGER: generated_volumes = merger(raw_features, generated_volumes) else: generated_volumes = torch.mean(generated_volumes, dim=1) encoder_loss = bce_loss(generated_volumes, ground_truth_volumes) * 10 if cfg.NETWORK.USE_REFINER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_REFINER: generated_volumes = refiner(generated_volumes) refiner_loss = bce_loss(generated_volumes, ground_truth_volumes) * 10 else: refiner_loss = encoder_loss # Gradient decent encoder.zero_grad() decoder.zero_grad() refiner.zero_grad() merger.zero_grad() if cfg.NETWORK.USE_REFINER and epoch_idx >= cfg.TRAIN.EPOCH_START_USE_REFINER: encoder_loss.backward(retain_graph=True) refiner_loss.backward() else: encoder_loss.backward() encoder_solver.step() decoder_solver.step() refiner_solver.step() merger_solver.step() # Append loss to average metrics encoder_losses.update(encoder_loss.item()) refiner_losses.update(refiner_loss.item()) # Append loss to TensorBoard n_itr = epoch_idx * n_batches + batch_idx train_writer.add_scalar('EncoderDecoder/BatchLoss', encoder_loss.item(), n_itr) train_writer.add_scalar('Refiner/BatchLoss', refiner_loss.item(), n_itr) # Tick / tock batch_time.update(time() - batch_end_time) batch_end_time = time() print( '[INFO] %s [Epoch %d/%d][Batch %d/%d] BatchTime = %.3f (s) DataTime = %.3f (s) EDLoss = %.4f RLoss = %.4f' % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, batch_idx + 1, n_batches, batch_time.val, data_time.val, encoder_loss.item(), refiner_loss.item())) # Append epoch loss to TensorBoard train_writer.add_scalar('EncoderDecoder/EpochLoss', encoder_losses.avg, epoch_idx + 1) train_writer.add_scalar('Refiner/EpochLoss', refiner_losses.avg, epoch_idx + 1) # Adjust learning rate encoder_lr_scheduler.step() decoder_lr_scheduler.step() refiner_lr_scheduler.step() merger_lr_scheduler.step() # Tick / tock epoch_end_time = time() print( '[INFO] %s Epoch [%d/%d] EpochTime = %.3f (s) EDLoss = %.4f RLoss = %.4f' % (dt.now(), epoch_idx + 1, cfg.TRAIN.NUM_EPOCHES, epoch_end_time - epoch_start_time, encoder_losses.avg, refiner_losses.avg)) # Update Rendering Views if cfg.TRAIN.UPDATE_N_VIEWS_RENDERING: n_views_rendering = random.randint(1, cfg.CONST.N_VIEWS_RENDERING) train_data_loader.dataset.set_n_views_rendering(n_views_rendering) print('[INFO] %s Epoch [%d/%d] Update #RenderingViews to %d' % (dt.now(), epoch_idx + 2, cfg.TRAIN.NUM_EPOCHES, n_views_rendering)) # Validate the training models iou = test_net(cfg, epoch_idx + 1, output_dir, val_data_loader, val_writer, encoder, decoder, refiner, merger) # Save weights to file if (epoch_idx + 1) % cfg.TRAIN.SAVE_FREQ == 0: if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) utils.network_utils.save_checkpoints( cfg, os.path.join(ckpt_dir, 'ckpt-epoch-%04d.pth' % (epoch_idx + 1)), epoch_idx + 1, encoder, encoder_solver, decoder, decoder_solver, refiner, refiner_solver, merger, merger_solver, best_iou, best_epoch) if iou > best_iou: if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) best_iou = iou best_epoch = epoch_idx + 1 utils.network_utils.save_checkpoints( cfg, os.path.join(ckpt_dir, 'best-ckpt.pth'), epoch_idx + 1, encoder, encoder_solver, decoder, decoder_solver, refiner, refiner_solver, merger, merger_solver, best_iou, best_epoch) # Close SummaryWriter for TensorBoard train_writer.close() val_writer.close()
def main(): global args, best_prec1 args = parser.parse_args() with open(args.config) as f: config = yaml.load(f) for k, v in config['common'].items(): setattr(args, k, v) torch.cuda.manual_seed(int(time.time()) % 1000) # create model print("=> creating model '{}'".format(args.arch)) if args.arch.startswith('inception_v3'): print('inception_v3 without aux_logits!') image_size = 341 input_size = 299 model = models.__dict__[args.arch](aux_logits=True, num_classes=1000, pretrained=args.pretrained) else: image_size = 256 input_size = 226 student_model = models.__dict__[args.arch]( num_classes=args.num_classes, pretrained=args.pretrained, avgpool_size=input_size / 32) student_model.cuda() student_params = list(student_model.parameters()) student_optimizer = torch.optim.Adam(student_model.parameters(), args.base_lr * 0.1) args.save_path = "checkpoint/" + args.exp_name if not osp.exists(args.save_path): os.mkdir(args.save_path) tb_logger = SummaryWriter(args.save_path) logger = create_logger('global_logger', args.save_path + '/log.txt') for key, val in vars(args).items(): logger.info("{:16} {}".format(key, val)) criterion = nn.CrossEntropyLoss() print("Build network") last_iter = -1 best_prec1 = 0 load_state(args.save_path + "/ckptmodel_best.pth.tar", student_model) cudnn.benchmark = True # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) se_normalize = se_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) border_value = int(np.mean([0.485, 0.456, 0.406]) * 255 + 0.5) test_aug = se_transforms.ImageAugmentation(True, 0, rot_std=0.0, scale_u_range=[0.75, 1.333], affine_std=0, scale_x_range=None, scale_y_range=None) val_dataset = NormalDataset(args.val_root, args.val_source, transform=transforms.Compose([ se_transforms.ScaleAndCrop( (input_size, input_size), args.padding, False, np.array([0.485, 0.456, 0.406]), np.array([0.229, 0.224, 0.225])) ]), is_train=False, args=args) val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.workers) val_multi_dataset = NormalDataset( args.val_root, args.val_source, transform=transforms.Compose([ se_transforms.ScaleCropAndAugmentAffineMultiple( 16, (input_size, input_size), args.padding, True, test_aug, border_value, np.array([0.485, 0.456, 0.406]), np.array([0.229, 0.224, 0.225])) ]), is_train=False, args=args) val_multi_loader = DataLoader(val_multi_dataset, batch_size=1, shuffle=False, num_workers=args.workers) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( student_optimizer, args.lr_steps, args.lr_gamma) #logger.info('{}'.format(args)) validate(val_loader, student_model, criterion) validate_multi(val_multi_loader, student_model, criterion)
def forward(self, x): return self.net(x) def calc_target(net, local_reward, next_state): if next_state is None: return local_reward state_v = torch.tensor([next_state], dtype=torch.float32) next_q_v = net(state_v) best_q = next_q_v.max(dim=1)[0].item() return local_reward + GAMMA * best_q if __name__ == "__main__": env = gym.make("CartPole-v0") writer = SummaryWriter(comment="-cartpole-dqn") net = DQN(env.observation_space.shape[0], env.action_space.n) print(net) selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=EPSILON_START) agent = ptan.agent.DQNAgent(net, selector, preprocessor=ptan.agent.float32_preprocessor) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA) replay_buffer = ptan.experience.ExperienceReplayBuffer( exp_source, REPLAY_BUFFER) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
def __init__(self, params: dict, dataloader: Dataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module): logger.info('Initializing Distiller') self.params = params self.dump_path = params.dump_path self.multi_gpu = params.multi_gpu self.fp16 = params.fp16 self.student = student self.teacher = teacher self.dataloader = dataloader if self.params.n_gpu > 1: self.dataloader.split() self.get_iterator(seed=params.seed) self.temperature = params.temperature assert self.temperature > 0. self.alpha_ce = params.alpha_ce self.alpha_mlm = params.alpha_mlm self.alpha_mse = params.alpha_mse assert self.alpha_ce >= 0. assert self.alpha_mlm >= 0. assert self.alpha_mse >= 0. assert self.alpha_ce + self.alpha_mlm + self.alpha_mse > 0. self.mlm_mask_prop = params.mlm_mask_prop assert 0.0 <= self.mlm_mask_prop <= 1.0 assert params.word_mask + params.word_keep + params.word_rand == 1.0 self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand]) self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs if self.fp16: self.pred_probs = self.pred_probs.half() self.token_probs = self.token_probs.half() self.epoch = 0 self.n_iter = 0 self.n_total_iter = 0 self.n_sequences_epoch = 0 self.total_loss_epoch = 0 self.last_loss = 0 self.last_loss_ce = 0 self.last_loss_mlm = 0 self.last_loss_mse = 0 self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean') self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) self.mse_loss_fct = nn.MSELoss(reduction='sum') logger.info('--- Initializing model optimizer') assert params.gradient_accumulation_steps >= 1 self.num_steps_epoch = int(len(self.dataloader) / params.batch_size) + 1 num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1 warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': params.weight_decay}, {'params': [p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0} ] logger.info("------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad])) logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()])) self.optimizer = AdamW(optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)) self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level") self.student, self.optimizer = amp.initialize(self.student, self.optimizer, opt_level=self.params.fp16_opt_level) self.teacher = self.teacher.half() if self.multi_gpu: if self.fp16: from apex.parallel import DistributedDataParallel logger.info("Using apex.parallel.DistributedDataParallel for distributed training.") self.student = DistributedDataParallel(self.student) else: from torch.nn.parallel import DistributedDataParallel logger.info("Using nn.parallel.DistributedDataParallel for distributed training.") self.student = DistributedDataParallel(self.student, device_ids=[params.local_rank], output_device=params.local_rank) self.is_master = params.is_master if self.is_master: logger.info('--- Initializing Tensorboard') self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train')) self.tensorboard.add_text(tag='config', text_string=str(self.params), global_step=0)
def main(): # 加载训练集 print('Loading dataset ...\n') dataset_train = Dataset(train=True) loader_train = DataLoader(dataset=dataset_train, num_workers=4, batch_size=opt.batchSize, shuffle=True) print("# of training samples: %d\n" % int(len(dataset_train))) # 加载模型 net = DnCNN(channels=1, num_of_layers=17) net.apply(weights_init_kaiming) # 权重初始化 # 使用GPU device_ids = [0] model = nn.DataParallel(net, device_ids=device_ids).cuda() # criterion.cuda() # 定义损失和优化器 criterion = nn.MSELoss(size_average=False) optimizer = optim.Adam(model.parameters(), lr=opt.lr) # 使用tensorboardx可视化训练曲线和指标 time_now = datetime.now().isoformat() if not os.path.exists(opt.log_dir): os.mkdir(opt.log_dir) writer = SummaryWriter(log_dir=os.path.join(opt.log_dir, time_now)) step = 0 for epoch in range(opt.epochs): # 设置学习率 if epoch < opt.milestone: current_lr = opt.lr else: # current_lr = opt.lr / 10. current_lr = opt.lr for param_group in optimizer.param_groups: param_group["lr"] = current_lr print('learning rate %f' % current_lr) # 开始训练 total_loss = 0 psnr_train = 0 for i, data in enumerate(loader_train, 0): # training step model.train() model.zero_grad() optimizer.zero_grad() img_train = data noise = torch.FloatTensor(img_train.size()).normal_( mean=0, std=opt.noiseL / 255.) imgn_train = img_train + noise # print(imgn_train.shape) img_train, imgn_train = Variable(img_train.cuda()), Variable( imgn_train.cuda()) noise = Variable(noise.cuda()) out_train = model(imgn_train) loss = criterion(out_train, noise) / (imgn_train.size()[0] * 2) loss.backward() optimizer.step() # 统计loss和计算psnr,并显示 out_train = torch.clamp(imgn_train - out_train, 0., 1.) psnr_train += batch_PSNR(out_train, img_train, 1.) total_loss += loss.item() print("[epoch %d][%d/%d] loss: %.4f PSNR_train: %.4f" % (epoch + 1, i + 1, len(loader_train), total_loss / (i + 1), psnr_train / (i + 1))) writer.add_scalar('loss', total_loss / (i + 1), step) writer.add_scalar('PSNR on training data', psnr_train / (i + 1), step) # 保存训练图片和模型 step += 1 if step % 500 == 0: if not os.path.exists(opt.image_path): os.mkdir(opt.image_path) cv2.imwrite(opt.image_path + '/' + "{}_pred.jpg".format(step), save_image(out_train)) cv2.imwrite(opt.image_path + '/' + "{}_input.jpg".format(step), save_image(imgn_train)) cv2.imwrite(opt.image_path + '/' + "{}_gt.jpg".format(step), save_image(img_train)) if not os.path.exists(opt.save_model): os.makedirs(opt.save_model) torch.save(model.state_dict(), os.path.join(opt.save_model, 'net.pth'))
los = cse los.backward() optimizerLstm.step() optimizerE.step() return [cse.data.cpu().numpy()] ################################################################################################################# # FUN TRAINING TIME ! # train_eval = test_data.get_eval_data(True) # proto1 = test_data.get_eval_data(False,2,session1_probe=list(range(4, 9 + 1)),session2_probe=list(range(4, 6 + 1))) # proto2 = test_data.get_eval_data(False,2,session1_probe=list(range(10, 12 + 1))) # proto3 = test_data.get_eval_data(False,2,session2_probe=list(range(7, 9 + 1))) # proto4 = test_data.get_eval_data(False,2,session2_probe=list(range(10, 12 + 1))) proto5 = test_data.get_eval_data(False,2,cross_session=True) writer = SummaryWriter('%s/logs/%s'%(opt.savedir,opt.signature)) itr = opt.siter while True: # netE.train() # netD.train() # lstm.train() # im_cond1, im_cond2,lb = next(training_batch_generator1) # print(lb) # # losses1 = train_main(im_cond1, im_cond2, im_cond1,lb) # write_tfboard(losses1,itr,name='EDLoss') # # losses3 = train_lstm(im_cond1,lb) # write_tfboard(losses3, itr, name='LstmLoss') # print(itr)
def train(args, snapshot_path): base_lr = args.base_lr train_data_path = args.root_path batch_size = args.batch_size max_iterations = args.max_iterations num_classes = 2 def create_model(ema=False): # Network definition net = net_factory_3d(net_type=args.model, in_chns=1, class_num=num_classes) model = net.cuda() if ema: for param in model.parameters(): param.detach_() return model model = create_model() ema_model = create_model(ema=True) db_train = BraTS2019(base_dir=train_data_path, split='train', num=None, transform=transforms.Compose([ RandomRotFlip(), RandomCrop(args.patch_size), ToTensor(), ])) def worker_init_fn(worker_id): random.seed(args.seed + worker_id) labeled_idxs = list(range(0, args.labeled_num)) unlabeled_idxs = list(range(args.labeled_num, args.total_labeled_num)) batch_sampler = TwoStreamBatchSampler(labeled_idxs, unlabeled_idxs, batch_size, batch_size - args.labeled_bs) trainloader = DataLoader(db_train, batch_sampler=batch_sampler, num_workers=4, pin_memory=True, worker_init_fn=worker_init_fn) model.train() ema_model.train() optimizer = optim.SGD(model.parameters(), lr=base_lr, momentum=0.9, weight_decay=0.0001) ce_loss = CrossEntropyLoss() dice_loss = losses.DiceLoss(2) writer = SummaryWriter(snapshot_path + '/log') logging.info("{} iterations per epoch".format(len(trainloader))) iter_num = 0 max_epoch = max_iterations // len(trainloader) + 1 best_performance = 0.0 iterator = tqdm(range(max_epoch), ncols=70) for epoch_num in iterator: for i_batch, sampled_batch in enumerate(trainloader): volume_batch, label_batch = sampled_batch['image'], sampled_batch[ 'label'] volume_batch, label_batch = volume_batch.cuda(), label_batch.cuda() labeled_volume_batch = volume_batch[:args.labeled_bs] unlabeled_volume_batch = volume_batch[args.labeled_bs:] # ICT mix factors ict_mix_factors = np.random.beta(args.ict_alpha, args.ict_alpha, size=(args.labeled_bs // 2, 1, 1, 1, 1)) ict_mix_factors = torch.tensor(ict_mix_factors, dtype=torch.float).cuda() unlabeled_volume_batch_0 = unlabeled_volume_batch[0:1, ...] unlabeled_volume_batch_1 = unlabeled_volume_batch[1:2, ...] # Mix images batch_ux_mixed = unlabeled_volume_batch_0 * \ (1.0 - ict_mix_factors) + \ unlabeled_volume_batch_1 * ict_mix_factors input_volume_batch = torch.cat( [labeled_volume_batch, batch_ux_mixed], dim=0) outputs = model(input_volume_batch) outputs_soft = torch.softmax(outputs, dim=1) with torch.no_grad(): ema_output_ux0 = torch.softmax( ema_model(unlabeled_volume_batch_0), dim=1) ema_output_ux1 = torch.softmax( ema_model(unlabeled_volume_batch_1), dim=1) batch_pred_mixed = ema_output_ux0 * \ (1.0 - ict_mix_factors) + ema_output_ux1 * ict_mix_factors loss_ce = ce_loss(outputs[:args.labeled_bs], label_batch[:args.labeled_bs][:]) loss_dice = dice_loss(outputs_soft[:args.labeled_bs], label_batch[:args.labeled_bs].unsqueeze(1)) supervised_loss = 0.5 * (loss_dice + loss_ce) consistency_weight = get_current_consistency_weight(iter_num // 150) consistency_loss = torch.mean( (outputs_soft[args.labeled_bs:] - batch_pred_mixed)**2) loss = supervised_loss + consistency_weight * consistency_loss optimizer.zero_grad() loss.backward() optimizer.step() update_ema_variables(model, ema_model, args.ema_decay, iter_num) lr_ = base_lr * (1.0 - iter_num / max_iterations)**0.9 for param_group in optimizer.param_groups: param_group['lr'] = lr_ iter_num = iter_num + 1 writer.add_scalar('info/lr', lr_, iter_num) writer.add_scalar('info/total_loss', loss, iter_num) writer.add_scalar('info/loss_ce', loss_ce, iter_num) writer.add_scalar('info/loss_dice', loss_dice, iter_num) writer.add_scalar('info/consistency_loss', consistency_loss, iter_num) writer.add_scalar('info/consistency_weight', consistency_weight, iter_num) logging.info( 'iteration %d : loss : %f, loss_ce: %f, loss_dice: %f' % (iter_num, loss.item(), loss_ce.item(), loss_dice.item())) writer.add_scalar('loss/loss', loss, iter_num) if iter_num % 20 == 0: image = volume_batch[0, 0:1, :, :, 20:61:10].permute(3, 0, 1, 2).repeat(1, 3, 1, 1) grid_image = make_grid(image, 5, normalize=True) writer.add_image('train/Image', grid_image, iter_num) image = outputs_soft[0, 1:2, :, :, 20:61:10].permute(3, 0, 1, 2).repeat(1, 3, 1, 1) grid_image = make_grid(image, 5, normalize=False) writer.add_image('train/Predicted_label', grid_image, iter_num) image = label_batch[0, :, :, 20:61:10].unsqueeze(0).permute( 3, 0, 1, 2).repeat(1, 3, 1, 1) grid_image = make_grid(image, 5, normalize=False) writer.add_image('train/Groundtruth_label', grid_image, iter_num) if iter_num > 0 and iter_num % 200 == 0: model.eval() avg_metric = test_all_case(model, args.root_path, test_list="val.txt", num_classes=2, patch_size=args.patch_size, stride_xy=32, stride_z=32) if avg_metric[:, 0].mean() > best_performance: best_performance = avg_metric[:, 0].mean() save_mode_path = os.path.join( snapshot_path, 'iter_{}_dice_{}.pth'.format( iter_num, round(best_performance, 4))) save_best = os.path.join( snapshot_path, '{}_best_model.pth'.format(args.model)) torch.save(model.state_dict(), save_mode_path) torch.save(model.state_dict(), save_best) writer.add_scalar('info/val_dice_score', avg_metric[0, 0], iter_num) writer.add_scalar('info/val_hd95', avg_metric[0, 1], iter_num) logging.info('iteration %d : dice_score : %f hd95 : %f' % (iter_num, avg_metric[0, 0].mean(), avg_metric[0, 1].mean())) model.train() if iter_num % 3000 == 0: save_mode_path = os.path.join(snapshot_path, 'iter_' + str(iter_num) + '.pth') torch.save(model.state_dict(), save_mode_path) logging.info("save model to {}".format(save_mode_path)) if iter_num >= max_iterations: break if iter_num >= max_iterations: iterator.close() break writer.close() return "Training Finished!"
def ddp_train_nerf(rank, args): ###### set up multi-processing setup(rank, args.world_size) ###### set up logger logger = logging.getLogger(__package__) setup_logger() ###### decide chunk size according to gpu memory logger.info('gpu_mem: {}'.format( torch.cuda.get_device_properties(rank).total_memory)) if torch.cuda.get_device_properties(rank).total_memory / 1e9 > 14: logger.info('setting batch size according to 24G gpu') args.N_rand = 1024 args.chunk_size = 8192 else: logger.info('setting batch size according to 12G gpu') args.N_rand = 512 args.chunk_size = 4096 ###### Create log dir and copy the config file if rank == 0: os.makedirs(os.path.join(args.basedir, args.expname), exist_ok=True) f = os.path.join(args.basedir, args.expname, 'args.txt') with open(f, 'w') as file: for arg in sorted(vars(args)): attr = getattr(args, arg) file.write('{} = {}\n'.format(arg, attr)) if args.config is not None: f = os.path.join(args.basedir, args.expname, 'config.txt') with open(f, 'w') as file: file.write(open(args.config, 'r').read()) torch.distributed.barrier() ray_samplers = load_data_split(args.datadir, args.scene, split='train', try_load_min_depth=args.load_min_depth) val_ray_samplers = load_data_split(args.datadir, args.scene, split='validation', try_load_min_depth=args.load_min_depth, skip=args.testskip) # write training image names for autoexposure if args.optim_autoexpo: f = os.path.join(args.basedir, args.expname, 'train_images.json') with open(f, 'w') as file: img_names = [ ray_samplers[i].img_path for i in range(len(ray_samplers)) ] json.dump(img_names, file, indent=2) ###### create network and wrap in ddp; each process should do this start, models = create_nerf(rank, args) ##### important!!! # make sure different processes sample different rays np.random.seed((rank + 1) * 777) # make sure different processes have different perturbations in depth samples torch.manual_seed((rank + 1) * 777) ##### only main process should do the logging if rank == 0: writer = SummaryWriter( os.path.join(args.basedir, 'summaries', args.expname)) # start training what_val_to_log = 0 # helper variable for parallel rendering of a image what_train_to_log = 0 for global_step in range(start + 1, start + 1 + args.N_iters): time0 = time.time() scalars_to_log = OrderedDict() ### Start of core optimization loop scalars_to_log['resolution'] = ray_samplers[0].resolution_level # randomly sample rays and move to device i = np.random.randint(low=0, high=len(ray_samplers)) ray_batch = ray_samplers[i].random_sample(args.N_rand, center_crop=False) for key in ray_batch: if torch.is_tensor(ray_batch[key]): ray_batch[key] = ray_batch[key].to(rank) # forward and backward dots_sh = list(ray_batch['ray_d'].shape[:-1]) # number of rays all_rets = [] # results on different cascade levels for m in range(models['cascade_level']): optim = models['optim_{}'.format(m)] net = models['net_{}'.format(m)] # sample depths N_samples = models['cascade_samples'][m] if m == 0: # foreground depth fg_far_depth = intersect_sphere(ray_batch['ray_o'], ray_batch['ray_d']) # [...,] fg_near_depth = ray_batch['min_depth'] # [..., ] step = (fg_far_depth - fg_near_depth) / (N_samples - 1) fg_depth = torch.stack( [fg_near_depth + i * step for i in range(N_samples)], dim=-1) # [..., N_samples] fg_depth = perturb_samples( fg_depth) # random perturbation during training # background depth bg_depth = torch.linspace(0., 1., N_samples).view([ 1, ] * len(dots_sh) + [ N_samples, ]).expand(dots_sh + [ N_samples, ]).to(rank) bg_depth = perturb_samples( bg_depth) # random perturbation during training else: # sample pdf and concat with earlier samples fg_weights = ret['fg_weights'].clone().detach() fg_depth_mid = .5 * (fg_depth[..., 1:] + fg_depth[..., :-1] ) # [..., N_samples-1] fg_weights = fg_weights[..., 1:-1] # [..., N_samples-2] fg_depth_samples = sample_pdf(bins=fg_depth_mid, weights=fg_weights, N_samples=N_samples, det=False) # [..., N_samples] fg_depth, _ = torch.sort( torch.cat((fg_depth, fg_depth_samples), dim=-1)) # sample pdf and concat with earlier samples bg_weights = ret['bg_weights'].clone().detach() bg_depth_mid = .5 * (bg_depth[..., 1:] + bg_depth[..., :-1]) bg_weights = bg_weights[..., 1:-1] # [..., N_samples-2] bg_depth_samples = sample_pdf(bins=bg_depth_mid, weights=bg_weights, N_samples=N_samples, det=False) # [..., N_samples] bg_depth, _ = torch.sort( torch.cat((bg_depth, bg_depth_samples), dim=-1)) optim.zero_grad() ret = net(ray_batch['ray_o'], ray_batch['ray_d'], fg_far_depth, fg_depth, bg_depth, img_name=ray_batch['img_name']) all_rets.append(ret) rgb_gt = ray_batch['rgb'].to(rank) if 'autoexpo' in ret: scale, shift = ret['autoexpo'] scalars_to_log['level_{}/autoexpo_scale'.format( m)] = scale.item() scalars_to_log['level_{}/autoexpo_shift'.format( m)] = shift.item() # rgb_gt = scale * rgb_gt + shift rgb_pred = (ret['rgb'] - shift) / scale rgb_loss = img2mse(rgb_pred, rgb_gt) loss = rgb_loss + args.lambda_autoexpo * ( torch.abs(scale - 1.) + torch.abs(shift)) else: rgb_loss = img2mse(ret['rgb'], rgb_gt) loss = rgb_loss scalars_to_log['level_{}/loss'.format(m)] = rgb_loss.item() scalars_to_log['level_{}/pnsr'.format(m)] = mse2psnr( rgb_loss.item()) loss.backward() optim.step() # # clean unused memory # torch.cuda.empty_cache() ### end of core optimization loop dt = time.time() - time0 scalars_to_log['iter_time'] = dt ### only main process should do the logging if rank == 0 and (global_step % args.i_print == 0 or global_step < 10): logstr = '{} step: {} '.format(args.expname, global_step) for k in scalars_to_log: logstr += ' {}: {:.6f}'.format(k, scalars_to_log[k]) writer.add_scalar(k, scalars_to_log[k], global_step) logger.info(logstr) ### each process should do this; but only main process merges the results if global_step % args.i_img == 0 or global_step == start + 1: #### critical: make sure each process is working on the same random image time0 = time.time() idx = what_val_to_log % len(val_ray_samplers) log_data = render_single_image(rank, args.world_size, models, val_ray_samplers[idx], args.chunk_size) what_val_to_log += 1 dt = time.time() - time0 if rank == 0: # only main process should do this logger.info( 'Logged a random validation view in {} seconds'.format(dt)) log_view_to_tb(writer, global_step, log_data, gt_img=val_ray_samplers[idx].get_img(), mask=None, prefix='val/') time0 = time.time() idx = what_train_to_log % len(ray_samplers) log_data = render_single_image(rank, args.world_size, models, ray_samplers[idx], args.chunk_size) what_train_to_log += 1 dt = time.time() - time0 if rank == 0: # only main process should do this logger.info( 'Logged a random training view in {} seconds'.format(dt)) log_view_to_tb(writer, global_step, log_data, gt_img=ray_samplers[idx].get_img(), mask=None, prefix='train/') del log_data torch.cuda.empty_cache() if rank == 0 and (global_step % args.i_weights == 0 and global_step > 0): # saving checkpoints and logging fpath = os.path.join(args.basedir, args.expname, 'model_{:06d}.pth'.format(global_step)) to_save = OrderedDict() for m in range(models['cascade_level']): name = 'net_{}'.format(m) to_save[name] = models[name].state_dict() name = 'optim_{}'.format(m) to_save[name] = models[name].state_dict() torch.save(to_save, fpath) # clean up for multi-processing cleanup()
def __init__(self, args): # initialise name of the file (optional(prefix) + seed + start time) cql_ext = '_cql' if 'use_cql' in args and args.use_cql else '' if hasattr(args, 'output_file_prefix'): self.output_name = args.output_file_prefix + cql_ext + \ '__' + str(args.seed) + '__' + \ datetime.datetime.now().strftime('%d_%m_%H_%M_%S') else: self.output_name = str(args.seed) + '__' + datetime.datetime.now( ).strftime('%d_%m_%H_%M_%S') # get path to log directory (and create it if necessary) try: log_dir = args.results_log_dir except AttributeError: log_dir = args['results_log_dir'] if log_dir is None: log_dir = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir)) log_dir = os.path.join(log_dir, 'logs') if not os.path.exists(log_dir): try: os.mkdir(log_dir) except: dir_path_head, dir_path_tail = os.path.split(log_dir) if len(dir_path_tail) == 0: dir_path_head, dir_path_tail = os.path.split(dir_path_head) os.mkdir(dir_path_head) os.mkdir(log_dir) # create a subdirectory for the environment try: env_dir = os.path.join(log_dir, '{}'.format(args.env_name)) except: env_dir = os.path.join(log_dir, '{}'.format(args["env_name"])) if not os.path.exists(env_dir): os.makedirs(env_dir) # create a subdirectory for the exp_label (usually the method name) # exp_dir = os.path.join(env_dir, exp_label) # if not os.path.exists(exp_dir): # os.makedirs(exp_dir) # finally, get full path of where results are stored self.full_output_folder = os.path.join(env_dir, self.output_name) self.writer = SummaryWriter(self.full_output_folder) print('logging under', self.full_output_folder) with open(os.path.join(self.full_output_folder, 'online_config.json'), 'w') as f: try: config = { k: v for (k, v) in vars(args).items() if k != 'device' } except: config = args config.update(device=ptu.device.type) json.dump(config, f, indent=2)
import os import torch import torch.nn as nn import numpy as np from tensorboardX import SummaryWriter from config import AugmentConfig import utils from models.augment_cnn import AugmentCNN config = AugmentConfig() device = torch.device("cuda") # tensorboard writer = SummaryWriter(log_dir=os.path.join(config.path, "tb")) writer.add_text('config', config.as_markdown(), 0) logger = utils.get_logger(os.path.join(config.path, "{}.log".format(config.name))) config.print_params(logger.info) def main(): logger.info("Logger is set - training start") # set default gpu device id torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed)
def main(): opt = parse_opt() print('Arguments:') for k in opt.__dict__.keys(): print(' ', k, ':', str(opt.__dict__[k])) if opt.load is not None: logger = SummaryWriter(opt.load) else: #logger = SummaryWriter(comment=opt.comment) logger = SummaryWriter(logdir=os.path.join(opt.savedir, opt.exp_name)) print('Log files saved to', logger.file_writer.get_logdir()) for k in opt.__dict__.keys(): logger.add_text(k, str(opt.__dict__[k])) # get and save the version of the code being run repo = git.Repo(search_parent_directories=True) sha = repo.head.object.hexsha logger.add_text("git_sha", sha) dataset_dict = load_dataset(opt) model, optimizer = create_model_and_optimizer( opt, dataset_dict["train"].get_all_texts()) if opt.load is not None: print("loading from: %s" % opt.load) loaded_dict = \ torch.load(logger.file_writer.get_logdir()+"/latest_checkpoint.pth") model.load_state_dict(loaded_dict["model_state_dict"]) initial_it = loaded_dict["it"] for g in optimizer.param_groups: print('learning rate(s):') print(g["lr"]) # g['lr'] *= opt.learning_rate_decay else: initial_it = 0 if opt.inspect: import IPython IPython.embed() if not opt.test_only: final_it = train_loop(opt, logger, dataset_dict, model, optimizer, initial_it) else: final_it = initial_it run_eval(opt, logger, dataset_dict, model, final_it + int(opt.final_eval_on_test), eval_on_test=opt.final_eval_on_test) if opt.dataset == "fashioniq": print('Generating FashionIQ submission...') test_retrieval.predict(opt, model, dataset_dict["test"], filter_categories=True) print('done') logger.close()
def train(config): ## set up summary writer writer = SummaryWriter(config['output_path']) # set up early stop early_stop_engine = EarlyStopping(config["early_stop_patience"]) ## set loss class_num = config["network"]["params"]["class_num"] class_criterion = nn.CrossEntropyLoss() transfer_criterion = config["loss"]["name"] center_criterion = config["loss"]["discriminant_loss"]( num_classes=class_num, feat_dim=config["network"]["params"]["bottleneck_dim"]) loss_params = config["loss"] ## prepare data dsets = {} dset_loaders = {} #sampling WOR, i guess we leave the 10 in the middle to validate? pristine_indices = torch.randperm(len(pristine_x)) #train pristine_x_train = pristine_x[ pristine_indices[:int(np.floor(.7 * len(pristine_x)))]] pristine_y_train = pristine_y[ pristine_indices[:int(np.floor(.7 * len(pristine_x)))]] #validate --- gets passed into test functions in train file pristine_x_valid = pristine_x[pristine_indices[ int(np.floor(.7 * len(pristine_x))):int(np.floor(.8 * len(pristine_x)))]] pristine_y_valid = pristine_y[pristine_indices[ int(np.floor(.7 * len(pristine_x))):int(np.floor(.8 * len(pristine_x)))]] #test for evaluation file pristine_x_test = pristine_x[ pristine_indices[int(np.floor(.8 * len(pristine_x))):]] pristine_y_test = pristine_y[ pristine_indices[int(np.floor(.8 * len(pristine_x))):]] noisy_indices = torch.randperm(len(noisy_x)) #train noisy_x_train = noisy_x[noisy_indices[:int(np.floor(.7 * len(noisy_x)))]] noisy_y_train = noisy_y[noisy_indices[:int(np.floor(.7 * len(noisy_x)))]] #validate --- gets passed into test functions in train file noisy_x_valid = noisy_x[noisy_indices[int(np.floor(.7 * len(noisy_x)) ):int(np.floor(.8 * len(noisy_x)))]] noisy_y_valid = noisy_y[noisy_indices[int(np.floor(.7 * len(noisy_x)) ):int(np.floor(.8 * len(noisy_x)))]] #test for evaluation file noisy_x_test = noisy_x[noisy_indices[int(np.floor(.8 * len(noisy_x))):]] noisy_y_test = noisy_y[noisy_indices[int(np.floor(.8 * len(noisy_x))):]] dsets["source"] = TensorDataset(pristine_x_train, pristine_y_train) dsets["target"] = TensorDataset(noisy_x_train, noisy_y_train) dsets["source_valid"] = TensorDataset(pristine_x_valid, pristine_y_valid) dsets["target_valid"] = TensorDataset(noisy_x_valid, noisy_y_valid) dsets["source_test"] = TensorDataset(pristine_x_test, pristine_y_test) dsets["target_test"] = TensorDataset(noisy_x_test, noisy_y_test) #put your dataloaders here #i stole batch size numbers from below dset_loaders["source"] = DataLoader(dsets["source"], batch_size=36, shuffle=True, num_workers=1) dset_loaders["target"] = DataLoader(dsets["target"], batch_size=36, shuffle=True, num_workers=1) #guessing batch size based on what was done for testing in the original file dset_loaders["source_valid"] = DataLoader(dsets["source_valid"], batch_size=4, shuffle=True, num_workers=1) dset_loaders["target_valid"] = DataLoader(dsets["target_valid"], batch_size=4, shuffle=True, num_workers=1) dset_loaders["source_test"] = DataLoader(dsets["source_test"], batch_size=4, shuffle=True, num_workers=1) dset_loaders["target_test"] = DataLoader(dsets["target_test"], batch_size=4, shuffle=True, num_workers=1) config['out_file'].write("dataset sizes: source={}, target={}\n".format( len(dsets["source"]), len(dsets["target"]))) #TODO: change this too ## set base network net_config = config["network"] base_network = net_config["name"](**net_config["params"]) use_gpu = torch.cuda.is_available() if use_gpu: base_network = base_network.cuda() ## collect parameters if "DeepMerge" in args.net: parameter_list = [{ "params": base_network.parameters(), "lr_mult": 1, 'decay_mult': 2 }] elif net_config["params"]["new_cls"]: if net_config["params"]["use_bottleneck"]: parameter_list = [{"params":base_network.feature_layers.parameters(), "lr_mult":1, 'decay_mult':2}, \ {"params":base_network.bottleneck.parameters(), "lr_mult":10, 'decay_mult':2}, \ {"params":base_network.fc.parameters(), "lr_mult":10, 'decay_mult':2}] else: parameter_list = [{"params":base_network.feature_layers.parameters(), "lr_mult":1, 'decay_mult':2}, \ {"params":base_network.fc.parameters(), "lr_mult":10, 'decay_mult':2}] else: parameter_list = [{ "params": base_network.parameters(), "lr_mult": 1, 'decay_mult': 2 }] ## add additional network for some methods class_weight = torch.from_numpy(np.array([1.0] * class_num)) if use_gpu: class_weight = class_weight.cuda() parameter_list.append({ "params": center_criterion.parameters(), "lr_mult": 10, 'decay_mult': 1 }) ## set optimizer optimizer_config = config["optimizer"] optimizer = optim_dict[optimizer_config["type"]](parameter_list, \ **(optimizer_config["optim_params"])) param_lr = [] for param_group in optimizer.param_groups: param_lr.append(param_group["lr"]) schedule_param = optimizer_config["lr_param"] lr_scheduler = lr_schedule.schedule_dict[optimizer_config["lr_type"]] ## train len_train_source = len(dset_loaders["source"]) - 1 len_train_target = len(dset_loaders["target"]) - 1 len_valid_source = len(dset_loaders["source_valid"]) - 1 len_valid_target = len(dset_loaders["target_valid"]) - 1 transfer_loss_value = classifier_loss_value = total_loss_value = 0.0 best_acc = 0.0 for i in range(config["num_iterations"]): if i % config["test_interval"] == 0: base_network.train(False) if config['loss']['ly_type'] == "cosine": temp_acc, _ = image_classification_test(dset_loaders, 'source_valid', \ base_network, \ gpu=use_gpu) train_acc, _ = image_classification_test(dset_loaders, 'source', \ base_network, \ gpu=use_gpu) elif config['loss']['ly_type'] == "euclidean": temp_acc, _ = distance_classification_test(dset_loaders, 'source_valid', \ base_network, center_criterion.centers.detach(), \ gpu=use_gpu) train_acc, _ = distance_classification_test(dset_loaders, 'source', \ base_network, \ gpu=use_gpu) else: raise ValueError("no test method for cls loss: {}".format( config['loss']['ly_type'])) snapshot_obj = { 'step': i, "base_network": base_network.state_dict(), 'valid accuracy': temp_acc, 'train accuracy': train_acc, } snapshot_obj['center_criterion'] = center_criterion.state_dict() if temp_acc > best_acc: best_acc = temp_acc # save best model torch.save( snapshot_obj, osp.join(config["output_path"], "best_model.pth.tar")) log_str = "iter: {:05d}, {} validation accuracy: {:.5f}, {} training accuracy: {:.5f}\n".format( i, config['loss']['ly_type'], temp_acc, config['loss']['ly_type'], train_acc) config["out_file"].write(log_str) config["out_file"].flush() writer.add_scalar("validation accuracy", temp_acc, i) writer.add_scalar("training accuracy", train_acc, i) if early_stop_engine.is_stop_training(temp_acc): config["out_file"].write( "no improvement after {}, stop training at step {}\n". format(config["early_stop_patience"], i)) break if (i + 1) % config["snapshot_interval"] == 0: torch.save( snapshot_obj, osp.join(config["output_path"], "iter_{:05d}_model.pth.tar".format(i))) ## train one iter base_network.train(True) optimizer = lr_scheduler(param_lr, optimizer, i, **schedule_param) optimizer.zero_grad() if i % len_train_source == 0: iter_source = iter(dset_loaders["source"]) if i % len_train_target == 0: iter_target = iter(dset_loaders["target"]) try: inputs_source, labels_source = iter_source.next() inputs_target, labels_target = iter_target.next() except StopIteration: iter_source = iter(dset_loaders["source"]) iter_target = iter(dset_loaders["target"]) if use_gpu: inputs_source, inputs_target, labels_source = \ Variable(inputs_source).cuda(), Variable(inputs_target).cuda(), \ Variable(labels_source).cuda() else: inputs_source, inputs_target, labels_source = Variable(inputs_source), \ Variable(inputs_target), Variable(labels_source) inputs = torch.cat((inputs_source, inputs_target), dim=0) source_batch_size = inputs_source.size(0) if config['loss']['ly_type'] == 'cosine': features, logits = base_network(inputs) source_logits = logits.narrow(0, 0, source_batch_size) elif config['loss']['ly_type'] == 'euclidean': features, _ = base_network(inputs) logits = -1.0 * loss.distance_to_centroids( features, center_criterion.centers.detach()) source_logits = logits.narrow(0, 0, source_batch_size) transfer_loss = transfer_criterion(features[:source_batch_size], features[source_batch_size:]) # source domain classification task loss classifier_loss = class_criterion(source_logits, labels_source.long()) # fisher loss on labeled source domain fisher_loss, fisher_intra_loss, fisher_inter_loss, center_grad = center_criterion( features.narrow(0, 0, int(inputs.size(0) / 2)), labels_source, inter_class=loss_params["inter_type"], intra_loss_weight=loss_params["intra_loss_coef"], inter_loss_weight=loss_params["inter_loss_coef"]) # entropy minimization loss em_loss = loss.EntropyLoss(nn.Softmax(dim=1)(logits)) total_loss = loss_params["trade_off"] * transfer_loss \ + fisher_loss \ + loss_params["em_loss_coef"] * em_loss \ + classifier_loss total_loss.backward() if center_grad is not None: # clear mmc_loss center_criterion.centers.grad.zero_() # Manually assign centers gradients other than using autograd center_criterion.centers.backward(center_grad) optimizer.step() if i % config["log_iter"] == 0: config['out_file'].write( 'iter {}: train total loss={:0.4f}, train transfer loss={:0.4f}, train classifier loss={:0.4f}, ' 'train entropy min loss={:0.4f}, ' 'train fisher loss={:0.4f}, train intra-group fisher loss={:0.4f}, train inter-group fisher loss={:0.4f}\n' .format( i, total_loss.data.cpu(), transfer_loss.data.cpu().float().item(), classifier_loss.data.cpu().float().item(), em_loss.data.cpu().float().item(), fisher_loss.cpu().float().item(), fisher_intra_loss.cpu().float().item(), fisher_inter_loss.cpu().float().item(), )) config['out_file'].flush() writer.add_scalar("training total loss", total_loss.data.cpu().float().item(), i) writer.add_scalar("training transfer loss", transfer_loss.data.cpu().float().item(), i) writer.add_scalar("training classifier loss", classifier_loss.data.cpu().float().item(), i) writer.add_scalar("training entropy minimization loss", em_loss.data.cpu().float().item(), i) writer.add_scalar("training total fisher loss", fisher_loss.data.cpu().float().item(), i) writer.add_scalar("training intra-group fisher", fisher_intra_loss.data.cpu().float().item(), i) writer.add_scalar("training inter-group fisher", fisher_inter_loss.data.cpu().float().item(), i) #attempted validation step base_network.eval() with torch.no_grad(): if i % len_valid_source == 0: iter_source = iter(dset_loaders["source_valid"]) if i % len_valid_target == 0: iter_target = iter(dset_loaders["target_valid"]) try: inputs_source, labels_source = iter_source.next() inputs_target, labels_target = iter_target.next() except StopIteration: iter_source = iter(dset_loaders["source_valid"]) iter_target = iter(dset_loaders["target_valid"]) if use_gpu: inputs_source, inputs_target, labels_source = \ Variable(inputs_source).cuda(), Variable(inputs_target).cuda(), \ Variable(labels_source).cuda() else: inputs_source, inputs_target, labels_source = Variable(inputs_source), \ Variable(inputs_target), Variable(labels_source) inputs = torch.cat((inputs_source, inputs_target), dim=0) source_batch_size = inputs_source.size(0) if config['loss']['ly_type'] == 'cosine': features, logits = base_network(inputs) source_logits = logits.narrow(0, 0, source_batch_size) elif config['loss']['ly_type'] == 'euclidean': features, _ = base_network(inputs) logits = -1.0 * loss.distance_to_centroids( features, center_criterion.centers.detach()) source_logits = logits.narrow(0, 0, source_batch_size) transfer_loss = transfer_criterion(features[:source_batch_size], features[source_batch_size:]) # source domain classification task loss classifier_loss = class_criterion(source_logits, labels_source.long()) # fisher loss on labeled source domain fisher_loss, fisher_intra_loss, fisher_inter_loss, center_grad = center_criterion( features.narrow(0, 0, int(inputs.size(0) / 2)), labels_source, inter_class=loss_params["inter_type"], intra_loss_weight=loss_params["intra_loss_coef"], inter_loss_weight=loss_params["inter_loss_coef"]) # entropy minimization loss em_loss = loss.EntropyLoss(nn.Softmax(dim=1)(logits)) # final loss total_loss = loss_params["trade_off"] * transfer_loss \ + fisher_loss \ + loss_params["em_loss_coef"] * em_loss \ + classifier_loss #total_loss.backward() no backprop on the eval mode if i % config["log_iter"] == 0: config['out_file'].write( 'iter {} valid transfer loss={:0.4f}, valid classifier loss={:0.4f}, ' 'valid entropy min loss={:0.4f}, ' 'valid fisher loss={:0.4f}, valid intra-group fisher loss={:0.4f}, valid inter-group fisher loss={:0.4f}\n' .format( i, transfer_loss.data.cpu().float().item(), classifier_loss.data.cpu().float().item(), em_loss.data.cpu().float().item(), fisher_loss.cpu().float().item(), fisher_intra_loss.cpu().float().item(), fisher_inter_loss.cpu().float().item(), )) config['out_file'].flush() writer.add_scalar("validation total loss", total_loss.data.cpu().float().item(), i) writer.add_scalar("validation transfer loss", transfer_loss.data.cpu().float().item(), i) writer.add_scalar("validation classifier loss", classifier_loss.data.cpu().float().item(), i) writer.add_scalar("validation entropy minimization loss", em_loss.data.cpu().float().item(), i) writer.add_scalar("validation total fisher loss", fisher_loss.data.cpu().float().item(), i) writer.add_scalar("validation intra-group fisher", fisher_intra_loss.data.cpu().float().item(), i) writer.add_scalar("validation inter-group fisher", fisher_inter_loss.data.cpu().float().item(), i) return best_acc
def train( train_file, valid_file, test_file, cate_file, item_count, dataset = "book", batch_size = 128, maxlen = 100, test_iter = 50, model_type = 'DNN', lr = 0.001, max_iter = 100, patience = 20 ): exp_name = get_exp_name(dataset, model_type, batch_size, lr, maxlen) best_model_path = "best_model/" + exp_name + '/' gpu_options = tf.GPUOptions(allow_growth=True) writer = SummaryWriter('runs/' + exp_name) item_cate_map = load_item_cate(cate_file) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: train_data = DataIterator(train_file, batch_size, maxlen, train_flag=0) valid_data = DataIterator(valid_file, batch_size, maxlen, train_flag=1) model = get_model(dataset, model_type, item_count, batch_size, maxlen) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) print('training begin') sys.stdout.flush() start_time = time.time() iter = 0 try: loss_sum = 0.0 trials = 0 for src, tgt in train_data: data_iter = prepare_data(src, tgt) loss = model.train(sess, list(data_iter) + [lr]) loss_sum += loss iter += 1 if iter % test_iter == 0: metrics = evaluate_full(sess, valid_data, model, best_model_path, batch_size, item_cate_map) log_str = 'iter: %d, train loss: %.4f' % (iter, loss_sum / test_iter) if metrics != {}: log_str += ', ' + ', '.join(['valid ' + key + ': %.6f' % value for key, value in metrics.items()]) print(exp_name) print(log_str) writer.add_scalar('train/loss', loss_sum / test_iter, iter) if metrics != {}: for key, value in metrics.items(): writer.add_scalar('eval/' + key, value, iter) if 'recall' in metrics: recall = metrics['recall'] global best_metric if recall > best_metric: best_metric = recall model.save(sess, best_model_path) trials = 0 else: trials += 1 if trials > patience: break loss_sum = 0.0 test_time = time.time() print("time interval: %.4f min" % ((test_time-start_time)/60.0)) sys.stdout.flush() if iter >= max_iter * 1000: break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') model.restore(sess, best_model_path) metrics = evaluate_full(sess, valid_data, model, best_model_path, batch_size, item_cate_map, save=False) print(', '.join(['valid ' + key + ': %.6f' % value for key, value in metrics.items()])) test_data = DataIterator(test_file, batch_size, maxlen, train_flag=2) metrics = evaluate_full(sess, test_data, model, best_model_path, batch_size, item_cate_map, save=False) print(', '.join(['test ' + key + ': %.6f' % value for key, value in metrics.items()]))
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # 裁剪梯度 optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def __init__(self, log_dir, fps): self._log_dir = log_dir self.fps = fps logger.info('Logging training data to: ' + log_dir) self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)