def main_worker(): opt = parse_opts() print(opt) seed = 1 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # CUDA for PyTorch device = torch.device(f"cuda:{opt.gpu}" if opt.use_cuda else "cpu") # tensorboard summary_writer = tensorboardX.SummaryWriter(log_dir='tf_logs') # defining model encoder_cnn, decoder_rnn = generate_model(opt, device) # get data loaders train_loader, val_loader = get_loaders(opt) # optimizer crnn_params = list(encoder_cnn.parameters()) + \ list(decoder_rnn.parameters()) optimizer = torch.optim.Adam(crnn_params, lr=opt.lr_rate, weight_decay=opt.weight_decay) # scheduler = lr_scheduler.ReduceLROnPlateau( # optimizer, 'min', patience=opt.lr_patience) criterion = nn.CrossEntropyLoss() # resume model if opt.resume_path: start_epoch = resume_model(opt, encoder_cnn, decoder_rnn, optimizer) else: start_epoch = 1 # start training for epoch in range(start_epoch, opt.n_epochs + 1): train_loss, train_acc = train_epoch( encoder_cnn, decoder_rnn, train_loader, criterion, optimizer, epoch, opt.log_interval, device) val_loss, val_acc = val_epoch( encoder_cnn, decoder_rnn, val_loader, criterion, device) # saving weights to checkpoint if (epoch) % opt.save_interval == 0: # scheduler.step(val_loss) # write summary summary_writer.add_scalar( 'losses/train_loss', train_loss, global_step=epoch) summary_writer.add_scalar( 'losses/val_loss', val_loss, global_step=epoch) summary_writer.add_scalar( 'acc/train_acc', train_acc * 100, global_step=epoch) summary_writer.add_scalar( 'acc/val_acc', val_acc * 100, global_step=epoch) state = {'epoch': epoch, 'encoder_state_dict': encoder_cnn.state_dict(), 'decoder_state_dict': decoder_rnn.state_dict(), 'optimizer_state_dict': optimizer.state_dict()} torch.save(state, os.path.join('snapshots', f'{opt.model}-Epoch-{epoch}-Loss-{val_loss}.pth')) print("Epoch {} model saved!\n".format(epoch))
def main(): # init or load model print("init model with input shape",config["input_shape"]) model = NvNet(config=config,input_shape=config["input_shape"], seg_outChans=config["n_labels"]) parameters = model.parameters() optimizer = optim.Adam(parameters, lr=config["initial_learning_rate"], weight_decay = config["L2_norm"]) start_epoch = 1 if config["VAE_enable"]: loss_function = CombinedLoss(k1=config["loss_k1_weight"], k2=config["loss_k2_weight"]) else: loss_function = SoftDiceLoss() # data_generator print("data generating") training_data = BratsDataset(phase="train", config=config) train_loader = torch.utils.data.DataLoader(dataset=training_data, batch_size=config["batch_size"], shuffle=True, pin_memory=True) valildation_data = BratsDataset(phase="validate", config=config) valildation_loader = torch.utils.data.DataLoader(dataset=valildation_data, batch_size=config["batch_size"], shuffle=True, pin_memory=True) train_logger = Logger(model_name=config["model_file"],header=['epoch', 'loss', 'acc', 'lr']) if config["cuda_devices"] is not None: model = model.cuda() loss_function = loss_function.cuda() # if not config["overwrite"] and os.path.exists(config["model_file"]) or os.path.exists(config["saved_model_file"]): # model, start_epoch, optimizer = load_old_model(model, optimizer, saved_model_path=config["saved_model_file"]) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=config["lr_decay"],patience=config["patience"]) print("training on label:{}".format(config["labels"])) for i in range(start_epoch,config["epochs"]): train_epoch(epoch=i, data_loader=train_loader, model=model, model_name=config["model_file"], criterion=loss_function, optimizer=optimizer, opt=config, epoch_logger=train_logger) val_loss = val_epoch(epoch=i, data_loader=valildation_loader, model=model, criterion=loss_function, opt=config, optimizer=optimizer, logger=train_logger) scheduler.step(val_loss)
def main_worker(opt): # opt.device = torch.device('cuda') model = RACNN(num_classes=opt.num_classes) model = model.to(opt.device) # model = torch.nn.DataParallel(model,device_ids=[0,1]) print(model) cls_params = list(model.b1.parameters()) + list( model.b2.parameters()) + list(model.classifier1.parameters()) + list( model.classifier2.parameters()) apn_params = list(model.apn.parameters()) # optimizer = model.parameters() criterion = CrossEntropyLoss().to(opt.device) (train_loader, train_logger, optimizer_cls, optimizer_apn, scheduler) = get_train_utils(opt, cls_params, apn_params) val_loader, val_logger = get_val_utils(opt) test_sample, _ = next(iter(val_loader)) tb_writer = SummaryWriter(log_dir=opt.result_path) pretrainAPN(train_loader, optimizer_apn, opt, model, tb_writer) # model = model.to(opt.device) for i in range(opt.begin_epoch, opt.n_epochs + 1): cls_train_epoch(i, train_loader, model, criterion, optimizer_cls, opt.device, train_logger, tb_writer) apn_train_epoch(i, train_loader, model, optimizer_apn, opt.device, tb_writer) if i % opt.checkpoint == 0: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, model, optimizer_cls, optimizer_apn, scheduler) # if i % 5 == 0: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer) if opt.lr_scheduler == 'multistep': scheduler.step() elif opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) test_sample = test_sample.to(opt.device) _, _, _, crops = model(test_sample) img = crops[0].data # pic_path = str(opt.result_path)+'/samples/' save_img( img, path='/home/zhaoliu/car_brand/racnn/results/samples/iter_{}@2x.jpg' .format(i), annotation=f' 2xstep = {i}')
def main(): opt = parse_opts() opt.device_ids = list(range(device_count())) local2global_path(opt) model, parameters = generate_model(opt) criterion = get_loss(opt) criterion = criterion.cuda() optimizer = get_optim(opt, parameters) writer = SummaryWriter(logdir=opt.log_path) # train spatial_transform = get_spatial_transform(opt, 'train') temporal_transform = TSN(seq_len=opt.seq_len, snippet_duration=opt.snippet_duration, center=False) target_transform = ClassLabel() training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = get_data_loader(opt, training_data, shuffle=True) # validation spatial_transform = get_spatial_transform(opt, 'test') temporal_transform = TSN(seq_len=opt.seq_len, snippet_duration=opt.snippet_duration, center=False) target_transform = ClassLabel() validation_data = get_validation_set(opt, spatial_transform, temporal_transform, target_transform) val_loader = get_data_loader(opt, validation_data, shuffle=False) for i in range(1, opt.n_epochs + 1): train_epoch(i, train_loader, model, criterion, optimizer, opt, training_data.class_names, writer) val_epoch(i, val_loader, model, criterion, opt, writer, optimizer) writer.close()
def main_worker(opt): model = generate_model(opt) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) val_loader, val_logger = get_val_utils(opt) criterion = CrossEntropyLoss().to(opt.device) prev_val_loss, val_acc = val_epoch(0, val_loader, model, criterion, opt.device, val_logger, None, opt.distributed) print('Acc ({acc:.3f})'.format(acc=val_acc))
def main(): opt = set_opts() model = load_pretrained_resnet101(opt) train_loader, val_loader, test_loader, test_data = get_ucf_data(opt) criterion = nn.CrossEntropyLoss() if not opt.no_cuda: criterion = criterion.cuda() # get fine-tune parameters (we fine-tune all of them) parameters = get_fine_tuning_parameters(model, opt.ft_begin_index) optimizer = optim.SGD(parameters, lr=opt.learning_rate, momentum=opt.momentum, dampening=opt.dampening, weight_decay=opt.weight_decay, nesterov=opt.nesterov) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=opt.lr_patience) train_logger = Logger(os.path.join(opt.result_path, 'train.log'), ['epoch', 'loss', 'acc', 'lr']) train_batch_logger = Logger( os.path.join(opt.result_path, 'train_batch.log'), ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) val_logger = Logger(os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc']) # training for i in range(opt.begin_epoch, opt.n_epochs + 1): train_epoch(i, train_loader, model, criterion, optimizer, opt, train_logger, train_batch_logger) validation_loss = val_epoch(i, val_loader, model, criterion, opt, val_logger) scheduler.step(validation_loss) # testing test_results, all_output_buffer = final_test(test_loader, model, opt, test_data.class_names)
CenterCrop(opt.sample_size), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = ClassLabel() validation_data = get_validation_set( opt, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader( validation_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True) val_logger = Logger( os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc']) validation_loss = val_epoch(opt.begin_epoch, val_loader, model, criterion, opt, val_logger) # similarity model for testing sim_model = None if opt.resume_path_sim != '': opt.n_finetune_classes = opt.n_classes sim_model, _ = generate_sim_model(opt) print('loading similarity model checkpoint {}'.format(opt.resume_path_sim)) checkpoint = torch.load(opt.resume_path_sim) print(opt.arch, checkpoint['arch']) assert opt.arch == checkpoint['arch'] if not opt.no_cuda: sim_model.module.load_state_dict(checkpoint['state_dict']) else: sim_model.load_state_dict(checkpoint['state_dict']) sim_model.eval()
def main(opt): place = fluid.CPUPlace() if opt.no_cuda else fluid.CUDAPlace(0) with fluid.dygraph.guard(place): print(place) random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) prog = fluid.default_main_program() prog.global_seed(opt.manual_seed) os.environ['PYTHONHASHSEED'] = str(opt.manual_seed) model = generate_model(opt) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, model) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if not opt.no_train: (train_loader, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) best_acc = 0.88 for epoch in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: train_epoch(epoch, train_loader, model, optimizer, scheduler, train_logger, train_batch_logger) if epoch % opt.checkpoint == 0: save_file_path = str( opt.result_path) + 'save_{}_{}_{}'.format( epoch, opt.train_crop, opt.batch_size) save_checkpoint(save_file_path, model, optimizer) if not opt.no_val: prev_val_loss, val_acc = val_epoch(epoch, val_loader, model, val_logger) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.epoch() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if not opt.no_val: if val_acc > best_acc: best_acc = val_acc save_file_path = str( opt.result_path) + 'save_{}_{}_best_val_acc'.format( epoch, opt.train_crop) save_checkpoint(save_file_path, model, optimizer) if not opt.no_train: current_lr = optimizer.current_step_lr() print("current val_loss is %s, current lr is %s" % (prev_val_loss.numpy()[0], current_lr)) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}_{}.json'.format( opt.inference_subset, opt.train_crop) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main(): opt = parse_opts() ecd_name, cls_name = opt.model_name.split('-') ecd_model = get_encoder_net(ecd_name) cls_model = get_end_net(cls_name) cfg.encoder_model = ecd_name cfg.classification_model = cls_name if opt.debug: cfg.debug = opt.debug else: if opt.tensorboard == 'TEST': cfg.tensorboard = opt.model_name else: cfg.tensorboard = opt.tensorboard cfg.flag = opt.flag model = cls_model(cfg, encoder=CNNencoder( cfg, ecd_model(pretrained=True, path=opt.encoder_model))) cfg.video_path = os.path.join(cfg.root_path, cfg.video_path) cfg.annotation_path = os.path.join(cfg.root_path, cfg.annotation_path) cfg.list_all_member() torch.manual_seed(cfg.manual_seed) print('##########################################') print('####### model 仅支持单GPU') print('##########################################') model = model.cuda() print(model) criterion = nn.CrossEntropyLoss() if cfg.cuda: criterion = criterion.cuda() norm_method = Normalize([0, 0, 0], [1, 1, 1]) print('##########################################') print('####### train') print('##########################################') assert cfg.train_crop in ['random', 'corner', 'center'] if cfg.train_crop == 'random': crop_method = (cfg.scales, cfg.sample_size) elif cfg.train_crop == 'corner': crop_method = MultiScaleCornerCrop(cfg.scales, cfg.sample_size) elif cfg.train_crop == 'center': crop_method = MultiScaleCornerCrop(cfg.scales, cfg.sample_size, crop_positions=['c']) spatial_transform = Compose([ crop_method, RandomHorizontalFlip(), ToTensor(cfg.norm_value), norm_method ]) temporal_transform = TemporalRandomCrop(cfg.sample_duration) target_transform = ClassLabel() training_data = get_training_set(cfg, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.n_threads, drop_last=False, pin_memory=True) optimizer = model.get_optimizer(lr1=cfg.lr, lr2=cfg.lr2) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=cfg.lr_patience) print('##########################################') print('####### val') print('##########################################') spatial_transform = Compose([ Scale(cfg.sample_size), CenterCrop(cfg.sample_size), ToTensor(cfg.norm_value), norm_method ]) temporal_transform = LoopPadding(cfg.sample_duration) target_transform = ClassLabel() validation_data = get_validation_set(cfg, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader(validation_data, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.n_threads, drop_last=False, pin_memory=True) print('##########################################') print('####### run') print('##########################################') if cfg.debug: logger = None else: path = get_log_dir(cfg.logdir, name=cfg.tensorboard, flag=cfg.flag) logger = Logger(logdir=path) cfg.save_config(path) for i in range(cfg.begin_epoch, cfg.n_epochs + 1): train_epoch(i, train_loader, model, criterion, optimizer, cfg, logger) validation_loss = val_epoch(i, val_loader, model, criterion, cfg, logger) scheduler.step(validation_loss)
def main(): opt = parse_opts() ecd_name, cls_name = opt.model_name.split('-') cfg.encoder_model = ecd_name cfg.classification_model = cls_name if opt.debug: cfg.debug = opt.debug else: if opt.tensorboard == 'TEST': cfg.tensorboard = opt.model_name else: cfg.tensorboard = opt.tensorboard cfg.flag = opt.flag model, parameters = get_model(2) cfg.video_path = os.path.join(cfg.root_path, cfg.video_path) cfg.annotation_path = os.path.join(cfg.root_path, cfg.annotation_path) cfg.list_all_member() torch.manual_seed(cfg.manual_seed) print('##########################################') print('####### model 仅支持单GPU') print('##########################################') print(model) criterion = nn.CrossEntropyLoss() if cfg.cuda: criterion = criterion.cuda() print('##########################################') print('####### train') print('##########################################') training_data = FaceRecognition(cfg, '/share5/public/lijianwei/faces/', TemporalRandomCrop(14), train_spatial_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.n_threads, drop_last=False, pin_memory=True) optimizer = torch.optim.SGD(parameters, lr=cfg.lr, momentum=0.9, dampening=0.9, weight_decay=1e-3) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=cfg.lr_patience) print('##########################################') print('####### val') print('##########################################') validation_data = FaceRecognition(cfg, '/share5/public/lijianwei/faces/', TemporalRandomCrop(14), val_spatial_transform, phase='val') val_loader = torch.utils.data.DataLoader(validation_data, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.n_threads, drop_last=False, pin_memory=True) print('##########################################') print('####### run') print('##########################################') if cfg.debug: logger = None else: path = get_log_dir(cfg.logdir, name=cfg.tensorboard, flag=cfg.flag) logger = Logger(logdir=path) cfg.save_config(path) for i in range(cfg.begin_epoch, cfg.n_epochs + 1): train_epoch(i, train_loader, model, criterion, optimizer, cfg, logger) validation_loss = val_epoch(i, val_loader, model, criterion, cfg, logger) scheduler.step(validation_loss)
def main(): """ input: outp1 concat with 4 modalities; target: difference between outp1 and the GT :return: """ # init or load model print("init model with input shape", config["input_shape"]) model = AttentionVNet(config=config) parameters = model.parameters() optimizer = optim.Adam(parameters, lr=config["initial_learning_rate"], weight_decay=config["L2_norm"]) start_epoch = 1 if config["VAE_enable"]: loss_function = CombinedLoss(combine=config["combine"], k1=config["loss_k1_weight"], k2=config["loss_k2_weight"]) else: loss_function = SoftDiceLoss(combine=config["combine"]) with open('valid_list.txt', 'r') as f: val_list = f.read().splitlines() with open('train_list.txt', 'r') as f: tr_list = f.read().splitlines() config["training_patients"] = tr_list config["validation_patients"] = val_list preprocessor = stage2net_preprocessor(config, patch_size=patch_size) # data_generator print("data generating") training_data = PatchDataset(phase="train", config=config, preprocessor=preprocessor) valildation_data = PatchDataset(phase="validate", config=config, preprocessor=preprocessor) train_logger = Logger(model_name=config["model_name"] + '.h5', header=['epoch', 'loss', 'wt-dice', 'tc-dice', 'et-dice', 'lr']) if not config["overwrite"] and config["saved_model_file"] is not None: if not os.path.exists(config["saved_model_file"]): raise Exception("Invalid model path!") model, start_epoch, optimizer_resume = load_old_model(model, optimizer, saved_model_path=config["saved_model_file"]) parameters = model.parameters() optimizer = optim.Adam(parameters, lr=optimizer_resume.param_groups[0]["lr"], weight_decay=optimizer_resume.param_groups[0]["weight_decay"]) if config["cuda_devices"] is not None: model = model.cuda() model = nn.DataParallel(model) # multi-gpu training for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() scheduler = lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=poly_lr_scheduler_multi) # scheduler = lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=poly_lr_scheduler) max_val_TC_dice = 0. max_val_ET_dice = 0. max_val_AVG_dice = 0. for i in range(start_epoch, config["epochs"]): train_epoch(epoch=i, data_set=training_data, model=model, criterion=loss_function, optimizer=optimizer, opt=config, logger=train_logger) val_loss, WT_dice, TC_dice, ET_dice = val_epoch(epoch=i, data_set=valildation_data, model=model, criterion=loss_function, opt=config, optimizer=optimizer, logger=train_logger) scheduler.step() dices = np.array([WT_dice, TC_dice, ET_dice]) AVG_dice = dices.mean() save_flag = False if config["checkpoint"] and TC_dice > max_val_TC_dice: max_val_TC_dice = TC_dice save_flag = True if config["checkpoint"] and ET_dice > max_val_ET_dice: max_val_ET_dice = ET_dice save_flag = True if config["checkpoint"] and AVG_dice > max_val_AVG_dice: max_val_AVG_dice = AVG_dice save_flag = True if save_flag: save_dir = config["result_path"] if not os.path.exists(save_dir): os.makedirs(save_dir) save_states_path = os.path.join(save_dir, 'epoch_{0}_val_loss_{1:.4f}_TC_{2:.4f}_ET_{3:.4f}_AVG_{4:.4f}.pth'.format(i, val_loss, TC_dice, ET_dice, AVG_dice)) if config["cuda_devices"] is not None: state_dict = model.module.state_dict() else: state_dict = model.state_dict() states = { 'epoch': i, 'state_dict': state_dict, 'optimizer': optimizer.state_dict(), } torch.save(states, save_states_path) save_model_path = os.path.join(save_dir, "best_model.pth") if os.path.exists(save_model_path): os.system("rm "+ save_model_path) torch.save(model, save_model_path) print("batch {0:d} finished, validation loss:{1:.4f}; TC:{2:.4f}, ET:{3:.4f}, AVG:{4:.4f}".format(i, val_loss, TC_dice, ET_dice, AVG_dice))
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) print('after generating model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) print('after resume model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) # summary(model, input_size=(3, 112, 112)) # if opt.pretrain_path: # model = load_pretrained_model(model, opt.pretrain_path, opt.model, # opt.n_finetune_classes) print('after pretrained model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) print(torch_summarize(model)) # parameters = model.parameters() # for name, param in model.named_parameters(): # if param.requires_grad: # print(name, param.data) # summary(model, (3, 112, 112)) # return # print('model parameters shape', parameters.shape) (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, model.parameters()) for i, (inputs, targets) in enumerate(train_loader): print('input shape:', inputs.shape) print('targets shape:', targets.shape) outputs = model(inputs) print("output shape", outputs.shape) model_arch = make_dot(outputs, params=dict(model.named_parameters())) print(model_arch) model_arch.render("/apollo/data/model.png", format="png") # Source(model_arch).render('/apollo/data/model.png') # print("generating /apollo/data/model.png") break # make_dot(yhat, params=dict(list(model.named_parameters()))).render("rnn_torchviz", format="png") return if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
if name == prev_name: param.data = prev_param.data print('{}: {}({}) -> {}({})'.format( i, prev_name, prev_param.shape, name, param.shape)) if name == 'module.exp.weight': param.requires_grad = False check = True if not check: raise if opt.init_path: check = False print('initilize checkpoint {}'.format(opt.init_path)) checkpoint = torch.load(opt.init_path) for (name, param), i in zip(model.named_parameters(), range(opt.init_level)): for (prev_name, prev_param) in checkpoint['state_dict'].items(): if name == prev_name: param.data = prev_param.data print('{}: {}({}) -> {}({})'.format( i, prev_name, prev_param.shape, name, param.shape)) check = True if not check: raise model.to(device) print('run') validation_loss = val_epoch(1, val_loader, model, criterion, opt, val_logger, device)
for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: cudnn.benchmark = True training_metrics = train_epoch(i, train_loader, model, criterion, optimizer, opt, train_logger, writer) for k, v in training_metrics.items(): stats_dict[k][i] = v if i % opt.checkpoint == 0: save_file_path = os.path.join(opt.save_path, 'train_' + str(i) + '_model.pth') states = { 'epoch': i, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),} torch.save(states, save_file_path) if not opt.no_val and i % opt.val_per_epoches == 0: test_metrics = val_epoch(i, val_loader, model, criterion, opt, val_logger, writer) for k, v in test_metrics.items(): stats_dict[k][i] = v scheduler.step() writer.close() save_stats_dir = os.path.join(opt.save_path, 'stats') if not os.path.exists(save_stats_dir): os.makedirs(save_stats_dir) with open(os.path.join(save_stats_dir, 'training_stats.npz'), 'wb') as f: np.savez(f, **stats_dict)
val_logger = Logger(os.path.join(result_dir_name, 'val.log'), [ 'epoch', 'loss', 'acc-top1', 'acc-top5', 'batch-time', 'epoch-time' ]) else: val_logger = Logger( os.path.join(result_dir_name, 'val.log'), ['epoch', 'loss', 'acc-top1', 'batch-time', 'epoch-time']) # 重みを保存したファイルがあるなら読み込む if opt.resume_path: path = os.path.join(result_dir_name, opt.resume_path) print('loading checkpoint {}'.format(path)) checkpoint = torch.load(path) # アーキテクチャが同じかどうかチェック assert opt.arch == checkpoint['arch'] # 前回のエポック数を取得 opt.begin_epoch = checkpoint['epoch'] # パラメータを読み込む model.load_state_dict(checkpoint['state_dict']) if not opt.no_train: optimizer.load_state_dict(checkpoint['optimizer']) optimizer.param_groups[0]['lr'] = opt.learning_rate print('run') for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: train_epoch(i, train_loader, model, criterion, optimizer, opt, train_logger, result_dir_name, device) if not opt.no_val: val_epoch(i, val_loader, model, criterion, opt, val_logger, device)
def main(): args = parse_command() # 如果有多GPU 使用多GPU训练 if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") args.batch_size = args.batch_size * torch.cuda.device_count() else: print("Let's use", torch.cuda.current_device()) train_loader, val_loader = create_dataloader(args) print('train size:', len(train_loader)) print('val size:', len(val_loader)) # create results folder, if not already exists output_directory = utils.get_output_directory_run(args) if not os.path.exists(output_directory): os.makedirs(output_directory) log_path = os.path.join( output_directory, 'logs', datetime.now().strftime('%b%d_%H-%M-%S') + '_' + socket.gethostname()) if os.path.isdir(log_path): shutil.rmtree(log_path) os.makedirs(log_path) logger = SummaryWriter(log_path) torch.manual_seed(args.manual_seed) # 定义模型 model = resnext.resnet101(num_classes=args.n_classes, shortcut_type=args.resnet_shortcut, cardinality=args.resnext_cardinality, sample_size=args.sample_size, sample_duration=args.sample_duration) model = model.cuda() model = nn.DataParallel(model, device_ids=None) if args.pretrain_path: print('loading pretrained model {}'.format(args.pretrain_path)) pretrain = torch.load(args.pretrain_path) model.load_state_dict(pretrain['state_dict']) del pretrain # 清理缓存 # parameters = get_fine_tuning_parameters(model, args.ft_begin_index) train_params = [{ 'params': resnext.get_1x_lr_params(model), 'lr': args.lr }, { 'params': resnext.get_10x_lr_params(model), 'lr': args.lr * 10 }] # loss函数 criterion = nn.CrossEntropyLoss() if torch.cuda.is_available(): criterion = criterion.cuda() # 优化器 args.nesterov = False if args.nesterov: dampening = 0 else: dampening = args.dampening optimizer = optim.SGD(train_params, lr=args.learning_rate, momentum=args.momentum, dampening=dampening, weight_decay=args.weight_decay, nesterov=args.nesterov) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=args.lr_patience) print('run') for i in range(args.begin_epoch, args.n_epochs + 1): train_epoch(i, train_loader, model, criterion, optimizer, logger) validation_loss = val_epoch(i, val_loader, model, criterion, output_directory, logger) if i % args.checkpoint == 0: save_file_path = os.path.join(output_directory, 'save_{}.pth'.format(i)) states = { 'epoch': i + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(states, save_file_path) scheduler.step(validation_loss)
print('loading checkpoint {}'.format(opt.resume_path)) checkpoint = torch.load(opt.resume_path) assert opt.arch == checkpoint['arch'] opt.begin_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) if not opt.no_train: optimizer.load_state_dict(checkpoint['optimizer']) print('run') for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: train_epoch(i, train_loader, model, criterion, optimizer, opt, train_logger, train_batch_logger) if not opt.no_val: validation_loss = val_epoch(i, val_loader, model, criterion, opt, val_logger) if not opt.no_train and not opt.no_val: scheduler.step(validation_loss) if opt.test: spatial_transform = Compose([ Scale(int(opt.sample_size / opt.scale_in_test)), CornerCrop(opt.sample_size, opt.crop_position_in_test), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = VideoID() test_data = get_test_set(opt, spatial_transform, temporal_transform, target_transform)
def main(): # init or load model print("init model with input shape", config["input_shape"]) if config["attention"]: model = AttentionVNet(config=config) else: model = NvNet(config=config) parameters = model.parameters() optimizer = optim.Adam(parameters, lr=config["initial_learning_rate"], weight_decay=config["L2_norm"]) start_epoch = 1 if config["VAE_enable"]: loss_function = CombinedLoss(new_loss=config["new_SoftDiceLoss"], k1=config["loss_k1_weight"], k2=config["loss_k2_weight"], alpha=config["focal_alpha"], gamma=config["focal_gamma"], focal_enable=config["focal_enable"]) else: loss_function = SoftDiceLoss(new_loss=config["new_SoftDiceLoss"]) with open('valid_list_v2.txt', 'r') as f: val_list = f.read().splitlines() # with open('train_list.txt', 'r') as f: with open('train_list_v2.txt', 'r') as f: tr_list = f.read().splitlines() config["training_patients"] = tr_list config["validation_patients"] = val_list # data_generator print("data generating") training_data = BratsDataset(phase="train", config=config) # x = training_data[0] # for test valildation_data = BratsDataset(phase="validate", config=config) train_logger = Logger( model_name=config["model_name"] + '.h5', header=['epoch', 'loss', 'wt-dice', 'tc-dice', 'et-dice', 'lr']) if not config["overwrite"] and config["saved_model_file"] is not None: if not os.path.exists(config["saved_model_file"]): raise Exception("Invalid model path!") model, start_epoch, optimizer_resume = load_old_model( model, optimizer, saved_model_path=config["saved_model_file"]) parameters = model.parameters() optimizer = optim.Adam( parameters, lr=optimizer_resume.param_groups[0]["lr"], weight_decay=optimizer_resume.param_groups[0]["weight_decay"]) if config["cuda_devices"] is not None: model = model.cuda() loss_function = loss_function.cuda() model = nn.DataParallel(model) # multi-gpu training for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=config["lr_decay"], patience=config["patience"]) scheduler = lr_scheduler.LambdaLR( optimizer=optimizer, lr_lambda=poly_lr_scheduler) # can't restore lr correctly max_val_WT_dice = 0. max_val_AVG_dice = 0. for i in range(start_epoch, config["epochs"]): train_epoch(epoch=i, data_set=training_data, model=model, criterion=loss_function, optimizer=optimizer, opt=config, logger=train_logger) val_loss, WT_dice, TC_dice, ET_dice = val_epoch( epoch=i, data_set=valildation_data, model=model, criterion=loss_function, opt=config, optimizer=optimizer, logger=train_logger) scheduler.step() # scheduler.step(val_loss) dices = np.array([WT_dice, TC_dice, ET_dice]) AVG_dice = dices.mean() if config["checkpoint"] and (WT_dice > max_val_WT_dice or AVG_dice > max_val_AVG_dice or WT_dice >= 0.912): max_val_WT_dice = WT_dice max_val_AVG_dice = AVG_dice # save_dir = os.path.join(config["result_path"], config["model_file"].split("/")[-1].split(".h5")[0]) save_dir = config["result_path"] if not os.path.exists(save_dir): os.makedirs(save_dir) save_states_path = os.path.join( save_dir, 'epoch_{0}_val_loss_{1:.4f}_WTdice_{2:.4f}_AVGDice:{3:.4f}.pth' .format(i, val_loss, WT_dice, AVG_dice)) if config["cuda_devices"] is not None: state_dict = model.module.state_dict() else: state_dict = model.state_dict() states = { 'epoch': i, 'state_dict': state_dict, 'optimizer': optimizer.state_dict(), } torch.save(states, save_states_path) save_model_path = os.path.join(save_dir, "best_model.pth") if os.path.exists(save_model_path): os.system("rm " + save_model_path) torch.save(model, save_model_path) print( "batch {0:d} finished, validation loss:{1:.4f}; WTDice:{2:.4f}; AVGDice:{3:.4f}" .format(i, val_loss, WT_dice, AVG_dice))
def main(): opt = parse_opts() print(opt) seed = 0 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False # CUDA for PyTorch use_cuda = torch.cuda.is_available() device = torch.device(f"cuda:{opt.gpu}" if use_cuda else "cpu") train_transform = transforms.Compose([ #transforms.RandomCrop(32, padding=3), transforms.Resize((256, 256)), transforms.RandomHorizontalFlip(0.5), transforms.ColorJitter(brightness=[0.2,1]), GaussianNoise(0.5), # transforms.RandomRotation(10), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[ 0.229, 0.224, 0.225]) ]) test_transform = transforms.Compose([ #transforms.RandomCrop(32, padding=3), transforms.Resize((256, 256)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[ 0.229, 0.224, 0.225]) ]) training_data = torchvision.datasets.ImageFolder( opt.dataset_path) traindataset = MyLazyDataset(training_data, train_transform) valdataset = MyLazyDataset(training_data,test_transform) # Create the index splits for training, validation and test train_size = 0.8 num_train = len(training_data) indices = list(range(num_train)) split = int(np.floor(train_size * num_train)) split2 = int(np.floor((train_size+(1-train_size)/2) * num_train)) np.random.shuffle(indices) train_idx, valid_idx, test_idx = indices[:split], indices[split:split2], indices[split2:] traindata = Subset(traindataset, indices=train_idx) valdata = Subset(valdataset, indices=valid_idx) train_loader = torch.utils.data.DataLoader(traindata, batch_size=opt.batch_size, shuffle=True, num_workers=0) val_loader = torch.utils.data.DataLoader(valdata, batch_size=opt.batch_size, shuffle=True, num_workers=0) print(f'Number of training examples: {len(train_loader.dataset)}') print(f'Number of validation examples: {len(val_loader.dataset)}') # tensorboard summary_writer = tensorboardX.SummaryWriter(log_dir='tf_logs') # define model # model = ResidualNet("ImageNet", opt.depth, opt.num_classes, "CBAM") model = ViT( image_size = 256, patch_size = 32, num_classes = 2, dim = 1024, depth = 6, heads = 8, mlp_dim = 2048, dropout = 0.1, emb_dropout = 0.1 ) if opt.resume_path: checkpoint = torch.load(opt.resume_path) model.load_state_dict(checkpoint['model_state_dict']) epoch = checkpoint['epoch'] print("Model Restored from Epoch {}".format(epoch)) opt.start_epoch = epoch + 1 model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), weight_decay=opt.wt_decay) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=opt.lr_patience) if opt.resume_path: checkpoint = torch.load(opt.resume_path) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) th = 100000 # start training for epoch in range(opt.start_epoch, opt.epochs+1): # train, test model train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device, opt) val_loss, val_acc = val_epoch(model, val_loader, criterion, device) scheduler.step(val_loss) lr = optimizer.param_groups[0]['lr'] # saving weights to checkpoint if (epoch) % opt.save_interval == 0: # write summary summary_writer.add_scalar( 'losses/train_loss', train_loss, global_step=epoch) summary_writer.add_scalar( 'losses/val_loss', val_loss, global_step=epoch) summary_writer.add_scalar( 'acc/train_acc', train_acc, global_step=epoch) summary_writer.add_scalar( 'acc/val_acc', val_acc, global_step=epoch) summary_writer.add_scalar( 'lr_rate', lr, global_step=epoch) state = {'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict':scheduler.state_dict()} if val_loss < th: torch.save(state, os.path.join('./snapshots', f'{opt.dataset}_model.pth')) print("Epoch {} model saved!\n".format(epoch)) th = val_loss
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.dropout: n_classes = opt.n_classes if opt.pretrain_path is not None: n_classes = opt.n_finetune_classes model = replace_fc_layer(model=model, dropout_factor=opt.dropout_factor, n_classes=n_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) if opt.labelsmoothing: criterion = LabelSmoothingCrossEntropy().to(opt.device) else: criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None if opt.lr_finder and not opt.no_train and not opt.no_val: print( "Performing Learning Rate Search\nWith Leslie Smith's approach...") lr_finder = LRFinder(model, optimizer, criterion, device=opt.device) lr_finder.range_test(train_loader, val_loader=val_loader, start_lr=opt.learning_rate, end_lr=opt.lrf_end_lr, num_iter=opt.lrf_num_it, step_mode=opt.lrf_mode) lr_finder.plot(log_lr=False) with (opt.result_path / 'lr_search.json').open('w') as results_file: json.dump(lr_finder.history, results_file, default=json_serial) lr_finder.reset() return prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) #current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, train_logger, train_batch_logger, scheduler, opt.lr_scheduler, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) elif not opt.no_train and opt.lr_scheduler == 'cosineannealing': scheduler.step() if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main(): # convert input images into an hdf5 file if config["overwrite"] or not os.path.exists(config["data_file"]): training_files, subject_ids = fetch_training_data_files(return_subject_ids=True) write_data_to_file(training_files, config["data_file"], image_shape=config["image_shape"], subject_ids=subject_ids) # init or load model print("init model with input shape",config["input_shape"]) model = NvNet(config=config) parameters = model.parameters() optimizer = optim.Adam(parameters, lr=config["initial_learning_rate"], weight_decay = config["L2_norm"]) start_epoch = 1 if config["VAE_enable"]: loss_function = CombinedLoss(k1=config["loss_k1_weight"], k2=config["loss_k2_weight"]) else: loss_function = SoftDiceLoss() # data_generator print("data generating") training_data = BratsDataset(phase="train", config=config) valildation_data = BratsDataset(phase="validate", config=config) train_logger = Logger(model_name=config["model_file"],header=['epoch', 'loss', 'acc', 'lr']) if config["cuda_devices"] is not None: # model = nn.DataParallel(model) # multi-gpu training model = model.cuda() loss_function = loss_function.cuda() if not config["overwrite"] and config["saved_model_file"] is not None: if not os.path.exists(config["saved_model_file"]): raise Exception("Invalid model path!") model, start_epoch, optimizer = load_old_model(model, optimizer, saved_model_path=config["saved_model_file"]) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=config["lr_decay"],patience=config["patience"]) print("training on label:{}".format(config["labels"])) max_val_acc = 0. for i in range(start_epoch,config["epochs"]): train_epoch(epoch=i, data_set=training_data, model=model, criterion=loss_function, optimizer=optimizer, opt=config, logger=train_logger) val_loss, val_acc = val_epoch(epoch=i, data_set=valildation_data, model=model, criterion=loss_function, opt=config, optimizer=optimizer, logger=train_logger) scheduler.step(val_loss) if config["checkpoint"] and val_acc > max_val_acc: max_val_acc = val_acc save_dir = os.path.join(config["result_path"], config["model_file"].split("/")[-1].split(".h5")[0]) if not os.path.exists(save_dir): os.makedirs(save_dir) save_states_path = os.path.join(save_dir,'epoch_{0}_val_loss_{1:.4f}_acc_{2:.4f}.pth'.format(i, val_loss, val_acc)) states = { 'epoch': i + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(states, save_states_path) save_model_path = os.path.join(save_dir, "best_model_file.pth") if os.path.exists(save_model_path): os.system("rm "+save_model_path) torch.save(model, save_model_path)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.resume_path is not None: if not opt.no_train: opt.begin_epoch, model, optimizer, scheduler = resume( opt.resume_path, opt.arch, opt.begin_epoch, model, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones else: opt.begin_epoch, model, _, _ = resume(opt.resume_path, opt.arch, opt.begin_epoch, model) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': # opt.device = torch.device(f'cuda:{index}') opt.device = torch.device('cuda:{}'.format(index)) if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes, opt.strg) if opt.strg: model = STRG(model, nclass=opt.n_classes, nrois=opt.nrois) rpn = RPN(nrois=opt.nrois) rpn = make_data_parallel(rpn, opt.distributed, opt.device) else: rpn = None if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) # if opt.pretrain_path: # parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) # else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: #from torch.utils.tensorboard import SummaryWriter from tensorboardX import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None if opt.wandb: name = str(opt.result_path) wandb.init( project='strg', name=name, config=opt, dir=name, # resume=str(opt.resume_path) != '', sync_tensorboard=True) prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed, rpn=rpn, det_interval=opt.det_interval, nrois=opt.nrois) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed, rpn=rpn, det_interval=opt.det_interval, nrois=opt.nrois) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main(): # Print config print(config) # init or load model model = NvNet(config=config) optimizer = optim.Adam(model.parameters(), lr=config["initial_learning_rate"], weight_decay = config["L2_norm"]) start_epoch = 1 if config["VAE_enable"]: loss_function = CombinedLoss(k1=config["loss_k1_weight"], k2=config["loss_k2_weight"]) else: loss_function = SoftDiceLoss() # data_generator print("Loading BraTS dataset...") training_data = BratsDataset(phase="train", config=config) validation_data = BratsDataset(phase="validate", config=config) train_logger = Logger(model_name=config["model_file"],header=['epoch', 'loss', 'acc', 'lr']) if config["cuda_devices"] is not None: #gpu_list = list(range(0, 2)) #model = nn.DataParallel(model, gpu_list) # multi-gpu training model = model.cuda() loss_function = loss_function.cuda() # model = model.to(device=device) # move the model parameters to CPU/GPU # loss_function = loss_function.to(device=device) if not config["overwrite"] and config["saved_model_file"] is not None: if not os.path.exists(config["saved_model_file"]): raise Exception("Invalid model path!") model, start_epoch, optimizer = load_old_model(model, optimizer, saved_model_path=config["saved_model_file"]) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=config["lr_decay"], patience=config["patience"]) #model = torch.load("checkpoint_models/run0/best_model_file_24.pth") print("training on label:{}".format(config["labels"])) max_val_acc = 0. for i in range(start_epoch,config["epochs"]): train_epoch(epoch=i, data_set=training_data, model=model, criterion=loss_function, optimizer=optimizer, opt=config, logger=train_logger) val_loss, val_acc = val_epoch(epoch=i, data_set=validation_data, model=model, criterion=loss_function, opt=config, optimizer=optimizer, logger=train_logger) scheduler.step(val_loss) if config["checkpoint"] and val_acc >= max_val_acc - 0.10: max_val_acc = val_acc save_dir = os.path.join(config["result_path"], config["model_file"].split("/")[-1].split(".h5")[0]) if not os.path.exists(save_dir): os.makedirs(save_dir) save_states_path = os.path.join(save_dir,'epoch_{0}_val_loss_{1:.4f}_acc_{2:.4f}.pth'.format(i, val_loss, val_acc)) states = { 'epoch': i + 1, # 'state_dict': model.state_dict(), 'encoder': model.encoder.state_dict(), 'decoder': model.decoder.state_dict(), 'vae': model.vae.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(states, save_states_path) save_model_path = os.path.join(save_dir, "best_model_file_{0}.pth".format(i)) if os.path.exists(save_model_path): os.system("rm "+save_model_path) # torch.save(model, save_model_path) torch.save(states, save_model_path)
def main(): opt = parse_opts() print(opt) #pdb.set_trace() if not os.path.exists(opt.result_path): os.mkdir(opt.result_path) with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file: json.dump(vars(opt), opt_file) device = torch.device("cuda" if opt.use_cuda else "cpu") # Read Phenotype csv = pd.read_csv(opt.csv_dir) if opt.cross_val: for fold in range(5): # change back to 5 train_ID = dd.io.load(os.path.join(opt.MAT_dir, opt.splits))[fold]['X_train'] val_ID = dd.io.load(os.path.join(opt.MAT_dir, opt.splits))[fold]['X_test'] # ==========================================================================# # 1. Network Initialization # # ==========================================================================# torch.manual_seed(opt.manual_seed) if opt.architecture == 'ResNet': kwargs = { 'inchn': opt.win_size, 'sample_size': opt.sample_size, 'sample_duration': opt.sample_duration, 'num_classes': opt.n_classes } model = resnet10(**kwargs).to(device) elif opt.architecture == 'NC3D': model = MyNet(opt.win_size, opt.nb_filter, opt.batch_size).to(device) elif opt.architecture == 'CRNN': model = CNN_LSTM(opt.win_size, opt.nb_filter, opt.batch_size, opt.sample_size, opt.sample_duration, opt.rep).to(device) else: print('Architecture is not available.') raise LookupError print(model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) num_params = sum([np.prod(p.size()) for p in model_parameters]) print('number of trainable parameters:', num_params) device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') class_weights = torch.FloatTensor(opt.weights).to(device) criterion = nn.CrossEntropyLoss(weight=class_weights) criterion.to(device) # ==========================================================================# # 2. Setup Dataloading Paramters # # ==========================================================================# '''load subjects ID''' ID = csv['SUB_ID'].values win_size = opt.win_size # num of channel input T = opt.sample_duration # total length of fMRI num_rep = T // win_size # num of repeat the ID # ==========================================================================# # 3. Training and Validation # # ==========================================================================# if opt.architecture == 'ResNet': training_data = fMRIDataset(opt.datadir, win_size, train_ID, T, csv) elif opt.architecture == 'NC3D': training_data = fMRIDataset_2C(opt.datadir, train_ID) elif opt.architecture == 'CRNN': training_data = fMRIDataset_CRNN(opt.datadir, win_size, train_ID, T, csv) else: print('Architecture is not available.') raise LookupError train_loader = torch.utils.data.DataLoader( training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.n_threads, pin_memory=False) log_path = os.path.join(opt.result_path, str(fold)) if not os.path.exists(log_path): os.mkdir(log_path) train_logger = Logger(os.path.join(log_path, 'train.log'), ['epoch', 'loss', 'acc', 'lr']) train_batch_logger = Logger( os.path.join(log_path, 'train_batch.log'), ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) '''optimization''' if opt.nesterov: dampening = 0 else: dampening = opt.dampening if opt.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, dampening=dampening, weight_decay=opt.weight_decay, nesterov=opt.nesterov) elif opt.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) elif opt.optimizer == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=opt.lr_patience) scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.2) if not opt.no_val: if opt.architecture == 'ResNet': validation_data = fMRIDataset(opt.datadir, win_size, val_ID, T, csv) elif opt.architecture == 'NC3D': validation_data = fMRIDataset_2C(opt.datadir, val_ID) elif opt.architecture == 'CRNN': validation_data = fMRIDataset_CRNN(opt.datadir, win_size, val_ID, T, csv) val_loader = torch.utils.data.DataLoader( validation_data, batch_size=opt.n_val_samples, shuffle=False, num_workers=opt.n_threads, pin_memory=False) val_logger = Logger(os.path.join(log_path, 'val.log'), ['epoch', 'loss', 'acc']) if opt.resume_path: print('loading checkpoint {}'.format(opt.resume_path)) checkpoint = torch.load(opt.resume_path) # assert opt.arch == checkpoint['arch'] opt.begin_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) if not opt.no_train: optimizer.load_state_dict(checkpoint['optimizer']) print('run') best_loss = 1e4 for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: train_epoch(i, train_loader, model, criterion, optimizer, opt, log_path, train_logger, train_batch_logger) if not opt.no_val: validation_loss = val_epoch(i, val_loader, model, criterion, opt, val_logger) if validation_loss < best_loss: best_loss = validation_loss best_model_wts = copy.deepcopy(model.state_dict()) torch.save( best_model_wts, os.path.join(log_path, str(fold) + '_best.pth')) if not opt.no_train and not opt.no_val: #scheduler.step(validation_loss) scheduler.step() model_wts = copy.deepcopy(model.state_dict()) torch.save( model_wts, os.path.join(log_path, str(fold) + '_epoch_' + str(i) + '.pth')) # =========================================================================# # 4. Testing # # =========================================================================# if opt.test: model = MyNet(opt.win_size, opt.nb_filter, opt.batch_size).to(device) model.load_state_dict( torch.load(os.path.join(log_path, str(fold) + '_best.pth'))) test_details_logger = Logger( os.path.join(log_path, 'test_details.log'), ['sub_id', 'pos', 'neg']) test_logger = Logger(os.path.join(log_path, 'test.log'), [ 'fold', 'real_Y', 'pred_Y', 'acc', 'sen', 'spec', 'ppv', 'npv' ]) real_Y = [] pred_Y = [] model.eval() if opt.no_val: if opt.architecture == 'ResNet': validation_data = fMRIDataset(opt.datadir, win_size, val_ID, T, csv) elif opt.architecture == 'NC3D': validation_data = fMRIDataset_2C(opt.datadir, val_ID) elif opt.architecture == 'CRNN': validation_data = fMRIDataset_CRNN( opt.datadir, win_size, val_ID, T, csv) test_loader = torch.utils.data.DataLoader( validation_data, batch_size=146 + 1 - opt.s_sz, shuffle=False, num_workers=opt.n_threads, pin_memory=False) with torch.no_grad(): for i, (inputs, targets) in enumerate(test_loader): real_Y.append(targets[0]) inputs, targets = inputs.to(device), targets.to(device) inputs = Variable(inputs).float() targets = Variable(targets).long() outputs = model(inputs) rest = np.argmax(outputs.detach().cpu().numpy(), axis=1) pos = np.sum(rest == targets.detach().cpu().numpy()) neg = len(rest) - pos print('pos:', pos, ' and neg:', neg) test_details_logger.log({ 'sub_id': val_ID[i * 142], 'pos': pos, 'neg': neg }) if np.sum(rest == 1) >= np.sum(rest == 0): pred_Y.append(1) else: pred_Y.append(0) TP, FP, TN, FN = perf_measure(real_Y, pred_Y) acc = (TP + TN) / (TP + TN + FP + FN) sen = TP / (TP + FN) spec = TN / (TN + FP) ppv = TP / (TP + FP) npv = TN / (TN + FN) test_logger.log({ 'fold': fold, 'real_Y': real_Y, 'pred_Y': pred_Y, 'acc': acc, 'sen': sen, 'spec': spec, 'ppv': ppv, 'npv': npv }) else: fold = opt.fold train_ID = dd.io.load(os.path.join(opt.MAT_dir, opt.splits))[fold]['X_train'] val_ID = dd.io.load(os.path.join(opt.MAT_dir, opt.splits))[fold]['X_test'] # ==========================================================================# # 1. Network Initialization # # ==========================================================================# torch.manual_seed(opt.manual_seed) if opt.architecture == 'ResNet': kwargs = { 'inchn': opt.win_size, 'sample_size': opt.sample_size, 'sample_duration': opt.sample_duration, 'num_classes': opt.n_classes } model = resnet10(**kwargs).to(device) elif opt.architecture == 'NC3D': model = MyNet(opt.win_size, opt.nb_filter, opt.batch_size).to(device) elif opt.architecture == 'CRNN': model = CNN_LSTM(opt.win_size, opt.nb_filter, opt.batch_size, opt.s_sz, opt.sample_duration, opt.rep).to(device) else: print('Architecture is not available.') raise LookupError print(model) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) num_params = sum([np.prod(p.size()) for p in model_parameters]) print('number of trainable parameters:', num_params) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') class_weights = torch.FloatTensor(opt.weights).to(device) criterion = nn.CrossEntropyLoss(weight=class_weights) criterion.to(device) # ==========================================================================# # 2. Setup Dataloading Paramters # # ==========================================================================# '''load subjects ID''' win_size = opt.win_size # num of channel input T = opt.sample_duration # total length of fMRI # ==========================================================================# # 3. Training and Validation # # ==========================================================================# # repeat the ID, in order to visit all the volumes in fMRI, this will be input to the dataloader if opt.architecture == 'ResNet': training_data = fMRIDataset(opt.datadir, opt.s_sz, train_ID, T, csv, opt.rep) elif opt.architecture == 'NC3D': training_data = fMRIDataset_2C(opt.datadir, train_ID) elif opt.architecture == 'CRNN': training_data = fMRIDataset_CRNN(opt.datadir, opt.s_sz, train_ID, T, csv, opt.rep) train_loader = torch.utils.data.DataLoader( training_data, batch_size=opt.batch_size, shuffle=True, #num_workers=opt.n_threads, pin_memory=True) log_path = opt.result_path print('log_path', log_path) if not os.path.exists(log_path): os.mkdir(log_path) train_logger = Logger(os.path.join(log_path, 'train.log'), ['epoch', 'loss', 'acc', 'lr']) train_batch_logger = Logger( os.path.join(log_path, 'train_batch.log'), ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) '''optimization''' if opt.nesterov: dampening = 0 else: dampening = opt.dampening if opt.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum, dampening=dampening, weight_decay=opt.weight_decay, nesterov=opt.nesterov) elif opt.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) elif opt.optimizer == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) # scheduler = lr_scheduler.ReduceLROnPlateau( # optimizer, 'min', patience=opt.lr_patience) scheduler = lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1) if not opt.no_val: if opt.architecture == 'ResNet': validation_data = fMRIDataset(opt.datadir, opt.s_sz, val_ID, T, csv, opt.rep) elif opt.architecture == 'NC3D': validation_data = fMRIDataset_2C(opt.datadir, val_ID) elif opt.architecture == 'CRNN': validation_data = fMRIDataset_CRNN(opt.datadir, opt.s_sz, val_ID, T, csv, opt.rep) val_loader = torch.utils.data.DataLoader( validation_data, batch_size=opt.n_val_samples, shuffle=False, #num_workers=opt.n_threads, pin_memory=True) val_logger = Logger(os.path.join(log_path, 'val.log'), ['epoch', 'loss', 'acc']) if opt.resume_path: print('loading checkpoint {}'.format(opt.resume_path)) checkpoint = torch.load(opt.resume_path) # assert opt.arch == checkpoint['arch'] opt.begin_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) if not opt.no_train: optimizer.load_state_dict(checkpoint['optimizer']) print('run') for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: train_epoch(i, train_loader, model, criterion, optimizer, opt, log_path, train_logger, train_batch_logger) if not opt.no_val: # when epoch is greater then 5, we start to do validation validation_loss = val_epoch(i, val_loader, model, criterion, opt, val_logger) if not opt.no_train and not opt.no_val: scheduler.step(validation_loss) # =========================================================================# # 4. Testing # # =========================================================================# if opt.test: test_details_logger = Logger( os.path.join(opt.result_path, 'test_details.log'), ['sub_id', 'pos', 'neg']) test_logger = Logger(os.path.join(opt.result_path, 'test.log'), [ 'fold', 'real_Y', 'pred_Y', 'acc', 'sen', 'spec', 'ppv', 'npv' ]) real_Y = [] pred_Y = [] model.eval() test_loader = torch.utils.data.DataLoader( validation_data, batch_size=142, shuffle=False, num_workers=opt.n_threads, pin_memory=False) with torch.no_grad(): for i, (inputs, targets) in enumerate(test_loader): real_Y.append(targets) inputs, targets = inputs.to(device), targets.to(device) inputs = Variable(inputs).float() targets = Variable(targets).long() outputs = model(inputs) rest = np.argmax(outputs.detach().cpu().numpy(), axis=1) pred_Y.append(outputs.detach().cpu().numpu()) pos = np.sum(rest == targets.detach().cpu().numpu()) neg = len(rest) - pos #print('pos:', pos, ' and neg:', neg) test_details_logger.log({ 'sub_id': val_ID[i * 142], 'pos': pos, 'neg': neg }) TP, FP, TN, FN = perf_measure(real_Y, pred_Y) acc = (TP + TN) / (TP + TN + FP + FN) sen = TP / (TP + FN) spec = TN / (TN + FP) ppv = TP / (TP + FP) npv = TN / (TN + FN) test_logger.log({ 'fold': fold, 'real_Y': real_Y, 'pred_Y': pred_Y, 'acc': acc, 'sen': sen, 'spec': spec, 'ppv': ppv, 'npv': npv })
sample_duration=sample_duration, attention=attention, num_classes=num_classes).to(device) # model = r2plus1d_18(pretrained=True, num_classes=num_classes).to(device) # Run the model parallelly if torch.cuda.device_count() > 1: logger.info("Using {} GPUs".format(torch.cuda.device_count())) model = nn.DataParallel(model) # Create loss criterion & optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Start training logger.info("Training Started".center(60, '#')) for epoch in range(epochs): # Train the model train_epoch(model, criterion, optimizer, train_loader, device, epoch, logger, log_interval, writer) # Validate the model val_epoch(model, criterion, val_loader, device, epoch, logger, writer) # Save model torch.save( model.state_dict(), os.path.join(model_path, "slr_cnn3d_epoch{:03d}.pth".format(epoch + 1))) logger.info("Epoch {} Model Saved".format(epoch + 1).center(60, '#')) logger.info("Training Finished".center(60, '#'))
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) # opt.n_threads = int( # (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = genarate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.distributed: model = make_data_parallel(model,opt.device) else: model.to(opt.device) # model = nn.DataParallel(model).cuda() print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) if opt.is_master_node: print(model) parameters = model.parameters() criterion = CrossEntropyLoss().to(opt.device) (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) val_loader, val_logger = get_val_utils(opt) if not opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None print('数据加载完毕') for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) # train_sampler2.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, opt.is_master_node, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i,model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger,opt.is_master_node, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) print('resume model from ', opt.resume_path) print('model after resume:', model) # save model to current running id # mlflow.pytorch.log_model(model, "action_model") # model_path = mlflow.get_artifact_uri("action_model") # print('mlflow action model path: ', model_path) # model = mlflow.pytorch.load_model(model_path) if opt.ml_tag_name != '' and opt.ml_tag_value != '': # mlflow.set_tag("test_tag", 'inference_test') mlflow.set_tag(opt.ml_tag_name, opt.ml_tag_value) # load from previous published model version if opt.ml_model_name != '' and opt.ml_model_version != '': # model_name = 'action_model' # model_version = '1' model_uri = "models:/{}/{}".format(opt.ml_model_name, opt.ml_model_version) model = mlflow.pytorch.load_model(model_uri) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if opt.ml_model_name != '': mlflow.pytorch.log_model(model, opt.ml_model_name) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) mlflow.log_metric("loss", prev_val_loss) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main(): opt = parse_opts() print(opt) seed = 0 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # CUDA for PyTorch use_cuda = torch.cuda.is_available() device = torch.device(f"cuda:{opt.gpu}" if use_cuda else "cpu") train_transform = transforms.Compose([ #transforms.RandomCrop(32, padding=3), transforms.Resize((opt.img_H, opt.img_W)), transforms.RandomHorizontalFlip(), transforms.RandomRotation(10), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[ 0.229, 0.224, 0.225]) ]) test_transform = transforms.Compose([ #transforms.RandomCrop(32, padding=3), transforms.Resize((opt.img_H, opt.img_W)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[ 0.229, 0.224, 0.225]) ]) training_data = get_training_set(opt, train_transform) validation_data = get_validation_set(opt, test_transform) n_train_examples = int(len(training_data)*0.8) n_valid_examples = len(training_data) - n_train_examples # split data training_data, validation_data = torch.utils.data.random_split(training_data, [n_train_examples, n_valid_examples]) train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=True, num_workers=1) val_loader = torch.utils.data.DataLoader(validation_data, batch_size=opt.batch_size, shuffle=True, num_workers=1) print(f'Number of training examples: {len(train_loader.dataset)}') print(f'Number of validation examples: {len(val_loader.dataset)}') # tensorboard summary_writer = tensorboardX.SummaryWriter(log_dir='tf_logs') # define model model = resnet18(num_classes=opt.num_classes) # if torch.cuda.device_count() > 1: # print("Let's use", torch.cuda.device_count(), "GPUs!") # model = nn.DataParallel(model) model = model.to(device) if opt.nesterov: dampening = 0 else: dampening = opt.dampening #define optimizer and criterion # optimizer = optim.Adam(model.parameters()) # optimizer = optim.SGD( # model.parameters(), # lr=opt.learning_rate, # momentum=opt.momentum, # dampening=dampening, # weight_decay=opt.weight_decay, # nesterov=opt.nesterov) # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=opt.lr_patience) # criterion = nn.CrossEntropyLoss() # define optimizer and criterion optimizer = optim.Adam(model.parameters()) # loss function criterion = BCEWithLogitsLoss() # resume model, optimizer if already exists if opt.resume_path: checkpoint = torch.load(opt.resume_path) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] print("Model Restored from Epoch {}".format(epoch)) start_epoch = epoch + 1 else: start_epoch = 1 # start training #th = 10000 for epoch in range(start_epoch, opt.epochs+1): val_loss, val_mAP = val_epoch(model, val_loader, criterion, device, opt)
print('loading checkpoint {}') # .format(opt.resume_path)) checkpoint = torch.load(opt.resume_path) assert opt.arch == checkpoint['arch'] opt.begin_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) if not opt.no_train: optimizer.load_state_dict(checkpoint['optimizer']) print('run') for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: train_epoch(i, train_loader, model, criterion, optimizer, opt, train_logger, train_batch_logger) if not opt.no_val: validation_loss = val_epoch(i, val_loader, model, criterion, opt, val_logger) if not opt.no_train and not opt.no_val: scheduler.step(validation_loss) if opt.test: spatial_transform = Compose([ Scale(int(opt.sample_size / opt.scale_in_test)), CornerCrop(opt.sample_size, opt.crop_position_in_test), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = VideoID() test_data = get_test_set(opt, spatial_transform, temporal_transform, target_transform)
for k, v in checkpoint['state_dict'].items() } model.load_state_dict(checkpoint_dict) if not opt.no_train: optimizer.load_state_dict(checkpoint['optimizer']) print('run') for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: train_epoch(i, train_loader, model, [sourceImagenet, sourcePlaces], criterion, optimizer, opt, train_logger, train_batch_logger) if not opt.no_val: validation_loss = val_epoch(i, val_loader, model, [sourceImagenet, sourcePlaces], criterion, opt, val_logger) if not opt.no_train and not opt.no_val: scheduler.step(validation_loss) if opt.test: spatial_transform = Compose([ Scale(int(opt.sample_size / opt.scale_in_test)), CornerCrop(opt.sample_size, opt.crop_position_in_test), ToTensor(opt.norm_value), norm_method ]) temporal_transform = LoopPadding(opt.sample_duration) target_transform = VideoID() test_data = get_test_set(opt, spatial_transform, temporal_transform,