def train(settings=None): """Main script for training the semi-supervised GAN.""" if not settings: settings = Settings() train_transform = torchvision.transforms.Compose([transforms.RandomlySelectPatchAndRescale(), transforms.RandomHorizontalFlip(), transforms.NegativeOneToOneNormalizeImage(), transforms.NumpyArraysToTorchTensors()]) validation_transform = torchvision.transforms.Compose([transforms.RandomlySelectPatchAndRescale(), transforms.NegativeOneToOneNormalizeImage(), transforms.NumpyArraysToTorchTensors()]) train_dataset = CrowdDatasetWithUnlabeled(settings.train_dataset_path, 'train', transform=train_transform) train_dataset_loader = torch.utils.data.DataLoader(train_dataset, batch_size=settings.batch_size, shuffle=True, num_workers=settings.number_of_data_loader_workers) validation_dataset = CrowdDataset(settings.validation_dataset_path, 'validation', transform=validation_transform) validation_dataset_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=settings.batch_size, shuffle=False, num_workers=settings.number_of_data_loader_workers) gan = GAN() gpu(gan) D = gan.D G = gan.G discriminator_optimizer = Adam(D.parameters()) generator_optimizer = Adam(G.parameters()) step = 0 epoch = 0 if settings.load_model_path: d_model_state_dict, d_optimizer_state_dict, epoch, step = load_trainer(prefix='discriminator', settings=settings) D.load_state_dict(d_model_state_dict) discriminator_optimizer.load_state_dict(d_optimizer_state_dict) discriminator_optimizer.param_groups[0].update({'lr': settings.learning_rate, 'weight_decay': settings.weight_decay}) if settings.load_model_path: g_model_state_dict, g_optimizer_state_dict, _, _ = load_trainer(prefix='generator', settings=settings) G.load_state_dict(g_model_state_dict) generator_optimizer.load_state_dict(g_optimizer_state_dict) generator_optimizer.param_groups[0].update({'lr': settings.learning_rate}) running_scalars = defaultdict(float) validation_running_scalars = defaultdict(float) running_example_count = 0 datetime_string = datetime.datetime.now().strftime("y%Ym%md%dh%Hm%Ms%S") trial_directory = os.path.join(settings.log_directory, settings.trial_name + ' ' + datetime_string) os.makedirs(trial_directory, exist_ok=True) summary_writer = SummaryWriter(os.path.join(trial_directory, 'train')) validation_summary_writer = SummaryWriter(os.path.join(trial_directory, 'validation')) print('Starting training...') step_time_start = datetime.datetime.now() while epoch < settings.number_of_epochs: for examples, unlabeled_examples in train_dataset_loader: unlabeled_images = unlabeled_examples[0] # Real image discriminator processing. discriminator_optimizer.zero_grad() images, labels, _ = examples images, labels = Variable(gpu(images)), Variable(gpu(labels)) current_batch_size = images.data.shape[0] predicted_labels, predicted_counts = D(images) real_feature_layer = D.feature_layer density_loss = torch.abs(predicted_labels - labels).pow(settings.loss_order).sum(1).sum(1).mean() count_loss = torch.abs(predicted_counts - labels.sum(1).sum(1)).pow(settings.loss_order).mean() loss = count_loss + (density_loss * 10) loss.backward() running_scalars['Labeled/Loss'] += loss.data[0] running_scalars['Labeled/Count Loss'] += count_loss.data[0] running_scalars['Labeled/Density Loss'] += density_loss.data[0] running_scalars['Labeled/Count ME'] += (predicted_counts - labels.sum(1).sum(1)).mean().data[0] # Unlabeled. _ = D(gpu(images)) labeled_feature_layer = D.feature_layer _ = D(gpu(Variable(unlabeled_images))) unlabeled_feature_layer = D.feature_layer unlabeled_loss = feature_distance_loss(unlabeled_feature_layer, labeled_feature_layer, scale=False) * settings.unlabeled_loss_multiplier unlabeled_loss.backward() # Fake. _ = D(gpu(Variable(unlabeled_images))) unlabeled_feature_layer = D.feature_layer z = torch.from_numpy(MixtureModel([norm(-settings.mean_offset, 1), norm(settings.mean_offset, 1)]).rvs( size=[current_batch_size, 100]).astype(np.float32)) # z = torch.randn(settings.batch_size, noise_size) fake_examples = G(gpu(Variable(z))) _ = D(fake_examples.detach()) fake_feature_layer = D.feature_layer fake_loss = feature_distance_loss(unlabeled_feature_layer, fake_feature_layer, order=1).neg() * settings.fake_loss_multiplier fake_loss.backward() # Feature norm loss. _ = D(gpu(Variable(unlabeled_images))) unlabeled_feature_layer = D.feature_layer feature_norm_loss = (unlabeled_feature_layer.norm(dim=1).mean() - 1).pow(2) feature_norm_loss.backward() # Gradient penalty. if settings.gradient_penalty_on: alpha = gpu(Variable(torch.rand(2))) alpha = alpha / alpha.sum(0) interpolates = (alpha[0] * gpu(Variable(unlabeled_images, requires_grad=True)) + alpha[1] * gpu(Variable(fake_examples.detach().data, requires_grad=True))) _ = D(interpolates) interpolates_predictions = D.feature_layer gradients = torch.autograd.grad(outputs=interpolates_predictions, inputs=interpolates, grad_outputs=gpu(torch.ones(interpolates_predictions.size())), create_graph=True, only_inputs=True)[0] gradient_penalty = ((gradients.norm(dim=1) - 1) ** 2).mean() * settings.gradient_penalty_multiplier gradient_penalty.backward() # Discriminator update. discriminator_optimizer.step() # Generator. if step % 1 == 0: generator_optimizer.zero_grad() _ = D(gpu(Variable(unlabeled_images))) unlabeled_feature_layer = D.feature_layer.detach() z = torch.randn(current_batch_size, 100) fake_examples = G(gpu(Variable(z))) _ = D(fake_examples) fake_feature_layer = D.feature_layer generator_loss = feature_distance_loss(unlabeled_feature_layer, fake_feature_layer) generator_loss.backward() generator_optimizer.step() running_example_count += images.size()[0] if step % settings.summary_step_period == 0 and step != 0: comparison_image = viewer.create_crowd_images_comparison_grid(cpu(images), cpu(labels), cpu(predicted_labels)) summary_writer.add_image('Comparison', comparison_image, global_step=step) fake_images_image = torchvision.utils.make_grid(fake_examples.data[:9], nrow=3) summary_writer.add_image('Fake', fake_images_image, global_step=step) print('\rStep {}, {}...'.format(step, datetime.datetime.now() - step_time_start), end='') step_time_start = datetime.datetime.now() for name, running_scalar in running_scalars.items(): mean_scalar = running_scalar / running_example_count summary_writer.add_scalar(name, mean_scalar, global_step=step) running_scalars[name] = 0 running_example_count = 0 for validation_examples in validation_dataset_loader: images, labels, _ = validation_examples images, labels = Variable(gpu(images)), Variable(gpu(labels)) predicted_labels, predicted_counts = D(images) density_loss = torch.abs(predicted_labels - labels).pow(settings.loss_order).sum(1).sum(1).mean() count_loss = torch.abs(predicted_counts - labels.sum(1).sum(1)).pow(settings.loss_order).mean() count_mae = torch.abs(predicted_counts - labels.sum(1).sum(1)).mean() count_me = (predicted_counts - labels.sum(1).sum(1)).mean() validation_running_scalars['Labeled/Density Loss'] += density_loss.data[0] validation_running_scalars['Labeled/Count Loss'] += count_loss.data[0] validation_running_scalars['Test/Count MAE'] += count_mae.data[0] validation_running_scalars['Labeled/Count ME'] += count_me.data[0] comparison_image = viewer.create_crowd_images_comparison_grid(cpu(images), cpu(labels), cpu(predicted_labels)) validation_summary_writer.add_image('Comparison', comparison_image, global_step=step) for name, running_scalar in validation_running_scalars.items(): mean_scalar = running_scalar / len(validation_dataset) validation_summary_writer.add_scalar(name, mean_scalar, global_step=step) validation_running_scalars[name] = 0 step += 1 epoch += 1 if epoch != 0 and epoch % settings.save_epoch_period == 0: save_trainer(trial_directory, D, discriminator_optimizer, epoch, step, prefix='discriminator') save_trainer(trial_directory, G, generator_optimizer, epoch, step, prefix='generator') save_trainer(trial_directory, D, discriminator_optimizer, epoch, step, prefix='discriminator') save_trainer(trial_directory, G, generator_optimizer, epoch, step, prefix='generator') print('Finished Training') return trial_directory
def main(parser_data): device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) data_transform = { "train": transforms.Compose([transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError("VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=train_data_set.collate_fn) # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) val_data_set_loader = torch.utils.data.DataLoader(val_data_set, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=5) # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format(parser_data.start_epoch)) train_loss = [] learning_rate = [] val_mAP = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, train_loss=train_loss, train_lr=learning_rate, print_freq=50, warmup=True) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights"): os.makedirs("save_weights") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = "./" # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = 8 nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=batch_size, shuffle=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], False) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=batch_size, shuffle=False, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=21) # print(model) model.to(device) train_loss = [] learning_rate = [] val_mAP = [] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) num_epochs = 5 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) torch.save(model.state_dict(), "./save_weights/pretrain.pth") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) num_epochs = 20 for epoch in range(num_epochs): # train for one epoch, printing every 50 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, train_loss=train_loss, train_lr=learning_rate) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device, mAP_list=val_mAP) # save weights if epoch > 10: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/mobile-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_mAP) != 0: from plot_curve import plot_map plot_map(val_mAP)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print(device) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } root = parser_data.data_path # load train data set train_data_set = XRayDataset(root=root, transform=data_transform["train"], train_set=True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=4, shuffle=True, num_workers=0, collate_fn=utils.collate_fn) # load validation data set val_data_set = XRayDataset(root=root, transform=data_transform["val"], train_set=False) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=2, shuffle=False, num_workers=0, collate_fn=utils.collate_fn) # create model num_classes equal background + 5 classes model = create_model(num_classes=6) print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device) # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch))
def main(): global best_prec1 best_prec1 = 0 global val_acc val_acc = [] global class_num class_num = args.dataset == 'cifar10' and 10 or 100 normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) if args.augment: if args.autoaugment: print('Autoaugment') transform_train = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: F.pad(x.unsqueeze(0), (4, 4, 4, 4), mode='reflect').squeeze()), transforms.ToPILImage(), transforms.RandomCrop(32), transforms.RandomHorizontalFlip(), CIFAR10Policy(), transforms.ToTensor(), Cutout(n_holes=args.n_holes, length=args.length), normalize, ]) elif args.cutout: print('Cutout') transform_train = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: F.pad(x.unsqueeze(0), (4, 4, 4, 4), mode='reflect').squeeze()), transforms.ToPILImage(), transforms.RandomCrop(32), transforms.RandomHorizontalFlip(), transforms.ToTensor(), Cutout(n_holes=args.n_holes, length=args.length), normalize, ]) else: print('Standrad Augmentation!') transform_train = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: F.pad(x.unsqueeze(0), (4, 4, 4, 4), mode='reflect').squeeze()), transforms.ToPILImage(), transforms.RandomCrop(32), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) else: transform_train = transforms.Compose([ transforms.ToTensor(), normalize, ]) transform_test = transforms.Compose([transforms.ToTensor(), normalize]) kwargs = {'num_workers': 1, 'pin_memory': True} assert (args.dataset == 'cifar10' or args.dataset == 'cifar100') train_loader = torch.utils.data.DataLoader( datasets.__dict__[args.dataset.upper()]('../data', train=True, download=True, transform=transform_train), batch_size=training_configurations[args.model]['batch_size'], shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( datasets.__dict__[args.dataset.upper()]('../data', train=False, transform=transform_test), batch_size=training_configurations[args.model]['batch_size'], shuffle=True, **kwargs) # create model if args.model == 'resnet': model = eval('networks.resnet.resnet' + str(args.layers) + '_cifar')(dropout_rate=args.droprate) elif args.model == 'se_resnet': model = eval('networks.se_resnet.resnet' + str(args.layers) + '_cifar')(dropout_rate=args.droprate) elif args.model == 'wideresnet': model = networks.wideresnet.WideResNet(args.layers, args.dataset == 'cifar10' and 10 or 100, args.widen_factor, dropRate=args.droprate) elif args.model == 'se_wideresnet': model = networks.se_wideresnet.WideResNet( args.layers, args.dataset == 'cifar10' and 10 or 100, args.widen_factor, dropRate=args.droprate) elif args.model == 'densenet_bc': model = networks.densenet_bc.DenseNet( growth_rate=args.growth_rate, block_config=(int((args.layers - 4) / 6), ) * 3, compression=args.compression_rate, num_init_features=24, bn_size=args.bn_size, drop_rate=args.droprate, small_inputs=True, efficient=False) elif args.model == 'shake_pyramidnet': model = networks.shake_pyramidnet.PyramidNet(dataset=args.dataset, depth=args.layers, alpha=args.alpha, num_classes=class_num, bottleneck=True) elif args.model == 'resnext': if args.cardinality == 8: model = networks.resnext.resnext29_8_64(class_num) if args.cardinality == 16: model = networks.resnext.resnext29_16_64(class_num) elif args.model == 'shake_shake': if args.widen_factor == 112: model = networks.shake_shake.shake_resnet26_2x112d(class_num) if args.widen_factor == 32: model = networks.shake_shake.shake_resnet26_2x32d(class_num) if args.widen_factor == 96: model = networks.shake_shake.shake_resnet26_2x32d(class_num) elif args.model == 'shake_shake_x': model = networks.shake_shake.shake_resnext29_2x4x64d(class_num) if not os.path.isdir(check_point): mkdir_p(check_point) fc = Full_layer(int(model.feature_num), class_num) print('Number of final features: {}'.format(int(model.feature_num))) print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]) + sum([p.data.nelement() for p in fc.parameters()]))) cudnn.benchmark = True # define loss function (criterion) and optimizer isda_criterion = ISDALoss(int(model.feature_num), class_num).cuda() ce_criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD( [{ 'params': model.parameters() }, { 'params': fc.parameters() }], lr=training_configurations[args.model]['initial_learning_rate'], momentum=training_configurations[args.model]['momentum'], nesterov=training_configurations[args.model]['nesterov'], weight_decay=training_configurations[args.model]['weight_decay']) model = torch.nn.DataParallel(model).cuda() fc = nn.DataParallel(fc).cuda() if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) fc.load_state_dict(checkpoint['fc']) optimizer.load_state_dict(checkpoint['optimizer']) isda_criterion = checkpoint['isda_criterion'] val_acc = checkpoint['val_acc'] best_prec1 = checkpoint['best_acc'] np.savetxt(accuracy_file, np.array(val_acc)) else: start_epoch = 0 for epoch in range(start_epoch, training_configurations[args.model]['epochs']): adjust_learning_rate(optimizer, epoch + 1) # train for one epoch train(train_loader, model, fc, isda_criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, fc, ce_criterion, epoch) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'fc': fc.state_dict(), 'best_acc': best_prec1, 'optimizer': optimizer.state_dict(), 'isda_criterion': isda_criterion, 'val_acc': val_acc, }, is_best, checkpoint=check_point) print('Best accuracy: ', best_prec1) np.savetxt(accuracy_file, np.array(val_acc)) print('Best accuracy: ', best_prec1) print('Average accuracy', sum(val_acc[len(val_acc) - 10:]) / 10) # val_acc.append(sum(val_acc[len(val_acc) - 10:]) / 10) # np.savetxt(val_acc, np.array(val_acc)) np.savetxt(accuracy_file, np.array(val_acc))
def build_model(self): """ DataLoader """ # TODO 由于Api不同此处先去掉了RandomCrop train_transform = [ transforms.RandomHorizontalFlip(), transforms.Resize((self.img_size + 30, self.img_size + 30)), transforms.RandomCrop(self.img_size), transforms.ToArray(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), transforms.ToTensor() ] test_transform = [ transforms.Resize((self.img_size, self.img_size)), transforms.ToArray(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), transforms.ToTensor() ] self.trainA = os.path.join('dataset', self.dataset, 'trainA') self.trainB = os.path.join('dataset', self.dataset, 'trainB') self.testA = os.path.join('dataset', self.dataset, 'testA') self.testB = os.path.join('dataset', self.dataset, 'testB') self.trainA_loader = DataLoader(self.trainA, batch_size=self.batch_size, transforms=train_transform, shuffle=True) self.trainB_loader = DataLoader(self.trainB, batch_size=self.batch_size, transforms=train_transform, shuffle=True) self.testA_loader = DataLoader(self.testA, batch_size=1, transforms=test_transform, shuffle=False) self.testB_loader = DataLoader(self.testB, batch_size=1, transforms=test_transform, shuffle=False) """ Define Generator, Discriminator """ self.genA2B = ResnetGenerator(input_nc=3, output_nc=3, ngf=self.ch, n_blocks=self.n_res, img_size=self.img_size, light=self.light) self.genB2A = ResnetGenerator(input_nc=3, output_nc=3, ngf=self.ch, n_blocks=self.n_res, img_size=self.img_size, light=self.light) self.disGA = Discriminator(input_nc=3, ndf=self.ch, n_layers=7) self.disGB = Discriminator(input_nc=3, ndf=self.ch, n_layers=7) self.disLA = Discriminator(input_nc=3, ndf=self.ch, n_layers=5) self.disLB = Discriminator(input_nc=3, ndf=self.ch, n_layers=5) """ Define Loss """ self.L1_loss = dygraph.L1Loss() self.MSE_loss = layers.mse_loss self.BCELoss = bce_Loss # BCELoss should be called with Normalize=True, use seperately """ Trainer """ self.G_optim = optimizer.Adam( learning_rate=self.lr, beta1=0.5, beta2=0.999, parameter_list=itertools.chain(self.genA2B.parameters(), self.genB2A.parameters()), regularization=fluid.regularizer.L1Decay(self.weight_decay)) # self.G_optim = torch.optim.Adam(itertools.chain(self.genA2B.parameters(), self.genB2A.parameters()), lr=self.lr, betas=(0.5, 0.999), weight_decay=self.weight_decay) self.D_optim = optimizer.Adam( learning_rate=self.lr, beta1=0.5, beta2=0.999, parameter_list=itertools.chain(self.disGA.parameters(), self.disLB.parameters()), regularization=fluid.regularizer.L1Decay(self.weight_decay)) # self.D_optim = torch.optim.Adam(itertools.chain(self.disGA.parameters(), self.disGB.parameters(), self.disLA.parameters(), self.disLB.parameters()), lr=self.lr, betas=(0.5, 0.999), weight_decay=self.weight_decay) """ Define Rho clipper to constraint the value of rho in AdaILN and ILN""" self.Rho_clipper = RhoClipper(0, 1)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(device) # 检查保存权重文件夹是否存在,不存在则创建 if not os.path.exists("save_weights_mobielnet_hand"): os.makedirs("save_weights_mobielnet_hand") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = os.getcwd() # load train data set train_data_set = HandDatset(VOC_root, data_transform["train"], True) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch train_data_loader = torch.utils.data.DataLoader( train_data_set, batch_size=32, shuffle=True, num_workers=4, collate_fn=utils.collate_fn) # load validation data set val_data_set = HandDatset(VOC_root, data_transform["val"], False) val_data_set_loader = torch.utils.data.DataLoader( val_data_set, batch_size=16, shuffle=False, num_workers=4, collate_fn=utils.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=12) print(model) model.to(device) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # first frozen backbone and train 5 epochs # # 首先冻结前置特征提取网络权重(backbone),训练rpn以及最终预测网络部分 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 不冻结。 for param in model.backbone.parameters(): param.requires_grad = False # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) num_epochs = 5 for epoch in range(num_epochs): # train for one epoch, printing every 10 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50) # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device) torch.save(model.state_dict(), "./save_weights/pretrain.pth") # # # # # # # # # # # # # # # # # # # # # # # # # # # # # second unfrozen backbone and train all network # # 解冻前置特征提取网络权重(backbone),接着训练整个网络权重 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 冻结backbone部分底层权重 for name, parameter in model.backbone.named_parameters(): split_name = name.split(".")[0] if split_name in ["0", "1", "2", "3"]: parameter.requires_grad = False else: parameter.requires_grad = True # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.33) num_epochs = 2000 for epoch in range(num_epochs): # train for one epoch, printing every 50 iterations utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) # update the learning rate lr_scheduler.step() # evaluate on the test dataset utils.evaluate(model, val_data_set_loader, device=device) # save weights if epoch > 10: save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_hand_weights/mobile-model-{}.pth".format(epoch))
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False if not args.evaluate: sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) else: sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_vidreid_dataset(root=args.root, name=args.dataset) transform_train = T.Compose([ T.Random2DTranslation(args.height, args.width), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) transform_test = T.Compose([ T.Resize((args.height, args.width)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) pin_memory = True if use_gpu else False # decompose tracklets into images for image-based training new_train = [] for img_paths, pid, camid in dataset.train: for img_path in img_paths: new_train.append((img_path, pid, camid)) trainloader = DataLoader( ImageDataset(new_train, transform=transform_train), sampler=RandomIdentitySampler(new_train, num_instances=args.num_instances), batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) queryloader = DataLoader( VideoDataset(dataset.query, seq_len=args.seq_len, sample='evenly', transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) galleryloader = DataLoader( VideoDataset(dataset.gallery, seq_len=args.seq_len, sample='evenly', transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids, loss={'xent', 'htri'}) print("Model size: {:.3f} M".format(count_num_param(model))) criterion_xent = CrossEntropyLabelSmooth(num_classes=dataset.num_train_pids, use_gpu=use_gpu) criterion_htri = TripletLoss(margin=args.margin) optimizer = init_optim(args.optim, model.parameters(), args.lr, args.weight_decay) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.stepsize, gamma=args.gamma) if args.load_weights: # load pretrained weights but ignore layers that don't match in size print("Loading pretrained weights from '{}'".format(args.load_weights)) checkpoint = torch.load(args.load_weights) pretrain_dict = checkpoint['state_dict'] model_dict = model.state_dict() pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size()} model_dict.update(pretrain_dict) model.load_state_dict(model_dict) if args.resume: if osp.isfile(args.resume): checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] rank1 = checkpoint['rank1'] print("Loaded checkpoint from '{}'".format(args.resume)) print("- start_epoch: {}\n- rank1: {}".format(args.start_epoch, rank1)) else: print("=> No checkpoint found at '{}'".format(args.resume)) if use_gpu: model = nn.DataParallel(model).cuda() if args.evaluate: print("Evaluate only") distmat = test(model, queryloader, galleryloader, use_gpu, return_distmat=True) if args.vis_ranked_res: visualize_ranked_results( distmat, dataset, save_dir=osp.join(args.save_dir, 'ranked_results'), topk=20, ) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(args.start_epoch, args.max_epoch): start_train_time = time.time() train(epoch, model, criterion_xent, criterion_htri, optimizer, trainloader, use_gpu) train_time += round(time.time() - start_train_time) scheduler.step() if (epoch + 1) > args.start_eval and args.eval_step > 0 and (epoch + 1) % args.eval_step == 0 or (epoch + 1) == args.max_epoch: print("==> Test") rank1 = test(model, queryloader, galleryloader, args.pool, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format(best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print("Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.".format(elapsed, train_time))
def train(epochs): device = torch.device('cuda') param = {} param['lr'] = 0.001 param['max_epoch'] = 60 param['total_epoch'] = epochs param['lr_pow'] = 0.95 param['running_lr'] = param['lr'] train_file = 'Dataset05/train_file.txt' gt_root = 'Dataset05/training_aug/groundtruth' left_high_root = 'Dataset05/training_aug/left_high' right_low_root = 'Dataset05/training_aug/right_low' list_file = open(train_file) image_names = [line.strip() for line in list_file] crit = nn.L1Loss() #crit = nn.BCELoss() # model = SRNet().to(device) model = DINetwok().to(device) # model.load_state_dict(torch.load('model/2018-10-26 22:11:34/50000/snap_model.pth')) # model = load_part_of_model_PSP_LSTM(model, param['pretrained_model']) # model.load_state_dict(torch.load(param['pretrained_model'])) # optimizers = create_optimizers(nets, param) optimizer = torch.optim.Adam(model.parameters(), lr=param['lr']) model.train() # model = load_part_of_model(model, 'checkpoint/model_epoch_5.pth') dataset = EnhanceDataset(left_high_root, right_low_root, gt_root, image_names, transform=transforms.Compose([ transforms.RandomCrop(120), transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.RandomRotation(), transforms.ToTensor() ])) training_data_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True, num_workers=int(2)) time_str = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) for epoch in range(1, epochs + 1): for iteration, (low, high, target) in enumerate(training_data_loader): low = low.type(torch.cuda.FloatTensor) high = high.type(torch.cuda.FloatTensor) target = target.type(torch.cuda.FloatTensor) final, lstm_branck = model(low, high) loss = crit(final, target) #loss_lstm = crit(lstm_branck, target) #loss = 0.9 * loss + 0.1 * loss_lstm optimizer.zero_grad() loss.backward() optimizer.step() if iteration % 2 == 0: print( "===> Epoch[{}]({}/{}): Loss: {:.10f}; lr:{:.10f}".format( epoch, iteration, len(training_data_loader), loss.item(), param['running_lr'])) adjust_learning_rate(optimizer, epoch, param) print("Epochs={}, lr={}".format(epoch, optimizer.param_groups[0]["lr"])) if epoch % 50 == 0: save_checkpoint(model, epoch, time_str)
def main(): ### ray.init() global args, best_prec1 args = parser.parse_args() print(args) cudnn.benchmark = True normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2473, 0.2434, 0.2610]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) transform_val = transforms.Compose([ transforms.ToTensor(), normalize, ]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) valset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_val) train_loader = torch.utils.data.DataLoader( trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( valset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) blocks = get_blocks(cifar10=True) model = SampleNet(num_classes=10, blocks=blocks) ### model = torch.nn.DataParallel(model, device_ids=[0] if args.single_gpu else None).cuda() model.architecture_parameters = read_architecture_parameters() architecture_parameters = model.architecture_parameters ### NOTE # criterion = nn.CrossEntropyLoss().cuda() criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) for epoch in range(args.epochs): adjust_learning_rate(optimizer, epoch) # train weight for one epoch model.is_train_weight = True print('train_weight_parameters') # train_weight_parameters(train_loader, optimizer, model, criterion, epoch) # Configure 20 workers and get acc by interacting with the environment index_candidate_blocks = [get_index_candidate_blocks(architecture_parameters) for _ in range(20)] model.is_train_weight = False models = [] for i in range(20): model.index_candidate_block = index_candidate_blocks[i] models.append(model) ### val_accs = [ValAcc.remote(val_loader, models[i]) for i in range(20)] val_accs = [ValAcc(models[i]) for i in range(20)] ### acc_results = ray.get([v.get_acc.remote() for v in val_accs]) acc_results = [v.get_acc() for v in val_accs] print('train_architecture_parameters') architecture_parameters = train_architecture_parameters(architecture_parameters, index_candidate_blocks, acc_results)
def main(parser_data): device = torch.device( parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = parser_data.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_dataset = VOCDataSet(VOC_root, "2012", data_transform["train"], "train.txt") train_sampler = None # 是否按图片相似高宽比采样图片组成batch # 使用的话能够减小训练时所需GPU显存,默认使用 if args.aspect_ratio_group_factor >= 0: train_sampler = torch.utils.data.RandomSampler(train_dataset) # 统计所有图像高宽比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_dataset, k=args.aspect_ratio_group_factor) # 每个batch图片从同一高宽比例区间中取 train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) if train_sampler: # 如果按照图片高宽比采样图片,dataloader中需要使用batch_sampler train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=train_batch_sampler, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) else: train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_dataset.collate_fn) # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_dataset = VOCDataSet(VOC_root, "2012", data_transform["val"], "val.txt") val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=1, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=val_dataset.collate_fn) # create model # 注意:不包含背景 model = create_model(num_classes=parser_data.num_classes) # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format( parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item()] ] + [str(round(lr, 6))] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal map # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch } torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(parser_data): device = torch.device(parser_data.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file =str(parser_data.batch_size)+ "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose([transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } print("loading data...") #VOC_root = parser_data.data_path VOC_root='/remote-home/xymou/NNDL/datasets/voc/' # check voc root #if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: # raise FileNotFoundError("VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC0712DataSet(VOC_root, data_transform["train"], "train.txt") # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = parser_data.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) print("traindata loaded") # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC0712DataSet(VOC_root, data_transform["val"], "val.txt") val_data_set_loader = torch.utils.data.DataLoader(val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 20 classes model = create_model(num_classes=parser_data.num_classes + 1, device=device) # print(model) model.to(device) # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005) # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.33) # 如果指定了上次训练保存的权重文件地址,则接着上次结果接着训练 if parser_data.resume != "": checkpoint = torch.load(parser_data.resume, map_location=device) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) parser_data.start_epoch = checkpoint['epoch'] + 1 print("the training process from epoch{}...".format(parser_data.start_epoch)) train_loss = [] learning_rate = [] val_map = [] for epoch in range(parser_data.start_epoch, parser_data.epochs): # train for one epoch, printing every 10 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device=device, epoch=epoch, print_freq=50, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr]] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} torch.save(save_files, "./save_weights/resNetFpn-model-{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
with open(config_file, "r") as ymlfile: yml_file = yaml.safe_load(ymlfile) cfg = Config(yml_file) np.random.seed(0) # Deterministic random device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Set up dataset and dataloader print('Loading dataset...') train_dataset = LTSIDDataset(cfg.input_dir, cfg.truth_dir, preprocess=cfg.preprocess, preprocess_dir=cfg.preprocess_dir, collection='train', transforms=transforms.Compose([ trf.RandomCrop(cfg.patch_size), trf.ToTensor(), trf.RandomHorizontalFlip(p=0.5), trf.RandomVerticalFlip(p=0.5), trf.RandomTranspose(p=0.5), ])) validation_dataset = LTSIDDataset(cfg.input_dir, cfg.truth_dir, preprocess=cfg.preprocess, preprocess_dir=cfg.preprocess_dir, collection='validation', transforms=transforms.Compose([ trf.RandomCrop(cfg.patch_size), trf.ToTensor(), trf.RandomHorizontalFlip(p=0.5), trf.RandomVerticalFlip(p=0.5), trf.RandomTranspose(p=0.5), ])) train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True) validation_loader = DataLoader(validation_dataset, batch_size=cfg.batch_size, shuffle=True,)
def train_each_teacher(num_epoch, train_data, train_label, test_data, test_label, save_path): torch.manual_seed(config.seed) print('len of train_data in network', len(train_data)) os.environ['CUDA_VISIBLE_DEVICES'] = config.gpu_devices print('it is training now') use_gpu = torch.cuda.is_available() if config.use_cpu: use_gpu = False print('whether evaluate', config.evaluate) if use_gpu: print("Currently using GPU {}".format(config.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(config.seed) else: print("Currently using CPU (GPU is highly recommended)") if config.dataset == 'mnist': transform_train = T.Compose([ T.Random2DTranslation(config.height, config.width), #T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.1307], std=[0.3081]), ]) transform_test = T.Compose([ T.Resize((config.height, config.width)), T.ToTensor(), T.Normalize(mean=[0.1307], std=[0.3081]), ]) else: transform_train = T.Compose([ #T.Random2DTranslation(config.height, config.width), T.ToPILImage(), T.RandomCrop(32, padding=4), T.RandomHorizontalFlip(), T.ToTensor(), #T.Resize(32), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) transform_test = T.Compose([ T.ToPILImage(), #T.Resize(32), #T.Resize((config.height, config.width,3)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) pin_memory = True if use_gpu else False print('train_data', len(train_data), 'train_label', len(train_label)) trainloader = DataLoader( ImageDataset(train_data, label=train_label, transform=transform_train), batch_size=config.train_batch, shuffle=True, num_workers=config.workers, pin_memory=pin_memory, drop_last=True, ) testloader = DataLoader( ImageDataset(test_data, label=test_label, transform=transform_test), batch_size=config.test_batch, shuffle=False, num_workers=config.workers, pin_memory=pin_memory, drop_last=False, ) model = models.init_model(name=config.arch, num_classes=config.nb_labels, loss={'xent'}, use_gpu=use_gpu) if use_gpu: model = nn.DataParallel(model).cuda() criterion = torch.nn.CrossEntropyLoss() #optimizer = init_optim(config.optim, model.parameters(), config.lr, config.weight_decay) #if config.stepsize > 0: # scheduler = lr_scheduler.StepLR(optimizer, step_size=config.stepsize, gamma=config.gamma) print("==> Start training") start_time = time.time() for epoch in range(num_epoch): optimizer = optim.SGD(model.parameters(), lr=learning_rate(config.lr, epoch), momentum=0.9, weight_decay=0.0005) print('\n=> Training Epoch #%d, LR=%.4f' % (epoch, learning_rate(config.lr, epoch))) train(epoch, model, criterion, optimizer, trainloader, use_gpu) #if config.stepsize > 0: scheduler.step() rank1 = test(model, testloader, use_gpu) rank1 = test(model, testloader, use_gpu) if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() print('save model', save_path) torch.save(state_dict, save_path) #print("==> Hamming Score {:.3%}".format(rank1)) elapsed = round(time.time() - start_time) print("Finished. Training time (h:m:s): {}.".format(elapsed))
def __init__(self): self.best_acc = 0 self.best_acc_gbt = 0 self.use_cuda = torch.cuda.is_available() pixvlu, npix = 0, 0 for fname in os.listdir(preprocesspath): if fname.endswith('.npy'): if fname[:-4] in blklst: continue data = np.load(os.path.join(preprocesspath, fname)) pixvlu += np.sum(data) npix += np.prod(data.shape) pixmean = pixvlu / float(npix) pixvlu = 0 for fname in os.listdir(preprocesspath): if fname.endswith('.npy'): if fname[:-4] in blklst: continue data = np.load(os.path.join(preprocesspath, fname)) - pixmean pixvlu += np.sum(data * data) pixstd = np.sqrt(pixvlu / float(npix)) print('pixmean:%.3f, pixstd:%.3f' % (pixmean, pixstd)) logging.info('mean ' + str(pixmean) + ' std ' + str(pixstd)) # Datatransforms logging.info( '==> Preparing data..') # Random Crop, Zero out, x z flip, scale, transform_train = transforms.Compose([ # transforms.RandomScale(range(28, 38)), transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.RandomYFlip(), transforms.RandomZFlip(), transforms.ZeroOut(4), transforms.ToTensor(), transforms.Normalize( (pixmean), (pixstd)), # need to cal mean and std, revise norm func ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((pixmean), (pixstd)), ]) # load data list self.trfnamelst = [] trlabellst = [] trfeatlst = [] self.tefnamelst = [] telabellst = [] tefeatlst = [] dataframe = pd.read_csv(csvfilepath, names=[ 'seriesuid', 'coordX', 'coordY', 'coordZ', 'diameter_mm', 'malignant' ]) alllst = dataframe['seriesuid'].tolist()[1:] labellst = dataframe['malignant'].tolist()[1:] crdxlst = dataframe['coordX'].tolist()[1:] crdylst = dataframe['coordY'].tolist()[1:] crdzlst = dataframe['coordZ'].tolist()[1:] dimlst = dataframe['diameter_mm'].tolist()[1:] # test id teidlst = [] for fname in os.listdir(luna16path + '/subset' + str(fold) + '/'): if fname.endswith('.mhd'): teidlst.append(fname[:-4]) mxx = mxy = mxz = mxd = 0 for srsid, label, x, y, z, d in zip(alllst, labellst, crdxlst, crdylst, crdzlst, dimlst): mxx = max(abs(float(x)), mxx) mxy = max(abs(float(y)), mxy) mxz = max(abs(float(z)), mxz) mxd = max(abs(float(d)), mxd) if srsid in blklst: continue # crop raw pixel as feature if os.path.exists(os.path.join(preprocesspath, srsid + '.npy')): data = np.load(os.path.join(preprocesspath, srsid + '.npy')) bgx = data.shape[0] / 2 - CROPSIZE / 2 bgy = data.shape[1] / 2 - CROPSIZE / 2 bgz = data.shape[2] / 2 - CROPSIZE / 2 data = np.array(data[bgx:bgx + CROPSIZE, bgy:bgy + CROPSIZE, bgz:bgz + CROPSIZE]) feat = np.hstack((np.reshape(data, (-1, )) / 255, float(d))) if srsid.split('-')[0] in teidlst: self.tefnamelst.append(srsid + '.npy') telabellst.append(int(label)) tefeatlst.append(feat) else: self.trfnamelst.append(srsid + '.npy') trlabellst.append(int(label)) trfeatlst.append(feat) for idx in xrange(len(trfeatlst)): trfeatlst[idx][-1] /= mxd for idx in xrange(len(tefeatlst)): tefeatlst[idx][-1] /= mxd trainset = lunanod(preprocesspath, self.trfnamelst, trlabellst, trfeatlst, train=True, download=True, transform=transform_train) self.trainloader = torch.utils.data.DataLoader(trainset, batch_size=16, shuffle=True, num_workers=30) testset = lunanod(preprocesspath, self.tefnamelst, telabellst, tefeatlst, train=False, download=True, transform=transform_test) self.testloader = torch.utils.data.DataLoader(testset, batch_size=16, shuffle=False, num_workers=30) # Model if args.resume: # Load checkpoint. logging.info('==> Resuming from checkpoint..') # assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' checkpoint = torch.load(savemodelpath + 'ckpt.t7') self.net = checkpoint['net'] best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] else: logging.info('==> Building model..') self.net = dpn3d.DPN92_3D() if self.use_cuda: self.net.cuda() self.net = torch.nn.DataParallel(self.net, device_ids=range( torch.cuda.device_count())) cudnn.benchmark = False # True self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.SGD(self.net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) pass
def build_model(self): """ DataLoader """ train_transform = [ transforms.RandomHorizontalFlip(), transforms.Resize((self.img_size + 30, self.img_size + 30)), transforms.RandomCrop(self.img_size), transforms.ToArray(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), transforms.ToTensor() ] test_transform = [ transforms.Resize((self.img_size, self.img_size)), transforms.ToArray(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), transforms.ToTensor() ] self.trainA = os.path.join('dataset', self.dataset, 'trainA') self.trainB = os.path.join('dataset', self.dataset, 'trainB') self.testA = os.path.join('dataset', self.dataset, 'testA') self.testB = os.path.join('dataset', self.dataset, 'testB') self.trainA_loader = DataLoader(self.trainA, batch_size=self.batch_size, transforms=train_transform, shuffle=True) self.trainB_loader = DataLoader(self.trainB, batch_size=self.batch_size, transforms=train_transform, shuffle=True) self.testA_loader = DataLoader(self.testA, batch_size=1, transforms=test_transform, shuffle=False) self.testB_loader = DataLoader(self.testB, batch_size=1, transforms=test_transform, shuffle=False) """ Define Generator, Discriminator """ self.genA2B = ResnetGenerator(input_nc=3, output_nc=3, ngf=self.ch, n_blocks=self.n_res, img_size=self.img_size, light=self.light) self.genB2A = ResnetGenerator(input_nc=3, output_nc=3, ngf=self.ch, n_blocks=self.n_res, img_size=self.img_size, light=self.light) self.disGA = Discriminator(input_nc=3, ndf=self.ch, n_layers=7) self.disGB = Discriminator(input_nc=3, ndf=self.ch, n_layers=7) self.disLA = Discriminator(input_nc=3, ndf=self.ch, n_layers=5) self.disLB = Discriminator(input_nc=3, ndf=self.ch, n_layers=5) """ Define Loss """ self.L1_loss = dygraph.L1Loss() self.MSE_loss = layers.mse_loss self.BCELoss = bce_loss # BCELoss should be called with Normalize=True, use seperately """ Trainer """ self.G_optim = optimizer.Adam(learning_rate=self.lr, beta1=0.5, beta2=0.999, parameter_list=self.genA2B.parameters() + self.genB2A.parameters()) self.D_optim = optimizer.Adam(learning_rate=self.lr, beta1=0.5, beta2=0.999, parameter_list=self.disGA.parameters() + self.disLB.parameters())
sys.stdout = Logger(osp.join(PATH, 'log_train.txt')) print("Dataset is being initialized") dataset = dataset_manager.init_img_dataset( root='data', name=dataset_name, split_id=split_id, cuhk03_labeled=cuhk03_labeled, cuhk03_classic_split=cuhk03_classic_split, ) tfms_train = tfms.Compose([ tfms.Random2DTranslation(height, width), tfms.RandomHorizontalFlip(), tfms.ToTensor(), tfms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) tfms_test = tfms.Compose([ tfms.Resize(size=(height, width)), tfms.ToTensor(), tfms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) pin_memory = True trainloader = DataLoader( ImageDataset(dataset.train, transform=tfms_train), sampler=RandomIdentitySampler(dataset.train, num_instances=num_instances),
category_index = {} try: json_file = open('./pascal_voc_classes.json', 'r') # 先读入json文件 class_dict = json.load(json_file) category_index = {v: k for k, v in class_dict.items() } # key和value颠倒,因为我们后面传入的都是索引值,而不是类别 except Exception as e: print(e) exit(-1) data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } # load train data set train_data_set = VOC2012DataSet(os.getcwd(), data_transform["train"], "train.txt") # 自定义的数据集VOC2012DataSet,第一个参数是VOC所在的根目录,若是当前目录则写为os.getcwd() # 第二个参数是训练集对应的预训练方法data_transform["train"],可以跳转看看 print(len(train_data_set)) # 训练集的文件个数 for index in random.sample(range(0, len(train_data_set)), k=5): # 随机采样5张图 img, target = train_data_set[ index] # 传入索引就可以返回img和target信息,因为已经实现了__getitem__方法 img = ts.ToPILImage()(img) # 预处理将img变为了tensor,现在换为PIL格式
def main(): args.save_dir = args.arch + '_' + args.save_dir args.save_prefix = args.arch + '_' + args.save_dir torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False # append date with save_dir args.save_dir = '../scratch/' + utils.get_currenttime_prefix() + '_' + \ args.dataset + '_' + args.save_dir if args.pretrained_model is not None: args.save_dir = os.path.dirname(args.pretrained_model) if not osp.exists(args.save_dir): os.makedirs(args.save_dir) if not args.evaluate: sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) else: sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_dataset(name=args.dataset) transform_train = T.Compose([ T.Random2DTranslation(args.height, args.width), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) transform_test = T.Compose([ T.Resize((args.height, args.width)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) pin_memory = True if use_gpu else False trainloader = DataLoader( VideoDataset(dataset.train, seq_len=args.seq_len, sample=args.data_selection, transform=transform_train), sampler=RandomIdentitySampler(dataset.train, num_instances=args.num_instances), batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) queryloader = DataLoader( VideoDataset(dataset.query, seq_len=args.seq_len, sample='dense', transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) galleryloader = DataLoader( VideoDataset(dataset.gallery, seq_len=args.seq_len, sample='dense', transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids, seq_len=args.seq_len) # pretrained model loading if args.pretrained_model is not None: if not os.path.exists(args.pretrained_model): raise IOError("Can't find pretrained model: {}".format( args.pretrained_model)) print("Loading checkpoint from '{}'".format(args.pretrained_model)) pretrained_state = torch.load(args.pretrained_model)['state_dict'] print(len(pretrained_state), ' keys in pretrained model') current_model_state = model.state_dict() pretrained_state = { key: val for key, val in pretrained_state.items() if key in current_model_state and val.size() == current_model_state[key].size() } print(len(pretrained_state), ' keys in pretrained model are available in current model') current_model_state.update(pretrained_state) model.load_state_dict(current_model_state) print("Model size: {:.5f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) if use_gpu: model = nn.DataParallel(model).cuda() criterion_xent = CrossEntropyLabelSmooth( num_classes=dataset.num_train_pids, use_gpu=use_gpu) criterion_htri = TripletLoss(margin=args.margin) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) if args.stepsize > 0: scheduler = lr_scheduler.StepLR(optimizer, step_size=args.stepsize, gamma=args.gamma) start_epoch = args.start_epoch if args.evaluate: print("Evaluate only") test(model, queryloader, galleryloader, use_gpu) return start_time = time.time() best_rank1 = -np.inf is_first_time = True for epoch in range(start_epoch, args.max_epoch): print("==> Epoch {}/{}".format(epoch + 1, args.max_epoch)) train(model, criterion_xent, criterion_htri, optimizer, trainloader, use_gpu) if args.stepsize > 0: scheduler.step() rank1 = 'NA' is_best = False if args.eval_step > 0 and (epoch + 1) % args.eval_step == 0 or ( epoch + 1) == args.max_epoch: print("==> Test") rank1 = test(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 # save the model as required if (epoch + 1) % args.save_step == 0: if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join( args.save_dir, args.save_prefix + 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) is_first_time = False if not is_first_time: utils.disable_all_print_once() elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) print("Finished. Total elapsed time (h:m:s): {}".format(elapsed))
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False if not args.evaluate: sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) else: sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_img_dataset( root=args.root, name=args.dataset, split_id=args.split_id, cuhk03_labeled=args.cuhk03_labeled, cuhk03_classic_split=args.cuhk03_classic_split, ) transform_train = T.Compose([ T.Resize((args.height, args.width)), T.RandomHorizontalFlip(p=0.5), T.Pad(10), T.RandomCrop([args.height, args.width]), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), torchvision.transforms.RandomErasing(p=0.5, scale=(0.02, 0.4), ratio=(0.3, 3.33), value=(0.4914, 0.4822, 0.4465)) # T.RandomErasing(probability=0.5, sh=0.4, mean=(0.4914, 0.4822, 0.4465)), ]) transform_test = T.Compose([ T.Resize((args.height, args.width)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) pin_memory = True if use_gpu else False trainloader = DataLoader( ImageDataset(dataset.train, transform=transform_train), sampler=RandomIdentitySampler2(dataset.train, batch_size=args.train_batch, num_instances=args.num_instances), batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) queryloader = DataLoader( ImageDataset(dataset.query, transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) galleryloader = DataLoader( ImageDataset(dataset.gallery, transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids, loss={'xent', 'htri'}) print("Model size: {:.5f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) #embed() criterion_xent = CrossEntropyLabelSmooth( num_classes=dataset.num_train_pids, use_gpu=use_gpu) criterion_htri = TripletLoss(margin=args.margin) optimizer = init_optim(args.optim, model.parameters(), args.lr, args.weight_decay) # if args.stepsize > 0: # scheduler = lr_scheduler.StepLR(optimizer, step_size=args.stepsize, gamma=args.gamma) '''------Modify lr_schedule here------''' current_schedule = init_lr_schedule(schedule=args.schedule, warm_up_epoch=args.warm_up_epoch, half_cos_period=args.half_cos_period, lr_milestone=args.lr_milestone, gamma=args.gamma, stepsize=args.stepsize) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=current_schedule) '''------Please refer to the args.xxx for details of hyperparams------''' #embed() start_epoch = args.start_epoch if args.resume: print("Loading checkpoint from '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] if use_gpu: model = nn.DataParallel(model).cuda() if args.evaluate: print("Evaluate only") test(model, queryloader, galleryloader, use_gpu) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(start_epoch, args.max_epoch): start_train_time = time.time() train(epoch, model, criterion_xent, criterion_htri, optimizer, trainloader, use_gpu) train_time += round(time.time() - start_train_time) if args.schedule: scheduler.step() if (epoch + 1) > args.start_eval and args.eval_step > 0 and ( epoch + 1) % args.eval_step == 0 or (epoch + 1) == args.max_epoch: print("==> Test") rank1 = test(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False if not args.evaluate: sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) else: sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_img_dataset( root=args.root, name=args.dataset, split_id=args.split_id, cuhk03_labeled=args.cuhk03_labeled, cuhk03_classic_split=args.cuhk03_classic_split, ) transform_train = T.Compose([ T.Random2DTranslation(args.height, args.width), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) transform_test = T.Compose([ T.Resize((args.height, args.width)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) pin_memory = True if use_gpu else False trainloader = DataLoader( ImageDataset(dataset.train, transform=transform_train), batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) queryloader = DataLoader( ImageDataset(dataset.query, transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) galleryloader = DataLoader( ImageDataset(dataset.gallery, transform=transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids, loss={'cent'}) print("Model size: {:.5f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion_xent = CrossEntropyLabelSmooth( num_classes=dataset.num_train_pids, use_gpu=use_gpu) criterion_cent = CenterLoss(num_classes=dataset.num_train_pids, feat_dim=model.feat_dim, use_gpu=use_gpu) optimizer_model = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) optimizer_cent = torch.optim.SGD(criterion_cent.parameters(), lr=args.lr_cent) if args.stepsize > 0: scheduler = lr_scheduler.StepLR(optimizer_model, step_size=args.stepsize, gamma=args.gamma) start_epoch = args.start_epoch if args.resume: print("Loading checkpoint from '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) start_epoch = checkpoint['epoch'] if use_gpu: model = nn.DataParallel(model).cuda() if args.evaluate: print("Evaluate only") test(model, queryloader, galleryloader, use_gpu) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(start_epoch, args.max_epoch): start_train_time = time.time() train(epoch, model, criterion_xent, criterion_cent, optimizer_model, optimizer_cent, trainloader, use_gpu) train_time += round(time.time() - start_train_time) if args.stepsize > 0: scheduler.step() if (epoch + 1) > args.start_eval and args.eval_step > 0 and ( epoch + 1) % args.eval_step == 0 or (epoch + 1) == args.max_epoch: print("==> Test") rank1 = test(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
def main(args): if args.apex: if sys.version_info < (3, 0): raise RuntimeError( "Apex currently only supports Python 3. Aborting.") if amp is None: raise RuntimeError( "Failed to import apex. Please install apex from https://www.github.com/nvidia/apex " "to enable mixed-precision training.") if args.output_dir: utils.mkdir(args.output_dir) utils.init_distributed_mode(args) print(args) print("torch version: ", torch.__version__) print("torchvision version: ", torchvision.__version__) device = torch.device(args.device) torch.backends.cudnn.benchmark = True # Data loading code print("Loading data") traindir = os.path.join(args.data_path, args.train_dir) valdir = os.path.join(args.data_path, args.val_dir) normalize = T.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]) print("Loading training data") st = time.time() cache_path = _get_cache_path(traindir) transform_train = torchvision.transforms.Compose([ T.ToFloatTensorInZeroOne(), T.Resize((128, 171)), T.RandomHorizontalFlip(), normalize, T.RandomCrop((112, 112)) ]) if args.cache_dataset and os.path.exists(cache_path): print("Loading dataset_train from {}".format(cache_path)) dataset, _ = torch.load(cache_path) dataset.transform = transform_train else: if args.distributed: print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster") dataset = torchvision.datasets.Kinetics400( traindir, frames_per_clip=args.clip_len, step_between_clips=1, transform=transform_train, frame_rate=15) if args.cache_dataset: print("Saving dataset_train to {}".format(cache_path)) utils.mkdir(os.path.dirname(cache_path)) utils.save_on_master((dataset, traindir), cache_path) print("Took", time.time() - st) print("Loading validation data") cache_path = _get_cache_path(valdir) transform_test = torchvision.transforms.Compose([ T.ToFloatTensorInZeroOne(), T.Resize((128, 171)), normalize, T.CenterCrop((112, 112)) ]) if args.cache_dataset and os.path.exists(cache_path): print("Loading dataset_test from {}".format(cache_path)) dataset_test, _ = torch.load(cache_path) dataset_test.transform = transform_test else: if args.distributed: print("It is recommended to pre-compute the dataset cache " "on a single-gpu first, as it will be faster") dataset_test = torchvision.datasets.Kinetics400( valdir, frames_per_clip=args.clip_len, step_between_clips=1, transform=transform_test, frame_rate=15) if args.cache_dataset: print("Saving dataset_test to {}".format(cache_path)) utils.mkdir(os.path.dirname(cache_path)) utils.save_on_master((dataset_test, valdir), cache_path) print("Creating data loaders") train_sampler = RandomClipSampler(dataset.video_clips, args.clips_per_video) test_sampler = UniformClipSampler(dataset_test.video_clips, args.clips_per_video) if args.distributed: train_sampler = DistributedSampler(train_sampler) test_sampler = DistributedSampler(test_sampler) data_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, sampler=train_sampler, num_workers=args.workers, pin_memory=True, collate_fn=collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=args.batch_size, sampler=test_sampler, num_workers=args.workers, pin_memory=True, collate_fn=collate_fn) print("Creating model") model = torchvision.models.video.__dict__[args.model]( pretrained=args.pretrained) model.to(device) if args.distributed and args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) criterion = nn.CrossEntropyLoss() lr = args.lr * args.world_size optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.apex: model, optimizer = amp.initialize(model, optimizer, opt_level=args.apex_opt_level) # convert scheduler to be per iteration, not per epoch, for warmup that lasts # between different epochs warmup_iters = args.lr_warmup_epochs * len(data_loader) lr_milestones = [len(data_loader) * m for m in args.lr_milestones] lr_scheduler = WarmupMultiStepLR(optimizer, milestones=lr_milestones, gamma=args.lr_gamma, warmup_iters=warmup_iters, warmup_factor=1e-5) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, criterion, data_loader_test, device=device) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq, args.apex) evaluate(model, criterion, data_loader_test, device=device) if args.output_dir: checkpoint = { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args } utils.save_on_master( checkpoint, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) utils.save_on_master( checkpoint, os.path.join(args.output_dir, 'checkpoint.pth')) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): device = torch.device(args.device if torch.cuda.is_available() else "cpu") print("Using {} device training.".format(device.type)) # 用来保存coco_info的文件 results_file = "results{}.txt".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) data_transform = { "train": transforms.Compose([transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } COCO_root = args.data_path # load train data set # coco2017 -> annotations -> instances_train2017.json train_data_set = CocoDetection(COCO_root, "train", data_transform["train"]) # 注意这里的collate_fn是自定义的,因为读取的数据包括image和targets,不能直接使用默认的方法合成batch batch_size = args.batch_size nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using %g dataloader workers' % nw) train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # load validation data set # coco2017 -> annotations -> instances_val2017.json val_data_set = CocoDetection(COCO_root, "val", data_transform["val"]) val_data_set_loader = torch.utils.data.DataLoader(val_data_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=nw, collate_fn=train_data_set.collate_fn) # create model num_classes equal background + 80 classes model = create_model(num_classes=args.num_classes + 1) # print(model) model.to(device) train_loss = [] learning_rate = [] val_map = [] # define optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scaler = torch.cuda.amp.GradScaler() if args.amp else None # learning rate scheduler lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load(args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.amp and "scaler" in checkpoint: scaler.load_state_dict(checkpoint["scaler"]) for epoch in range(args.start_epoch, args.epochs): # train for one epoch, printing every 50 iterations mean_loss, lr = utils.train_one_epoch(model, optimizer, train_data_loader, device, epoch, print_freq=50, warmup=True, scaler=scaler) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update the learning rate lr_scheduler.step() # evaluate on the test dataset coco_info = utils.evaluate(model, val_data_set_loader, device=device) # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [str(round(i, 4)) for i in coco_info + [mean_loss.item()]] + [str(round(lr, 6))] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") val_map.append(coco_info[1]) # pascal mAP # save weights save_files = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch} if args.amp: save_files["scaler"] = scaler.state_dict() torch.save(save_files, "./save_weights/model_{}.pth".format(epoch)) # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False if not args.evaluate: sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) else: sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_imgreid_dataset( root=args.root, name=args.dataset, split_id=args.split_id, cuhk03_labeled=args.cuhk03_labeled, cuhk03_classic_split=args.cuhk03_classic_split, use_lmdb=args.use_lmdb, ) transform_train = T.Compose([ T.Random2DTranslation(args.height, args.width), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) transform_test = T.Compose([ T.Resize((args.height, args.width)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) pin_memory = True if use_gpu else False trainloader = DataLoader( ImageDataset(dataset.train, transform=transform_train, use_lmdb=args.use_lmdb, lmdb_path=dataset.train_lmdb_path), batch_size=args.train_batch, shuffle=True, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) queryloader = DataLoader( ImageDataset(dataset.query, transform=transform_test, use_lmdb=args.use_lmdb, lmdb_path=dataset.query_lmdb_path), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) galleryloader = DataLoader( ImageDataset(dataset.gallery, transform=transform_test, use_lmdb=args.use_lmdb, lmdb_path=dataset.gallery_lmdb_path), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False, ) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids, loss={'xent'}, use_gpu=use_gpu) print("Model size: {:.3f} M".format(count_num_param(model))) criterion = CrossEntropyLabelSmooth(num_classes=dataset.num_train_pids, use_gpu=use_gpu) optimizer = init_optim(args.optim, model.parameters(), args.lr, args.weight_decay) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.stepsize, gamma=args.gamma) if args.fixbase_epoch > 0: if hasattr(model, 'classifier') and isinstance(model.classifier, nn.Module): optimizer_tmp = init_optim(args.optim, model.classifier.parameters(), args.fixbase_lr, args.weight_decay) else: print( "Warn: model has no attribute 'classifier' and fixbase_epoch is reset to 0" ) args.fixbase_epoch = 0 if args.load_weights: # load pretrained weights but ignore layers that don't match in size print("Loading pretrained weights from '{}'".format(args.load_weights)) checkpoint = torch.load(args.load_weights) pretrain_dict = checkpoint['state_dict'] model_dict = model.state_dict() pretrain_dict = { k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size() } model_dict.update(pretrain_dict) model.load_state_dict(model_dict) if args.resume: if osp.isfile(args.resume): checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] rank1 = checkpoint['rank1'] print("Loaded checkpoint from '{}'".format(args.resume)) print("- start_epoch: {}\n- rank1: {}".format( args.start_epoch, rank1)) else: print("=> No checkpoint found at '{}'".format(args.resume)) if use_gpu: model = nn.DataParallel(model).cuda() if args.evaluate: print("Evaluate only") distmat = test(model, queryloader, galleryloader, use_gpu, return_distmat=True) if args.vis_ranked_res: visualize_ranked_results( distmat, dataset, save_dir=osp.join(args.save_dir, 'ranked_results'), topk=20, ) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") if args.fixbase_epoch > 0: print( "Train classifier for {} epochs while keeping base network frozen". format(args.fixbase_epoch)) for epoch in range(args.fixbase_epoch): start_train_time = time.time() train(epoch, model, criterion, optimizer_tmp, trainloader, use_gpu, freeze_bn=True) train_time += round(time.time() - start_train_time) del optimizer_tmp print("Now open all layers for training") for epoch in range(args.start_epoch, args.max_epoch): start_train_time = time.time() train(epoch, model, criterion, optimizer, trainloader, use_gpu) train_time += round(time.time() - start_train_time) scheduler.step() if (epoch + 1) > args.start_eval and args.eval_step > 0 and ( epoch + 1) % args.eval_step == 0 or (epoch + 1) == args.max_epoch: print("==> Test") rank1 = test(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
def main(args): init_distributed_mode(args) print(args) device = torch.device(args.device) # 用来保存coco_info的文件 results_file = "results{}.txt".format( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) # Data loading code print("Loading data") data_transform = { "train": transforms.Compose( [transforms.ToTensor(), transforms.RandomHorizontalFlip(0.5)]), "val": transforms.Compose([transforms.ToTensor()]) } VOC_root = args.data_path # check voc root if os.path.exists(os.path.join(VOC_root, "VOCdevkit")) is False: raise FileNotFoundError( "VOCdevkit dose not in path:'{}'.".format(VOC_root)) # load train data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> train.txt train_data_set = VOC2012DataSet(VOC_root, data_transform["train"], "train.txt") # load validation data set # VOCdevkit -> VOC2012 -> ImageSets -> Main -> val.txt val_data_set = VOC2012DataSet(VOC_root, data_transform["val"], "val.txt") print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set) test_sampler = torch.utils.data.distributed.DistributedSampler( val_data_set) else: train_sampler = torch.utils.data.RandomSampler(train_data_set) test_sampler = torch.utils.data.SequentialSampler(val_data_set) if args.aspect_ratio_group_factor >= 0: # 统计所有图像比例在bins区间中的位置索引 group_ids = create_aspect_ratio_groups( train_data_set, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( train_data_set, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) data_loader_test = torch.utils.data.DataLoader( val_data_set, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=train_data_set.collate_fn) print("Creating model") model = create_model(num_classes=21, device=device) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) # 如果传入resume参数,即上次训练的权重地址,则接着上次的参数训练 if args.resume: # If map_location is missing, torch.load will first load the module to CPU # and then copy each parameter to where it was saved, # which would result in all processes on the same machine using the same set of devices. checkpoint = torch.load( args.resume, map_location='cpu') # 读取之前保存的权重文件(包括优化器以及学习率策略) model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: utils.evaluate(model, data_loader_test, device=device) return train_loss = [] learning_rate = [] val_map = [] print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) mean_loss, lr = utils.train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, warmup=True) train_loss.append(mean_loss.item()) learning_rate.append(lr) # update learning rate lr_scheduler.step() # evaluate after every epoch coco_info = utils.evaluate(model, data_loader_test, device=device) val_map.append(coco_info[1]) # pascal mAP # 只在主进程上进行写操作 if args.rank in [-1, 0]: # write into txt with open(results_file, "a") as f: # 写入的数据包括coco指标还有loss和learning rate result_info = [ str(round(i, 4)) for i in coco_info + [mean_loss.item(), lr] ] txt = "epoch:{} {}".format(epoch, ' '.join(result_info)) f.write(txt + "\n") if args.output_dir: # 只在主节点上执行保存权重操作 save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch }, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str)) if args.rank in [-1, 0]: # plot loss and lr curve if len(train_loss) != 0 and len(learning_rate) != 0: from plot_curve import plot_loss_and_lr plot_loss_and_lr(train_loss, learning_rate) # plot mAP curve if len(val_map) != 0: from plot_curve import plot_map plot_map(val_map)
def main(): runId = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') cfg.OUTPUT_DIR = os.path.join(cfg.OUTPUT_DIR, runId) if not os.path.exists(cfg.OUTPUT_DIR): os.mkdir(cfg.OUTPUT_DIR) print(cfg.OUTPUT_DIR) torch.manual_seed(cfg.RANDOM_SEED) random.seed(cfg.RANDOM_SEED) np.random.seed(cfg.RANDOM_SEED) os.environ['CUDA_VISIBLE_DEVICES'] = cfg.MODEL.DEVICE_ID use_gpu = torch.cuda.is_available() and cfg.MODEL.DEVICE == "cuda" if not cfg.EVALUATE_ONLY: sys.stdout = Logger(osp.join(cfg.OUTPUT_DIR, 'log_train.txt')) else: sys.stdout = Logger(osp.join(cfg.OUTPUT_DIR, 'log_test.txt')) print("==========\nConfigs:{}\n==========".format(cfg)) if use_gpu: print("Currently using GPU {}".format(cfg.MODEL.DEVICE_ID)) cudnn.benchmark = True torch.cuda.manual_seed_all(cfg.RANDOM_SEED) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(cfg.DATASETS.NAME)) dataset = data_manager.init_dataset(root=cfg.DATASETS.ROOT_DIR, name=cfg.DATASETS.NAME) print("Initializing model: {}".format(cfg.MODEL.NAME)) if cfg.MODEL.ARCH == 'video_baseline': torch.backends.cudnn.benchmark = False model = models.init_model(name=cfg.MODEL.ARCH, num_classes=625, pretrain_choice=cfg.MODEL.PRETRAIN_CHOICE, last_stride=cfg.MODEL.LAST_STRIDE, neck=cfg.MODEL.NECK, model_name=cfg.MODEL.NAME, neck_feat=cfg.TEST.NECK_FEAT, model_path=cfg.MODEL.PRETRAIN_PATH) print("Model size: {:.5f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) transform_train = T.Compose([ T.Resize(cfg.INPUT.SIZE_TRAIN), T.RandomHorizontalFlip(p=cfg.INPUT.PROB), T.Pad(cfg.INPUT.PADDING), T.RandomCrop(cfg.INPUT.SIZE_TRAIN), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), T.RandomErasing(probability=cfg.INPUT.RE_PROB, mean=cfg.INPUT.PIXEL_MEAN) ]) transform_test = T.Compose([ T.Resize(cfg.INPUT.SIZE_TEST), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) pin_memory = True if use_gpu else False cfg.DATALOADER.NUM_WORKERS = 0 trainloader = DataLoader(VideoDataset( dataset.train, seq_len=cfg.DATASETS.SEQ_LEN, sample=cfg.DATASETS.TRAIN_SAMPLE_METHOD, transform=transform_train, dataset_name=cfg.DATASETS.NAME), sampler=RandomIdentitySampler( dataset.train, num_instances=cfg.DATALOADER.NUM_INSTANCE), batch_size=cfg.SOLVER.SEQS_PER_BATCH, num_workers=cfg.DATALOADER.NUM_WORKERS, pin_memory=pin_memory, drop_last=True) queryloader = DataLoader(VideoDataset( dataset.query, seq_len=cfg.DATASETS.SEQ_LEN, sample=cfg.DATASETS.TEST_SAMPLE_METHOD, transform=transform_test, max_seq_len=cfg.DATASETS.TEST_MAX_SEQ_NUM, dataset_name=cfg.DATASETS.NAME), batch_size=cfg.TEST.SEQS_PER_BATCH, shuffle=False, num_workers=cfg.DATALOADER.NUM_WORKERS, pin_memory=pin_memory, drop_last=False) galleryloader = DataLoader( VideoDataset(dataset.gallery, seq_len=cfg.DATASETS.SEQ_LEN, sample=cfg.DATASETS.TEST_SAMPLE_METHOD, transform=transform_test, max_seq_len=cfg.DATASETS.TEST_MAX_SEQ_NUM, dataset_name=cfg.DATASETS.NAME), batch_size=cfg.TEST.SEQS_PER_BATCH, shuffle=False, num_workers=cfg.DATALOADER.NUM_WORKERS, pin_memory=pin_memory, drop_last=False, ) if cfg.MODEL.SYN_BN: if use_gpu: model = nn.DataParallel(model) if cfg.SOLVER.FP_16: model = apex.parallel.convert_syncbn_model(model) model.cuda() start_time = time.time() xent = CrossEntropyLabelSmooth(num_classes=dataset.num_train_pids) tent = TripletLoss(cfg.SOLVER.MARGIN) optimizer = make_optimizer(cfg, model) scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR, cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD) # metrics = test(model, queryloader, galleryloader, cfg.TEST.TEMPORAL_POOL_METHOD, use_gpu) no_rise = 0 best_rank1 = 0 start_epoch = 0 for epoch in range(start_epoch, cfg.SOLVER.MAX_EPOCHS): # if no_rise == 10: # break scheduler.step() print("noriase:", no_rise) print("==> Epoch {}/{}".format(epoch + 1, cfg.SOLVER.MAX_EPOCHS)) print("current lr:", scheduler.get_lr()[0]) train(model, trainloader, xent, tent, optimizer, use_gpu) if cfg.SOLVER.EVAL_PERIOD > 0 and ( (epoch + 1) % cfg.SOLVER.EVAL_PERIOD == 0 or (epoch + 1) == cfg.SOLVER.MAX_EPOCHS): print("==> Test") metrics = test(model, queryloader, galleryloader, cfg.TEST.TEMPORAL_POOL_METHOD, use_gpu) rank1 = metrics[0] if rank1 > best_rank1: best_rank1 = rank1 no_rise = 0 else: no_rise += 1 continue if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() torch.save( state_dict, osp.join( cfg.OUTPUT_DIR, "rank1_" + str(rank1) + '_checkpoint_ep' + str(epoch + 1) + '.pth')) # best_p = osp.join(cfg.OUTPUT_DIR, "rank1_" + str(rank1) + '_checkpoint_ep' + str(epoch + 1) + '.pth') elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) print("Finished. Total elapsed time (h:m:s): {}".format(elapsed))
data = np.load(os.path.join(preprocesspath, fname))-pixmean pixvlu += np.sum(data * data) pixstd = np.sqrt(pixvlu / float(npix)) # pixstd /= 255 print(pixmean, pixstd) print('mean '+str(pixmean)+' std '+str(pixstd)) # Datatransforms print('==> Preparing data..') # Random Crop, Zero out, x z flip, scale, transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((pixmean), (pixstd)), ]) transform_train = transforms.Compose([ # transforms.RandomScale(range(28, 38)), transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.RandomYFlip(), transforms.RandomZFlip(), transforms.ZeroOut(4), transforms.ToTensor(), transforms.Normalize((pixmean), (pixstd)), # need to cal mean and std, revise norm func ]) from dataloadernp import lunanod import pandas as pd import logging # fold = 1 # gbtdepth = 3 savemodelpath = './detcls-'+str(fold)+'new/' if not os.path.isdir(savemodelpath): os.mkdir(savemodelpath) logging.basicConfig(filename=savemodelpath+'detclslog-'+str(fold), level=logging.INFO)
def main(): torch.manual_seed(1) os.environ['CUDA_VISIBLE_DEVICES'] = config.gpu_devices use_gpu = torch.cuda.is_available() sys.stdout = Logger(config.save_dir, config.checkpoint_suffix, config.evaluate) print("\n==========\nArgs:") config.print_parameter() print("==========\n") if use_gpu: print("Currently using GPU {}".format(config.gpu_devices)) cudnn.benchmark = True torch.cuda.manual_seed_all(1) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(config.dataset)) dataset = data_manager.init_imgreid_dataset(name=config.dataset, root=config.data_root) transform_train = T.Compose([ T.Random2DTranslation(config.height, config.width), T.RandomHorizontalFlip(), T.ToTensor(), T.Normalize(mean=data_mean, std=data_std), ]) transform_test = T.Compose([ T.Resize((config.height, config.width)), T.ToTensor(), T.Normalize(mean=data_mean, std=data_std), ]) pin_memory = True if use_gpu else False # train_batch_sampler = BalancedBatchSampler(dataset.train, n_classes=8, n_samples=8) # train_batch_sampler = CCLBatchSampler(dataset.train, n_classes=n_classes, n_samples=n_samples) # train_batch_sampler = CCLBatchSamplerV2(dataset.train, n_classes=n_classes, pos_samp_cnt=pos_samp_cnt, # neg_samp_cnt=neg_samp_cnt, each_cls_max_cnt=each_cls_max_cnt) train_batch_sampler = ClassSampler(dataset.train, sample_cls_cnt=config.sample_cls_cnt, each_cls_cnt=config.each_cls_cnt) # trainloader = DataLoader( # ImageDataset(dataset.train, transform=transform_train), # batch_sampler=train_batch_sampler, batch_size=args.train_batch, # shuffle=True, num_workers=args.workers, pin_memory=pin_memory, drop_last=True # ) trainloader = DataLoader(ImageDatasetWCL(dataset, data_type='train', merge_h=256, merge_w=256, mean_std=[data_mean, data_std]), batch_sampler=train_batch_sampler, num_workers=config.workers, pin_memory=pin_memory) queryloader = DataLoader( ImageDatasetWCL(dataset.query, data_type='query', merge_h=256, merge_w=256, mean_std=[data_mean, data_std]), batch_size=config.test_batch, shuffle=False, num_workers=config.workers, pin_memory=pin_memory, drop_last=False, ) galleryloader = DataLoader( ImageDatasetWCL(dataset.gallery, data_type='gallery', merge_h=256, merge_w=256, mean_std=[data_mean, data_std]), batch_size=config.test_batch, shuffle=False, num_workers=config.workers, pin_memory=pin_memory, drop_last=False, ) if config.dataset == 'vehicleid': train_query_loader = None train_gallery_loader = None else: train_query_loader = DataLoader( ImageDatasetWCL(dataset.train_query, data_type='train_query', merge_h=256, merge_w=256, mean_std=[data_mean, data_std]), batch_size=config.test_batch, shuffle=False, num_workers=config.workers, pin_memory=pin_memory, drop_last=False, ) train_gallery_loader = DataLoader( ImageDatasetWCL(dataset.train_gallery, data_type='train_gallery', merge_h=256, merge_w=256, mean_std=[data_mean, data_std]), batch_size=config.test_batch, shuffle=False, num_workers=config.workers, pin_memory=pin_memory, drop_last=False, ) print("Initializing model: {}".format(config.arch)) model = init_model(name=config.arch, num_classes=dataset.num_train_pids, loss_type=config.loss_type) print("Model size: {:.3f} M".format(count_num_param(model))) if config.loss_type == 'xent': criterion = [nn.CrossEntropyLoss(), nn.CrossEntropyLoss()] elif config.loss_type == 'xent_triplet': criterion = XentTripletLoss( margin=config.margin, triplet_selector=RandomNegativeTripletSelector( margin=config.margin), each_cls_cnt=config.each_cls_cnt, n_class=config.sample_cls_cnt) elif config.loss_type == 'xent_tripletv2': criterion = XentTripletLossV2( margin=config.margin, triplet_selector=RandomNegativeTripletSelectorV2( margin=config.margin), each_cls_cnt=config.each_cls_cnt, n_class=config.sample_cls_cnt) # criterion = XentTripletLossV2(margin=0.04, triplet_selector=RandomNegativeTripletSelectorV2(margin=0.04), # each_cls_cnt=config.each_cls_cnt, n_class=config.sample_cls_cnt) # criterion = XentGroupTripletLossV2(margin=0.8, triplet_selector=AllTripletSelector(margin=0.8), # each_cls_cnt=config.each_cls_cnt, n_class=config.sample_cls_cnt) else: raise KeyError("Unsupported loss: {}".format(config.loss_type)) optimizer = init_optim(config.optim, model.parameters(), config.lr, config.weight_decay) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=config.stepsize, gamma=config.gamma) if config.resume is not None: if check_isfile(config.resume): checkpoint = torch.load(config.resume) pretrain_dict = checkpoint['state_dict'] model_dict = model.state_dict() pretrain_dict = { k: v for k, v in pretrain_dict.items() if k in model_dict and model_dict[k].size() == v.size() } model_dict.update(pretrain_dict) model.load_state_dict(model_dict) config.start_epoch = checkpoint['epoch'] rank1 = checkpoint['rank1'] if 'mAP' in checkpoint: mAP = checkpoint['mAP'] else: mAP = 0 print("Loaded checkpoint from '{}'".format(config.resume)) print("- start_epoch: {}\n- rank1: {}\n- mAP: {}".format( config.start_epoch, rank1, mAP)) if use_gpu: model = nn.DataParallel(model).cuda() if config.evaluate: print("Evaluate only") test_model(model, queryloader, galleryloader, train_query_loader, train_gallery_loader, use_gpu, config.test_batch, config.loss_type, config.euclidean_distance_loss) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_map = 0 best_epoch = 0 for epoch in range(config.start_epoch, config.max_epoch): print("==> Start training") start_train_time = time.time() scheduler.step() print('epoch:', epoch, 'lr:', scheduler.get_lr()) train(epoch, model, criterion, optimizer, trainloader, config.loss_type, config.print_freq) train_time += round(time.time() - start_train_time) if epoch >= config.start_eval and config.eval_step > 0 and epoch % config.eval_step == 0 \ or epoch == config.max_epoch: print("==> Test") rank1, mAP = test_model(model, queryloader, galleryloader, train_query_loader, train_gallery_loader, use_gpu, config.test_batch, config.loss_type, config.euclidean_distance_loss) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_map = mAP best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'mAP': mAP, 'epoch': epoch + 1, }, is_best, use_gpu_suo=False, fpath=osp.join( config.save_dir, 'checkpoint_ep' + str(epoch + 1) + config.checkpoint_suffix + '.pth.tar')) print("==> Best Rank-1 {:.2%}, mAP {:.2%}, achieved at epoch {}".format( best_rank1, best_map, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
def get_transform(train): transforms = [] transforms.append(T.ToTensor()) if train: transforms.append(T.RandomHorizontalFlip(0.5)) return T.Compose(transforms)
def train(self): use_cuda = torch.cuda.is_available() path = os.path.join('./out_models/' + self.model_name + '_' + self.task_name + '_' + self.job_id) ## get logger logger = self.get_logger(self.model_name, self.task_name, self.job_id, path) logger.info("Task Name : {}".format(self.task_name)) logger.info("Backbone_name : {}".format(self.model_name)) logger.info("input_shape : ({},{}.{})".format(self.input_shape[0],self.input_shape[1],self.input_shape[2])) logger.info("num_epochs : {}".format(self.num_epochs)) logger.info("resume_from : {}".format(self.resume_from)) logger.info("pretrained : {}".format(self.pretrained)) ## tensorboard writer log_dir = os.path.join(path,"{}".format("tensorboard_log")) if not os.path.isdir(log_dir): os.mkdir(log_dir) writer = SummaryWriter(log_dir) ## get model of train net = get_model(self.model_name) net = torch.nn.DataParallel(net, device_ids = self.device_ids) net = net.cuda(device = self.device_ids[0]) ## loss criterion = nn.CrossEntropyLoss() ## optimizer if self.optimizers == 'SGD': optimizer = optim.SGD(net.parameters(), lr=self.init_lr, momentum=0.9, weight_decay=self.weight_decay) elif self.optimizers == 'Adam': optimizer = optim.Adam(net.parameters(), lr=self.init_lr, weight_decay=self.weight_decay) milestones = [80,150,200,300] scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=0.1) logger.info(("============opti===========")) logger.info("Optimizer:{}".format(self.optimizers)) logger.info("lr:{}".format(self.init_lr)) logger.info("weight_decay:{}".format(self.weight_decay)) logger.info("lr_scheduler: MultiStepLR") logger.info("milestones:{}".format(milestones)) ## augumation normalize = transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5]) ## train aug transform_train = transforms.Compose([ transforms.RandomCrop(int(self.input_shape[-1])), transforms.RandomHorizontalFlip(), transforms.RandomBrightness(brightness = self.brightness, brightness_ratio=self.brightness_ratio), transforms.RandomBlur(blur_ratio = self.blur_ratio), transforms.RandomRotation(degrees = self.degrees, rotation_ratio = 0.1), transforms.ColorJitter(brightness = self.color_brightnesss, contrast = self.color_contrast,\ saturation = self.color_saturation, hue=0), transforms.ToTensor(), #normalize, ]) ## test aug transform_test = transforms.Compose([ transforms.CenterCrop(int(self.input_shape[-1])), transforms.ToTensor(), #normalize, ]) logger.info(("============aug===========")) logger.info("brightness:{}".format(self.brightness)) logger.info("brightness_ratio:{}".format(self.brightness_ratio)) logger.info("blur_ratio:{}".format(self.blur_ratio)) logger.info("degrees:{}".format(self.degrees)) logger.info("color_brightnesss:{}".format(self.color_brightnesss)) logger.info("color_contrast:{}".format(self.color_contrast)) logger.info("color_saturation:{}".format(self.color_saturation)) ## prepara data print('==> Preparing data..') import pdb #pdb.set_trace() trainset = DataLoader(split = 'Training', transform=transform_train) trainloader = torch.utils.data.DataLoader(trainset, batch_size=self.batch_size * len(self.device_ids), shuffle=True) Valset = DataLoader(split = 'Valing', transform=transform_test) Valloader = torch.utils.data.DataLoader(Valset, batch_size=64 * len(self.device_ids), shuffle=False) Testset = DataLoader(split = 'Testing', transform=transform_test) Testloader = torch.utils.data.DataLoader(Testset, batch_size=64 * len(self.device_ids), shuffle=False) ## train logger.info(("====== Training !!!======")) #self.train_model(net, criterion, optimizer, scheduler, trainloader, Valloader, Testloader, logger, writer, path) self.train_model(net, criterion, optimizer, scheduler, trainloader, Valloader, Testloader, logger, writer, path) logger.info(("======Finsh Training !!!======")) logger.info(("best_val_acc_epoch: %d, best_val_acc: %0.3f" % (best_Val_acc_epoch, best_Val_acc))) logger.info(("best_test_acc_epoch: %d, best_test_acc: %0.3f" % (best_Test_acc_epoch, best_Test_acc)))