def freezenet(net, freeze_base_net, freeze_net): if freeze_base_net: logging.info("Freeze base net.") freeze_net_layers(net.base_net) params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(), net.regression_headers.parameters(), net.classification_headers.parameters()) params = [{ 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] elif freeze_net: freeze_net_layers(net.base_net) freeze_net_layers(net.source_layer_add_ons) freeze_net_layers(net.extras) params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) logging.info("Freeze all the layers except prediction heads.") else: params = [{ 'params': net.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] return net, params
logging.info(val_dataset) logging.info("validation dataset size: {}".format(len(val_dataset))) val_loader = DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers, shuffle=False) logging.info("Build network.") net = create_net(num_classes) min_loss = -10000.0 last_epoch = args.last_epoch-1 base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr extra_layers_lr = args.extra_layers_lr if args.extra_layers_lr is not None else args.lr if args.freeze_base_net: logging.info("Freeze base net.") freeze_net_layers(net.base_net) params = [ {'params': itertools.chain( net.regression_headers.parameters(), net.classification_headers.parameters(), net.extra_layers.parameters() )} ] elif args.freeze_net: freeze_net_layers(net.base_net) freeze_net_layers(net.extra_layers) params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) logging.info("Freeze all the layers except prediction heads.") else: params = [ {'params': net.base_net.parameters(), 'lr': base_net_lr},
logging.info("validation dataset size: {}".format(len(val_dataset))) val_loader = DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers, shuffle=False) logging.info("Build network.") net = create_net(num_classes) min_loss = -10000.0 last_epoch = -1 base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr extra_layers_lr = args.extra_layers_lr if args.extra_layers_lr is not None else args.lr if args.freeze_base_net: logging.info("Freeze base net.") freeze_net_layers(net.base_net) params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(), net.regression_headers.parameters(), net.classification_headers.parameters()) params = [{ 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }]
def optim_and_model_initial(args, net, timer, config, DEVICE): #net = create_net(num_classes) last_epoch = -1 base_net_lr = args['Training_hyperparam']['base_net_lr'] if args[ 'Training_hyperparam']['base_net_lr'] != "None" else args[ 'Training_hyperparam']['lr'] extra_layers_lr = args['Training_hyperparam']['extra_layers_lr'] if args[ 'Training_hyperparam']['extra_layers_lr'] != "None" else args[ 'Training_hyperparam']['lr'] if args['flow_control']['freeze_base_net']: logging.info("Freeze base net.") freeze_net_layers(net.base_net) params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(), net.regression_headers.parameters(), net.classification_headers.parameters()) params = [{ 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] elif args['flow_control']['freeze_net']: freeze_net_layers(net.base_net) freeze_net_layers(net.source_layer_add_ons) freeze_net_layers(net.extras) params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) logging.info("Freeze all the layers except prediction heads.") else: params = [{ 'params': net.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] timer.start("Load Model") if args['flow_control']['resume']: logging.info("Resume from the model {}".format( args['flow_control']['resume'])) net.load(args['flow_control']['resume']) elif args['flow_control']['base_net']: logging.info("Init from base net {}".format( args['flow_control']['base_net'])) net.init_from_base_net(args['flow_control']['base_net']) elif args['flow_control']['pretrained_ssd']: logging.info("Init from pretrained ssd {}".format( args['flow_control']['pretrained_ssd'])) net.init_from_pretrained_ssd(args['flow_control']['pretrained_ssd']) logging.info('Took {:.2f} seconds to load the model.'.format( timer.end("Load Model"))) # net.to(DEVICE) net = nn.DataParallel(net).cuda() neg_pos_ratio = 3 #3 criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=neg_pos_ratio, center_variance=0.1, size_variance=0.2, device=DEVICE) # criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=1, # center_variance=0.1, size_variance=0.2, device=DEVICE) optimizer = torch.optim.SGD( params, lr=args['Training_hyperparam']['lr'], momentum=args['Training_hyperparam']['momentum'], weight_decay=args['Training_hyperparam']['weighted_decay']) logging.info("Learning rate: {}, Base net learning rate: {}, ".format( args['Training_hyperparam']['lr'], base_net_lr) + "Extra Layers learning rate: {}.".format(extra_layers_lr)) if args['Training_hyperparam']['lr_scheduler'] == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [ int(v.strip()) for v in args["Training_hyperparam"] ["lr_scheduler_param"]["multi-step"]['milestones'].split(",") ] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=args["Training_hyperparam"] ["lr_scheduler_param"]["multi-step"]['gamma'], last_epoch=last_epoch) elif args['Training_hyperparam']['lr_scheduler'] == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR( optimizer, float(args['Training_hyperparam']['lr_scheduler_param']['cosine'] ['t_max']), last_epoch=last_epoch) else: logging.fatal("Unsupported Scheduler: {}.".format( args['Training_hyperparam']['lr_scheduler'])) parser.print_help(sys.stderr) sys.exit(1) logging.info("Start training from epoch {}.".format(last_epoch + 1)) return net, criterion, optimizer, scheduler
dataset_type="test") log.info(val_dataset) log.info("validation dataset size: {}".format(len(val_dataset))) val_loader = DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers, shuffle=False) log.info("Build network.") net = create_net(num_classes) min_loss = -10000.0 last_epoch = -1 # freeze_base_net: log.info("Freeze base net..") freeze_net_layers(net.base_net) params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(), net.regression_headers.parameters(), net.classification_headers.parameters()) # log.info("params 1 = "+str(params)) params = [{ 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': args.extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters())
def main(args): DEVICE = torch.device( "cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu") #DEVICE = torch.device("cpu") if args.use_cuda and torch.cuda.is_available(): torch.backends.cudnn.benchmark = True logging.info("Use Cuda.") timer = Timer() logging.info(args) if args.net == 'vgg16-ssd': create_net = create_vgg_ssd config = vgg_ssd_config elif args.net == 'mb1-ssd': create_net = create_mobilenetv1_ssd config = mobilenetv1_ssd_config elif args.net == 'mb1-ssd-lite': create_net = create_mobilenetv1_ssd_lite config = mobilenetv1_ssd_config elif args.net == 'sq-ssd-lite': create_net = create_squeezenet_ssd_lite config = squeezenet_ssd_config elif args.net == 'mb2-ssd-lite': create_net = lambda num: create_mobilenetv2_ssd_lite( num, width_mult=args.mb2_width_mult) config = mobilenetv1_ssd_config else: logging.fatal("The net type is wrong.") parser.print_help(sys.stderr) sys.exit(1) train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std) target_transform = MatchPrior(config.priors, config.center_variance, config.size_variance, 0.5) test_transform = TestTransform(config.image_size, config.image_mean, config.image_std) logging.info("Prepare training datasets.") datasets = [] for dataset_path in args.datasets: if args.dataset_type == 'voc': dataset = VOCDataset(dataset_path, transform=train_transform, target_transform=target_transform) label_file = os.path.join(args.checkpoint_folder, "voc-model-labels.txt") store_labels(label_file, dataset.class_names) num_classes = len(dataset.class_names) elif args.dataset_type == 'open_images': dataset = OpenImagesDataset(dataset_path, transform=train_transform, target_transform=target_transform, dataset_type="train", balance_data=args.balance_data) label_file = os.path.join(args.checkpoint_folder, "open-images-model-labels.txt") store_labels(label_file, dataset.class_names) logging.info(dataset) num_classes = len(dataset.class_names) elif args.dataset_type == 'coco': # root, annFile, transform=None, target_transform=None, transforms=None) # dataset_type="train", balance_data=args.balance_data) dataset = CocoDetection( "/home/wenyen4desh/datasets/coco/train2017", "/home/wenyen4desh/datasets/coco/annotations/instances_train2017.json", transform=train_transform, target_transform=target_transform) label_file = os.path.join(args.checkpoint_folder, "open-images-model-labels.txt") store_labels(label_file, dataset.class_names) logging.info(dataset) num_classes = len(dataset.class_names) # raise ValueError("Dataset type {} yet implement.".format(args.dataset_type)) else: raise ValueError("Dataset type {} is not supported.".format( args.dataset_type)) datasets.append(dataset) logging.info("Stored labels into file {}.".format(label_file)) train_dataset = ConcatDataset(datasets) logging.info("Train dataset size: {}".format(len(train_dataset))) train_loader = DataLoader(train_dataset, args.batch_size, num_workers=args.num_workers, shuffle=True) logging.info("Prepare Validation datasets.") if args.dataset_type == "voc": val_dataset = VOCDataset(args.validation_dataset, transform=test_transform, target_transform=target_transform, is_test=True) elif args.dataset_type == 'open_images': val_dataset = OpenImagesDataset(dataset_path, transform=test_transform, target_transform=target_transform, dataset_type="test") logging.info(val_dataset) elif args.dataset_type == "coco": val_dataset = CocoDetection( "/home/wenyen4desh/datasets/coco/val2017", "/home/wenyen4desh/datasets/coco/annotations/instances_val2017.json", transform=test_transform, target_transform=target_transform) logging.info(val_dataset) logging.info("validation dataset size: {}".format(len(val_dataset))) val_loader = DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers, shuffle=False) logging.info("Build network.") net = create_net(num_classes) min_loss = -10000.0 last_epoch = -1 base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr extra_layers_lr = args.extra_layers_lr if args.extra_layers_lr is not None else args.lr if args.freeze_base_net: logging.info("Freeze base net.") freeze_net_layers(net.base_net) params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(), net.regression_headers.parameters(), net.classification_headers.parameters()) params = [{ 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] elif args.freeze_net: freeze_net_layers(net.base_net) freeze_net_layers(net.source_layer_add_ons) freeze_net_layers(net.extras) params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) logging.info("Freeze all the layers except prediction heads.") else: params = [{ 'params': net.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] timer.start("Load Model") if args.resume: logging.info("Resume from the model {}".format(args.resume)) net.load(args.resume) elif args.base_net: logging.info("Init from base net {}".format(args.base_net)) net.init_from_base_net(args.base_net) elif args.pretrained_ssd: logging.info("Init from pretrained ssd {}".format(args.pretrained_ssd)) net.init_from_pretrained_ssd(args.pretrained_ssd) logging.info('Took {:.2f} seconds to load the model.'.format( timer.end("Load Model"))) net.to(DEVICE) criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3, center_variance=0.1, size_variance=0.2, device=DEVICE) optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) logging.info("Learning rate: {}, Base net learning rate: {}, ".format( args.lr, base_net_lr) + "Extra Layers learning rate: {}.".format(extra_layers_lr)) if args.scheduler == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [int(v.strip()) for v in args.milestones.split(",")] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch) elif args.scheduler == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR(optimizer, args.t_max, last_epoch=last_epoch) else: logging.fatal("Unsupported Scheduler: {}.".format(args.scheduler)) parser.print_help(sys.stderr) sys.exit(1) logging.info("Start training from epoch {}.".format(last_epoch + 1)) for epoch in range(last_epoch + 1, args.num_epochs): scheduler.step() train(train_loader, net, criterion, optimizer, device=DEVICE, debug_steps=args.debug_steps, epoch=epoch) if epoch % args.validation_epochs == 0 or epoch == args.num_epochs - 1: val_loss, val_regression_loss, val_classification_loss = test( val_loader, net, criterion, DEVICE) logging.info("Epoch: {}, ".format(epoch) + "Validation Loss: {:.4f}, ".format(val_loss) + "Validation Regression Loss {:.4f}, ".format( val_regression_loss) + "Validation Classification Loss: {:.4f}".format( val_classification_loss)) model_path = os.path.join( args.checkpoint_folder, "{}-Epoch-{}-Loss-{}.pth".format(args.net, epoch, val_loss)) net.save(model_path) logging.info("Saved model {}".format(model_path))
def setup_and_start_training(self): logging.basicConfig( stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and self. system_dict["params"]["use_cuda"] else "cpu") if self.system_dict["params"]["use_cuda"] and torch.cuda.is_available( ): torch.backends.cudnn.benchmark = True logging.info("Using gpu.") else: logging.info("Using cpu.") timer = Timer() logging.info(self.system_dict) if self.system_dict["params"]["net"] == 'vgg16-ssd': create_net = create_vgg_ssd config = vgg_ssd_config elif self.system_dict["params"]["net"] == 'mb1-ssd': create_net = create_mobilenetv1_ssd config = mobilenetv1_ssd_config elif self.system_dict["params"]["net"] == 'mb1-ssd-lite': create_net = create_mobilenetv1_ssd_lite config = mobilenetv1_ssd_config elif self.system_dict["params"]["net"] == 'sq-ssd-lite': create_net = create_squeezenet_ssd_lite config = squeezenet_ssd_config elif self.system_dict["params"]["net"] == 'mb2-ssd-lite': create_net = lambda num: create_mobilenetv2_ssd_lite( num, width_mult=self.system_dict["params"]["mb2_width_mult"]) config = mobilenetv1_ssd_config else: logging.fatal("The net type is wrong.") sys.exit(1) train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std) target_transform = MatchPrior(config.priors, config.center_variance, config.size_variance, 0.5) test_transform = TestTransform(config.image_size, config.image_mean, config.image_std) logging.info("Prepare training datasets.") datasets = [] dataset = VOCDataset( self.system_dict["dataset"]["val"]["img_dir"], self.system_dict["dataset"]["val"]["label_dir"], transform=train_transform, target_transform=target_transform, label_file=self.system_dict["params"]["label_file"]) label_file = self.system_dict["params"]["label_file"] #store_labels(label_file, dataset.class_names) num_classes = len(dataset.class_names) datasets.append(dataset) logging.info(f"Stored labels into file {label_file}.") train_dataset = ConcatDataset(datasets) logging.info("Train dataset size: {}".format(len(train_dataset))) train_loader = DataLoader( train_dataset, self.system_dict["params"]["batch_size"], num_workers=self.system_dict["params"]["num_workers"], shuffle=True) if (self.system_dict["dataset"]["val"]["status"]): val_dataset = VOCDataset( self.system_dict["dataset"]["val"]["img_dir"], self.system_dict["dataset"]["val"]["label_dir"], transform=test_transform, target_transform=target_transform, is_test=True, label_file=self.system_dict["params"]["label_file"]) logging.info("validation dataset size: {}".format( len(val_dataset))) val_loader = DataLoader( val_dataset, self.system_dict["params"]["batch_size"], num_workers=self.system_dict["params"]["num_workers"], shuffle=False) logging.info("Build network.") net = create_net(num_classes) min_loss = -10000.0 last_epoch = -1 base_net_lr = self.system_dict["params"][ "base_net_lr"] if self.system_dict["params"][ "base_net_lr"] is not None else self.system_dict["params"]["lr"] extra_layers_lr = self.system_dict["params"][ "extra_layers_lr"] if self.system_dict["params"][ "extra_layers_lr"] is not None else self.system_dict["params"][ "lr"] if self.system_dict["params"]["freeze_base_net"]: logging.info("Freeze base net.") freeze_net_layers(net.base_net) params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(), net.regression_headers.parameters(), net.classification_headers.parameters()) params = [{ 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] elif self.system_dict["params"]["freeze_net"]: freeze_net_layers(net.base_net) freeze_net_layers(net.source_layer_add_ons) freeze_net_layers(net.extras) params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) logging.info("Freeze all the layers except prediction heads.") else: params = [{ 'params': net.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] timer.start("Load Model") resume = self.system_dict["params"]["resume"] base_net = self.system_dict["params"]["base_net"] pretrained_ssd = self.system_dict["params"]["pretrained_ssd"] if self.system_dict["params"]["resume"]: logging.info(f"Resume from the model {resume}") net.load(self.system_dict["params"]["resume"]) elif self.system_dict["params"]["base_net"]: logging.info(f"Init from base net {base_net}") net.init_from_base_net(self.system_dict["params"]["base_net"]) elif self.system_dict["params"]["pretrained_ssd"]: logging.info(f"Init from pretrained ssd {pretrained_ssd}") net.init_from_pretrained_ssd( self.system_dict["params"]["pretrained_ssd"]) logging.info( f'Took {timer.end("Load Model"):.2f} seconds to load the model.') net.to(DEVICE) criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3, center_variance=0.1, size_variance=0.2, device=DEVICE) optimizer = torch.optim.SGD( params, lr=self.system_dict["params"]["lr"], momentum=self.system_dict["params"]["momentum"], weight_decay=self.system_dict["params"]["weight_decay"]) lr = self.system_dict["params"]["lr"] logging.info( f"Learning rate: {lr}, Base net learning rate: {base_net_lr}, " + f"Extra Layers learning rate: {extra_layers_lr}.") if (not self.system_dict["params"]["milestones"]): self.system_dict["params"]["milestones"] = "" self.system_dict["params"]["milestones"] += str( int(self.system_dict["params"]["num_epochs"] / 3)) + "," self.system_dict["params"]["milestones"] += str( int(2 * self.system_dict["params"]["num_epochs"] / 3)) if self.system_dict["params"]["scheduler"] == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [ int(v.strip()) for v in self.system_dict["params"]["milestones"].split(",") ] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch) elif self.system_dict["params"]["scheduler"] == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR(optimizer, self.system_dict["params"]["t_max"], last_epoch=last_epoch) logging.info(f"Start training from epoch {last_epoch + 1}.") for epoch in range(last_epoch + 1, self.system_dict["params"]["num_epochs"]): scheduler.step() self.base_train( train_loader, net, criterion, optimizer, device=DEVICE, debug_steps=self.system_dict["params"]["debug_steps"], epoch=epoch) if ((self.system_dict["dataset"]["val"]["status"]) and (epoch % self.system_dict["params"]["validation_epochs"] == 0 or epoch == self.system_dict["params"]["num_epochs"] - 1)): val_loss, val_regression_loss, val_classification_loss = self.base_test( val_loader, net, criterion, DEVICE) logging.info( f"Epoch: {epoch}, " + f"Validation Loss: {val_loss:.4f}, " + f"Validation Regression Loss {val_regression_loss:.4f}, " + f"Validation Classification Loss: {val_classification_loss:.4f}" ) net_name = self.system_dict["params"]["net"] model_path = os.path.join( self.system_dict["params"]["checkpoint_folder"], f"{net_name}-Epoch-{epoch}-Loss-{val_loss}.pth") net.save(model_path) logging.info(f"Saved model {model_path}") if (not self.system_dict["dataset"]["val"]["status"]): model_path = os.path.join( self.system_dict["params"]["checkpoint_folder"], f"{net_name}-Epoch-{epoch}.pth") net.save(model_path) logging.info(f"Saved model {model_path}")