def train(cfg, args): logger = logging.getLogger('SSD.trainer') model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) extra_checkpoint_data = checkpointer.load() arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) return model
def train(cfg, args): # 工厂模式,加载日志文件设置,这里暂时不同管 logger = logging.getLogger('SSD.trainer') # 建立目标检测模型 model = build_detection_model(cfg) # 设置Device并且把模型部署到设备上 device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) # 设置学习率、优化器还有学习率变化步长,可以理解为模拟退火这种,前面的步长比较大,后面的步长比较小 lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 # **** 这里应该是从断点开始对模型进行训练 **** checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) extra_checkpoint_data = checkpointer.load() arguments.update(extra_checkpoint_data) # Important 通过torch的形式去加载数据集 # 关键在于如何加载数据集,模型的构建过程可以简单地看成是黑盒 max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) # 正式开始训练, 暂时先不训练? # 不对,不训练也得加载数据集**** 暂时不训练就完事了 *** 直接看数据加载过程 # model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) return model
def start_train(cfg): logger = logging.getLogger('SSD.trainer') model = SSDDetector(cfg) model = torch_utils.to_cuda(model) lr = cfg.SOLVER.LR optimizer = make_optimizer(cfg, model, lr) milestones = [step for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = True checkpointer = CheckPointer(cfg, model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) extra_checkpoint_data = checkpointer.load() arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER train_loader = make_data_loader(cfg, is_train=True, max_iter=max_iter, start_iter=arguments['iteration']) model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, arguments) return model
def train(cfg, args): logger = logging.getLogger('SSD.trainer') model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) extra_checkpoint_data = checkpointer.load() arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) # macs, params = profile(model, inputs=(input, )) # # macs, params = clever_format([flops, params], "%.3f") # net = model.to() # with torch.cuda.device(0): # net = model.to(device) # macs, params = get_model_complexity_info(net, (3, 512, 512), as_strings=True, # print_per_layer_stat=True, verbose=True) # print('{:<30} {:<8}'.format('Computational complexity: ', macs)) # print('{:<30} {:<8}'.format('Number of parameters: ', params)) n_params = sum(p.numel() for name, p in model.named_parameters() if p.requires_grad) print(n_params) # # model = net # inputs = torch.randn(1, 3, 300, 300) #8618 305 # inputs = torch.randn(1, 3, 300, 300) # macs = profile_macs(model, inputs) # print(macs) model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) return model
def train(cfg, args): logger = logging.getLogger('SSD.trainer') model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) extra_checkpoint_data = checkpointer.load(args.ckpt) arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) logging.info('==>Start statistic') do_run(cfg, model, distributed=args.distributed) logging.info('==>End statistic') for ops in model.modules(): if isinstance(ops, torch.nn.ReLU): ops.collectStats = False # ops.c.data = ops.running_mean + (ops.running_b * laplace[args.actBitwidth]) ops.c.data = ops.running_mean + (3 * ops.running_std) ops.quant = True torch.cuda.empty_cache() model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) return model
def train(cfg: CfgNode, args: Namespace, output_dir: Path, model_manager: Dict[str, Any], freeze_non_sigma: bool = False): logger = logging.getLogger('SSD.trainer') model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) resume_from = checkpointer.get_best_from_experiment_dir(cfg) extra_checkpoint_data = checkpointer.load(f=resume_from) arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) # Weight freezing test: # print_model(model) # freeze_weights(model) print_model(model) model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args, output_dir, model_manager) return model
def train(cfg, args): logger = logging.getLogger('SSD.trainer') model = build_detection_model(cfg) # 建立模型 device = torch.device(cfg.MODEL.DEVICE) # 看cfg怎么组织的,把文件和args剥离开 model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) # model = nn.DataParallel(model) lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus optimizer = make_optimizer(cfg, model, lr) # 建立优化器 milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(model, optimizer, scheduler, save_dir=cfg.OUTPUT_DIR, save_to_disk=save_to_disk, logger=logger) # 建立模型存储载入类,给save_dir赋值表示 extra_checkpoint_data = checkpointer.load(f='', use_latest=False) # 载入模型 arguments.update(extra_checkpoint_data) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) # 建立数据库 print("dataloader: ", train_loader.batch_size) # exit(1232) model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) # 训练 return model
def active_train(cfg, args): logger = logging.getLogger("SSD.trainer") raw_model = build_detection_model(cfg) device = torch.device(cfg.MODEL.DEVICE) raw_model.to(device) lr = cfg.SOLVER.LR * args.num_gpus optimizer = make_optimizer(cfg, raw_model, lr) milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] scheduler = make_lr_scheduler(cfg, optimizer, milestones) arguments = {"iteration": 0} checkpointer = None save_to_disk = dist_util.get_rank() == 0 checkpointer = CheckPointer(raw_model, optimizer, scheduler, args.model_dir, save_to_disk, logger) max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus is_train = True train_transform = build_transforms(cfg, is_train=is_train) target_transform = build_target_transform(cfg) if is_train else None dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST datasets = build_dataset(dataset_list, transform=train_transform, target_transform=target_transform, is_train=is_train) logger.info(f'Creating query loader...') query_loader = QueryLoader(datasets[0], args, cfg) logger.info(f'Creating al model...') strategy = get_strategy(args.strategy) model = ALModel(raw_model, strategy, optimizer, device, scheduler, arguments, args, checkpointer, cfg) logger.info(f'Training on initial data with size {args.init_size}...') n_bbox = query_loader.len_annotations() t1 = time.time() model.fit(query_loader.get_labeled_loader()) init_time = time.time() - t1 logger.info(f'Scoring after initial training...') score = model.score() logger.info(f'SCORE : {score:.4f}') fields = [ args.strategy, {}, 0, score, init_time, 0, init_time, len(query_loader), n_bbox ] save_to_csv(args.filename, fields) for step in range(args.query_step): logger.info(f'STEP NUMBER {step}') logger.info('Querying assets to label') t1 = time.time() query_idx = model.query( unlabeled_loader=query_loader.get_unlabeled_loader(), cfg=cfg, args=args, step=step, n_instances=args.query_size, length_ds=len(datasets[0])) logger.info('Adding labeled samples to train dataset') query_loader.add_to_labeled(query_idx, step + 1) t2 = time.time() logger.info('Fitting with new data...') model.fit(query_loader.get_labeled_loader()) total_time = time.time() - t1 train_time = time.time() - t2 active_time = total_time - train_time logger.info('Scoring model...') score = model.score() n_bbox = query_loader.len_annotations() fields = [ args.strategy, {}, step + 1, score, train_time, active_time, total_time, len(query_loader), n_bbox ] save_to_csv(args.filename, fields) logger.info(f'SCORE : {score:.4f}') return model.model