def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, # weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) # modify the dataset from coco to nvdata # home_dir = os.environ["HOME"] # # on local # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train') # dataset_val = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/test'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='test', camera=args.camera) dataset_val = build_nvdataset(dataset_root=[ os.path.join(os.environ["HOME"], 'datasets/test'), os.path.join(os.environ["HOME"], 'datasets/test') ], mode='test', camera=args.camera) # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) # # on maglev # dataset_train_ = build_nvdataset(dataset_root=[args.dataset_root_sql, args.dataset_root_img], # mode='train') # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql], # mode='test', camera=args.camera) # indices_50k =np.load(os.path.join(args.root_indices)) # dataset_train = Subset(dataset_train_, indices_50k) print("Validation samples: %d" % (len(dataset_val))) # IPython.embed() if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth') # args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth' print(args.resume) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: print('Loading model: %s' % args.resume) checkpoint = torch.load(args.resume, map_location='cpu') print('Load model from %d epoch' % (checkpoint['epoch'] + 1)) model_without_ddp.load_state_dict(checkpoint['model']) if args.eval: if args.dataset_file == 'nvdata': evaluate(model, dataset_val, postprocessors, device) else: evaluate_5classes(model, dataset_val, postprocessors, device) return model, dataset_val, postprocessors, device
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) # IPython.embed() device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_val = build_nvdataset( dataset_root=[ os.path.join(os.environ["HOME"], 'datasets/detection-f'), #test os.path.join(os.environ["HOME"], 'datasets/frames_nvidia') ], mode='test', camera=args.camera) # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql], # mode='test', camera=args.camera) print("Validation samples: %d" % (len(dataset_val))) IPython.embed() # compute how many boxes in the test dataset for each image # accumulate_bboxes_numbers(dataset_val) # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train', camera=args.camera) # # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) # # dataset_train = Subset(dataset_train_, indices_50k) # print("Train samples: %d"%(len(dataset_train_))) # print(len(dataset_val)) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth') # args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth' log_path = args.resume log = os.path.join(args.resume, 'log.txt') # read_log(log) # IPython.embed() args.resume = os.path.join(args.resume, 'checkpoint.pth') print(args.resume) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: print('Loading model: %s' % args.resume) checkpoint = torch.load(args.resume, map_location='cpu') print('Load model from %d epoch' % checkpoint['epoch']) model_without_ddp.load_state_dict(checkpoint['model']) if args.eval: vis_bboxes(model, dataset_val, postprocessors, device) # inference_time(model, dataset_val, postprocessors, device) return model, dataset_val, postprocessors, device
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() # os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) # modify the dataset from coco to nvdata # home_dir = os.environ["HOME"] # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train') # dataset_val = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/test'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='test') # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) dataset_train = build_nvdataset( dataset_root=[args.dataset_root_sql, args.dataset_root_img], mode='train', camera=args.camera) dataset_val = build_nvdataset( dataset_root=[args.dataset_root_test, args.dataset_root_test], mode='test', camera=args.camera) if args.root_indices is not None: indices_50k = np.load(os.path.join(args.root_indices)) # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) dataset_train = Subset(dataset_train, indices_50k) # IPython.embed() print("Train samples: %d" % (len(dataset_train))) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) # if args.dataset_file == "coco_panoptic": # # We also evaluate AP during panoptic training, on original coco DS # coco_val = datasets.coco.build("val", args) # base_ds = get_coco_api_from_dataset(coco_val) # elif args.dataset_file == "nvdata": # coco_val = datasets.coco.build("val", args) # base_ds = get_coco_api_from_dataset(coco_val) # else: # base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 # if args.eval: # test_stats, coco_evaluator = evaluate_nvdata(model, criterion, postprocessors, # data_loader_val, base_ds, device, args.output_dir) # if args.output_dir: # utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") # return # if args.eval: # evaluate(model, dataset_val, postprocessors, device) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 50 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # test_stats, coco_evaluator = evaluate_nvdata( # model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir # ) # log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, # **{f'test_{k}': v for k, v in test_stats.items()}, # 'epoch': epoch, # 'n_parameters': n_parameters} log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs # if coco_evaluator is not None: # (output_dir / 'eval').mkdir(exist_ok=True) # if "bbox" in coco_evaluator.coco_eval: # filenames = ['latest.pth'] # if epoch % 50 == 0: # filenames.append(f'{epoch:03}.pth') # for name in filenames: # torch.save(coco_evaluator.coco_eval["bbox"].eval, # output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def load_model(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, # weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) # modify the dataset from coco to nvdata # home_dir = os.environ["HOME"] # # on local # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train') dataset_val = build_nvdataset(dataset_root=[ os.path.join(os.environ["HOME"], 'datasets/test'), os.path.join(os.environ["HOME"], 'datasets/frames_nvidia') ], mode='test') # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) # # on maglev # dataset_train_ = build_nvdataset(dataset_root=[args.dataset_root_sql, args.dataset_root_img], # mode='train') # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql], # mode='test') # indices_50k =np.load(os.path.join(args.root_indices)) # dataset_train = Subset(dataset_train_, indices_50k) print("Validation samples: %d" % (len(dataset_val))) # IPython.embed() if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth') args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth' if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: print('Loading model: %s' % args.resume) checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) # if args.eval: # evaluate(model, dataset_val, postprocessors, device) return model, dataset_val, postprocessors, device # if __name__ == '__main__': # parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()]) # args = parser.parse_args() # if args.output_dir: # Path(args.output_dir).mkdir(parents=True, exist_ok=True) # main(args) # CUDA_VISIBLE_DEVICES=1 dazel run //sandbox/williamz/detr:train_with_eval -- --eval # test: # CUDA_VISIBLE_DEVICES=1 dazel run //sandbox/williamz/detr:eval -- --eval --resume /home/shuxuang/experiments/train_output/checkpoint.pth # get info: # maglev workflows get 0cf25940-3f00-5c2c-a8e8-1571e986513b # maglev volumes mount -n train-outputs -v 4977fea-0998-4d5b-b557-ff17605f2098 -p /home/shuxuang/experiments/train_output