def detect(cfgfile, weightfile, imgfile): m = Darknet(cfgfile) m.load_state_dict(torch.load(weightfile)) # m.print_network() # m.load_weights(weightfile) print('Loading weights from %s... Done!' % (weightfile)) num_classes = 20 if num_classes == 20: namesfile = 'data/voc.names' elif num_classes == 80: namesfile = 'data/coco.names' else: namesfile = 'data/names' use_cuda = 1 if use_cuda: m.cuda() input_img = cv2.imread(imgfile) orig_img = Image.open(imgfile) start = time.time() boxes,scale = do_detect(m, input_img, 0.5, 0.4, use_cuda) finish = time.time() print('%s: Predicted in %f seconds.' % (imgfile, (finish - start))) class_names = load_class_names(namesfile) plot_boxes(orig_img, boxes, 'predictions.jpg', class_names,scale=scale)
def load_model(model_config_file, weight_file, frame_size): model = Darknet(model_config_file, inference=True) checkpoint = torch.load( weight_file, map_location=torch.device('cuda')) model.load_state_dict(checkpoint['state_dict']) model.eval() model.cuda() return model
def detect_cv2_camera(cfgfile, weightfile): import cv2 m = Darknet(cfgfile) # mot_tracker = Sort() m.print_network() m.load_weights(weightfile) if args.torch: m.load_state_dict(torch.load(weightfile)) else: m.load_weights(weightfile) print('Loading weights from %s... Done!' % (weightfile)) if use_cuda: m.cuda() # cap = cv2.VideoCapture(0) cap = cv2.VideoCapture('rtsp://192.168.1.75:8554/mjpeg/1') # cap = cv2.VideoCapture("./test.mp4") cap.set(3, 1280) cap.set(4, 720) print("Starting the YOLO loop...") num_classes = m.num_classes if num_classes == 20: namesfile = 'data/voc.names' elif num_classes == 80: namesfile = 'data/coco.names' else: namesfile = 'data/x.names' class_names = load_class_names(namesfile) while True: ret, img = cap.read() sized = cv2.resize(img, (m.width, m.height)) sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) # piling = Image.fromarray(sized) start = time.time() boxes = do_detect(m, sized, 0.4, 0.6, use_cuda) if boxes is not None: # tracked_object = mot_tracker.update(tensorQ) finish = time.time() print('Predicted in %f seconds.' % (finish - start)) result_img = plot_boxes_cv2(img, boxes[0], savename=None, class_names=class_names) cv2.imshow('Yolo demo', result_img) if cv2.waitKey(1) & 0xFF == ord('q'): break cap.release() cv2.destroyAllWindows()
def detect_cv2_camera(cfgfile, weightfile): import cv2 m = Darknet(cfgfile) m.print_network() if args.torch: m.load_state_dict(torch.load(weightfile)) else: m.load_weights(weightfile) print('Loading weights from %s... Done!' % (weightfile)) if use_cuda: m.cuda() cap = cv2.VideoCapture(0) # cap = cv2.VideoCapture("./test.mp4") cap.set(3, 1280) cap.set(4, 720) print("Starting the YOLO loop...") num_classes = m.num_classes if num_classes == 20: namesfile = 'data/voc.names' elif num_classes == 80: namesfile = 'data/coco.names' else: namesfile = 'data/x.names' class_names = load_class_names(namesfile) while True: ret, img = cap.read() sized = cv2.resize(img, (m.width, m.height)) sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) start = time.time() boxes = do_detect(m, sized, 0.4, 0.6, use_cuda) finish = time.time() print('Predicted in %f seconds.' % (finish - start)) result_img = plot_boxes_cv2(img, boxes[0], savename=None, class_names=class_names) cv2.imshow('Yolo demo', result_img) cv2.waitKey(1) cap.release()
def detect_cv2(cfgfile, weightfile, imgfile): import cv2 m = Darknet(cfgfile) m.print_network() m.load_weights(weightfile) if args.torch: m.load_state_dict(torch.load(weightfile)) else: m.load_weights(weightfile) print('Loading weights from %s... Done!' % (weightfile)) if use_cuda: m.cuda() num_classes = m.num_classes if num_classes == 20: namesfile = 'data/voc.names' elif num_classes == 80: namesfile = 'data/coco.names' else: namesfile = 'data/x.names' class_names = load_class_names(namesfile) while True: val = input("\n numero da imagem: ") pred_init_time = time.time() named_file = "../fotos_geladeira_4/opencv_frame_" + val + ".png" print(named_file) img = cv2.imread(named_file) # img = cv2.imread(imgfile) sized = cv2.resize(img, (m.width, m.height)) sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) for i in range(2): start = time.time() boxes = do_detect(m, sized, 0.4, 0.6, use_cuda) finish = time.time() if i == 1: print('%s: Predicted in %f seconds.' % (imgfile, (finish - start))) plot_boxes_cv2(img, boxes[0], savename='predictions.jpg', class_names=class_names) count_total_in_image(boxes[0], class_names) print("\n Total inference time {0} seconds".format(time.time() - pred_init_time))
def load_model(opts, frame_size): cfg_file_path = opts.model_config_dir + \ "/yolov4_" + str(frame_size) + ".cfg" model = Darknet(cfg_file_path, inference=True) weight_file = os.path.join( opts.weights_dir, "yolov4_{}.pth".format(frame_size)) checkpoint = torch.load( weight_file, map_location='cuda:{}'.format(opts.gpu_id)) model.load_state_dict(checkpoint['state_dict']) model.eval() if not opts.no_cuda: model.cuda(opts.gpu_id) # Zero grad for parameters for param in model.parameters(): param.grad = None return model
def detect(cfgfile, weightfile, imgfile): m = Darknet(cfgfile) checkpoint = torch.load(weightfile) model_dict = m.state_dict() pretrained_dict = checkpoint keys = [] for k, v in pretrained_dict.items(): keys.append(k) i = 0 for k, v in model_dict.items(): if v.size() == pretrained_dict[keys[i]].size(): model_dict[k] = pretrained_dict[keys[i]] i = i + 1 m.load_state_dict(model_dict) # m.load_state_dict(torch.load(weightfile)) # m.print_network() # m.load_weights(weightfile) print('Loading weights from %s... Done!' % (weightfile)) namesfile = 'data/mydata.names' use_cuda = 1 if use_cuda: m.cuda() input_img = cv2.imread(imgfile) # orig_img = Image.open(imgfile).convert('RGB') start = time.time() boxes, scale = do_detect(m, input_img, 0.5, 0.4, use_cuda) finish = time.time() print('%s: Predicted in %f seconds.' % (imgfile, (finish - start))) class_names = load_class_names(namesfile) # draw_boxes(input_img,boxes,scale=scale) plot_boxes_cv2(input_img, boxes, 'predictions1.jpg', class_names=class_names, scale=scale)
def train(model, device, config, epochs=5, batch_size=1, save_cp=True, log_step=20, img_scale=0.5): train_dataset = Yolo_dataset(config.train_label, config, train=True) val_dataset = Yolo_dataset(config.val_label, config, train=False) n_train = len(train_dataset) n_val = len(val_dataset) train_loader = DataLoader(train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate) val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=val_collate) writer = SummaryWriter( log_dir=config.TRAIN_TENSORBOARD_DIR, filename_suffix= f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}', comment= f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}' ) # writer.add_images('legend', # torch.from_numpy(train_dataset.label2colorlegend2(cfg.DATA_CLASSES).transpose([2, 0, 1])).to( # device).unsqueeze(0)) max_itr = config.TRAIN_EPOCHS * n_train # global_step = cfg.TRAIN_MINEPOCH * n_train global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {config.batch} Subdivisions: {config.subdivisions} Learning rate: {config.learning_rate} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images size: {config.width} Optimizer: {config.TRAIN_OPTIMIZER} Dataset classes: {config.classes} Train label path:{config.train_label} Pretrained: ''') # learning rate setup def burnin_schedule(i): if i < config.burn_in: factor = pow(i / config.burn_in, 4) elif i < config.steps[0]: factor = 1.0 elif i < config.steps[1]: factor = 0.1 else: factor = 0.01 return factor if config.TRAIN_OPTIMIZER.lower() == 'adam': optimizer = optim.Adam( model.parameters(), lr=config.learning_rate / config.batch, betas=(0.9, 0.999), eps=1e-08, ) elif config.TRAIN_OPTIMIZER.lower() == 'sgd': optimizer = optim.SGD( params=model.parameters(), lr=config.learning_rate / config.batch, momentum=config.momentum, weight_decay=config.decay, ) scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule) criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes) # scheduler = ReduceLROnPlateau(optimizer, mode='max', verbose=True, patience=6, min_lr=1e-7) # scheduler = CosineAnnealingWarmRestarts(optimizer, 0.001, 1e-6, 20) save_prefix = 'Yolov4_epoch' saved_models = deque() model.train() for epoch in range(epochs): # model.train() epoch_loss = 0 epoch_step = 0 with tqdm(total=n_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='img', ncols=50) as pbar: for i, batch in enumerate(train_loader): global_step += 1 epoch_step += 1 images = batch[0] bboxes = batch[1] images = images.to(device=device, dtype=torch.float32) bboxes = bboxes.to(device=device) bboxes_pred = model(images) loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion( bboxes_pred, bboxes) # loss = loss / config.subdivisions loss.backward() epoch_loss += loss.item() if global_step % config.subdivisions == 0: optimizer.step() scheduler.step() model.zero_grad() if global_step % (log_step * config.subdivisions) == 0: writer.add_scalar('train/Loss', loss.item(), global_step) writer.add_scalar('train/loss_xy', loss_xy.item(), global_step) writer.add_scalar('train/loss_wh', loss_wh.item(), global_step) writer.add_scalar('train/loss_obj', loss_obj.item(), global_step) writer.add_scalar('train/loss_cls', loss_cls.item(), global_step) writer.add_scalar('train/loss_l2', loss_l2.item(), global_step) writer.add_scalar('lr', scheduler.get_lr()[0] * config.batch, global_step) pbar.set_postfix( **{ 'loss (batch)': loss.item(), 'loss_xy': loss_xy.item(), 'loss_wh': loss_wh.item(), 'loss_obj': loss_obj.item(), 'loss_cls': loss_cls.item(), 'loss_l2': loss_l2.item(), 'lr': scheduler.get_lr()[0] * config.batch }) logging.debug( 'Train step_{}: loss : {},loss xy : {},loss wh : {},' 'loss obj : {},loss cls : {},loss l2 : {},lr : {}'. format(global_step, loss.item(), loss_xy.item(), loss_wh.item(), loss_obj.item(), loss_cls.item(), loss_l2.item(), scheduler.get_lr()[0] * config.batch)) pbar.update(images.shape[0]) if cfg.use_darknet_cfg: eval_model = Darknet(cfg.cfgfile, inference=True) else: eval_model = Yolov4(cfg.pretrained, n_classes=cfg.classes, inference=True) # eval_model = Yolov4(yolov4conv137weight=None, n_classes=config.classes, inference=True) if torch.cuda.device_count() > 1: eval_model.load_state_dict(model.module.state_dict()) else: eval_model.load_state_dict(model.state_dict()) eval_model.to(device) evaluator = evaluate(eval_model, val_loader, config, device) del eval_model stats = evaluator.coco_eval['bbox'].stats writer.add_scalar('train/AP', stats[0], global_step) writer.add_scalar('train/AP50', stats[1], global_step) writer.add_scalar('train/AP75', stats[2], global_step) writer.add_scalar('train/AP_small', stats[3], global_step) writer.add_scalar('train/AP_medium', stats[4], global_step) writer.add_scalar('train/AP_large', stats[5], global_step) writer.add_scalar('train/AR1', stats[6], global_step) writer.add_scalar('train/AR10', stats[7], global_step) writer.add_scalar('train/AR100', stats[8], global_step) writer.add_scalar('train/AR_small', stats[9], global_step) writer.add_scalar('train/AR_medium', stats[10], global_step) writer.add_scalar('train/AR_large', stats[11], global_step) if save_cp: try: # os.mkdir(config.checkpoints) os.makedirs(config.checkpoints, exist_ok=True) logging.info('Created checkpoint directory') except OSError: pass save_path = os.path.join(config.checkpoints, f'{save_prefix}{epoch + 1}.pth') torch.save(model.state_dict(), save_path) logging.info(f'Checkpoint {epoch + 1} saved !') saved_models.append(save_path) if len(saved_models) > config.keep_checkpoint_max > 0: model_to_remove = saved_models.popleft() try: os.remove(model_to_remove) except: logging.info(f'failed to remove {model_to_remove}') writer.close()
cfg = get_args(**Cfg) os.environ["CUDA_VISIBLE_DEVICES"] = cfg.gpu device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logging.info(f'Using device {device}') if cfg.use_darknet_cfg: model = Darknet(cfg.cfgfile) if cfg.pretrained: model.load_weights(cfg.pretrained) else: model = Yolov4(cfg.pretrained, n_classes=cfg.classes) if cfg.load: pretrained_dict = torch.load(cfg.load, map_location=torch.device('cuda')) model.load_state_dict(pretrained_dict) if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) model.to(device=device) try: train( model=model, config=cfg, epochs=cfg.TRAIN_EPOCHS, device=device, ) except KeyboardInterrupt: torch.save(model.state_dict(), 'INTERRUPTED.pth') logging.info('Saved interrupt')
def train(model, device, config, epochs=5, batch_size=1, save_cp=True, log_step=20, img_scale=0.5): # TODO:加上resume功能,resume需要什么信息? # config的所有信息、yolov4-custom.cfg的所有信息,权重,epoch序号,学习率到哪了 # 创建dataset # config.train_label为data/coins.txt标签文本的路径 train_dataset = Yolo_dataset(config.train_label, config, train=True) val_dataset = Yolo_dataset(config.val_label, config, train=False) # 获得dataset的长度 n_train = len(train_dataset) n_val = len(val_dataset) # 创建dataloader # 当pin_memory=False,num_workers=0(子进程数量为0,即只有主进程)时,正常 # 当pin_memory=True,num_workers=8时,卡住 # 当pin_memory=False,num_workers=8时,卡住 # 当pin_memory=True,num_workers=0时,正常 # 综上,原因在于num_workers大于0开启多线程导致 # 经查,dataset加载图片中使用OpenCV,OpenCV某些函数默认也会开多线程, # 多线程套多线程,容易导致线程卡住(是否会卡住可能与不同操作系统有关) # 解决方法:法一,在dataset的前面import cv2时加上cv2.setNumThreads(0)禁用OpenCV多进程(推荐) # 法二,使用PIL加载和预处理图片(不推荐,PIL速度不如OpenCV) train_loader = DataLoader(train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True) val_loader = DataLoader(val_dataset, batch_size=config.batch // config.subdivisions, shuffle=False, num_workers=8, pin_memory=True, drop_last=False, collate_fn=val_collate) if config.only_evaluate or config.evaluate_when_train: tgtFile = makeTgtJson(val_loader, config.categories) writer = SummaryWriter(log_dir=config.TRAIN_TENSORBOARD_DIR, filename_suffix=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}', comment=f'OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}') # 计算迭代次数的最大值 max_itr = config.TRAIN_EPOCHS * n_train # 迭代次数的全局计数器 global_step = 0 logging.info(f'''Starting training: Epochs: {epochs} Batch size: {config.batch} Subdivisions: {config.subdivisions} Learning rate: {config.learning_rate} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Images size: {config.width} Optimizer: {config.TRAIN_OPTIMIZER} Dataset classes: {config.classes} Train label path:{config.train_label} Pretrained: {config.pretrainedWeight is not None or config.Pretrained is not None} ''') if config.only_evaluate: if config.use_darknet_cfg: eval_model = Darknet(config.cfgfile) else: raise NotImplementedError if torch.cuda.device_count() > 1: eval_model.load_state_dict(model.module.state_dict()) else: eval_model.load_state_dict(model.state_dict()) eval_model.to(device) eval_model.eval() resFile = evaluate(eval_model, config.val_label, config.dataset_dir, device==torch.device("cuda")) if resFile is None: debugPrint("detect 0 boxes in the val set") return cocoEvaluate(tgtFile, resFile) return # learning rate setup # 自定义的学习率调整函数,先递增,然后阶梯性降低 def burnin_schedule(i): # i表示iter,而不是epoch if i < config.burn_in: # 按4次方递增阶段 # factor表示乘在学习率上的倍数 factor = pow(i / config.burn_in, 4) elif i < config.steps[0]: # 第一阶段 factor = 1.0 elif i < config.steps[1]: # 第二阶段 factor = 0.1 else: # 第三阶段 factor = 0.01 return factor if config.TRAIN_OPTIMIZER.lower() == 'adam': # 默认是adam optimizer = optim.Adam( model.parameters(), lr=config.learning_rate / config.batch, # 学习率的实际值是设置值/batch_size betas=(0.9, 0.999), # adam的特殊参数,一般用默认即可 eps=1e-08, # adam的特殊参数,一般用默认即可 ) elif config.TRAIN_OPTIMIZER.lower() == 'sgd': optimizer = optim.SGD( params=model.parameters(), lr=config.learning_rate / config.batch, momentum=config.momentum, weight_decay=config.decay, ) # pytorch调整学习率的专用接口 scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule) # 计算loss的对象,这个模块是在yolo网络后专门求解loss的(yolo主网络只负责接收图片,然后输出三路张量),这个模块不需要权重等参数 criterion = Yolo_loss(device=device, batch=config.batch // config.subdivisions, n_classes=config.classes) save_prefix = 'Yolov4_epoch' saved_models = deque() for epoch in range(epochs): epoch_loss = 0 epoch_step = 0 model.train() logging.info("===Train===") for i, batch in enumerate(train_loader): global_step += 1 epoch_step += 1 images = batch[0] bboxes = batch[1] images = images.to(device=device, dtype=torch.float32) bboxes = bboxes.to(device=device) bboxes_pred = model(images) loss, loss_xy, loss_wh, loss_obj, loss_cls, loss_l2 = criterion(bboxes_pred, bboxes) loss.backward() epoch_loss += loss.item() if global_step % config.subdivisions == 0: optimizer.step() scheduler.step() model.zero_grad() logging.info("Epoch:[{:3}/{}],step:[{:3}/{}],total loss:{:.2f}|lr:{:.5f}".format(epoch + 1, epochs, i + 1, len(train_loader), loss.item(), scheduler.get_last_lr()[0])) if global_step % (log_step * config.subdivisions) == 0: # log_step默认为20,这里指的是迭代次数 writer.add_scalar('train/Loss', loss.item(), global_step) writer.add_scalar('train/loss_xy', loss_xy.item(), global_step) writer.add_scalar('train/loss_wh', loss_wh.item(), global_step) writer.add_scalar('train/loss_obj', loss_obj.item(), global_step) writer.add_scalar('train/loss_cls', loss_cls.item(), global_step) writer.add_scalar('train/loss_l2', loss_l2.item(), global_step) writer.add_scalar('lr', scheduler.get_last_lr()[0] * config.batch, global_step) logging.debug('Train step_{}: loss : {},loss xy : {},loss wh : {},' 'loss obj : {},loss cls : {},loss l2 : {},lr : {}' .format(global_step, loss.item(), loss_xy.item(), loss_wh.item(), loss_obj.item(), loss_cls.item(), loss_l2.item(), scheduler.get_last_lr()[0] * config.batch)) if save_cp: # True # 创建checkpoints文件夹 if not os.path.exists(config.checkpoints): os.makedirs(config.checkpoints, exist_ok=True) # exist_ok=True表示可以接受已经存在该文件夹,当exist_ok=False时文件夹存在会抛出错误 logging.info('Created checkpoint directory') save_path = os.path.join(config.checkpoints, f'{save_prefix}{epoch + 1}.weights') # 考虑torch.nn.DataParallel特殊情况 if torch.cuda.device_count() > 1: model.module.save_weights(save_path) else: model.save_weights(save_path) logging.info(f'Checkpoint {epoch + 1} saved !') # 只保留最新keep_checkpoint_max个checkpoint,自动删除较早的checkpoint saved_models.append(save_path) if len(saved_models) > config.keep_checkpoint_max > 0: model_to_remove = saved_models.popleft() try: os.remove(model_to_remove) except: logging.info(f'failed to remove {model_to_remove}') if config.evaluate_when_train: try: model.eval() resFile = evaluate(model, config.val_label, config.dataset_dir, device==torch.device("cuda"), config.width, config.height) if resFile is None: continue stats = cocoEvaluate(tgtFile, resFile) logging.info("===Val===") logging.info("Epoch:[{:3}/{}],AP:{:.3f}|AP50:{:.3f}|AP75:{:.3f}|APs:{:.3f}|APm:{:.3f}|APl:{:.3f}".format( epoch + 1, epochs, stats[0], stats[1], stats[2], stats[3], stats[4], stats[5])) logging.info("Epoch:[{:3}/{}],AR1:{:.3f}|AR10:{:.3f}|AR100:{:.3f}|ARs:{:.3f}|ARm:{:.3f}|ARl:{:.3f}".format( epoch + 1, epochs, stats[6], stats[7], stats[8], stats[9], stats[10], stats[11])) writer.add_scalar('train/AP', stats[0], global_step) writer.add_scalar('train/AP50', stats[1], global_step) writer.add_scalar('train/AP75', stats[2], global_step) writer.add_scalar('train/AP_small', stats[3], global_step) writer.add_scalar('train/AP_medium', stats[4], global_step) writer.add_scalar('train/AP_large', stats[5], global_step) writer.add_scalar('train/AR1', stats[6], global_step) writer.add_scalar('train/AR10', stats[7], global_step) writer.add_scalar('train/AR100', stats[8], global_step) writer.add_scalar('train/AR_small', stats[9], global_step) writer.add_scalar('train/AR_medium', stats[10], global_step) writer.add_scalar('train/AR_large', stats[11], global_step) except Exception as e: debugPrint("evaluate meets an exception, here is the exception info:") traceback.print_exc() debugPrint("ignore error in evaluate and continue training") writer.close()
logging = init_logger(log_dir='log') cfg = get_args(**Cfg) assert cfg.batch_size >= cfg.subdivisions, 'Batch size should be >= subdivisions' os.environ["CUDA_VISIBLE_DEVICES"] = '0' device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logging.info(f'Using device {device}') # Initialise model model = Darknet(cfg.cfgfile) start_epoch = cfg.start_epoch _, extension = os.path.splitext(cfg.pretrained) if extension == '.weights': model.load_weights(cfg.pretrained) elif extension == '.pth': ckpt = torch.load(cfg.pretrained) model.load_state_dict(ckpt['state_dict']) if 'epoch' in ckpt: start_epoch = ckpt['epoch'] # if torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model) model.to(device=device) try: train(model=model, config=cfg, epochs=cfg.num_epochs, device=device, start_epoch=start_epoch, batch_size=cfg.batch_size) except KeyboardInterrupt:
return logging if __name__ == "__main__": logging = init_logger(log_dir='log') cfg = get_args(**Cfg) os.environ["CUDA_VISIBLE_DEVICES"] = cfg.gpu device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logging.info(f'Using device {device}') model = Darknet(cfg.model_config, inference=True) model.print_network() checkpoint = torch.load( cfg.weights_file, map_location=torch.device('cuda')) model.load_state_dict(checkpoint['state_dict']) # model.load_weights(cfg.weights_file) model.eval() # set model away from training # if torch.cuda.device_count() > 1: # model = torch.nn.DataParallel(model) model.to(device=device) annotations_file_path = cfg.gt_annotations_path with open(annotations_file_path) as annotations_file: try: annotations = json.load(annotations_file) except: print("annotations file not a json") exit()
elif len(sys.argv) == 7: n_classes = int(sys.argv[1]) weightfile = sys.argv[2] imgfile = sys.argv[3] height = sys.argv[4] width = int(sys.argv[5]) namesfile = int(sys.argv[6]) else: print('Usage: ') print(' python models.py num_classes weightfile imgfile namefile') # model = Yolov4(yolov4conv137weight=None, n_classes=n_classes, inference=True) model = Darknet('../cfg/yolov4.cfg', inference=True) pretrained_dict = torch.load(weightfile, map_location=torch.device('cuda')) model.load_state_dict(pretrained_dict['state_dict']) # model.load_weights(weightfile) use_cuda = True if use_cuda: model.cuda() if os.path.isdir(imgfile): names = os.listdir(imgfile) abs_names = list(map(lambda x: os.path.join(imgfile, x), names)) else: abs_names = [imgfile] for i, abs_name in tqdm(enumerate(abs_names)): img = cv2.imread(abs_name) # Inference input size is 416*416 does not mean training size is the same
def train( model, device, config, epochs=5, save_cp=True, log_step=20, ): # Get dataloaders train_dataset = Yolo_BEV_dataset(config, split="train") val_dataset = Yolo_BEV_dataset(config, split="val") train_loader = DataLoader( train_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate, ) val_loader = DataLoader( val_dataset, batch_size=config.batch // config.subdivisions, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate, ) # define summary writer writer = SummaryWriter( log_dir=config.TRAIN_TENSORBOARD_DIR, filename_suffix= f"OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}", comment= f"OPT_{config.TRAIN_OPTIMIZER}_LR_{config.learning_rate}_BS_{config.batch}_Sub_{config.subdivisions}_Size_{config.width}", ) # log n_train = len(train_dataset) n_val = len(val_dataset) global_step = 0 logging.info(f"""Starting training: Epochs: {config.epochs} Batch size: {config.batch} Subdivisions: {config.subdivisions} Learning rate: {config.learning_rate} Training size: {n_train} Validation size: {n_val} Checkpoints: {save_cp} Device: {device.type} Input height: {config.height} Input width: {config.width} Optimizer: {config.TRAIN_OPTIMIZER} Dataset classes: {config.classes} """) # learning rate setup def burnin_schedule(i): if i < config.burn_in: factor = pow(i / config.burn_in, 4) elif i < config.steps[0]: factor = 1.0 elif i < config.steps[1]: factor = 0.1 else: factor = 0.01 return factor # optimizer + scheduler if config.TRAIN_OPTIMIZER.lower() == "adam": optimizer = optim.Adam( model.parameters(), lr=config.learning_rate / config.batch, betas=(0.9, 0.999), eps=1e-08, ) elif config.TRAIN_OPTIMIZER.lower() == "sgd": optimizer = optim.SGD( params=model.parameters(), lr=config.learning_rate / config.batch, momentum=config.momentum, weight_decay=config.decay, ) # scheduler multiplies learning rate by a factor calculated on epoch scheduler = optim.lr_scheduler.LambdaLR(optimizer, burnin_schedule) # loss function criterion = Yolo_loss( cfg=config, device=device, ) # start training save_prefix = "Yolov4_BEV_flat_epoch" saved_models = deque() model.train() for epoch in range(epochs): epoch_loss = 0 epoch_step = 0 with tqdm(total=n_train, desc=f"Epoch {epoch + 1}/{epochs}", unit="img", ncols=75) as pbar: for i, batch in enumerate(train_loader): # get batch global_step += 1 epoch_step += 1 images = batch[0].float().to(device=device) labels = batch[1] # compute loss preds = model(images)[0] loss, loss_xy, loss_wl, loss_rot, loss_obj, loss_noobj = criterion( preds, labels) loss.backward() epoch_loss += loss.item() # update weights if global_step % config.subdivisions == 0: optimizer.step() scheduler.step() model.zero_grad() # log if global_step % (log_step * config.subdivisions) == 0: writer.add_scalar("train/Loss", loss.item(), global_step) writer.add_scalar("train/loss_xy", loss_xy.item(), global_step) writer.add_scalar("train/loss_wl", loss_wl.item(), global_step) writer.add_scalar("train/loss_rot", loss_rot.item(), global_step) writer.add_scalar("train/loss_obj", loss_obj.item(), global_step) writer.add_scalar("train/loss_noobj", loss_noobj.item(), global_step) writer.add_scalar("lr", scheduler.get_lr()[0] * config.batch, global_step) pbar.set_postfix({ "loss (batch)": loss.item(), "loss_xy": loss_xy.item(), "loss_wl": loss_wl.item(), "loss_rot": loss_rot.item(), "loss_obj": loss_obj.item(), "loss_noobj": loss_noobj.item(), "lr": scheduler.get_lr()[0] * config.batch, }) logging.debug( "Train step_{}: loss : {},loss xy : {},loss wl : {}," "loss rot : {},loss obj : {},loss noobj : {},lr : {}". format( global_step, loss.item(), loss_xy.item(), loss_wl.item(), loss_rot.item(), loss_obj.item(), loss_noobj.item(), scheduler.get_lr()[0] * config.batch, )) pbar.update(images.shape[0]) # evaluate models min_eval_loss = math.inf if epoch % 2 == 0: eval_model = Darknet(cfg.cfgfile, inference=True, model_type="BEV_flat") if torch.cuda.device_count() > 1: eval_model.load_state_dict(model.module.state_dict()) else: eval_model.load_state_dict(model.state_dict()) eval_model.to(device) eval_model.eval() eval_loss = 0.0 eval_loss_xy = 0.0 eval_loss_wl = 0.0 eval_loss_rot = 0.0 eval_loss_obj = 0.0 eval_loss_noobj = 0.0 with tqdm(total=n_val, desc=f"Eval {(epoch + 1) // 2}", unit="img", ncols=75) as epbar: for i, batch in enumerate(val_loader): # get batch global_step += 1 epoch_step += 1 images = batch[0].float().to(device=device) labels = batch[1] # compute loss labels_pred = model(images)[0] loss, loss_xy, loss_wl, loss_rot, loss_obj, loss_noobj = criterion( labels_pred, labels) eval_loss += loss.item() eval_loss_xy += loss_xy.item() eval_loss_wl += loss_wl.item() eval_loss_rot += loss_rot.item() eval_loss_rot += loss_obj.item() eval_loss_noobj += loss_noobj.item() epbar.update(images.shape[0]) # log logging.debug( "Val step_{}: loss : {},loss xy : {},loss wl : {}," "loss rot : {},loss obj : {},loss noobj : {},lr : {}". format( global_step, eval_loss.item(), eval_loss_xy.item(), eval_loss_wl.item(), eval_loss_rot.item(), eval_loss_obj.item(), eval_loss_noobj.item(), scheduler.get_lr()[0] * config.batch, )) del eval_model # save checkpoint if save_cp and eval_loss < min_eval_loss: min_eval_loss = eval_loss try: os.makedirs(config.checkpoints, exist_ok=True) logging.info("Created checkpoint directory") except OSError: pass save_path = os.path.join(config.checkpoints, f"{save_prefix}{epoch + 1}.pth") torch.save(model.state_dict(), save_path) logging.info(f"Checkpoint {epoch + 1} saved !") saved_models.append(save_path) if len(saved_models) > config.keep_checkpoint_max > 0: model_to_remove = saved_models.popleft() try: os.remove(model_to_remove) except: logging.info(f"failed to remove {model_to_remove}") writer.close()
os.environ["CUDA_VISIBLE_DEVICES"] = cfg.gpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logging.info(f"Using device {device}") cfg.device = device # load model and push to device model = Darknet(cfg.cfgfile, model_type="BEV_flat") if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) model.to(device=device) # load weights if cfg.load is None: model.load_weights(cfg.backbone, cut_off=53) else: model.load_state_dict(cfg.load) # freeze backbone model.freeze_layers([i for i in range(54)]) try: train( model=model, config=cfg, epochs=cfg.epochs, device=device, ) except KeyboardInterrupt: torch.save(model.state_dict(), "checkpoints/INTERRUPTED.pth") logging.info("Saved interrupt to checkpoints/INTERRUPTED.pth") try: