def load_pretrained_model(model, pretrained_model): if os.path.exists(pretrained_model): para_state_dict = paddle.load(pretrained_model) model_state_dict = model.state_dict() keys = model_state_dict.keys() num_params_loaded = 0 for k in keys: if k not in para_state_dict: logger.warning("{} is not in pretrained model".format(k)) elif list(para_state_dict[k].shape) != list( model_state_dict[k].shape): logger.warning( "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})" .format(k, para_state_dict[k].shape, model_state_dict[k].shape)) else: model_state_dict[k] = para_state_dict[k] num_params_loaded += 1 model.set_dict(model_state_dict) logger.info("There are {}/{} variables loaded into {}.".format( num_params_loaded, len(model_state_dict), model.__class__.__name__)) else: raise ValueError( 'The pretrained model directory is not Found: {}'.format( pretrained_model))
def main(args): paddle.set_device("gpu") cfg = Config(args.cfg) val_dataset = cfg.val_dataset if val_dataset is None: raise RuntimeError( 'The verification dataset is not specified in the configuration file.' ) elif len(val_dataset) == 0: raise ValueError( 'The length of val_dataset is 0. Please check if your dataset is valid' ) msg = '\n---------------Config Information---------------\n' msg += str(cfg) msg += '------------------------------------------------' logger.info(msg) model = cfg.model if args.model_path: load_pretrained_model(model, args.model_path) logger.info('Loaded trained params of model successfully') evaluate(model, val_dataset, num_workers=args.num_workers, output_dir=args.output_dir)
def main(args): paddle.set_device("gpu") cfg = Config(args.cfg) model = cfg.model model.eval() if args.model_path: load_pretrained_model(model, args.model_path) logger.info('Loaded trained params of model successfully') K = np.array([[[2055.56, 0, 939.658], [0, 2055.56, 641.072], [0, 0, 1]]], np.float32) K_inverse = np.linalg.inv(K) K_inverse = paddle.to_tensor(K_inverse) img, ori_img_size, output_size = get_img(args.input_path) ratio = get_ratio(ori_img_size, output_size) ratio = paddle.to_tensor(ratio) cam_info = [K_inverse, ratio] total_pred = model(img, cam_info) keep_idx = paddle.nonzero(total_pred[:, -1] > 0.25) total_pred = paddle.gather(total_pred, keep_idx) if total_pred.shape[0] > 0: pred_dimensions = total_pred[:, 6:9] pred_dimensions = pred_dimensions.roll(shifts=1, axis=1) pred_rotys = total_pred[:, 12] pred_locations = total_pred[:, 9:12] bbox_3d = encode_box3d(pred_rotys, pred_dimensions, pred_locations, paddle.to_tensor(K), (1280, 1920)) else: bbox_3d = total_pred img_draw = cv2.imread(args.input_path) for idx in range(bbox_3d.shape[0]): bbox = bbox_3d[idx] bbox = bbox.transpose([1, 0]).numpy() img_draw = draw_box_3d(img_draw, bbox) cv2.imwrite(args.output_path, img_draw)
def resume(model, optimizer, resume_model): if resume_model is not None: logger.info('Resume model from {}'.format(resume_model)) if os.path.exists(resume_model): resume_model = os.path.normpath(resume_model) ckpt_path = os.path.join(resume_model, 'model.pdparams') para_state_dict = paddle.load(ckpt_path) ckpt_path = os.path.join(resume_model, 'model.pdopt') opti_state_dict = paddle.load(ckpt_path) model.set_state_dict(para_state_dict) optimizer.set_state_dict(opti_state_dict) iter = resume_model.split('_')[-1] iter = int(iter) return iter else: raise ValueError( 'Directory of the model needed to resume is not Found: {}'. format(resume_model)) else: logger.info('No model needed to resume.')
def main(args): paddle.set_device("gpu") cfg = Config(args.cfg, learning_rate=args.learning_rate, iters=args.iters, batch_size=args.batch_size) train_dataset = cfg.train_dataset if train_dataset is None: raise RuntimeError( 'The training dataset is not specified in the configuration file.') elif len(train_dataset) == 0: raise ValueError( 'The length of train_dataset is 0. Please check if your dataset is valid' ) val_dataset = None #cfg.val_dataset if args.do_eval else None losses = cfg.loss msg = '\n---------------Config Information---------------\n' msg += str(cfg) msg += '------------------------------------------------' logger.info(msg) train(cfg.model, train_dataset, val_dataset=val_dataset, optimizer=cfg.optimizer, loss_computation=cfg.loss, save_dir=args.save_dir, iters=cfg.iters, batch_size=cfg.batch_size, resume_model=args.resume_model, save_interval=args.save_interval, log_iters=args.log_iters, num_workers=args.num_workers, keep_checkpoint_max=args.keep_checkpoint_max)
def evaluate(model, eval_dataset, num_workers=0, output_dir="./output", print_detail=True): """ Launch evalution. Args: model(nn.Layer): A model. eval_dataset (paddle.io.Dataset): Used to read and process validation datasets. num_workers (int, optional): Num workers for data loader. Default: 0. print_detail (bool, optional): Whether to print detailed information about the evaluation process. Default: True. Returns: float: The mIoU of validation datasets. float: The accuracy of validation datasets. """ model.eval() batch_sampler = paddle.io.BatchSampler(eval_dataset, batch_size=1, shuffle=False, drop_last=False) loader = paddle.io.DataLoader( eval_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, ) total_iters = len(loader) if print_detail: logger.info( "Start evaluating (total_samples={}, total_iters={})...".format( len(eval_dataset), total_iters)) progbar_val = progbar.Progbar(target=total_iters, verbose=1) reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() batch_start = time.time() predictions = {} with paddle.no_grad(): for cur_iter, batch in enumerate(loader): reader_cost_averager.record(time.time() - batch_start) images, targets, image_ids = batch[0], batch[1], batch[2] output = model(images, targets) output = output.numpy() predictions.update({img_id: output for img_id in image_ids}) batch_cost_averager.record(time.time() - batch_start, num_samples=len(targets)) batch_cost = batch_cost_averager.get_average() reader_cost = reader_cost_averager.get_average() if print_detail: progbar_val.update(cur_iter + 1, [('batch_cost', batch_cost), ('reader cost', reader_cost)]) reader_cost_averager.reset() batch_cost_averager.reset() batch_start = time.time() kitti_evaluation(eval_dataset, predictions, output_dir=output_dir)
def train(model, train_dataset, val_dataset=None, optimizer=None, loss_computation=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, keep_checkpoint_max=5): """ Launch training. Args: model(nn.Layer): A sementic segmentation model. train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. loss_computation (nn.Layer): A loss function. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. """ model.train() nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank start_iter = 0 if resume_model is not None: start_iter = resume(model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: # Initialize parallel environment if not done. if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( ): paddle.distributed.init_parallel_env() ddp_model = paddle.DataParallel(model) else: ddp_model = paddle.DataParallel(model) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, ) # VisualDL log log_writer = LogWriter(save_dir) avg_loss = 0.0 avg_loss_dict = {} iters_per_epoch = len(batch_sampler) reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() save_models = deque() batch_start = time.time() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: break reader_cost_averager.record(time.time() - batch_start) images = data[0] targets = data[1] if nranks > 1: predictions = ddp_model(images) else: predictions = model(images) loss_dict = loss_computation(predictions, targets) loss = sum(loss for loss in loss_dict.values()) loss.backward() optimizer.step() lr = optimizer.get_lr() if isinstance(optimizer._learning_rate, paddle.optimizer.lr.LRScheduler): optimizer._learning_rate.step() model.clear_gradients() avg_loss += loss.numpy()[0] # get the value if len(avg_loss_dict) == 0: avg_loss_dict = {k: v.numpy()[0] for k, v in loss_dict.items()} else: for key, value in loss_dict.items(): avg_loss_dict[key] += value.numpy()[0] batch_cost_averager.record(time.time() - batch_start, num_samples=batch_size) if (iter) % log_iters == 0 and local_rank == 0: avg_loss /= log_iters for key, value in avg_loss_dict.items(): avg_loss_dict[key] /= log_iters remain_iters = iters - iter avg_train_batch_cost = batch_cost_averager.get_average() avg_train_reader_cost = reader_cost_averager.get_average() eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.5f} | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss, lr, avg_train_batch_cost, avg_train_reader_cost, eta)) ######################### VisualDL Log ########################## log_writer.add_scalar('Train/loss', avg_loss, iter) # Record all losses if there are more than 2 losses. for key, value in avg_loss_dict.items(): log_tag = 'Train/' + key log_writer.add_scalar(log_tag, value, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) ################################################################# avg_loss = 0.0 avg_loss_list = {} reader_cost_averager.reset() batch_cost_averager.reset() if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) save_models.append(current_save_dir) if len(save_models) > keep_checkpoint_max > 0: model_to_remove = save_models.popleft() shutil.rmtree(model_to_remove) batch_start = time.time() # Sleep for half a second to let dataloader release resources. time.sleep(0.5) log_writer.close()