def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pretrained_path=None, pretrained_include=None, pretrained_exclude=None, freeze_include=None, freeze_exclude=None, multi_gpu=False, measure_time=False, resume=False): """train a VoxelNet model specified by a config file. """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_dir = str(Path(model_dir).resolve()) if create_folder: if Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = Path(model_dir) if not resume and model_dir.exists(): raise ValueError("model dir exists and you don't specify resume.") model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to train with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path proto_str = text_format.MessageToString(config, indent=2) with (model_dir / config_file_bkp).open("w") as f: f.write(proto_str) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, measure_time).to(device) # if train_cfg.enable_mixed_precision: # net.half() # net.metrics_to_float() # net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator print("num parameters:", len(list(net.parameters()))) torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) if pretrained_path is not None: model_dict = net.state_dict() pretrained_dict = torch.load(pretrained_path) pretrained_dict = filter_param_dict(pretrained_dict, pretrained_include, pretrained_exclude) new_pretrained_dict = {} for k, v in pretrained_dict.items(): if k in model_dict and v.shape == model_dict[k].shape: new_pretrained_dict[k] = v print("Load pretrained parameters:") for k, v in new_pretrained_dict.items(): print(k, v.shape) model_dict.update(new_pretrained_dict) net.load_state_dict(model_dict) freeze_params_v2(dict(net.named_parameters()), freeze_include, freeze_exclude) net.clear_global_step() net.clear_metrics() if multi_gpu: net_parallel = torch.nn.DataParallel(net) else: net_parallel = net optimizer_cfg = train_cfg.optimizer loss_scale = train_cfg.loss_scale_factor fastai_optimizer = optimizer_builder.build( optimizer_cfg, net, mixed=False, loss_scale=loss_scale) if loss_scale < 0: loss_scale = "dynamic" if train_cfg.enable_mixed_precision: max_num_voxels = input_cfg.preprocess.max_number_of_voxels * input_cfg.batch_size assert max_num_voxels < 65535, "spconv fp16 training only support this" from apex import amp net, amp_optimizer = amp.initialize(net, fastai_optimizer, opt_level="O2", keep_batchnorm_fp32=True, loss_scale=loss_scale ) net.metrics_to_float() else: amp_optimizer = fastai_optimizer torchplus.train.try_restore_latest_checkpoints(model_dir, [fastai_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, amp_optimizer, train_cfg.steps) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 if multi_gpu: num_gpu = torch.cuda.device_count() print(f"MULTI-GPU: use {num_gpu} gpu") collate_fn = merge_second_batch_multigpu else: collate_fn = merge_second_batch num_gpu = 1 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build( input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, multi_gpu=multi_gpu) eval_dataset = input_reader_builder.build( eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size * num_gpu, shuffle=True, num_workers=input_cfg.preprocess.num_workers * num_gpu, pin_memory=False, collate_fn=collate_fn, worker_init_fn=_worker_init_fn, drop_last=not multi_gpu) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, # only support multi-gpu train shuffle=False, num_workers=eval_input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch) ###################### # TRAINING ###################### model_logging = SimpleModelLog(model_dir) model_logging.open() model_logging.log_text(proto_str + "\n", 0, tag="config") start_step = net.get_global_step() total_step = train_cfg.steps t = time.time() steps_per_eval = train_cfg.steps_per_eval clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch amp_optimizer.zero_grad() step_times = [] step = start_step try: while True: if clear_metrics_every_epoch: net.clear_metrics() for example in dataloader: lr_scheduler.step(net.get_global_step()) time_metrics = example["metrics"] example.pop("metrics") example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net_parallel(example_torch) cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"].mean() cls_neg_loss = ret_dict["cls_neg_loss"].mean() loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] cared = ret_dict["cared"] labels = example_torch["labels"] if train_cfg.enable_mixed_precision: with amp.scale_loss(loss, amp_optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) amp_optimizer.step() amp_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) step_times.append(step_time) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: if measure_time: for name, val in net.get_avg_time_dict().items(): print(f"avg {name} time = {val * 1000:.3f} ms") loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["runtime"] = { "step": global_step, "steptime": np.mean(step_times), } metrics["runtime"].update(time_metrics[0]) step_times = [] metrics.update(net_metrics) metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.use_direction_classifier: dir_loss_reduced = ret_dict["dir_loss_reduced"].mean() metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["misc"] = { "num_vox": int(example_torch["voxels"].shape[0]), "num_pos": int(num_pos), "num_neg": int(num_neg), "num_anchors": int(num_anchors), "lr": float(amp_optimizer.lr), "mem_usage": psutil.virtual_memory().percent, } model_logging.log_metrics(metrics, global_step) if global_step % steps_per_eval == 0: torchplus.train.save_models(model_dir, [net, amp_optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) model_logging.log_text("#################################", global_step) model_logging.log_text("# EVAL", global_step) model_logging.log_text("#################################", global_step) model_logging.log_text("Generate output labels...", global_step) t = time.time() detections = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start((len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) detections += net(example) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) model_logging.log_text( f'generate label finished({sec_per_ex:.2f}/s). start eval:', global_step) result_dict = eval_dataset.dataset.evaluation( detections, str(result_path_step)) for k, v in result_dict["results"].items(): model_logging.log_text("Evaluation {}".format(k), global_step) model_logging.log_text(v, global_step) model_logging.log_metrics(result_dict["detail"], global_step) with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(detections, f) net.train() step += 1 if step >= total_step: break if step >= total_step: break except Exception as e: print(json.dumps(example["metadata"], indent=2)) model_logging.log_text(str(e), step) model_logging.log_text(json.dumps(example["metadata"], indent=2), step) torchplus.train.save_models(model_dir, [net, amp_optimizer], step) raise e finally: model_logging.close() torchplus.train.save_models(model_dir, [net, amp_optimizer], net.get_global_step())
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, pretrained_path=None, pretrained_include=None, pretrained_exclude=None, freeze_include=None, freeze_exclude=None, multi_gpu=False, measure_time=False, resume=False): """train a PointPillars model specified by a config file. """ torch.cuda.empty_cache() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_dir = str(Path(model_dir).resolve()) if create_folder: if Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = Path(model_dir) if not resume and model_dir.exists(): raise ValueError("model dir exists and you don't specify resume.") model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config, proto_str = load_config(model_dir, config_path) input_cfg = config.train_input_reader model_cfg = config.model.second train_cfg = config.train_config target_assigner_cfg = model_cfg.target_assigner voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) box_coder.custom_ndim = target_assigner._anchor_generators[0].custom_ndim net = PointPillarsNet(1, voxel_generator.grid_size, target_assigner.num_anchors_per_location, target_assigner.box_coder.code_size, with_distance=False).to(device) kaiming_init(net, 1.0) net_loss = build_net_loss(model_cfg, target_assigner).to(device) net_loss.clear_global_step() net_loss.clear_metrics() # print("num parameters:", len(list(net.parameters()))) load_pretrained_model(net, pretrained_path, pretrained_include, pretrained_exclude, freeze_include, freeze_exclude) if resume: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) amp_optimizer, lr_scheduler = create_optimizer(model_dir, train_cfg, net) collate_fn = merge_second_batch num_gpu = 1 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, multi_gpu=multi_gpu) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size * num_gpu, shuffle=True, num_workers=input_cfg.preprocess.num_workers * num_gpu, pin_memory=False, collate_fn=collate_fn, worker_init_fn=_worker_init_fn, drop_last=not multi_gpu) ###################### # TRAINING ###################### model_logging = SimpleModelLog(model_dir) model_logging.open() model_logging.log_text(proto_str + "\n", 0, tag="config") start_step = net_loss.get_global_step() total_step = train_cfg.steps t = time.time() steps_per_eval = train_cfg.steps_per_eval clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch amp_optimizer.zero_grad() step_times = [] step = start_step best_mAP = 0 epoch = 0 net.train() net_loss.train() try: while True: if clear_metrics_every_epoch: net_loss.clear_metrics() for example in dataloader: lr_scheduler.step(net_loss.get_global_step()) time_metrics = example["metrics"] example.pop("metrics") example_torch = example_convert_to_torch(example, float_dtype) batch_size = example_torch["anchors"].shape[0] coors = example_torch["coordinates"] input_features = compute_model_input( voxel_generator.voxel_size, voxel_generator.point_cloud_range, with_distance=False, voxels=example_torch['voxels'], num_voxels=example_torch['num_points'], coors=coors) # input_features = reshape_input(batch_size, input_features, coors, voxel_generator.grid_size) input_features = reshape_input1(input_features) net.batch_size = batch_size preds_list = net(input_features, coors) ret_dict = net_loss(example_torch, preds_list) cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"].mean() cls_neg_loss = ret_dict["cls_neg_loss"].mean() loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] cared = ret_dict["cared"] labels = example_torch["labels"] loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) amp_optimizer.step() amp_optimizer.zero_grad() net_loss.update_global_step() net_metrics = net_loss.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) step_times.append(step_time) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net_loss.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["runtime"] = { "step": global_step, "steptime": np.mean(step_times), } metrics["runtime"].update(time_metrics[0]) step_times = [] metrics.update(net_metrics) metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.use_direction_classifier: dir_loss_reduced = ret_dict["dir_loss_reduced"].mean() metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["misc"] = { "num_vox": int(example_torch["voxels"].shape[0]), "num_pos": int(num_pos), "num_neg": int(num_neg), "num_anchors": int(num_anchors), "lr": float(amp_optimizer.lr), "mem_usage": psutil.virtual_memory().percent, } model_logging.log_metrics(metrics, global_step) step += 1 epoch += 1 if epoch % 2 == 0: global_step = net_loss.get_global_step() torchplus.train.save_models(model_dir, [net, amp_optimizer], global_step) net.eval() net_loss.eval() best_mAP = evaluate(net, net_loss, best_mAP, voxel_generator, target_assigner, config, model_logging, model_dir, result_path) net.train() net_loss.train() if epoch > 100: break if epoch > 100: break except Exception as e: print(json.dumps(example["metadata"], indent=2)) model_logging.log_text(str(e), step) model_logging.log_text(json.dumps(example["metadata"], indent=2), step) torchplus.train.save_models(model_dir, [net, amp_optimizer], step) raise e finally: model_logging.close() torchplus.train.save_models(model_dir, [net, amp_optimizer], net_loss.get_global_step())
pred['scores'] = scores pred['label_preds'] = labels return pred # In[9]: ckpt_path = "/home/ags/second_test/all_fhd.30/voxelnet-29369.tckpt" net = build_network(config.model.second).to(device).float().eval() net.load_state_dict(torch.load(ckpt_path)) eval_input_cfg = config.eval_input_reader eval_input_cfg.dataset.kitti_root_path = root_path eval_input_cfg.dataset.kitti_info_path = info_path dataset = input_reader_builder.build( eval_input_cfg, config.model.second, training=False, voxel_generator=net.voxel_generator, target_assigner=net.target_assigner) #.dataset batch_size = 4 num_workers = 4 dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, # only support multi-gpu train shuffle=False, num_workers=num_workers, pin_memory=False, collate_fn=merge_second_batch) target_assigner = net.target_assigner
def evaluate(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None, ref_detfile=None, pickle_result=True): # Evaluate on a subset of kitti dataset # Setup parameters model_dir = pathlib.Path(model_dir) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) center_limit_range = model_cfg.post_center_limit_range ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) # Build the NN in GPU mode net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() # Further net settings if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) # Restore old checkpoint if possible if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) # Dataset build for easy usage eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=input_cfg.batch_size, shuffle=False, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) # Further variable setup if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 # Setup network for evaluation net.eval() # Further variable setup result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() dt_annos = [] global_set = None print() print("Generate output labels...") bar = ProgressBar() bar.start(len(eval_dataset) // input_cfg.batch_size + 1) # Predict each sample info and reformat data as needed for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) if pickle_result: dt_annos += _predict_kitti_to_anno(net, example, class_names, center_limit_range, model_cfg.lidar_input, global_set) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) bar.print_bar() # Update progress break sec_per_example = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_example:.2f}/s). start eval:') print(f"avg forward time per example: {net.avg_forward_time:.3f}") print(f"avg postprocess time per example: {net.avg_postprocess_time:.3f}") # Store the data (in a format specified by user) if not predict_test: gt_annos = [info["annos"] for info in eval_dataset.dataset.kitti_infos] if not pickle_result: dt_annos = kitti.get_label_annos( result_path_step) # FIXME: Not sure what is this step result = get_official_eval_result(gt_annos, dt_annos, class_names) print(result) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f)
def evaluate( config_path, model_dir=None, result_path=None, ckpt_path=None, measure_time=False, batch_size=None, slice_size_perc=100, # 42 is good with benchmarking min_slice_overlap_perc=2, deadline_sec=0.5, method=0, calc_AP=True, calc_AP_from_detections_path=None, **kwargs): """Don't support pickle_result anymore. if you want to generate kitti label file, please use kitti_anno_to_label_file and convert_detection_to_kitti_annos in second.data.kitti_dataset. """ # assert len(kwargs) == 0 model_dir = str(Path(model_dir).resolve()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") result_name = 'eval_results' if result_path is None: model_dir = Path(model_dir) result_path = model_dir / result_name else: result_path = Path(result_path) if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to eval with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, measure_time=measure_time).to(device) if train_cfg.enable_mixed_precision: net.half() print("half inference!") net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator if ckpt_path is None: assert model_dir is not None torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) print('Setting all model parameters to no grad') for param in net.parameters(): param.requires_grad = False batch_size = batch_size or input_cfg.batch_size eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=batch_size, shuffle=False, num_workers=0, #input_cfg.preprocess.num_workers, pin_memory=True, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 mem_params = sum([ param.nelement() * param.element_size() for param in net.parameters() ]) mem_bufs = sum( [buf.nelement() * buf.element_size() for buf in net.buffers()]) mem = mem_params + mem_bufs print('Memory requirement is: ', mem // 1024, ' kbytes') net.eval() #print('Last state of the network parameters:') #for k, v in dict(net.named_parameters()).items(): # print(k, v.shape, 'requires_grad:', v.requires_grad) deadline_ms = round(deadline_sec * 1000) if calc_AP_from_detections_path == None: print("Generate output labels...") t = time.time() ipp = imprecise_pp(net, eval_dataloader, deadline_sec, slice_size_perc, min_slice_overlap_perc, method, float_dtype, batch_size, True) eval_dict, detections = ipp.run_evaluation() if eval_dict is None and detections is None: print('Calibration done, exiting') return sec_per_example = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_example:.2f}/s). start eval:') print('After forward, memory_allocated is: ', torch.cuda.memory_allocated() // 1024, ' kbytes') print('After forward, max_memory_allocated is: ', torch.cuda.max_memory_allocated() // 1024, ' kbytes') # Print these for humans max_len = 0 for name in net.get_time_dict_stats().keys(): max_len = max(len(name), max_len) print((" " * max_len), "Min\tAvrg\t95perc\t99perc\tMax") for name, val in net.get_time_dict_stats().items(): spaces = " " * (max_len - len(name) + 1) print(f"{name}{spaces}{val[0]:.2f}\t{val[1]:.2f}" f"\t{val[2]:.2f}\t{val[3]:.2f}\t{val[4]:.2f} ms") print('Dumping detections') with open( f"detections_m{method}_d{deadline_ms}_s{slice_size_perc}.pickle", 'wb') as handle: pickle.dump(detections, handle, protocol=pickle.HIGHEST_PROTOCOL) if calc_AP: print('Calculating AP') t = time.time() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) result_dict = eval_dataset.dataset.evaluation( detections, str(result_path_step)) result_dict['mAP'] = ipp.calc_nusc_mAP(result_dict) eval_dict['eval_results_dict'] = result_dict for k, v in result_dict["results"].items(): print("Evaluation {}".format(k)) print(v) elapsed_time = (time.time() - t) print(f"Calculating AP took {elapsed_time:.2f} seconds") print('Dumping evaluation dictionary file') with open( f"eval_dict_m{method}_d{deadline_ms}_s{slice_size_perc}.json", 'w') as handle: json.dump(eval_dict, handle, indent=4) else: # calc_AP_from_detections_path print("Calculate evaluation results from available detections...") result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) del eval_dataloader del net torch.cuda.empty_cache() mp.set_start_method('spawn') # needed with concurrent.futures.ProcessPoolExecutor(max_workers=2) as executor: futs = [] paths = glob.glob(calc_AP_from_detections_path + '/detections_*.pickle') for i, dets_path in enumerate(paths): futs.append( executor.submit(calc_AP_from_dets, dets_path, str(result_path_step), eval_dataset, i + 1, len(paths))) concurrent.futures.wait(futs) # is this necessary? # EVALUATION END print('Done')
def train( config_path, # 附加配置文件路径 model_dir, # 模型保存路径 result_path=None, create_folder=False, display_step=50, # 结果显示步长 summary_step=5, # 数据统计步长 pickle_result=True): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) # 将字符串打包成路径 model_dir.mkdir(parents=True, exist_ok=True) # 根据路径创建目录,前一个参数表示创建父目录,后一个表示目录存在时不创建也不报错 eval_checkpoint_dir = model_dir / 'eval_checkpoints' # 检查点文件保存路径 eval_checkpoint_dir.mkdir(parents=True, exist_ok=True) # 创建目录 if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() # 配置文件整合框架 with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) # 向配置文件框架内传参 shutil.copyfile(config_path, str(model_dir / config_file_bkp)) # 复制一份配置文件 input_cfg = config.train_input_reader # 配置文件4个部分,训练输入,评价输入,模型参数,训练配置 eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) # 选择训练类别 ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build( model_cfg.voxel_generator) # 生成体素,输入配置参数,输出类的实例 ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] # 鸟瞰图范围 box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range net = second_builder.build(model_cfg, voxel_generator, target_assigner) # 模型结构的搭建在此 net.cuda() # 使用GPU运算 # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # 需要训练的参数 # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # BUILD OPTIMIZER ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) optimizer = optimizer_builder.build(optimizer_cfg, net.parameters()) if train_cfg.enable_mixed_precision: loss_scale = train_cfg.loss_scale_factor mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, gstep) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build( # 将dataset封装成了类,与内置的dataset类一致,方便后面dataloader加载 input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader( # 加载自建dataset里的数据 dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) # 将dataloader转化成可迭代对象,方便后续next迭代调用 ###################### # TRAINING ###################### log_path = model_dir / 'log.txt' # 写入日志文件 logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() # 训练起始时间 ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): # 32 if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval # 总训练次数296960 else: steps = train_cfg.steps_per_eval # 评估步长,steps=9280 for step in range(steps): # 9280 lr_scheduler.step() # 学习率更新 try: example = next(data_iter) # 按照索引迭代 except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch( example, float_dtype) # 数据转换成张量,这是后续用于处理的数据 batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) # 向建好的网络输入张量数据,经过网络处理输出预测值和loss # box_preds = ret_dict["box_preds"] cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"] cls_neg_loss = ret_dict["cls_neg_loss"] loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] dir_loss_reduced = ret_dict["dir_loss_reduced"] cared = ret_dict["cared"] labels = example_torch["labels"] if train_cfg.enable_mixed_precision: # False loss *= loss_scale loss.backward() # loss反向传递 torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) # 梯度剪裁,控制梯度爆炸 mixed_optimizer.step() # 模型参数更新 mixed_optimizer.zero_grad() # 梯度清零 net.update_global_step() # 优化次数更新 net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) # 一次训练计时结束 t = time.time() # 开始新一轮训练计时 metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: # display=50,显示一次训练结果 loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) # if unlabeled_training: # metrics["loss"]["diff_rt"] = float( # diff_loc_loss_reduced.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) metrics["lr"] = float( mixed_optimizer.param_groups[0]['lr']) metrics["image_idx"] = example['image_idx'][0] flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} writer.add_scalars(k, v, global_step) else: writer.add_scalar(k, v, global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time( ) - ckpt_start_time # 一个checkpoint(50steps)所耗时间 if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) # Ensure that all evaluation points are saved forever torchplus.train.save_models(eval_checkpoint_dir, [net, optimizer], net.get_global_step(), max_to_keep=100) # 模型评估 net.eval() # pytorch内置的 Module方法,将网络参数的traing设为False result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) print("#################################") print("#################################", file=logf) print("# EVAL") print("# EVAL", file=logf) print("#################################") print("#################################", file=logf) print("Generate output labels...") print("Generate output labels...", file=logf) t = time.time() # 评估计时开始 dt_annos = [] prog_bar = ProgressBar() prog_bar.start(len(eval_dataset) // eval_input_cfg.batch_size + 1) for example in iter(eval_dataloader): # 评估喂数据 example = example_convert_to_torch(example, float_dtype) if pickle_result: # True dt_annos += predict_kitti_to_anno(net, example, class_names, center_limit_range, model_cfg.lidar_input) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) # 每帧数据平均用时 print(f"avg forward time per example: {net.avg_forward_time:.3f}") print( f"avg postprocess time per example: {net.avg_postprocess_time:.3f}" ) net.clear_time_metrics() print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') print(f'generate label finished({sec_per_ex:.2f}/s). start eval:', file=logf) gt_annos = [ info["annos"] for info in eval_dataset.dataset.kitti_infos ] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) # 官方评价指标 result, mAPbbox, mAPbev, mAP3d, mAPaos = get_official_eval_result( gt_annos, dt_annos, class_names, return_data=True) print(result, file=logf) print(result) writer.add_text('eval_result', result, global_step) for i, class_name in enumerate(class_names): # 要记录的评估参数 writer.add_scalar('bev_ap:{}'.format(class_name), mAPbev[i, 1, 0], global_step) writer.add_scalar('3d_ap:{}'.format(class_name), mAP3d[i, 1, 0], global_step) writer.add_scalar('aos_ap:{}'.format(class_name), mAPaos[i, 1, 0], global_step) writer.add_scalar('bev_map', np.mean(mAPbev[:, 1, 0]), global_step) writer.add_scalar('3d_map', np.mean(mAP3d[:, 1, 0]), global_step) writer.add_scalar('aos_map', np.mean(mAPaos[:, 1, 0]), global_step) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) writer.add_text('eval_result', result, global_step) net.train() except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def detect(config_path, model_dir=None, result_path=None, ckpt_path=None, ref_detfile=None, pickle_result=True, measure_time=False, batch_size=None): result_name = 'eval_results' if result_path is None: model_dir = pathlib.Path(model_dir) result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to eval with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config center_limit_range = model_cfg.post_center_limit_range ###################### # BUILD VOXEL GENERATOR ###################### net = build_network(model_cfg, measure_time=measure_time).cuda() if train_cfg.enable_mixed_precision: net.half() print("half inference!") net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator class_names = target_assigner.classes if ckpt_path is None: assert model_dir is not None torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) batch_size = batch_size or input_cfg.batch_size eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=batch_size, shuffle=False, num_workers=0, # input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() result_path_step = result_path #/ f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() dt_annos = [] print("Generate output labels...") bar = ProgressBar() bar.start((len(eval_dataset) + batch_size - 1) // batch_size) prep_example_times = [] prep_times = [] t2 = time.time() for example in iter(eval_dataloader): if measure_time: prep_times.append(time.time() - t2) t1 = time.time() torch.cuda.synchronize() example = example_convert_to_torch(example, float_dtype) if measure_time: torch.cuda.synchronize() prep_example_times.append(time.time() - t1) dt_annos += predict_to_kitti_label(net, example, class_names, center_limit_range, model_cfg.lidar_input) # print(json.dumps(net.middle_feature_extractor.middle_conv.sparity_dict)) bar.print_bar() if measure_time: t2 = time.time() sec_per_example = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_example:.2f}/s). start eval:') if measure_time: print( f"avg example to torch time: {np.mean(prep_example_times) * 1000:.3f} ms" ) print(f"avg prep time: {np.mean(prep_times) * 1000:.3f} ms") for name, val in net.get_avg_time_dict().items(): print(f"avg {name} time = {val * 1000:.3f} ms") if pickle_result: print('Frames analyzed:' + str(len(dt_annos))) with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) else: kitti_anno_to_label_file(dt_annos, result_path_step)
def train(config_path, model_dir, use_fusion=False, use_ft=False, use_second_stage=False, use_endtoend=False, result_path=None, create_folder=False, display_step=50, summary_step=5, local_rank=0, pickle_result=True, patchs=None): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) patchs = patchs or [] model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) for patch in patchs: patch = "config." + patch exec(patch) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) class_names = target_assigner.classes ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range if use_second_stage: net = second_2stage_builder.build(model_cfg, voxel_generator, target_assigner) if use_endtoend: net = second_endtoend_builder.build(model_cfg, voxel_generator, target_assigner) else: net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() # import pdb; pdb.set_trace() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) # pth_name = 'pre_weight/first_stage/fusion_split/voxelnet-35210.tckpt' # # pth_name = 'pre_weight/first_stage/fusion_split/voxelnet-20130.tckpt' # res_pre_weights = torch.load(pth_name) # new_res_state_dict = OrderedDict() # model_dict = net.state_dict() # for k,v in res_pre_weights.items(): # if 'global_step' not in k: # if 'dir' not in k: # new_res_state_dict[k] = v # model_dict.update(new_res_state_dict) # net.load_state_dict(model_dict) ###################### if use_second_stage or use_endtoend: if use_fusion: # pth_name = 'pre_weight/8020/voxelnet-20130.tckpt' pth_name = 'pre_weight/first_stage/fusion_split/voxelnet-35210.tckpt' for i in range(30): print( '################## load Fusion First stage weight complete #######################' ) else: pth_name = 'pre_weight/first_stage/lidaronly/voxelnet-30950.tckpt' for i in range(30): print( '################## load LiDAR Only First stage weight complete #######################' ) res_pre_weights = torch.load(pth_name) new_res_state_dict = OrderedDict() model_dict = net.state_dict() for k, v in res_pre_weights.items(): if 'global_step' not in k: if 'dir' not in k: new_res_state_dict[k] = v model_dict.update(new_res_state_dict) net.load_state_dict(model_dict) ############ load FPN18 pre-weight ############# if (use_fusion and not use_second_stage and not use_endtoend): # if True: # or (use_endtoend and use_fusion): fpn_depth = 18 pth_name = 'pre_weight/FPN' + str(fpn_depth) + '_retinanet_968.pth' res_pre_weights = torch.load(pth_name) new_res_state_dict = OrderedDict() model_dict = net.state_dict() for k, v in res_pre_weights['state_dict'].items(): if ('regressionModel' not in k) and ('classificationModel' not in k): name = k.replace('module', 'rpn') new_res_state_dict[name] = v model_dict.update(new_res_state_dict) net.load_state_dict(model_dict) for i in range(30): print('!!!!!!!!!!!!!!!!!! load FPN' + str(fpn_depth) + ' weight complete !!!!!!!!!!!!!!!!!!') ################################################ # BUILD OPTIMIZER ##################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) loss_scale = train_cfg.loss_scale_factor mixed_optimizer = optimizer_builder.build( optimizer_cfg, net, mixed=train_cfg.enable_mixed_precision, loss_scale=loss_scale) optimizer = mixed_optimizer """ if train_cfg.enable_mixed_precision: mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer """ # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, train_cfg.steps) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # TRAINING ###################### training_detail = [] log_path = model_dir / 'log.txt' training_detail_path = model_dir / 'log.json' if training_detail_path.exists(): with open(training_detail_path, 'r') as f: training_detail = json.load(f) logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step(net.get_global_step()) try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) # box_preds = ret_dict["box_preds"] cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"] cls_neg_loss = ret_dict["cls_neg_loss"] loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] dir_loss_reduced = ret_dict["dir_loss_reduced"] cared = ret_dict["cared"] # idx_offset = ret_dict["idx_offset"] # labels = example_torch["labels"] if use_second_stage or use_endtoend: labels = ret_dict["labels"] else: labels = example_torch["labels"] if train_cfg.enable_mixed_precision: loss *= loss_scale loss.backward() # import pdb; pdb.set_trace() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() # print(step) if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["type"] = "step_info" metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) # metrics["idx_offset_mean"] = float(idx_offset.mean().detach().cpu().numpy()) # metrics["idx_offset_sum"] = float(idx_offset.sum().detach().cpu().numpy()) # metrics["lr"] = float( # mixed_optimizer.param_groups[0]['lr']) metrics["lr"] = float(optimizer.lr) metrics["image_idx"] = example['image_idx'][0] training_detail.append(metrics) flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} if type(v) != str and ('loc_elem' not in k): writer.add_scalars(k, v, global_step) else: if (type(v) != str) and ('loc_elem' not in k): writer.add_scalar(k, v, global_step) # if use_second_stage or use_endtoend: # bev_logs = ret_dict['bev_crops_output'][:64,0,...].view(64,1,14,14) # bev_vis = torchvision.utils.make_grid(bev_logs,normalize=True,scale_each=True) # writer.add_image('bev_crop',img_tensor=bev_vis, global_step=global_step) # if ret_dict['concat_crops_output'] is not None: # concat_logs = ret_dict['concat_crops_output'][:64,0,...].view(64,1,14,14) # concat_vis = torchvision.utils.make_grid(concat_logs,normalize=True,scale_each=True) # writer.add_image('concat_crop',img_tensor=concat_vis, global_step=global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) print("#################################") print("#################################", file=logf) print("# EVAL") print("# EVAL", file=logf) print("#################################") print("#################################", file=logf) print("Generate output labels...") print("Generate output labels...", file=logf) t = time.time() dt_annos = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start( (len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) if pickle_result: dt_annos += predict_kitti_to_anno(net, example, class_names, center_limit_range, model_cfg.lidar_input) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') print(f'generate label finished({sec_per_ex:.2f}/s). start eval:', file=logf) gt_annos = [ info["annos"] for info in eval_dataset.dataset.kitti_infos ] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) # result = get_official_eval_result_v2(gt_annos, dt_annos, class_names) # print(json.dumps(result, indent=2), file=logf) result = get_official_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) result_1 = result.split("\n")[:5] result_2 = result.split("\n")[10:15] result_3 = result.split("\n")[20:25] emh = ['0_easy', '1_mod', '2_hard'] result_save = result_1 for i in range(len(result_save) - 1): save_targ = result_save[i + 1] name_val = save_targ.split(':')[0].split(' ')[0] value_val = save_targ.split(':')[1:] for ev in range(3): each_val = value_val[0].split(',')[ev] merge_txt = 'AP_kitti/car_70/' + name_val + '/' + emh[ev] writer.add_scalar(merge_txt, float(each_val), global_step) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) writer.add_text('eval_result', result, global_step) net.train() except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def detect(scene_token, config_path, ckpt_path, info_path, root_path, result_path): ### Read Config file torch.set_num_threads(2) #config_path = "configs/nuscenes/all.pp.lowa_large_range_v2.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) input_cfg = config.eval_input_reader model_cfg = config.model.second # config_tool.change_detection_range_v2(model_cfg, [-50, -50, 50, 50]) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ### Build Network, Target Assigner and Voxel Generator #info_path = '/home/itiv/Desktop/lyft-dataset/infos_val.pkl' #root_path = '/home/itiv/Desktop/lyft-dataset' with open(info_path, 'rb') as f: infos = pickle.load(f) token2info = {} for info in infos['infos']: token2info[info['token']] = info #ckpt_path = "/home/itiv/Desktop/repo/scenarios_in_CarMaker/BA_Daniel/Lyft-Detector/second.pytorch/second/model/model_large_range_v2/voxelnet-33445.tckpt" net = build_network(config.model.second).to(device).float().eval() net.load_state_dict(torch.load(ckpt_path)) eval_input_cfg = config.eval_input_reader eval_input_cfg.dataset.kitti_root_path = root_path eval_input_cfg.dataset.kitti_info_path = info_path dataset = input_reader_builder.build( eval_input_cfg, config.model.second, training=False, voxel_generator=net.voxel_generator, target_assigner=net.target_assigner) #.dataset batch_size = 2 num_workers = 2 dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=False, collate_fn=merge_second_batch) target_assigner = net.target_assigner voxel_generator = net.voxel_generator classes = target_assigner.classes detections = [] #tk0 = prog_bar(dataloader, total=len(dataloader)) tk0 = (dataloader) for idx, examples in enumerate(tk0): #print(idx) #print(examples) try: example_torch = example_convert_to_torch(examples, device=device) detections += net(example_torch) except Exception as e: print(e) import pdb pdb.set_trace() threshold = 0.2 first_sample_token = detections[0]['metadata']['token'] dict_detections = {"results": {}} for idx, pred in enumerate((detections)): pred = thresholded_pred(pred, threshold) #token = tokens[idx]['token'] token = pred['metadata']['token'] dict_detections['results'].update( get_pred_dict(pred, token, classes, token2info)) #pred_str = get_pred_str(pred, token) #predStrings.append(pred_str) #index = df[df['Id'] == token].index[0] #df.loc[index, 'PredictionString'] = pred_str #df.to_csv(f'final.csv', index=False) #print(dict_detections) #path_to_result = f'/home/itiv/Desktop/lyft-dataset/detections-largev2.json' with open(result_path + '/detections_' + scene_token + '.json', 'w') as fp: json.dump(dict_detections, fp)
def helper_tune_target_assigner(config_path): """get information of target assign to tune thresholds in anchor generator. """ if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to train with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path proto_str = text_format.MessageToString(config, indent=2) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, False) # if train_cfg.enable_mixed_precision: # net.half() # net.metrics_to_float() # net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, multi_gpu=False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=0, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn, drop_last=False) class_count = {} anchor_count = {} for c in target_assigner.classes: class_count[c] = 0 anchor_count[c] = 0 for example in dataloader: gt_names = example["gt_names"] for name in gt_names: class_count[name] += 1 labels = example['labels'] for i in range(1, len(target_assigner.classes) + 1): anchor_count[target_assigner.classes[i - 1]] += int( np.sum(labels == i)) print(json.dumps(class_count, indent=2)) print(json.dumps(anchor_count, indent=2))
def helper_tune_target_assigner(config_path, target_rate=None, update_freq=200, update_delta=0.01, num_tune_epoch=5): """get information of target assign to tune thresholds in anchor generator. """ if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to train with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path proto_str = text_format.MessageToString(config, indent=2) input_cfg = config.train_input_reader model_cfg = config.model.second data_root = os.environ.get('DATA_ROOT') if data_root and osp.exists(data_root): train_info_filename = osp.basename(input_cfg.dataset.kitti_info_path) input_cfg.dataset.kitti_root_path = data_root input_cfg.dataset.kitti_info_path = osp.join(data_root, train_info_filename) if input_cfg.preprocess.database_sampler.database_info_path: db_info_filename = osp.basename( input_cfg.preprocess.database_sampler.database_info_path) input_cfg.preprocess.database_sampler.database_info_path = osp.join( data_root, db_info_filename) net = build_network(model_cfg, False) target_assigner = net.target_assigner voxel_generator = net.voxel_generator dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, multi_gpu=False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn, drop_last=False) class_count = {} anchor_count = {} class_count_tune = {} anchor_count_tune = {} for c in target_assigner.classes: class_count[c] = 0 anchor_count[c] = 0 class_count_tune[c] = 0 anchor_count_tune[c] = 0 step = 0 classes = target_assigner.classes if target_rate is None: num_tune_epoch = 0 for epoch in range(num_tune_epoch): print(f'{epoch + 1} / {num_tune_epoch} tune epochs') prog_bar = ProgressBar() prog_bar.start(len(dataloader)) for example in dataloader: gt_names = example["gt_names"] for name in gt_names: class_count_tune[name] += 1 labels = example['labels'] for i in range(1, len(classes) + 1): anchor_count_tune[classes[i - 1]] += int(np.sum(labels == i)) if target_rate is not None: for name, rate in target_rate.items(): if class_count_tune[name] > update_freq: # calc rate current_rate = anchor_count_tune[ name] / class_count_tune[name] if current_rate > rate: target_assigner._anchor_generators[classes.index( name)].match_threshold += update_delta target_assigner._anchor_generators[classes.index( name)].unmatch_threshold += update_delta else: target_assigner._anchor_generators[classes.index( name)].match_threshold -= update_delta target_assigner._anchor_generators[classes.index( name)].unmatch_threshold -= update_delta anchor_count_tune[name] = 0 class_count_tune[name] = 0 step += 1 prog_bar.print_bar() for c in target_assigner.classes: class_count[c] = 0 anchor_count[c] = 0 total_voxel_gene_time = 0 count = 0 prog_bar = ProgressBar() prog_bar.start(len(dataloader)) for example in dataloader: gt_names = example["gt_names"] total_voxel_gene_time += example["metrics"][0]["voxel_gene_time"] for name in gt_names: class_count[name] += 1 labels = example['labels'] for i in range(1, len(classes) + 1): anchor_count[classes[i - 1]] += int(np.sum(labels == i)) prog_bar.print_bar() count += 1 if count > 100: break print("avg voxel gene time", total_voxel_gene_time / count) print(json.dumps(class_count, indent=2)) print(json.dumps(anchor_count, indent=2)) if target_rate is not None: for ag in target_assigner._anchor_generators: if ag.class_name in target_rate: print(ag.class_name, ag.match_threshold, ag.unmatch_threshold)
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pickle_result=True, resume=False): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) if not resume and model_dir.exists(): raise ValueError("model dir exists and you don't specify resume.") model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to train with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path proto_str = text_format.MessageToString(config, indent=2) with (model_dir / config_file_bkp).open("w") as f: f.write(proto_str) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg).cuda() if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator class_names = target_assigner.classes # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # BUILD OPTIMIZER ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer loss_scale = train_cfg.loss_scale_factor mixed_optimizer = optimizer_builder.build( optimizer_cfg, net, mixed=train_cfg.enable_mixed_precision, loss_scale=loss_scale) optimizer = mixed_optimizer center_limit_range = model_cfg.post_center_limit_range """ if train_cfg.enable_mixed_precision: mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer """ # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, train_cfg.steps) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) dataloader = torch.utils.data.DataLoader(dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # TRAINING ###################### training_detail = [] log_path = model_dir / 'log.txt' training_detail_path = model_dir / 'log.json' if training_detail_path.exists(): with open(training_detail_path, 'r') as f: training_detail = json.load(f) logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step(net.get_global_step()) try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) # box_preds = ret_dict["box_preds"] cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"] cls_neg_loss = ret_dict["cls_neg_loss"] loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] dir_loss_reduced = ret_dict["dir_loss_reduced"] cared = ret_dict["cared"] labels = example_torch["labels"] if train_cfg.enable_mixed_precision: loss *= loss_scale loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["type"] = "step_info" metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) # metrics["lr"] = float( # mixed_optimizer.param_groups[0]['lr']) metrics["lr"] = float(optimizer.lr) if "image_info" in example['metadata'][0]: metrics["image_idx"] = example['metadata'][0][ "image_info"]['image_idx'] training_detail.append(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") """ for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} writer.add_scalars(k, v, global_step) else: writer.add_scalar(k, v, global_step) """ log_str = metric_to_str(metrics) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) print("#################################") print("#################################", file=logf) print("# EVAL") print("# EVAL", file=logf) print("#################################") print("#################################", file=logf) print("Generate output labels...") print("Generate output labels...", file=logf) t = time.time() dt_annos = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start( (len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) dt_annos += predict_to_kitti_label(net, example, class_names, center_limit_range, model_cfg.lidar_input) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') print(f'generate label finished({sec_per_ex:.2f}/s). start eval:', file=logf) result_official, result_coco = eval_dataset.dataset.evaluation( dt_annos) print(result_official) print(result_official, file=logf) print(result_coco) print(result_coco, file=logf) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) else: kitti_anno_to_label_file(dt_annos, result_path_step) writer.add_text('eval_result', result_official, global_step) writer.add_text('eval_result coco', result_coco, global_step) net.train() except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pickle_result=True, patchs=None): torch.manual_seed(3) np.random.seed(3) if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) patchs = patchs or [] model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config detection_2d_path = config.train_config.detection_2d_path print("2d detection path:", detection_2d_path) center_limit_range = model_cfg.post_center_limit_range voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) class_names = target_assigner.classes net = build_inference_net('./configs/car.fhd.config', '../model_dir') fusion_layer = fusion.fusion() fusion_layer.cuda() optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) loss_scale = train_cfg.loss_scale_factor mixed_optimizer = optimizer_builder.build( optimizer_cfg, fusion_layer, mixed=train_cfg.enable_mixed_precision, loss_scale=loss_scale) optimizer = mixed_optimizer # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, train_cfg.steps) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build( eval_input_cfg, model_cfg, training=True, #if rhnning for test, here it needs to be False voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # TRAINING ###################### focal_loss = SigmoidFocalClassificationLoss() cls_loss_sum = 0 training_detail = [] log_path = model_dir / 'log.txt' training_detail_path = model_dir / 'log.json' if training_detail_path.exists(): with open(training_detail_path, 'r') as f: training_detail = json.load(f) logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 #print("steps, steps_per_eval, total_loop:", train_cfg.steps, train_cfg.steps_per_eval, total_loop) # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch net.set_global_step(torch.tensor([0])) if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step(net.get_global_step()) try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] all_3d_output_camera_dict, all_3d_output, top_predictions, fusion_input, tensor_index = net( example_torch, detection_2d_path) d3_gt_boxes = example_torch["d3_gt_boxes"][0, :, :] if d3_gt_boxes.shape[0] == 0: target_for_fusion = np.zeros((1, 70400, 1)) positives = torch.zeros(1, 70400).type(torch.float32).cuda() negatives = torch.zeros(1, 70400).type(torch.float32).cuda() negatives[:, :] = 1 else: d3_gt_boxes_camera = box_torch_ops.box_lidar_to_camera( d3_gt_boxes, example_torch['rect'][0, :], example_torch['Trv2c'][0, :]) d3_gt_boxes_camera_bev = d3_gt_boxes_camera[:, [ 0, 2, 3, 5, 6 ]] ###### predicted bev boxes pred_3d_box = all_3d_output_camera_dict[0]["box3d_camera"] pred_bev_box = pred_3d_box[:, [0, 2, 3, 5, 6]] #iou_bev = bev_box_overlap(d3_gt_boxes_camera_bev.detach().cpu().numpy(), pred_bev_box.detach().cpu().numpy(), criterion=-1) iou_bev = d3_box_overlap( d3_gt_boxes_camera.detach().cpu().numpy(), pred_3d_box.squeeze().detach().cpu().numpy(), criterion=-1) iou_bev_max = np.amax(iou_bev, axis=0) #print(np.max(iou_bev_max)) target_for_fusion = ((iou_bev_max >= 0.7) * 1).reshape( 1, -1, 1) positive_index = ((iou_bev_max >= 0.7) * 1).reshape(1, -1) positives = torch.from_numpy(positive_index).type( torch.float32).cuda() negative_index = ((iou_bev_max <= 0.5) * 1).reshape(1, -1) negatives = torch.from_numpy(negative_index).type( torch.float32).cuda() cls_preds, flag = fusion_layer(fusion_input.cuda(), tensor_index.cuda()) one_hot_targets = torch.from_numpy(target_for_fusion).type( torch.float32).cuda() negative_cls_weights = negatives.type(torch.float32) * 1.0 cls_weights = negative_cls_weights + 1.0 * positives.type( torch.float32) pos_normalizer = positives.sum(1, keepdim=True).type( torch.float32) cls_weights /= torch.clamp(pos_normalizer, min=1.0) if flag == 1: cls_losses = focal_loss._compute_loss( cls_preds, one_hot_targets, cls_weights.cuda()) # [N, M] cls_losses_reduced = cls_losses.sum( ) / example_torch['labels'].shape[0] cls_loss_sum = cls_loss_sum + cls_losses_reduced if train_cfg.enable_mixed_precision: loss *= loss_scale cls_losses_reduced.backward() mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() step_time = (time.time() - t) t = time.time() metrics = {} global_step = net.get_global_step() if global_step % display_step == 0: print("now it is", global_step, "steps", " and the cls_loss is :", cls_loss_sum / display_step, "learning_rate: ", float(optimizer.lr), file=logf) print("now it is", global_step, "steps", " and the cls_loss is :", cls_loss_sum / display_step, "learning_rate: ", float(optimizer.lr)) cls_loss_sum = 0 ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [fusion_layer, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [fusion_layer, optimizer], net.get_global_step()) fusion_layer.eval() net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) print("#################################") print("#################################", file=logf) print("# EVAL") print("# EVAL", file=logf) print("#################################") print("#################################", file=logf) print("Generate output labels...") print("Generate output labels...", file=logf) t = time.time() dt_annos = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start( (len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) val_loss_final = 0 for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) if pickle_result: dt_annos_i, val_losses = predict_kitti_to_anno( net, detection_2d_path, fusion_layer, example, class_names, center_limit_range, model_cfg.lidar_input) dt_annos += dt_annos_i val_loss_final = val_loss_final + val_losses else: _predict_kitti_to_file(net, detection_2d_path, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) print("validation_loss:", val_loss_final / len(eval_dataloader)) print("validation_loss:", val_loss_final / len(eval_dataloader), file=logf) print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') print(f'generate label finished({sec_per_ex:.2f}/s). start eval:', file=logf) gt_annos = [ info["annos"] for info in eval_dataset.dataset.kitti_infos ] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) # result = get_official_eval_result_v2(gt_annos, dt_annos, class_names) result = get_official_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) writer.add_text('eval_result', json.dumps(result, indent=2), global_step) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) writer.add_text('eval_result', result, global_step) #net.train() fusion_layer.train() except Exception as e: torchplus.train.save_models(model_dir, [fusion_layer, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [fusion_layer, optimizer], net.get_global_step()) logf.close()
def onnx_model_generate(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None): model_dir = pathlib.Path(model_dir) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) center_limit_range = model_cfg.post_center_limit_range ########################## ## Build Voxel Generator ########################## voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) net = second_builder.build(model_cfg, voxel_generator, target_assigner, 1) net.cuda() if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=1, shuffle=False, num_workers=1, pin_memory=False, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) dt_annos = [] global_set = None print("Generate output labels...") bar = ProgressBar() bar.start(len(eval_dataset) // input_cfg.batch_size + 1) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) example_tuple = list(example.values()) batch_image_shape = example_tuple[8] example_tuple[8] = torch.from_numpy(example_tuple[8]) example_tuple[9] = torch.from_numpy(example_tuple[9]) dt_annos = export_onnx(net, example_tuple, class_names, batch_image_shape, center_limit_range, model_cfg.lidar_input, global_set) return 0 bar.print_bar()
float_dtype = torch.float32 if cfg.multi_gpu: num_gpu = torch.cuda.device_count() print(f"MULTI-GPU: use {num_gpu} gpu") collate_fn = merge_second_batch_multigpu else: collate_fn = merge_second_batch num_gpu = 1 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, multi_gpu=cfg.multi_gpu) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size * num_gpu, shuffle=True, num_workers=input_cfg.preprocess.num_workers * num_gpu, pin_memory=False, collate_fn=collate_fn,
def predict(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None, ref_detfile=None, pickle_result=True, bb_save_dir=None, pub_bb=None, pub_lidar=None): ''' Setup network and provide useful output ''' #################### # SETUP PARAMETERS # #################### model_dir = pathlib.Path(model_dir) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) # TODO: include this program as a function call in the localization/mapping code as needed # TODO: use whole pointcloud data instead of reduced pointcloud # TODO: [Done] store data in respective pcd and bounding box (csv) files # TODO: [Done] create a cpp file to read and show (n number of) pcd files with respective bounding boxes # > [Done] Check if pcl_viewer can open pcd # > [Done] Check if pcl_viewer can be called from a cpp program for vizualization # > [Done] Check if that cpp program can also show a bounding box input_cfg = config.eval_input_reader # Read the config file data into useful structures model_cfg = config.model.second # Read the config file data into useful structures train_cfg = config.train_config # Read the config file data into useful structures class_names = list(input_cfg.class_names) center_limit_range = model_cfg.post_center_limit_range ######################### # BUILD VOXEL GENERATOR # ######################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ##################### # NETWORK GENERATOR # ##################### # Build the NN in GPU mode net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() # Standard conversion approach if using FloatingPoint16 instead of FloatingPoint32 type of tensor if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) float_dtype = torch.float16 else: float_dtype = torch.float32 # Restore old checkpoint if possible if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) # Setup network for evaluation mode net.eval() ##################### # DATASET GENERATOR # ##################### # Dataset build for easy usage eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=input_cfg.batch_size, shuffle=False, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) # Further variable setup result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() dt_annos = [] global_set = None print() print("Generate output labels...") bar = ProgressBar() bar.start(len(eval_dataset) // input_cfg.batch_size + 1) ################# # NETWORK USAGE # ################# # Predict a set of 'num_workers' samples, get info and reformat data as needed # temp_count = 0 for example in iter(eval_dataloader): # pprint.pprint(example, width=1) # for key, value in example.items(): # print(key) # print(np.shape(value)) example = example_convert_to_torch(example, float_dtype) print(example['image_idx']) # pprint.pprint(example, width=1) # for key, value in example.items(): # print(key) # print(np.shape(value)) # # # # if pickle_result: # NOTE: Predict network output # start_time = time.time() predictions_dicts = net(example) # # Save copy of data if user requested # if save_pcd: # np.fromfile(str(v_path), dtype=np.float32, count=-1).reshape([-1, 4]) # # Publish original data # if pub_lidar: # data=PointCloud2() # # FIXME: Extract pointclound info from 'example' (use original kitti data file if needed) > publish # pub_lidar.publish(data) # # Publish network output # if pub_bb: # data = MarkerArray() # # FIXME: Create a wireframe 3D bounding box and, if possible, a transluscent 3D cuboid as well > publish # pub_bb.publish(data) # # print('Network predict time: {}'.format(time.time()-start_time)) # pprint.pprint(predictions_dicts[0]) # for key, value in predictions_dicts[0].items(): # print(key) # print(np.shape(value)) if bb_save_dir: save_path = pathlib.Path(bb_save_dir) save_path.mkdir( parents=True, exist_ok=True ) # create directory (and its parents) if non-existent for pred_dict in predictions_dicts: if pred_dict['box3d_lidar'] is not None: bb_lidar = pred_dict['box3d_lidar'].detach().cpu().numpy() else: bb_lidar = [[ 'temp', 'temp', 'temp', 'temp', 'temp', 'temp', 'temp' ]] df = pd.DataFrame(bb_lidar) df.columns = ['x', 'y', 'z', 'w', 'l', 'h', 't'] filename = save_path.joinpath( str(pred_dict['image_idx']) + '.csv') filename.write_text(df.to_csv(index=False))
def train( config_path: Union[str, Path, pipeline.TrainEvalPipelineConfig], model_dir: Union[str, Path], data_root_path: Union[str, Path], result_path: Optional[Union[str, Path]] = None, display_step: int = 50, pretrained_path=None, pretrained_include=None, pretrained_exclude=None, freeze_include=None, freeze_exclude=None, measure_time: bool = False, resume: bool = False, ): """train a VoxelNet model specified by a config file. """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_dir = real_path(model_dir, check_exists=False) if not resume and model_dir.exists(): raise ValueError("model dir exists and you don't specify resume.") model_dir.mkdir(parents=True, exist_ok=True) model_dir = Path(model_dir) if result_path is None: result_path = model_dir / "results" else: result_path = assert_real_path(result_path, mkdir=True) config_file_bkp = DEFAULT_CONFIG_FILE_NAME if isinstance(config_path, pipeline.TrainEvalPipelineConfig): # directly provide a config object. this usually used # when you want to train with several different parameters in # one script. config = config_path proto_str = text_format.MessageToString(config, use_short_repeated_primitives=True, indent=2) else: config_path = assert_real_path(config_path) data_root_path = assert_real_path(data_root_path) config = read_pipeline_config(config_path, data_root_path) # Copy the contents of config_path to config_file_bkp verbatim without passing it through the protobuf parser. with open(str(config_path), "r") as f: proto_str = f.read() with (model_dir / config_file_bkp).open("w") as f: f.write(proto_str) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, measure_time).to(device) if train_cfg.enable_mixed_precision: # net.half() net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator # print("num parameters:", len(list(net.parameters()))) print("num parameters (million): ", count_parameters(net) * 1e-6) torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) if pretrained_path is not None: model_dict = net.state_dict() pretrained_dict = torch.load(pretrained_path) pretrained_dict = filter_param_dict(pretrained_dict, pretrained_include, pretrained_exclude) new_pretrained_dict = {} for k, v in pretrained_dict.items(): if k in model_dict and v.shape == model_dict[k].shape: new_pretrained_dict[k] = v print("Load pretrained parameters:") for k, v in new_pretrained_dict.items(): print(k, v.shape) model_dict.update(new_pretrained_dict) net.load_state_dict(model_dict) freeze_params_v2(dict(net.named_parameters()), freeze_include, freeze_exclude) net.clear_global_step() net.clear_metrics() optimizer_cfg = train_cfg.optimizer loss_scale = train_cfg.loss_scale_factor fastai_optimizer = optimizer_builder.build( optimizer_cfg, net, mixed=False, loss_scale=loss_scale) if loss_scale < 0: loss_scale = "dynamic" amp_optimizer = fastai_optimizer torchplus.train.try_restore_latest_checkpoints(model_dir,[amp_optimizer]) float_dtype = torch.float32 collate_fn = merge_second_batch num_gpu = 1 ###################### # PREPARE INPUT ###################### def get_train_dataloader(input_cfg, model_cfg, voxel_generator, target_assigner, multi_gpu, num_gpu, collate_fn, _worker_init_fn): dataset = input_reader_builder.build( input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, multi_gpu=multi_gpu) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size * num_gpu, shuffle=True, num_workers=input_cfg.preprocess.num_workers * num_gpu, pin_memory=True, collate_fn=collate_fn, worker_init_fn=_worker_init_fn, drop_last=not multi_gpu) return dataloader eval_dataset = input_reader_builder.build( eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, # only support multi-gpu train shuffle=False, num_workers=eval_input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch) ###################### # TRAINING ###################### model_logging = SimpleModelLog(model_dir) model_logging.open() model_logging.log_text(proto_str + "\n", 0, tag="config") epochs = train_cfg.steps epochs_per_eval = train_cfg.steps_per_eval clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch amp_optimizer.zero_grad() step_times = [] eval_times = [] t = time.time() reset_ds_epoch = False run_once = True if not (os.getenv("MLFLOW_EXPERIMENT_ID") or os.getenv("MLFLOW_EXPERIMENT_NAME")): mlflow.set_experiment("object_detection") try: while True: if run_once or reset_ds_epoch: dataloader = get_train_dataloader(input_cfg, model_cfg, voxel_generator, target_assigner, multi_gpu, num_gpu, collate_fn, _worker_init_fn) total_step = int(np.ceil((len(dataloader.dataset) / dataloader.batch_size) * epochs)) steps_per_eval = int(np.floor((len(dataloader.dataset) / dataloader.batch_size) * epochs_per_eval)) train_cfg.steps = int(total_step) train_cfg.steps_per_eval = int(steps_per_eval) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, amp_optimizer, total_step) print(f"\nnumber of samples: {len(dataloader.dataset)}\ntotal_steps: {total_step}\nsteps_per_eval: {steps_per_eval}") run_once = False if clear_metrics_every_epoch: net.clear_metrics() for example in dataloader: lr_scheduler.step(net.get_global_step()) time_metrics = example["metrics"] example.pop("metrics") example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"].mean() cls_neg_loss = ret_dict["cls_neg_loss"].mean() loc_loss = ret_dict["loc_loss"] # cls_loss = ret_dict["cls_loss"] cared = ret_dict["cared"] labels = example_torch["labels"] loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 30.0) # torch.nn.utils.clip_grad_norm_(amp.master_params(amp_optimizer), 10.0) amp_optimizer.step() amp_optimizer.zero_grad() net.update_global_step() global_step = net.get_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) step_times.append(step_time) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) if global_step % display_step == 0: if measure_time: for name, val in net.get_avg_time_dict().items(): print(f"avg {name} time = {val * 1000:.3f} ms") loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] total_seconds = ((total_step - global_step) * np.mean(step_times)) if len(eval_times) != 0: eval_seconds = ((epochs / epochs_per_eval) - len(eval_times)) * np.mean(eval_times) total_seconds += eval_seconds next_eval_seconds = (steps_per_eval - (global_step % steps_per_eval)) * np.mean(step_times) metrics["runtime"] = { "step": global_step, "steptime": np.mean(step_times), "ETA": seconds_to_eta(total_seconds), "eval_ETA": seconds_to_eta(next_eval_seconds), } metrics["runtime"].update(time_metrics[0]) step_times = [] metrics.update(net_metrics) metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.use_direction_classifier: dir_loss_reduced = ret_dict["dir_loss_reduced"].mean() metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["misc"] = { "num_vox": int(example_torch["voxels"].shape[0]), "num_pos": int(num_pos), "num_neg": int(num_neg), "num_anchors": int(num_anchors), "lr": float(amp_optimizer.lr), "mem_usage": psutil.virtual_memory().percent, } model_logging.log_metrics(metrics, global_step) # if global_step % steps_per_eval != 0 and global_step % 1000 == 0: # torchplus.train.save_models(model_dir, [net, amp_optimizer], net.get_global_step()) if global_step % steps_per_eval == 0: torchplus.train.save_models(model_dir, [net, amp_optimizer], global_step) net.eval() result_path_step = result_path / f"step_{global_step}" result_path_step.mkdir(parents=True, exist_ok=True) model_logging.log_text("#################################", global_step) model_logging.log_text("# EVAL", global_step) model_logging.log_text("#################################", global_step) model_logging.log_text("Generate output labels...", global_step) t = time.time() detections = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start((len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) detections += net(example) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) eval_times.append((time.time() - t)) model_logging.log_text(f'generate label finished({sec_per_ex:.2f}/s). start eval:', global_step) result_dict = eval_dataset.dataset.evaluation(detections, result_path_step) if result_dict is None: raise RuntimeError("eval_dataset.dataset.evaluation() returned None") for k, v in result_dict["results"].items(): model_logging.log_text("Evaluation {}".format(k), global_step) model_logging.log_text(v, global_step) model_logging.log_metrics(result_dict["detail"], global_step) with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(detections, f) net.train() if global_step >= total_step: break if net.get_global_step() >= total_step: break except Exception as e: if 'example' in locals(): print(json.dumps(example["metadata"], indent=2)) global_step = net.get_global_step() model_logging.log_text(str(e), global_step) if 'example' in locals(): model_logging.log_text(json.dumps(example["metadata"], indent=2), global_step) torchplus.train.save_models(model_dir, [net, amp_optimizer], global_step) raise e finally: model_logging.close() torchplus.train.save_models(model_dir, [net, amp_optimizer], net.get_global_step()) def _save_checkpoint_info(file_path, config_filename, checkpoint_filename): from yaml import dump with open(file_path, "w") as config_info_file: checkpoint_info = { "config": config_filename, "checkpoint": checkpoint_filename } dump(checkpoint_info, config_info_file, default_flow_style=False) ckpt_info_path = str(model_dir / "checkpoint_info.yaml") latest_ckpt_filename = "voxelnet-{}.tckpt".format(net.get_global_step()) _save_checkpoint_info(ckpt_info_path, config_file_bkp, latest_ckpt_filename) mlflow.log_artifact(ckpt_info_path, "model") mlflow.log_artifact(str(model_dir / config_file_bkp), "model") mlflow.log_artifact(str(model_dir / latest_ckpt_filename), "model")
def main(config_path, lc_horizon, num_examples, model_dir, ckpt_path=None, **kwargs): """Don't support pickle_result anymore. if you want to generate kitti label file, please use kitti_anno_to_label_file and convert_detection_to_kitti_annos in second.data.kitti_dataset. """ assert len(kwargs) == 0 model_dir = str(Path(model_dir).resolve()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to eval with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path input_cfg = config.eval_input_reader input_cfg.cum_lc_wrapper.lc_horizon = lc_horizon model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, measure_time=False).to(device) if train_cfg.enable_mixed_precision: net.half() print("half inference!") net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator if ckpt_path is None: assert model_dir is not None torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) batch_size = 1 eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner, net=net) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() t = time.time() detections = [] print("Generate output labels...") bar = ProgressBar() bar.start((len(eval_dataset) + batch_size - 1) // batch_size) prep_example_times = [] prep_times = [] t2 = time.time() times = [] for scene_id in trange(num_examples): idx = eval_dataset.scene_id_and_step_to_idx(scene_id, lc_horizon) torch.cuda.synchronize() b_ex_time = time.time() example = eval_dataset[idx] example = merge_second_batch([example]) example = example_convert_to_torch(example, float_dtype) with torch.no_grad(): detections = net(example) torch.cuda.synchronize() e_ex_time = time.time() del example, detections times.append(e_ex_time - b_ex_time) times = np.array(times) mean = times.mean() interval = 1.96 * times.std() / np.sqrt( len(times)) # 95% confidence interval return mean, interval
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, resume=False): """train a VoxelNet model specified by a config file. """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) if not resume and model_dir.exists(): raise ValueError("model dir exists and you don't specify resume.") model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to train with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path proto_str = text_format.MessageToString(config, indent=2) with (model_dir / config_file_bkp).open("w") as f: f.write(proto_str) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg).to(device) if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator class_names = target_assigner.classes # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # BUILD OPTIMIZER ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer loss_scale = train_cfg.loss_scale_factor mixed_optimizer = optimizer_builder.build( optimizer_cfg, net, mixed=train_cfg.enable_mixed_precision, loss_scale=loss_scale) optimizer = mixed_optimizer center_limit_range = model_cfg.post_center_limit_range """ if train_cfg.enable_mixed_precision: mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer """ # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, train_cfg.steps) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) print(data_iter) ###################### # TRAINING ###################### model_logging = SimpleModelLog(model_dir) model_logging.open() model_logging.log_text(proto_str + "\n", 0, tag="config") total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t steps_per_eval = train_cfg.steps_per_eval total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step(net.get_global_step()) try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) #batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) # FCOS losses = ret_dict['total_loss'] loss_cls = ret_dict["loss_cls"] loss_reg = ret_dict["loss_reg"] cls_preds = ret_dict['cls_preds'] labels = ret_dict["labels"] cared = ret_dict["labels"] optimizer.zero_grad() losses.backward() #torch.nn.utils.clip_grad_norm_(net.parameters(), 1) # optimizer_step is for updating the parameter, so clip before update optimizer.step() net.update_global_step() #need to unpack the [0] for fpn net_metrics = net.update_metrics(loss_cls, loss_reg, cls_preds[0], labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} global_step = net.get_global_step() #print log if global_step % display_step == 0: metrics["runtime"] = { "step": global_step, "steptime": step_time, } metrics.update(net_metrics) metrics["misc"] = { "num_vox": int(example_torch["voxels"].shape[0]), "lr": float(optimizer.lr), } model_logging.log_metrics(metrics, global_step) ckpt_elasped_time = time.time() - ckpt_start_time torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) model_logging.log_text("#################################", global_step) model_logging.log_text("# EVAL", global_step) model_logging.log_text("#################################", global_step) model_logging.log_text("Generate output labels...", global_step) t = time.time() detections = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start( (len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) with torch.no_grad(): detections += net(example) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) model_logging.log_text( f'generate label finished({sec_per_ex:.2f}/s). start eval:', global_step) result_dict = eval_dataset.dataset.evaluation( detections, str(result_path_step)) for k, v in result_dict["results"].items(): model_logging.log_text("Evaluation {}".format(k), global_step) model_logging.log_text(v, global_step) model_logging.log_metrics(result_dict["detail"], global_step) with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(detections, f) net.train() ''' new version of evaluation while trainging # do the evaluation while traingingi if global_step % steps_per_eval == 0: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) model_logging.log_text("#################################", global_step) model_logging.log_text("# EVAL", global_step) model_logging.log_text("#################################", global_step) model_logging.log_text("Generate output labels...", global_step) t = time.time() detections = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start((len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) with torch.no_grad(): detections += net(example) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) model_logging.log_text( f'generate label finished({sec_per_ex:.2f}/s). start eval:', global_step) result_dict = eval_dataset.dataset.evaluation( detections, str(result_path_step)) for k, v in result_dict["results"].items(): model_logging.log_text("Evaluation {}".format(k), global_step) model_logging.log_text(v, global_step) model_logging.log_metrics(result_dict["detail"], global_step) with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(detections, f) net.train() ''' except Exception as e: print("trainging error") raise e finally: model_logging.close() # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step())
def evaluate(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None, ref_detfile=None, pickle_result=True, measure_time=False, batch_size=None): model_dir = pathlib.Path(model_dir) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config center_limit_range = model_cfg.post_center_limit_range ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) class_names = target_assigner.classes net = second_builder.build(model_cfg, voxel_generator, target_assigner, measure_time=measure_time) net.cuda() if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) if train_cfg.enable_mixed_precision: net.half() print("half inference!") net.metrics_to_float() net.convert_norm_to_float(net) batch_size = batch_size or input_cfg.batch_size eval_dataset = input_reader_builder.build( input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=batch_size, shuffle=False, num_workers=0,# input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() dt_annos = [] global_set = None print("Generate output labels...") bar = ProgressBar() bar.start((len(eval_dataset) + batch_size - 1) // batch_size) prep_example_times = [] prep_times = [] t2 = time.time() for example in iter(eval_dataloader): if measure_time: prep_times.append(time.time() - t2) t1 = time.time() torch.cuda.synchronize() example = example_convert_to_torch(example, float_dtype) if measure_time: torch.cuda.synchronize() prep_example_times.append(time.time() - t1) if pickle_result: dt_annos += predict_kitti_to_anno( net, example, class_names, center_limit_range, model_cfg.lidar_input, global_set) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) # print(json.dumps(net.middle_feature_extractor.middle_conv.sparity_dict)) bar.print_bar() if measure_time: t2 = time.time() sec_per_example = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_example:.2f}/s). start eval:') if measure_time: print(f"avg example to torch time: {np.mean(prep_example_times) * 1000:.3f} ms") print(f"avg prep time: {np.mean(prep_times) * 1000:.3f} ms") for name, val in net.get_avg_time_dict().items(): print(f"avg {name} time = {val * 1000:.3f} ms") if not predict_test: gt_annos = [info["annos"] for info in eval_dataset.dataset.kitti_infos] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) result = get_official_eval_result(gt_annos, dt_annos, class_names) # print(json.dumps(result, indent=2)) print(result) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f)
def evaluate_from_result(config_path, result_path_step=None, measure_time=False, batch_size=None, use_detections_kitti=False, **kwargs): """Don't support pickle_result anymore. if you want to generate kitti label file, please use kitti_anno_to_label_file and convert_detection_to_kitti_annos in second.data.kitti_dataset. """ assert len(kwargs) == 0 assert result_path_step is not None result_path_step = Path(result_path_step) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to eval with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, measure_time=measure_time).to(device) if train_cfg.enable_mixed_precision: net.half() print("half inference!") net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator batch_size = batch_size or input_cfg.batch_size eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=batch_size, shuffle=False, num_workers=input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() if use_detections_kitti: with open(result_path_step / "detections_kitti.pkl", 'rb') as f: detections = pickle.load(f) result_dict = eval_dataset.dataset.evaluation_from_kitti_dets( detections, str(result_path_step)) else: with open(result_path_step / "result.pkl", 'rb') as f: detections = pickle.load(f) result_dict = eval_dataset.dataset.evaluation(detections, str(result_path_step)) if result_dict is not None: for k, v in result_dict["results"].items(): print("Evaluation {}".format(k)) print(v) with open(result_path_step / "result_kitti.pkl", 'wb') as f: pickle.dump(result_dict["result_kitti"], f) with open(result_path_step / "result_evaluation.pkl", 'wb') as f: pickle.dump(result_dict["results"], f)
def helper_tune_target_assigner(config_path, target_rate=None, update_freq=200, update_delta=0.01, num_tune_epoch=5): """get information of target assign to tune thresholds in anchor generator. """ if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to train with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path proto_str = text_format.MessageToString(config, indent=2) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, False) # if train_cfg.enable_mixed_precision: # net.half() # net.metrics_to_float() # net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, multi_gpu=False) dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn, drop_last=False) class_count = {} anchor_count = {} class_count_tune = {} anchor_count_tune = {} for c in target_assigner.classes: class_count[c] = 0 anchor_count[c] = 0 class_count_tune[c] = 0 anchor_count_tune[c] = 0 step = 0 classes = target_assigner.classes if target_rate is None: num_tune_epoch = 0 for epoch in range(num_tune_epoch): for example in dataloader: gt_names = example["gt_names"] for name in gt_names: class_count_tune[name] += 1 labels = example['labels'] for i in range(1, len(classes) + 1): anchor_count_tune[classes[i - 1]] += int(np.sum(labels == i)) if target_rate is not None: for name, rate in target_rate.items(): if class_count_tune[name] > update_freq: # calc rate current_rate = anchor_count_tune[ name] / class_count_tune[name] if current_rate > rate: target_assigner._anchor_generators[classes.index( name)].match_threshold += update_delta target_assigner._anchor_generators[classes.index( name)].unmatch_threshold += update_delta else: target_assigner._anchor_generators[classes.index( name)].match_threshold -= update_delta target_assigner._anchor_generators[classes.index( name)].unmatch_threshold -= update_delta anchor_count_tune[name] = 0 class_count_tune[name] = 0 step += 1 for c in target_assigner.classes: class_count[c] = 0 anchor_count[c] = 0 total_voxel_gene_time = 0 count = 0 for example in dataloader: gt_names = example["gt_names"] total_voxel_gene_time += example["metrics"][0]["voxel_gene_time"] count += 1 for name in gt_names: class_count[name] += 1 labels = example['labels'] for i in range(1, len(classes) + 1): anchor_count[classes[i - 1]] += int(np.sum(labels == i)) print("avg voxel gene time", total_voxel_gene_time / count) print(json.dumps(class_count, indent=2)) print(json.dumps(anchor_count, indent=2)) if target_rate is not None: for ag in target_assigner._anchor_generators: if ag.class_name in target_rate: print(ag.class_name, ag.match_threshold, ag.unmatch_threshold)
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pickle_result=True): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) eval_checkpoint_dir = model_dir / 'eval_checkpoints' eval_checkpoint_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range # net = second_builder.build(model_cfg, voxel_generator, target_assigner) net = second_builder.build(model_cfg, voxel_generator, target_assigner, input_cfg.batch_size) net.cuda() # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # BUILD OPTIMIZER ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) optimizer = optimizer_builder.build(optimizer_cfg, net.parameters()) if train_cfg.enable_mixed_precision: loss_scale = train_cfg.loss_scale_factor mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, gstep) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build( input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build( eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # TRAINING ###################### log_path = model_dir / 'log.txt' logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step() try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] example_tuple = list(example_torch.values()) example_tuple[11] = torch.from_numpy(example_tuple[11]) example_tuple[12] = torch.from_numpy(example_tuple[12]) assert 13==len(example_tuple), "something wring with training input size!" # training example:[0:'voxels', 1:'num_points', 2:'coordinates', 3:'rect', # 4:'Trv2c', 5:'P2', # 6:'anchors', 7:'anchors_mask', 8:'labels', 9:'reg_targets', 10:'reg_weights', # 11:'image_idx', 12:'image_shape'] # ret_dict = net(example_torch) # training input from example # print("example[0] size", example_tuple[0].size()) pillar_x = example_tuple[0][:,:,0].unsqueeze(0).unsqueeze(0) pillar_y = example_tuple[0][:,:,1].unsqueeze(0).unsqueeze(0) pillar_z = example_tuple[0][:,:,2].unsqueeze(0).unsqueeze(0) pillar_i = example_tuple[0][:,:,3].unsqueeze(0).unsqueeze(0) num_points_per_pillar = example_tuple[1].float().unsqueeze(0) # Find distance of x, y, and z from pillar center # assuming xyres_16.proto coors_x = example_tuple[2][:, 3].float() coors_y = example_tuple[2][:, 2].float() # self.x_offset = self.vx / 2 + pc_range[0] # self.y_offset = self.vy / 2 + pc_range[1] # this assumes xyres 20 # x_sub = coors_x.unsqueeze(1) * 0.16 + 0.1 # y_sub = coors_y.unsqueeze(1) * 0.16 + -39.9 # here assumes xyres 16 x_sub = coors_x.unsqueeze(1) * 0.16 + 0.08 y_sub = coors_y.unsqueeze(1) * 0.16 + -39.6 ones = torch.ones([1, 100],dtype=torch.float32, device=pillar_x.device ) x_sub_shaped = torch.mm(x_sub, ones).unsqueeze(0).unsqueeze(0) y_sub_shaped = torch.mm(y_sub, ones).unsqueeze(0).unsqueeze(0) num_points_for_a_pillar = pillar_x.size()[3] mask = get_paddings_indicator(num_points_per_pillar, num_points_for_a_pillar, axis=0) mask = mask.permute(0, 2, 1) mask = mask.unsqueeze(1) mask = mask.type_as(pillar_x) coors = example_tuple[2] anchors = example_tuple[6] labels = example_tuple[8] reg_targets = example_tuple[9] input = [pillar_x, pillar_y, pillar_z, pillar_i, num_points_per_pillar, x_sub_shaped, y_sub_shaped, mask, coors, anchors, labels, reg_targets] ret_dict = net(input) assert 10==len(ret_dict), "something wring with training output size!" # return 0 # ret_dict { # 0:"loss": loss, # 1:"cls_loss": cls_loss, # 2:"loc_loss": loc_loss, # 3:"cls_pos_loss": cls_pos_loss, # 4:"cls_neg_loss": cls_neg_loss, # 5:"cls_preds": cls_preds, # 6:"dir_loss_reduced": dir_loss_reduced, # 7:"cls_loss_reduced": cls_loss_reduced, # 8:"loc_loss_reduced": loc_loss_reduced, # 9:"cared": cared, # } # cls_preds = ret_dict["cls_preds"] cls_preds = ret_dict[5] # loss = ret_dict["loss"].mean() loss = ret_dict[0].mean() # cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() cls_loss_reduced = ret_dict[7].mean() # loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() loc_loss_reduced = ret_dict[8].mean() # cls_pos_loss = ret_dict["cls_pos_loss"] cls_pos_loss = ret_dict[3] # cls_neg_loss = ret_dict["cls_neg_loss"] cls_neg_loss = ret_dict[4] # loc_loss = ret_dict["loc_loss"] loc_loss = ret_dict[2] # cls_loss = ret_dict["cls_loss"] cls_loss = ret_dict[1] # dir_loss_reduced = ret_dict["dir_loss_reduced"] dir_loss_reduced = ret_dict[6] # cared = ret_dict["cared"] cared = ret_dict[9] # labels = example_torch["labels"] labels = example_tuple[8] if train_cfg.enable_mixed_precision: loss *= loss_scale loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) # if 'anchors_mask' not in example_torch: # num_anchors = example_torch['anchors'].shape[1] # else: # num_anchors = int(example_torch['anchors_mask'][0].sum()) num_anchors = int(example_tuple[7][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) # if unlabeled_training: # metrics["loss"]["diff_rt"] = float( # diff_loc_loss_reduced.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) # metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_vox"] = int(example_tuple[0].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) metrics["lr"] = float( mixed_optimizer.param_groups[0]['lr']) # metrics["image_idx"] = example['image_idx'][0] metrics["image_idx"] = example_tuple[11][0] flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} writer.add_scalars(k, v, global_step) else: writer.add_scalar(k, v, global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) # Ensure that all evaluation points are saved forever torchplus.train.save_models(eval_checkpoint_dir, [net, optimizer], net.get_global_step(), max_to_keep=100) # net.eval() # result_path_step = result_path / f"step_{net.get_global_step()}" # result_path_step.mkdir(parents=True, exist_ok=True) # print("#################################") # print("#################################", file=logf) # print("# EVAL") # print("# EVAL", file=logf) # print("#################################") # print("#################################", file=logf) # print("Generate output labels...") # print("Generate output labels...", file=logf) # t = time.time() # dt_annos = [] # prog_bar = ProgressBar() # prog_bar.start(len(eval_dataset) // eval_input_cfg.batch_size + 1) # for example in iter(eval_dataloader): # example = example_convert_to_torch(example, float_dtype) # # evaluation example:[0:'voxels', 1:'num_points', 2:'coordinates', 3:'rect', # # 4:'Trv2c', 5:'P2', # # 6:'anchors', 7:'anchors_mask', 8:'image_idx', 9:'image_shape'] # example_tuple = list(example.values()) # example_tuple[8] = torch.from_numpy(example_tuple[8]) # example_tuple[9] = torch.from_numpy(example_tuple[9]) # if pickle_result: # dt_annos += predict_kitti_to_anno( # net, example_tuple, class_names, center_limit_range, # model_cfg.lidar_input) # else: # _predict_kitti_to_file(net, example, result_path_step, # class_names, center_limit_range, # model_cfg.lidar_input) # # prog_bar.print_bar() # # sec_per_ex = len(eval_dataset) / (time.time() - t) # print(f"avg forward time per example: {net.avg_forward_time:.3f}") # print( # f"avg postprocess time per example: {net.avg_postprocess_time:.3f}" # ) # # net.clear_time_metrics() # print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') # print( # f'generate label finished({sec_per_ex:.2f}/s). start eval:', # file=logf) # gt_annos = [ # info["annos"] for info in eval_dataset.dataset.kitti_infos # ] # if not pickle_result: # dt_annos = kitti.get_label_annos(result_path_step) # result, mAPbbox, mAPbev, mAP3d, mAPaos = get_official_eval_result(gt_annos, dt_annos, class_names, # return_data=True) # print(result, file=logf) # print(result) # writer.add_text('eval_result', result, global_step) # # for i, class_name in enumerate(class_names): # writer.add_scalar('bev_ap:{}'.format(class_name), mAPbev[i, 1, 0], global_step) # writer.add_scalar('3d_ap:{}'.format(class_name), mAP3d[i, 1, 0], global_step) # writer.add_scalar('aos_ap:{}'.format(class_name), mAPaos[i, 1, 0], global_step) # writer.add_scalar('bev_map', np.mean(mAPbev[:, 1, 0]), global_step) # writer.add_scalar('3d_map', np.mean(mAP3d[:, 1, 0]), global_step) # writer.add_scalar('aos_map', np.mean(mAPaos[:, 1, 0]), global_step) # # result = get_coco_eval_result(gt_annos, dt_annos, class_names) # print(result, file=logf) # print(result) # if pickle_result: # with open(result_path_step / "result.pkl", 'wb') as f: # pickle.dump(dt_annos, f) # writer.add_text('eval_result', result, global_step) # net.train() except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def evaluate(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None, ref_detfile=None, pickle_result=True, angle_deg=0.0): model_dir = pathlib.Path(model_dir) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) center_limit_range = model_cfg.post_center_limit_range ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner, angle_deg=angle_deg) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=input_cfg.batch_size, shuffle=False, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() dt_annos = [] global_set = None print("Generate output labels...") # bar = ProgressBar() # bar.start(len(eval_dataset) // input_cfg.batch_size + 1) for example in prog_bar(iter(eval_dataloader)): example = example_convert_to_torch(example, float_dtype) if pickle_result: dt_annos += predict_kitti_to_anno(net, example, class_names, center_limit_range, model_cfg.lidar_input, global_set) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) # bar.print_bar() sec_per_example = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_example:.2f}/s). start eval:') print(f"avg forward time per example: {net.avg_forward_time:.3f}") print(f"avg postprocess time per example: {net.avg_postprocess_time:.3f}") if not predict_test: gt_annos = [info["annos"] for info in eval_dataset.dataset.kitti_infos] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) # result = get_official_eval_result(gt_annos, dt_annos, class_names) # print(result) # result = get_coco_eval_result(gt_annos, dt_annos, class_names) # print(result) if pickle_result: with open(result_path_step / ("result_%03d.pkl" % angle_deg), 'wb') as f: pickle.dump(dt_annos, f)
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pickle_result=True): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) eval_checkpoint_dir = model_dir / 'eval_checkpoints' eval_checkpoint_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) ######################### # Build Voxel Generator ######################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) ######################### # Build Target Assigner ######################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ###################### # Build NetWork ###################### center_limit_range = model_cfg.post_center_limit_range # net = second_builder.build(model_cfg, voxel_generator, target_assigner) net = second_builder.build(model_cfg, voxel_generator, target_assigner, input_cfg.batch_size) net.cuda() # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # Build Optimizer ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) optimizer = optimizer_builder.build(optimizer_cfg, net.parameters()) if train_cfg.enable_mixed_precision: loss_scale = train_cfg.loss_scale_factor mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, gstep) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # Prepare Input ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # Training ###################### log_path = model_dir / 'log.txt' logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step() try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] example_tuple = list(example_torch.values()) example_tuple[11] = torch.from_numpy(example_tuple[11]) example_tuple[12] = torch.from_numpy(example_tuple[12]) assert 13 == len( example_tuple), "something write with training input size!" # ret_dict = net(example_torch) # Training Input form example pillar_x = example_tuple[0][:, :, 0].unsqueeze(0).unsqueeze(0) pillar_y = example_tuple[0][:, :, 1].unsqueeze(0).unsqueeze(0) pillar_z = example_tuple[0][:, :, 2].unsqueeze(0).unsqueeze(0) pillar_i = example_tuple[0][:, :, 3].unsqueeze(0).unsqueeze(0) num_points_per_pillar = example_tuple[1].float().unsqueeze(0) ################################################################ # Find distance of x, y, z from pillar center # assume config_file xyres_16.proto coors_x = example_tuple[2][:, 3].float() coors_y = example_tuple[2][:, 2].float() # self.x_offset = self.vx / 2 + pc_range[0] # self.y_offset = self.vy / 2 + pc_range[1] # this assumes xyres 20 # x_sub = coors_x.unsqueeze(1) * 0.16 + 0.1 # y_sub = coors_y.unsqueeze(1) * 0.16 + -39.9 ################################################################ # assumes xyres_16 x_sub = coors_x.unsqueeze(1) * 0.16 + 0.08 y_sub = coors_y.unsqueeze(1) * 0.16 - 39.6 ones = torch.ones([1, 100], dtype=torch.float32, device=pillar_x.device) x_sub_shaped = torch.mm(x_sub, ones).unsqueeze(0).unsqueeze(0) y_sub_shaped = torch.mm(y_sub, ones).unsqueeze(0).unsqueeze(0) num_points_for_a_pillar = pillar_x.size()[3] mask = get_paddings_indicator(num_points_per_pillar, num_points_for_a_pillar, axis=0) mask = mask.permute(0, 2, 1) mask = mask.unsqueeze(1) mask = mask.type_as(pillar_x) coors = example_tuple[2] anchors = example_tuple[6] labels = example_tuple[8] reg_targets = example_tuple[9] input = [ pillar_x, pillar_y, pillar_z, pillar_i, num_points_per_pillar, x_sub_shaped, y_sub_shaped, mask, coors, anchors, labels, reg_targets ] ret_dict = net(input) assert 10 == len( ret_dict), "something write with training output size!" cls_preds = ret_dict[5] loss = ret_dict[0].mean() cls_loss_reduced = ret_dict[7].mean() loc_loss_reduced = ret_dict[8].mean() cls_pos_loss = ret_dict[3] cls_neg_loss = ret_dict[4] loc_loss = ret_dict[2] cls_loss = ret_dict[1] dir_loss_reduced = ret_dict[6] cared = ret_dict[9] labels = example_tuple[8] if train_cfg.enable_mixed_precision: loss *= loss_scale loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) # if 'anchors_mask' not in example_torch: # num_anchors = example_torch['anchors'].shape[1] # else: # num_anchors = int(example_torch['anchors_mask'][0].sum()) num_anchors = int(example_tuple[7][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) # if unlabeled_training: # metrics["loss"]["diff_rt"] = float( # diff_loc_loss_reduced.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_tuple[0].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) metrics["lr"] = float( mixed_optimizer.param_groups[0]['lr']) metrics["image_idx"] = example_tuple[11][0] flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} writer.add_scalars(k, v, global_step) else: writer.add_scalar(k, v, global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) # Ensure that all evaluation points are saved forever torchplus.train.save_models(eval_checkpoint_dir, [net, optimizer], net.get_global_step(), max_to_keep=100) except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pickle_result=True): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) eval_checkpoint_dir = model_dir / 'eval_checkpoints' eval_checkpoint_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # BUILD OPTIMIZER ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) optimizer = optimizer_builder.build(optimizer_cfg, net.parameters()) if train_cfg.enable_mixed_precision: loss_scale = train_cfg.loss_scale_factor mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, gstep) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # TRAINING ###################### log_path = model_dir / 'log.txt' logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in tqdm(range(steps)): lr_scheduler.step() try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) # box_preds = ret_dict["box_preds"] cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"] cls_neg_loss = ret_dict["cls_neg_loss"] loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] dir_loss_reduced = ret_dict["dir_loss_reduced"] cared = ret_dict["cared"] labels = example_torch["labels"] if train_cfg.enable_mixed_precision: loss *= loss_scale loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) # if unlabeled_training: # metrics["loss"]["diff_rt"] = float( # diff_loc_loss_reduced.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) metrics["lr"] = float( mixed_optimizer.param_groups[0]['lr']) metrics["image_idx"] = example['image_idx'][0] flatted_metrics = _flat_nested_json_dict_to_py_dict( metrics) flatted_summarys = _flat_nested_json_dict_to_py_dict( metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} writer.add_scalars(k, v, global_step) else: writer.add_scalar(k, v, global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) print() ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) # Ensure that all evaluation points are saved forever torchplus.train.save_models(eval_checkpoint_dir, [net, optimizer], net.get_global_step(), max_to_keep=100) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) print("#################################") print("#################################", file=logf) print("# EVAL") print("# EVAL", file=logf) print("#################################") print("#################################", file=logf) print("Generate output labels...") print("Generate output labels...", file=logf) t = time.time() dt_annos = [] prog_bar = ProgressBar() prog_bar.start(len(eval_dataset) // eval_input_cfg.batch_size + 1) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) if pickle_result: dt_annos += _predict_kitti_to_anno(net, example, class_names, center_limit_range, model_cfg.lidar_input) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) print(f"avg forward time per example: {net.avg_forward_time:.3f}") print( f"avg postprocess time per example: {net.avg_postprocess_time:.3f}" ) net.clear_time_metrics() print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') print(f'generate label finished({sec_per_ex:.2f}/s). start eval:', file=logf) gt_annos = [ info["annos"] for info in eval_dataset.dataset.kitti_infos ] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) result, mAPbbox, mAPbev, mAP3d, mAPaos = get_official_eval_result( gt_annos, dt_annos, class_names, return_data=True) print(result, file=logf) print(result) writer.add_text('eval_result', result, global_step) for i, class_name in enumerate(class_names): writer.add_scalar('bev_ap:{}'.format(class_name), mAPbev[i, 1, 0], global_step) writer.add_scalar('3d_ap:{}'.format(class_name), mAP3d[i, 1, 0], global_step) writer.add_scalar('aos_ap:{}'.format(class_name), mAPaos[i, 1, 0], global_step) writer.add_scalar('bev_map', np.mean(mAPbev[:, 1, 0]), global_step) writer.add_scalar('3d_map', np.mean(mAP3d[:, 1, 0]), global_step) writer.add_scalar('aos_map', np.mean(mAPaos[:, 1, 0]), global_step) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) writer.add_text('eval_result', result, global_step) net.train() except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
def evaluate(config_path, model_dir, result_path=None, predict_test=False, ckpt_path=None, ref_detfile=None, pickle_result=True): model_dir = str(Path(model_dir).resolve()) if predict_test: result_name = 'predict_test' else: result_name = 'eval_results' if result_path is None: model_dir = Path(model_dir) result_path = model_dir / result_name else: result_path = pathlib.Path(result_path) if isinstance(config_path, str): config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) center_limit_range = model_cfg.post_center_limit_range ######################### # Build Voxel Generator ######################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) net = second_builder.build(model_cfg, voxel_generator, target_assigner, input_cfg.batch_size) net.cuda() if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) if ckpt_path is None: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) eval_dataset = input_reader_builder.build(input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=input_cfg.batch_size, shuffle=False, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() dt_annos = [] global_set = None print("Generate output labels...") bar = ProgressBar() bar.start(len(eval_dataset) // input_cfg.batch_size + 1) for example in iter(eval_dataloader): # eval example [0: 'voxels', 1: 'num_points', 2: 'coordinates', 3: 'rect' # 4: 'Trv2c', 5: 'P2', 6: 'anchors', 7: 'anchors_mask' # 8: 'image_idx', 9: 'image_shape'] example = example_convert_to_torch(example, float_dtype) example_tuple = list(example.values()) example_tuple[8] = torch.from_numpy(example_tuple[8]) example_tuple[9] = torch.from_numpy(example_tuple[9]) if (example_tuple[6].size()[0] != input_cfg.batch_size): continue if pickle_result: dt_annos += predict_kitti_to_anno(net, example_tuple, class_names, center_limit_range, model_cfg.lidar_input, global_set) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) bar.print_bar() sec_per_example = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_example:.2f}/s). start eval:') print(f"avg forward time per example: {net.avg_forward_time:.3f}") print(f"avg postprocess time per example: {net.avg_postprocess_time:.3f}") if not predict_test: gt_annos = [info["annos"] for info in eval_dataset.dataset.kitti_infos] if (len(gt_annos) % 2 != 0): del gt_annos[-1] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) result = get_official_eval_result(gt_annos, dt_annos, class_names) print(result) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f)
def evaluate(config_path, model_dir=None, result_path=None, ckpt_path=None, measure_time=False, batch_size=None, **kwargs): """Don't support pickle_result anymore. if you want to generate kitti label file, please use kitti_anno_to_label_file and convert_detection_to_kitti_annos in second.data.kitti_dataset. """ assert len(kwargs) == 0 model_dir = str(Path(model_dir).resolve()) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") result_name = 'eval_results' if result_path is None: model_dir = Path(model_dir) result_path = model_dir / result_name else: result_path = Path(result_path) if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to eval with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, measure_time=measure_time).to(device) if train_cfg.enable_mixed_precision: net.half() print("half inference!") net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator if ckpt_path is None: assert model_dir is not None torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) else: torchplus.train.restore(ckpt_path, net) batch_size = batch_size or input_cfg.batch_size eval_dataset = input_reader_builder.build( input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=batch_size, shuffle=False, num_workers=input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) t = time.time() detections = [] print("Generate output labels...") bar = ProgressBar() bar.start((len(eval_dataset) + batch_size - 1) // batch_size) prep_example_times = [] prep_times = [] t2 = time.time() for example in iter(eval_dataloader): if measure_time: prep_times.append(time.time() - t2) torch.cuda.synchronize() t1 = time.time() example = example_convert_to_torch(example, float_dtype) if measure_time: torch.cuda.synchronize() prep_example_times.append(time.time() - t1) with torch.no_grad(): detections += net(example) bar.print_bar() if measure_time: t2 = time.time() sec_per_example = len(eval_dataset) / (time.time() - t) print(f'generate label finished({sec_per_example:.2f}/s). start eval:') if measure_time: print( f"avg example to torch time: {np.mean(prep_example_times) * 1000:.3f} ms" ) print(f"avg prep time: {np.mean(prep_times) * 1000:.3f} ms") for name, val in net.get_avg_time_dict().items(): print(f"avg {name} time = {val * 1000:.3f} ms") with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(detections, f) result_dict = eval_dataset.dataset.evaluation(detections, str(result_path_step)) if result_dict is not None: for k, v in result_dict["results"].items(): print("Evaluation {}".format(k)) print(v)
def evaluate(net, net_loss, best_mAP, voxel_generator, target_assigner, config, model_logging, model_dir, result_path=None): torch.cuda.empty_cache() global_step = net_loss.get_global_step() eval_input_cfg = config.eval_input_reader model_cfg = config.model.second eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, # only support multi-gpu train shuffle=False, num_workers=eval_input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch) result_path_step = result_path / f"step_{global_step}" # result_path_step.mkdir(parents=True, exist_ok=True) model_logging.log_text("#################################", global_step) model_logging.log_text("# EVAL", global_step) model_logging.log_text("#################################", global_step) model_logging.log_text("Generate output labels...", global_step) t = time.time() detections = [] prog_bar = ProgressBar() prog_bar.start((len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] coors = example["coordinates"] input_features = compute_model_input(voxel_generator.voxel_size, voxel_generator.point_cloud_range, with_distance=False, voxels=example['voxels'], num_voxels=example['num_points'], coors=coors) # input_features = reshape_input(batch_size, input_features, coors, voxel_generator.grid_size) input_features = reshape_input1(input_features) net.batch_size = batch_size preds_list = net(input_features, coors) detections += net_loss(example, preds_list) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) model_logging.log_text( f'generate label finished({sec_per_ex:.2f}/s). start eval:', global_step) result_dict = eval_dataset.dataset.evaluation(detections, str(result_path_step)) if result_dict['mAp'] > best_mAP: best_mAP = result_dict['mAp'] ckpt_path = Path(model_dir) / "best_pointpillars.pth" torch.save(net.state_dict(), ckpt_path) for k, v in result_dict["results"].items(): model_logging.log_text("Evaluation {}".format(k), global_step) model_logging.log_text(v, global_step) model_logging.log_text("mAP {}".format(result_dict['mAp']), global_step) model_logging.log_text("best_mAP {}".format(best_mAP), global_step) model_logging.log_metrics(result_dict["detail"], global_step) # with open(result_path_step / "result.pkl", 'wb') as f: # pickle.dump(detections, f) return best_mAP