def log_function(model_dir, config_path): model_logging = SimpleModelLog(model_dir) model_logging.open() config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) model_logging.log_text(proto_str + "\n", 0, tag="config") return model_logging
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pretrained_path=None, pretrained_include=None, pretrained_exclude=None, freeze_include=None, freeze_exclude=None, multi_gpu=False, measure_time=False, resume=False): """train a VoxelNet model specified by a config file. """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # create dir for saving training states model_dir = str(Path(model_dir).resolve()) if create_folder: if Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = Path(model_dir) if not resume and model_dir.exists(): raise ValueError("model dir exists and you don't specify resume.") model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' # loadd config file config_file_bkp = "pipeline.config" if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to train with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path proto_str = text_format.MessageToString(config, indent=2) with (model_dir / config_file_bkp).open("w") as f: f.write(proto_str) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, measure_time).to(device) # if train_cfg.enable_mixed_precision: # net.half() # net.metrics_to_float() # net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator print("num parameters:", len(list(net.parameters()))) torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) if pretrained_path is not None: ## load pretrained params model_dict = net.state_dict() pretrained_dict = torch.load(pretrained_path) pretrained_dict = filter_param_dict(pretrained_dict, pretrained_include, pretrained_exclude) new_pretrained_dict = {} for k, v in pretrained_dict.items(): if k in model_dict and v.shape == model_dict[k].shape: new_pretrained_dict[k] = v print("Load pretrained parameters:") for k, v in new_pretrained_dict.items(): print(k, v.shape) model_dict.update(new_pretrained_dict) net.load_state_dict(model_dict) freeze_params_v2(dict(net.named_parameters()), freeze_include, freeze_exclude) net.clear_global_step() net.clear_metrics() if multi_gpu: net_parallel = torch.nn.DataParallel(net) else: net_parallel = net optimizer_cfg = train_cfg.optimizer loss_scale = train_cfg.loss_scale_factor fastai_optimizer = optimizer_builder.build(optimizer_cfg, net, mixed=False, loss_scale=loss_scale) if loss_scale < 0: loss_scale = "dynamic" if train_cfg.enable_mixed_precision: max_num_voxels = input_cfg.preprocess.max_number_of_voxels * input_cfg.batch_size assert max_num_voxels < 65535, "spconv fp16 training only support this" from apex import amp net, amp_optimizer = amp.initialize(net, fastai_optimizer, opt_level="O2", keep_batchnorm_fp32=True, loss_scale=loss_scale) net.metrics_to_float() else: amp_optimizer = fastai_optimizer torchplus.train.try_restore_latest_checkpoints(model_dir, [fastai_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, amp_optimizer, train_cfg.steps) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 if multi_gpu: num_gpu = torch.cuda.device_count() print(f"MULTI-GPU: use {num_gpu} gpu") collate_fn = merge_second_batch_multigpu else: collate_fn = merge_second_batch num_gpu = 1 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, multi_gpu=multi_gpu) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size * num_gpu, shuffle=True, num_workers=input_cfg.preprocess.num_workers * num_gpu, pin_memory=False, collate_fn=collate_fn, worker_init_fn=_worker_init_fn, drop_last=not multi_gpu) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, # only support multi-gpu train shuffle=False, num_workers=eval_input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch) ###################### # TRAINING ###################### model_logging = SimpleModelLog(model_dir) model_logging.open() model_logging.log_text(proto_str + "\n", 0, tag="config") start_step = net.get_global_step() total_step = train_cfg.steps t = time.time() steps_per_eval = train_cfg.steps_per_eval clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch amp_optimizer.zero_grad() step_times = [] step = start_step try: while True: if clear_metrics_every_epoch: net.clear_metrics() for example in dataloader: lr_scheduler.step(net.get_global_step()) time_metrics = example["metrics"] example.pop("metrics") example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net_parallel(example_torch) cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"].mean() cls_neg_loss = ret_dict["cls_neg_loss"].mean() loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] cared = ret_dict["cared"] labels = example_torch["labels"] if train_cfg.enable_mixed_precision: with amp.scale_loss(loss, amp_optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) amp_optimizer.step() amp_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) step_times.append(step_time) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: if measure_time: for name, val in net.get_avg_time_dict().items(): print(f"avg {name} time = {val * 1000:.3f} ms") loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["runtime"] = { "step": global_step, "steptime": np.mean(step_times), } metrics["runtime"].update(time_metrics[0]) step_times = [] metrics.update(net_metrics) metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.use_direction_classifier: dir_loss_reduced = ret_dict["dir_loss_reduced"].mean() metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["misc"] = { # "num_vox": int(example_torch["voxels"].shape[0]), "num_pos": int(num_pos), "num_neg": int(num_neg), "num_anchors": int(num_anchors), "lr": float(amp_optimizer.lr), "mem_usage": psutil.virtual_memory().percent, } model_logging.log_metrics(metrics, global_step) if global_step % steps_per_eval == 0: torchplus.train.save_models(model_dir, [net, amp_optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) model_logging.log_text("#################################", global_step) model_logging.log_text("# EVAL", global_step) model_logging.log_text("#################################", global_step) model_logging.log_text("Generate output labels...", global_step) t = time.time() detections = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start( (len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch( example, float_dtype) detections += net(example) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) model_logging.log_text( f'generate label finished({sec_per_ex:.2f}/s). start eval:', global_step) result_dict = eval_dataset.dataset.evaluation( detections, str(result_path_step)) for k, v in result_dict["results"].items(): model_logging.log_text("Evaluation {}".format(k), global_step) model_logging.log_text(v, global_step) model_logging.log_metrics(result_dict["detail"], global_step) with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(detections, f) net.train() step += 1 if step >= total_step: break if step >= total_step: break except Exception as e: print(json.dumps(example["metadata"], indent=2)) model_logging.log_text(str(e), step) model_logging.log_text(json.dumps(example["metadata"], indent=2), step) torchplus.train.save_models(model_dir, [net, amp_optimizer], step) raise e finally: model_logging.close() torchplus.train.save_models(model_dir, [net, amp_optimizer], net.get_global_step())
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, resume=False): """train a VoxelNet model specified by a config file. """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) if not resume and model_dir.exists(): raise ValueError("model dir exists and you don't specify resume.") model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" if isinstance(config_path, str): # directly provide a config object. this usually used # when you want to train with several different parameters in # one script. config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) else: config = config_path proto_str = text_format.MessageToString(config, indent=2) with (model_dir / config_file_bkp).open("w") as f: f.write(proto_str) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg).to(device) if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator class_names = target_assigner.classes # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # BUILD OPTIMIZER ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer loss_scale = train_cfg.loss_scale_factor mixed_optimizer = optimizer_builder.build( optimizer_cfg, net, mixed=train_cfg.enable_mixed_precision, loss_scale=loss_scale) optimizer = mixed_optimizer center_limit_range = model_cfg.post_center_limit_range """ if train_cfg.enable_mixed_precision: mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer """ # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, train_cfg.steps) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) print(data_iter) ###################### # TRAINING ###################### model_logging = SimpleModelLog(model_dir) model_logging.open() model_logging.log_text(proto_str + "\n", 0, tag="config") total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t steps_per_eval = train_cfg.steps_per_eval total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval for step in range(steps): lr_scheduler.step(net.get_global_step()) try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) #batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) # FCOS losses = ret_dict['total_loss'] loss_cls = ret_dict["loss_cls"] loss_reg = ret_dict["loss_reg"] cls_preds = ret_dict['cls_preds'] labels = ret_dict["labels"] cared = ret_dict["labels"] optimizer.zero_grad() losses.backward() #torch.nn.utils.clip_grad_norm_(net.parameters(), 1) # optimizer_step is for updating the parameter, so clip before update optimizer.step() net.update_global_step() #need to unpack the [0] for fpn net_metrics = net.update_metrics(loss_cls, loss_reg, cls_preds[0], labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} global_step = net.get_global_step() #print log if global_step % display_step == 0: metrics["runtime"] = { "step": global_step, "steptime": step_time, } metrics.update(net_metrics) metrics["misc"] = { "num_vox": int(example_torch["voxels"].shape[0]), "lr": float(optimizer.lr), } model_logging.log_metrics(metrics, global_step) ckpt_elasped_time = time.time() - ckpt_start_time torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) model_logging.log_text("#################################", global_step) model_logging.log_text("# EVAL", global_step) model_logging.log_text("#################################", global_step) model_logging.log_text("Generate output labels...", global_step) t = time.time() detections = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start( (len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) with torch.no_grad(): detections += net(example) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) model_logging.log_text( f'generate label finished({sec_per_ex:.2f}/s). start eval:', global_step) result_dict = eval_dataset.dataset.evaluation( detections, str(result_path_step)) for k, v in result_dict["results"].items(): model_logging.log_text("Evaluation {}".format(k), global_step) model_logging.log_text(v, global_step) model_logging.log_metrics(result_dict["detail"], global_step) with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(detections, f) net.train() ''' new version of evaluation while trainging # do the evaluation while traingingi if global_step % steps_per_eval == 0: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) model_logging.log_text("#################################", global_step) model_logging.log_text("# EVAL", global_step) model_logging.log_text("#################################", global_step) model_logging.log_text("Generate output labels...", global_step) t = time.time() detections = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start((len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) with torch.no_grad(): detections += net(example) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) model_logging.log_text( f'generate label finished({sec_per_ex:.2f}/s). start eval:', global_step) result_dict = eval_dataset.dataset.evaluation( detections, str(result_path_step)) for k, v in result_dict["results"].items(): model_logging.log_text("Evaluation {}".format(k), global_step) model_logging.log_text(v, global_step) model_logging.log_metrics(result_dict["detail"], global_step) with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(detections, f) net.train() ''' except Exception as e: print("trainging error") raise e finally: model_logging.close() # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step())
def train( config_path: Union[str, Path, pipeline.TrainEvalPipelineConfig], model_dir: Union[str, Path], data_root_path: Union[str, Path], result_path: Optional[Union[str, Path]] = None, display_step: int = 50, pretrained_path=None, pretrained_include=None, pretrained_exclude=None, freeze_include=None, freeze_exclude=None, measure_time: bool = False, resume: bool = False, ): """train a VoxelNet model specified by a config file. """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_dir = real_path(model_dir, check_exists=False) if not resume and model_dir.exists(): raise ValueError("model dir exists and you don't specify resume.") model_dir.mkdir(parents=True, exist_ok=True) model_dir = Path(model_dir) if result_path is None: result_path = model_dir / "results" else: result_path = assert_real_path(result_path, mkdir=True) config_file_bkp = DEFAULT_CONFIG_FILE_NAME if isinstance(config_path, pipeline.TrainEvalPipelineConfig): # directly provide a config object. this usually used # when you want to train with several different parameters in # one script. config = config_path proto_str = text_format.MessageToString(config, use_short_repeated_primitives=True, indent=2) else: config_path = assert_real_path(config_path) data_root_path = assert_real_path(data_root_path) config = read_pipeline_config(config_path, data_root_path) # Copy the contents of config_path to config_file_bkp verbatim without passing it through the protobuf parser. with open(str(config_path), "r") as f: proto_str = f.read() with (model_dir / config_file_bkp).open("w") as f: f.write(proto_str) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config net = build_network(model_cfg, measure_time).to(device) if train_cfg.enable_mixed_precision: # net.half() net.metrics_to_float() net.convert_norm_to_float(net) target_assigner = net.target_assigner voxel_generator = net.voxel_generator # print("num parameters:", len(list(net.parameters()))) print("num parameters (million): ", count_parameters(net) * 1e-6) torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) if pretrained_path is not None: model_dict = net.state_dict() pretrained_dict = torch.load(pretrained_path) pretrained_dict = filter_param_dict(pretrained_dict, pretrained_include, pretrained_exclude) new_pretrained_dict = {} for k, v in pretrained_dict.items(): if k in model_dict and v.shape == model_dict[k].shape: new_pretrained_dict[k] = v print("Load pretrained parameters:") for k, v in new_pretrained_dict.items(): print(k, v.shape) model_dict.update(new_pretrained_dict) net.load_state_dict(model_dict) freeze_params_v2(dict(net.named_parameters()), freeze_include, freeze_exclude) net.clear_global_step() net.clear_metrics() optimizer_cfg = train_cfg.optimizer loss_scale = train_cfg.loss_scale_factor fastai_optimizer = optimizer_builder.build( optimizer_cfg, net, mixed=False, loss_scale=loss_scale) if loss_scale < 0: loss_scale = "dynamic" amp_optimizer = fastai_optimizer torchplus.train.try_restore_latest_checkpoints(model_dir,[amp_optimizer]) float_dtype = torch.float32 collate_fn = merge_second_batch num_gpu = 1 ###################### # PREPARE INPUT ###################### def get_train_dataloader(input_cfg, model_cfg, voxel_generator, target_assigner, multi_gpu, num_gpu, collate_fn, _worker_init_fn): dataset = input_reader_builder.build( input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, multi_gpu=multi_gpu) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size * num_gpu, shuffle=True, num_workers=input_cfg.preprocess.num_workers * num_gpu, pin_memory=True, collate_fn=collate_fn, worker_init_fn=_worker_init_fn, drop_last=not multi_gpu) return dataloader eval_dataset = input_reader_builder.build( eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, # only support multi-gpu train shuffle=False, num_workers=eval_input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch) ###################### # TRAINING ###################### model_logging = SimpleModelLog(model_dir) model_logging.open() model_logging.log_text(proto_str + "\n", 0, tag="config") epochs = train_cfg.steps epochs_per_eval = train_cfg.steps_per_eval clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch amp_optimizer.zero_grad() step_times = [] eval_times = [] t = time.time() reset_ds_epoch = False run_once = True if not (os.getenv("MLFLOW_EXPERIMENT_ID") or os.getenv("MLFLOW_EXPERIMENT_NAME")): mlflow.set_experiment("object_detection") try: while True: if run_once or reset_ds_epoch: dataloader = get_train_dataloader(input_cfg, model_cfg, voxel_generator, target_assigner, multi_gpu, num_gpu, collate_fn, _worker_init_fn) total_step = int(np.ceil((len(dataloader.dataset) / dataloader.batch_size) * epochs)) steps_per_eval = int(np.floor((len(dataloader.dataset) / dataloader.batch_size) * epochs_per_eval)) train_cfg.steps = int(total_step) train_cfg.steps_per_eval = int(steps_per_eval) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, amp_optimizer, total_step) print(f"\nnumber of samples: {len(dataloader.dataset)}\ntotal_steps: {total_step}\nsteps_per_eval: {steps_per_eval}") run_once = False if clear_metrics_every_epoch: net.clear_metrics() for example in dataloader: lr_scheduler.step(net.get_global_step()) time_metrics = example["metrics"] example.pop("metrics") example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"].mean() cls_neg_loss = ret_dict["cls_neg_loss"].mean() loc_loss = ret_dict["loc_loss"] # cls_loss = ret_dict["cls_loss"] cared = ret_dict["cared"] labels = example_torch["labels"] loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 30.0) # torch.nn.utils.clip_grad_norm_(amp.master_params(amp_optimizer), 10.0) amp_optimizer.step() amp_optimizer.zero_grad() net.update_global_step() global_step = net.get_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) step_times.append(step_time) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) if global_step % display_step == 0: if measure_time: for name, val in net.get_avg_time_dict().items(): print(f"avg {name} time = {val * 1000:.3f} ms") loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] total_seconds = ((total_step - global_step) * np.mean(step_times)) if len(eval_times) != 0: eval_seconds = ((epochs / epochs_per_eval) - len(eval_times)) * np.mean(eval_times) total_seconds += eval_seconds next_eval_seconds = (steps_per_eval - (global_step % steps_per_eval)) * np.mean(step_times) metrics["runtime"] = { "step": global_step, "steptime": np.mean(step_times), "ETA": seconds_to_eta(total_seconds), "eval_ETA": seconds_to_eta(next_eval_seconds), } metrics["runtime"].update(time_metrics[0]) step_times = [] metrics.update(net_metrics) metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.use_direction_classifier: dir_loss_reduced = ret_dict["dir_loss_reduced"].mean() metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["misc"] = { "num_vox": int(example_torch["voxels"].shape[0]), "num_pos": int(num_pos), "num_neg": int(num_neg), "num_anchors": int(num_anchors), "lr": float(amp_optimizer.lr), "mem_usage": psutil.virtual_memory().percent, } model_logging.log_metrics(metrics, global_step) # if global_step % steps_per_eval != 0 and global_step % 1000 == 0: # torchplus.train.save_models(model_dir, [net, amp_optimizer], net.get_global_step()) if global_step % steps_per_eval == 0: torchplus.train.save_models(model_dir, [net, amp_optimizer], global_step) net.eval() result_path_step = result_path / f"step_{global_step}" result_path_step.mkdir(parents=True, exist_ok=True) model_logging.log_text("#################################", global_step) model_logging.log_text("# EVAL", global_step) model_logging.log_text("#################################", global_step) model_logging.log_text("Generate output labels...", global_step) t = time.time() detections = [] prog_bar = ProgressBar() net.clear_timer() prog_bar.start((len(eval_dataset) + eval_input_cfg.batch_size - 1) // eval_input_cfg.batch_size) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) detections += net(example) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) eval_times.append((time.time() - t)) model_logging.log_text(f'generate label finished({sec_per_ex:.2f}/s). start eval:', global_step) result_dict = eval_dataset.dataset.evaluation(detections, result_path_step) if result_dict is None: raise RuntimeError("eval_dataset.dataset.evaluation() returned None") for k, v in result_dict["results"].items(): model_logging.log_text("Evaluation {}".format(k), global_step) model_logging.log_text(v, global_step) model_logging.log_metrics(result_dict["detail"], global_step) with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(detections, f) net.train() if global_step >= total_step: break if net.get_global_step() >= total_step: break except Exception as e: if 'example' in locals(): print(json.dumps(example["metadata"], indent=2)) global_step = net.get_global_step() model_logging.log_text(str(e), global_step) if 'example' in locals(): model_logging.log_text(json.dumps(example["metadata"], indent=2), global_step) torchplus.train.save_models(model_dir, [net, amp_optimizer], global_step) raise e finally: model_logging.close() torchplus.train.save_models(model_dir, [net, amp_optimizer], net.get_global_step()) def _save_checkpoint_info(file_path, config_filename, checkpoint_filename): from yaml import dump with open(file_path, "w") as config_info_file: checkpoint_info = { "config": config_filename, "checkpoint": checkpoint_filename } dump(checkpoint_info, config_info_file, default_flow_style=False) ckpt_info_path = str(model_dir / "checkpoint_info.yaml") latest_ckpt_filename = "voxelnet-{}.tckpt".format(net.get_global_step()) _save_checkpoint_info(ckpt_info_path, config_file_bkp, latest_ckpt_filename) mlflow.log_artifact(ckpt_info_path, "model") mlflow.log_artifact(str(model_dir / config_file_bkp), "model") mlflow.log_artifact(str(model_dir / latest_ckpt_filename), "model")
worker_init_fn=_worker_init_fn, drop_last=not cfg.multi_gpu) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, # only support multi-gpu train shuffle=False, num_workers=eval_input_cfg.preprocess.num_workers, pin_memory=False, collate_fn=merge_second_batch) ###################### # TRAINING ###################### model_logging = SimpleModelLog(cfg.model_dir) model_logging.open() model_logging.log_text(proto_str + "\n", 0, tag="config") start_step = net.get_global_step() total_step = train_cfg.steps t = time.time() steps_per_eval = train_cfg.steps_per_eval clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch amp_optimizer.zero_grad() step_times = [] step = start_step try: while True: if clear_metrics_every_epoch: net.clear_metrics() for example in dataloader: lr_scheduler.step(net.get_global_step())
class SolverWrapper: def __init__(self, train_net, test_net, pretrain = None, prefix = "pp", model_dir=None, config_path=None, ### Solver Params ### solver_type='ADAM', weight_decay=0.001, lr_policy='step', warmup_step=0, warmup_start_lr=0, lr_ratio=1, end_ratio=1, base_lr=0.002, max_lr=0.002, momentum = 0.9, max_momentum = 0, cycle_steps=1856, gamma=0.8, #0.1 for lr_policy stepsize=100, test_iter=3769, test_interval=50, #set test_interval to 999999999 if not it will auto run validation max_iter=1e5, iter_size=1, snapshot=9999, display=1, random_seed=0, debug_info=False, create_prototxt=True, args=None): """Initialize the SolverWrapper.""" self.test_net = test_net self.solver_param = caffe_pb2.SolverParameter() self.solver_param.train_net = train_net self.solver_param.test_initialization = False self.solver_param.display = display self.solver_param.warmup_step = warmup_step self.solver_param.warmup_start_lr = warmup_start_lr self.solver_param.lr_ratio = lr_ratio self.solver_param.end_ratio = end_ratio self.solver_param.base_lr = base_lr self.solver_param.max_lr = max_lr self.solver_param.cycle_steps = cycle_steps self.solver_param.max_momentum = max_momentum self.solver_param.lr_policy = lr_policy # "fixed" #exp self.solver_param.gamma = gamma self.solver_param.stepsize = stepsize self.solver_param.display = display self.solver_param.max_iter = max_iter self.solver_param.iter_size = iter_size self.solver_param.snapshot = snapshot self.solver_param.snapshot_prefix = os.path.join(model_dir, prefix) self.solver_param.random_seed = random_seed self.solver_param.solver_mode = caffe_pb2.SolverParameter.GPU if solver_type is 'SGD': print("[Info] SGD Solver >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") self.solver_param.solver_type = caffe_pb2.SolverParameter.SGD elif solver_type is 'ADAM': print("[Info] ADAM Solver >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") self.solver_param.solver_type = caffe_pb2.SolverParameter.ADAM self.solver_param.momentum = momentum self.solver_param.momentum2 = 0.999 self.solver_param.weight_decay = weight_decay self.solver_param.debug_info = debug_info if create_prototxt: solver_prototxt = get_prototxt(self.solver_param, os.path.join(model_dir, 'solver.prototxt')) print(solver_prototxt) self.solver = caffe.get_solver(solver_prototxt) self.test_interval = test_interval '''Model config parameter Initialization''' self.args = args self.model_dir, self.config_path = model_dir, config_path _, eval_input_cfg, model_cfg, train_cfg = load_config(self.model_dir, self.config_path) voxel_generator, self.target_assigner = build_network(model_cfg) self.dataloader, self.eval_dataset = load_dataloader(eval_input_cfg, model_cfg, voxel_generator, self.target_assigner, args = args) self.model_cfg = model_cfg # NOTE: Could have problem, if eval no good check here self._box_coder=self.target_assigner.box_coder classes_cfg = model_cfg.target_assigner.class_settings self._num_class = len(classes_cfg) self._encode_background_as_zeros = model_cfg.encode_background_as_zeros self._nms_class_agnostic=model_cfg.nms_class_agnostic self._use_multi_class_nms=[c.use_multi_class_nms for c in classes_cfg] self._nms_pre_max_sizes=[c.nms_pre_max_size for c in classes_cfg] self._multiclass_nms=all(self._use_multi_class_nms) self._use_sigmoid_score=model_cfg.use_sigmoid_score self._num_anchor_per_loc=self.target_assigner.num_anchors_per_location self._use_rotate_nms=[c.use_rotate_nms for c in classes_cfg] #False for pillar, True for second self._nms_post_max_sizes=[c.nms_post_max_size for c in classes_cfg] #300 for pillar, 100 for second self._nms_score_thresholds=[c.nms_score_threshold for c in classes_cfg] # 0.4 in submit, but 0.3 can get better hard performance #pillar use 0.05, second 0.3 self._nms_iou_thresholds=[c.nms_iou_threshold for c in classes_cfg] ## NOTE: double check #pillar use 0.5, second use 0.01 self._post_center_range=list(model_cfg.post_center_limit_range) ## NOTE: double check self._use_direction_classifier=model_cfg.use_direction_classifier ## NOTE: double check path = pretrain["path"] weight = pretrain["weight"] skip_layer = pretrain["skip_layer"] #list skip layer name if path != None and weight != None: self.load_pretrained_caffe_weight(path, weight, skip_layer) #self.model_logging = log_function(self.model_dir, self.config_path) ################################Log##################################### self.model_logging = SimpleModelLog(self.model_dir) self.model_logging.open() config = pipeline_pb2.TrainEvalPipelineConfig() with open(self.config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) self.model_logging.log_text(proto_str + "\n", 0, tag="config") self.model_logging.close() ######################################################################## #Log loss ######################################################################## self.log_loss_path = Path(self.model_dir) / f'log_loss.txt' ######################################################################## def load_pretrained_caffe_weight(self, path, weight_path, skip_layer): assert isinstance(skip_layer, list) #pass skip list name inlist print("### Start loading pretrained caffe weights") old_proto_path = os.path.join(path, "train.prototxt") old_weight_path = os.path.join(path, weight_path) print("### Load old caffe model") old_net = caffe.Net(old_proto_path, old_weight_path, caffe.TRAIN) print("### Start loading model layers") for layer in old_net.params.keys(): if layer in skip_layer: print("### Skipped layer: " + layer) continue param_length = len(old_net.params[layer]) print("# Loading layer: " + layer) for index in range(param_length): try: self.solver.net.params[layer][index].data[...] = old_net.params[layer][index].data[...] except Exception as e: print(e) print("!! Cannot load layer: " + layer) continue print("### Finish loading pretrained model") def eval_model(self): self.model_logging.open() #logging cur_iter = self.solver.iter # if self.args["segmentation"]: # self.segmentation_evaluation(cur_iter) # else: self.object_detection_evaluation(cur_iter) self.model_logging.close() def train_model(self): cur_iter = self.solver.iter while cur_iter < self.solver_param.max_iter: for i in range(self.test_interval): #####For Restrore check if cur_iter + i >= self.solver_param.max_iter: break self.solver.step(1) if (self.solver.iter-1) % self.solver_param.display == 0: with open(self.log_loss_path, "a") as f: lr = self.solver.lr cls_loss = self.solver.net.blobs['cls_loss'].data[...][0] reg_loss = self.solver.net.blobs['reg_loss'].data[...][0] f.write("steps={},".format(self.solver.iter-1)) f.write("lr={:.8f},".format(lr)) f.write("cls_loss={:.3f},".format(cls_loss)) f.write("reg_loss={:.3f}".format(reg_loss)) f.write("\n") sut.plot_graph(self.log_loss_path, self.model_dir) self.eval_model() sut.clear_caffemodel(self.model_dir, 8) #KEPP Last 8 cur_iter += self.test_interval def lr_finder(self): lr_finder_path = Path(self.model_dir) / f'log_lrf.txt' for _ in range(self.solver_param.max_iter): self.solver.step(1) if (self.solver.iter-1) % self.solver_param.display == 0: with open(lr_finder_path, "a") as f: lr = self.solver.lr cls_loss = self.solver.net.blobs['cls_loss'].data[...][0] reg_loss = self.solver.net.blobs['reg_loss'].data[...][0] f.write("steps={},".format(self.solver.iter-1)) f.write("lr={:.8f},".format(lr)) f.write("cls_loss={:.3f},".format(cls_loss)) f.write("reg_loss={:.3f}".format(reg_loss)) f.write("\n") sut.plot_graph(lr_finder_path, self.model_dir, name='Finder') def demo(self): print("[Info] Initialize test net\n") test_net = caffe.Net(self.test_net, caffe.TEST) test_net.share_with(self.solver.net) print("[Info] Loaded train net weights \n") data_dir = "./debug_tool/experiment/data/2011_09_26_drive_0009_sync/velodyne_points/data" point_cloud_files = os.listdir(data_dir) point_cloud_files.sort() obj_detections = [] # Voxel generator pc_range = self.model_cfg.voxel_generator.point_cloud_range class_settings = self.model_cfg.target_assigner.class_settings[0] size = class_settings.anchor_generator_range.sizes rotations = class_settings.anchor_generator_range.rotations anchor_ranges = np.array(class_settings.anchor_generator_range.anchor_ranges) voxel_size = np.array(self.model_cfg.voxel_generator.voxel_size) out_size_factor = self.model_cfg.middle_feature_extractor.downsample_factor point_cloud_range = np.array(pc_range) grid_size = ( point_cloud_range[3:] - point_cloud_range[:3]) / voxel_size grid_size = np.round(grid_size).astype(np.int64) feature_map_size = grid_size[:2] // out_size_factor feature_map_size = [*feature_map_size, 1][::-1] for file in tqdm(point_cloud_files): file_path = os.path.join(data_dir, file) # with open(file_path, "rb") as f: # points = f.read() points = np.fromfile(file_path, dtype = np.float32).reshape(-1,4) # NOTE: Prior seg preprocessing ### points = box_np_ops.remove_out_pc_range_points(points, pc_range) # Data sampling seg_keep_points = 20000 points = PointRandomChoiceV2(points, seg_keep_points) #Repeat sample according points distance points = np.expand_dims(points, 0) ### # Anchor Generator anchors = box_np_ops.create_anchors_3d_range(feature_map_size, anchor_ranges, size, rotations) # input test_net.blobs['top_prev'].reshape(*points.shape) test_net.blobs['top_prev'].data[...] = points test_net.forward() # segmentation output try: seg_preds = test_net.blobs['seg_output'].data[...].squeeze() points = np.squeeze(points) pred_thresh = 0.5 pd_points = points[seg_preds >= pred_thresh,:] with open(os.path.join('./debug_tool/experiment',"pd_points.pkl") , 'ab') as f: pickle.dump(pd_points,f) except Exception as e: pass with open(os.path.join('./debug_tool/experiment',"points.pkl") , 'ab') as f: pickle.dump(points,f) # Bounding box output cls_preds = test_net.blobs['f_cls_preds'].data[...] box_preds = test_net.blobs['f_box_preds'].data[...] preds_dict = {"box_preds":box_preds, "cls_preds":cls_preds} example = {"anchors": np.expand_dims(anchors, 0)} example = example_convert_to_torch(example, torch.float32) preds_dict = example_convert_to_torch(preds_dict, torch.float32) obj_detections += self.predict(example, preds_dict) pd_boxes = obj_detections[-1]["box3d_lidar"].cpu().detach().numpy() with open(os.path.join('./debug_tool/experiment',"pd_boxes.pkl") , 'ab') as f: pickle.dump(pd_boxes,f) ############################################################################ # For object evaluation ############################################################################ def object_detection_evaluation(self, global_step): print("[Info] Initialize test net\n") test_net = caffe.Net(self.test_net, caffe.TEST) test_net.share_with(self.solver.net) print("[Info] Loaded train net weights \n") data_iter=iter(self.dataloader) obj_detections = [] seg_detections = [] t = time.time() model_dir = str(Path(self.model_dir).resolve()) model_dir = Path(model_dir) result_path = model_dir / 'results' result_path_step = result_path / f"step_{global_step}" result_path_step.mkdir(parents=True, exist_ok=True) for i in tqdm(range(len(data_iter))): example = next(data_iter) # points = example['seg_points'] # Pointseg # voxels = example['voxels'] # coors = example['coordinates'] # coors = example['coordinates'] # num_points = example['num_points'] # test_net.blobs['top_prev'].reshape(*points.shape) # test_net.blobs['top_prev'].data[...] = points # test_net.forward() # test_net.blobs['top_lat_feats'].reshape(*(voxels.squeeze()).shape) # test_net.blobs['top_lat_feats'].data[...] = voxels.squeeze() # voxels = voxels.squeeze() # with open(os.path.join('./debug',"points.pkl") , 'ab') as f: # pickle.dump(voxels,f) # voxels = voxels[cls_out,:] # # print("selected voxels", voxels.shape) # with open(os.path.join('./debug',"seg_points.pkl") , 'ab') as f: # pickle.dump(voxels,f) # NOTE: For voxel seg net # seg_points = example['seg_points'] # Pointseg # coords = example['coords'] # coords_center = example['coords_center'] # p2voxel_idx = example['p2voxel_idx'] # test_net.blobs['seg_points'].reshape(*seg_points.shape) # test_net.blobs['seg_points'].data[...] = seg_points # test_net.blobs['coords'].reshape(*coords.shape) # test_net.blobs['coords'].data[...] = coords # test_net.blobs['p2voxel_idx'].reshape(*p2voxel_idx.shape) # test_net.blobs['p2voxel_idx'].data[...] = p2voxel_idx ## # NOTE: For prior seg voxels = example['seg_points'] test_net.blobs['top_prev'].reshape(*voxels.shape) test_net.blobs['top_prev'].data[...] = voxels test_net.forward() ## cls_preds = test_net.blobs['f_cls_preds'].data[...] box_preds = test_net.blobs['f_box_preds'].data[...] # seg_preds = test_net.blobs['seg_output'].data[...].squeeze() # feat_map = test_net.blobs['p2fm'].data[...].squeeze().reshape(5,-1).transpose() # feat_map = feat_map[(feat_map != 0).any(-1)] # Reverse coordinate for anchor generator # anchor generated from generator shape (n_anchors, 7) # needed to expand dim for prediction # example["anchors"] = np.expand_dims(anchors, 0) # preds_dict = {"box_preds":box_preds.reshape(1,-1,7), "cls_preds":cls_preds.reshape(1,-1,1)} # example["seg_points"] = voxels preds_dict = {"box_preds":box_preds, "cls_preds":cls_preds} example = example_convert_to_torch(example, torch.float32) preds_dict = example_convert_to_torch(preds_dict, torch.float32) obj_detections += self.predict(example, preds_dict) # seg_detections += self.seg_predict(np.arange(0.5, 0.75, 0.05), seg_preds, example, result_path_step, vis=False) ################ visualization ##################### pd_boxes = obj_detections[-1]["box3d_lidar"].cpu().detach().numpy() with open(os.path.join(result_path_step,"pd_boxes.pkl") , 'ab') as f: pickle.dump(pd_boxes,f) self.model_logging.log_text( f'\nEval at step ---------> {global_step:.2f}:\n', global_step) # Object detection evaluation result_dict = self.eval_dataset.dataset.evaluation(obj_detections, str(result_path_step)) for k, v in result_dict["results"].items(): self.model_logging.log_text("Evaluation {}".format(k), global_step) self.model_logging.log_text(v, global_step) self.model_logging.log_metrics(result_dict["detail"], global_step) # Class segmentation prediction # result_dict = self.total_segmentation_result(seg_detections) # for k, v in result_dict["results"].items(): # self.model_logging.log_text("Evaluation {}".format(k), global_step) # self.model_logging.log_text(v, global_step) # self.model_logging.log_metrics(result_dict["detail"], global_step) def predict(self, example, preds_dict): """start with v1.6.0, this function don't contain any kitti-specific code. Returns: predict: list of pred_dict. pred_dict: { box3d_lidar: [N, 7] 3d box. scores: [N] label_preds: [N] metadata: meta-data which contains dataset-specific information. for kitti, it contains image idx (label idx), for nuscenes, sample_token is saved in it. } """ batch_size = example['anchors'].shape[0] # NOTE: for voxel seg net # batch_size = example['coords_center'].shape[0] # batch_size = example['seg_points'].shape[0] if "metadata" not in example or len(example["metadata"]) == 0: meta_list = [None] * batch_size else: meta_list = example["metadata"] batch_anchors = example["anchors"].view(batch_size, -1, example["anchors"].shape[-1]) # NOTE: for voxel seg net # batch_anchors = example["coords_center"].view(batch_size, -1, example["coords_center"].shape[-1]) # batch_anchors = example["seg_points"].view(batch_size, -1, example["seg_points"].shape[-1]) if "anchors_mask" not in example: batch_anchors_mask = [None] * batch_size else: batch_anchors_mask = example["anchors_mask"].view(batch_size, -1) t = time.time() batch_box_preds = preds_dict["box_preds"] batch_cls_preds = preds_dict["cls_preds"] batch_box_preds = batch_box_preds.view(batch_size, -1, self._box_coder.code_size) num_class_with_bg = self._num_class if not self._encode_background_as_zeros: num_class_with_bg = self._num_class + 1 batch_cls_preds = batch_cls_preds.view(batch_size, -1, num_class_with_bg) # NOTE: Original decoding batch_box_preds = self._box_coder.decode_torch(batch_box_preds, batch_anchors) # NOTE: For voxel seg net and point wise prediction # batch_box_preds = box_np_ops.fcos_box_decoder_v2_torch(batch_anchors, # batch_box_preds) if self._use_direction_classifier: batch_dir_preds = preds_dict["dir_cls_preds"] batch_dir_preds = batch_dir_preds.view(batch_size, -1, self._num_direction_bins) else: batch_dir_preds = [None] * batch_size predictions_dicts = [] post_center_range = None if len(self._post_center_range) > 0: post_center_range = torch.tensor( self._post_center_range, dtype=batch_box_preds.dtype, device=batch_box_preds.device).float() for box_preds, cls_preds, dir_preds, a_mask, meta in zip( batch_box_preds, batch_cls_preds, batch_dir_preds, batch_anchors_mask, meta_list): if a_mask is not None: box_preds = box_preds[a_mask] cls_preds = cls_preds[a_mask] box_preds = box_preds.float() cls_preds = cls_preds.float() if self._use_direction_classifier: if a_mask is not None: dir_preds = dir_preds[a_mask] dir_labels = torch.max(dir_preds, dim=-1)[1] if self._encode_background_as_zeros: # this don't support softmax assert self._use_sigmoid_score is True total_scores = torch.sigmoid(cls_preds) else: # encode background as first element in one-hot vector if self._use_sigmoid_score: total_scores = torch.sigmoid(cls_preds)[..., 1:] else: total_scores = F.softmax(cls_preds, dim=-1)[..., 1:] # Apply NMS in birdeye view if self._use_rotate_nms: nms_func = box_torch_ops.rotate_nms else: nms_func = box_torch_ops.nms feature_map_size_prod = batch_box_preds.shape[ 1] // self.target_assigner.num_anchors_per_location if self._multiclass_nms: assert self._encode_background_as_zeros is True boxes_for_nms = box_preds[:, [0, 1, 3, 4, 6]] if not self._use_rotate_nms: box_preds_corners = box_torch_ops.center_to_corner_box2d( boxes_for_nms[:, :2], boxes_for_nms[:, 2:4], boxes_for_nms[:, 4]) boxes_for_nms = box_torch_ops.corner_to_standup_nd( box_preds_corners) selected_boxes, selected_labels, selected_scores = [], [], [] selected_dir_labels = [] scores = total_scores boxes = boxes_for_nms selected_per_class = [] score_threshs = self._nms_score_thresholds pre_max_sizes = self._nms_pre_max_sizes post_max_sizes = self._nms_post_max_sizes iou_thresholds = self._nms_iou_thresholds for class_idx, score_thresh, pre_ms, post_ms, iou_th in zip( range(self._num_class), score_threshs, pre_max_sizes, post_max_sizes, iou_thresholds): if self._nms_class_agnostic: class_scores = total_scores.view( feature_map_size_prod, -1, self._num_class)[..., class_idx] class_scores = class_scores.contiguous().view(-1) class_boxes_nms = boxes.view(-1, boxes_for_nms.shape[-1]) class_boxes = box_preds class_dir_labels = dir_labels else: anchors_range = self.target_assigner.anchors_range(class_idx) class_scores = total_scores.view( -1, self._num_class)[anchors_range[0]:anchors_range[1], class_idx] class_boxes_nms = boxes.view(-1, boxes_for_nms.shape[-1])[anchors_range[0]:anchors_range[1], :] class_scores = class_scores.contiguous().view(-1) class_boxes_nms = class_boxes_nms.contiguous().view( -1, boxes_for_nms.shape[-1]) class_boxes = box_preds.view(-1, box_preds.shape[-1])[anchors_range[0]:anchors_range[1], :] class_boxes = class_boxes.contiguous().view( -1, box_preds.shape[-1]) if self._use_direction_classifier: class_dir_labels = dir_labels.view(-1)[anchors_range[0]:anchors_range[1]] class_dir_labels = class_dir_labels.contiguous( ).view(-1) if score_thresh > 0.0: class_scores_keep = class_scores >= score_thresh if class_scores_keep.shape[0] == 0: selected_per_class.append(None) continue class_scores = class_scores[class_scores_keep] if class_scores.shape[0] != 0: if score_thresh > 0.0: class_boxes_nms = class_boxes_nms[ class_scores_keep] class_boxes = class_boxes[class_scores_keep] class_dir_labels = class_dir_labels[ class_scores_keep] keep = nms_func(class_boxes_nms, class_scores, pre_ms, post_ms, iou_th) if keep.shape[0] != 0: selected_per_class.append(keep) else: selected_per_class.append(None) else: selected_per_class.append(None) selected = selected_per_class[-1] if selected is not None: selected_boxes.append(class_boxes[selected]) selected_labels.append( torch.full([class_boxes[selected].shape[0]], class_idx, dtype=torch.int64, device=box_preds.device)) if self._use_direction_classifier: selected_dir_labels.append( class_dir_labels[selected]) selected_scores.append(class_scores[selected]) selected_boxes = torch.cat(selected_boxes, dim=0) selected_labels = torch.cat(selected_labels, dim=0) selected_scores = torch.cat(selected_scores, dim=0) if self._use_direction_classifier: selected_dir_labels = torch.cat(selected_dir_labels, dim=0) else: # get highest score per prediction, than apply nms # to remove overlapped box. if num_class_with_bg == 1: top_scores = total_scores.squeeze(-1) top_labels = torch.zeros( total_scores.shape[0], device=total_scores.device, dtype=torch.long) else: top_scores, top_labels = torch.max( total_scores, dim=-1) if self._nms_score_thresholds[0] > 0.0: top_scores_keep = top_scores >= self._nms_score_thresholds[0] top_scores = top_scores.masked_select(top_scores_keep) print("nms_thres is {} selected {} cars ".format(self._nms_score_thresholds, len(top_scores))) if top_scores.shape[0] != 0: if self._nms_score_thresholds[0] > 0.0: box_preds = box_preds[top_scores_keep] if self._use_direction_classifier: dir_labels = dir_labels[top_scores_keep] top_labels = top_labels[top_scores_keep] boxes_for_nms = box_preds[:, [0, 1, 3, 4, 6]] if not self._use_rotate_nms: box_preds_corners = box_torch_ops.center_to_corner_box2d( boxes_for_nms[:, :2], boxes_for_nms[:, 2:4], boxes_for_nms[:, 4]) boxes_for_nms = box_torch_ops.corner_to_standup_nd( box_preds_corners) # the nms in 3d detection just remove overlap boxes. selected = nms_func( boxes_for_nms, top_scores, pre_max_size=self._nms_pre_max_sizes[0], post_max_size=self._nms_post_max_sizes[0], iou_threshold=self._nms_iou_thresholds[0], ) else: selected = [] # if selected is not None: selected_boxes = box_preds[selected] print("IoU_thresh is {} remove {} overlap".format(self._nms_iou_thresholds, (len(box_preds)-len(selected_boxes)))) #Eval debug if "gt_num" in example: eval_idx = example['metadata'][0]['image_idx'] eval_obj_num = example['gt_num'] detetion_error = eval_obj_num-len(selected_boxes) print("Eval img_{} have {} Object, detected {} Object, error {} ".format(eval_idx, eval_obj_num, len(selected_boxes), detetion_error)) if self._use_direction_classifier: selected_dir_labels = dir_labels[selected] selected_labels = top_labels[selected] selected_scores = top_scores[selected] # finally generate predictions. if selected_boxes.shape[0] != 0: box_preds = selected_boxes scores = selected_scores label_preds = selected_labels if self._use_direction_classifier: dir_labels = selected_dir_labels period = (2 * np.pi / self._num_direction_bins) dir_rot = box_torch_ops.limit_period( box_preds[..., 6] - self._dir_offset, self._dir_limit_offset, period) box_preds[ ..., 6] = dir_rot + self._dir_offset + period * dir_labels.to( box_preds.dtype) final_box_preds = box_preds final_scores = scores final_labels = label_preds if post_center_range is not None: mask = (final_box_preds[:, :3] >= post_center_range[:3]).all(1) mask &= (final_box_preds[:, :3] <= post_center_range[3:]).all(1) predictions_dict = { "box3d_lidar": final_box_preds[mask], "scores": final_scores[mask], "label_preds": label_preds[mask], "metadata": meta, } else: predictions_dict = { "box3d_lidar": final_box_preds, "scores": final_scores, "label_preds": label_preds, "metadata": meta, } else: dtype = batch_box_preds.dtype device = batch_box_preds.device predictions_dict = { "box3d_lidar": torch.zeros([0, box_preds.shape[-1]], dtype=dtype, device=device), "scores": torch.zeros([0], dtype=dtype, device=device), "label_preds": torch.zeros([0], dtype=top_labels.dtype, device=device), "metadata": meta, } predictions_dicts.append(predictions_dict) return predictions_dicts ############################################################################ # For segmentation evaluation ############################################################################ def segmentation_evaluation(self, global_step): print("Initialize test net") test_net = caffe.Net(self.test_net, caffe.TEST) print("Load train net weights") test_net.share_with(self.solver.net) _, eval_input_cfg, model_cfg, train_cfg = load_config(self.model_dir, self.config_path) voxel_generator, self.target_assigner = build_network(model_cfg) ## TODO: dataloader, _= load_dataloader(eval_input_cfg, model_cfg, voxel_generator, self.target_assigner, args = self.args) data_iter=iter(dataloader) model_dir = str(Path(self.model_dir).resolve()) model_dir = Path(model_dir) result_path = model_dir / 'results' result_path_step = result_path / f"step_{global_step}" result_path_step.mkdir(parents=True, exist_ok=True) detections = [] detections_voc = [] detections_05 = [] for i in tqdm(range(len(data_iter))): example = next(data_iter) points = example['seg_points'] test_net.blobs['top_prev'].reshape(*points.shape) test_net.blobs['top_prev'].data[...] = points test_net.forward() #seg_cls_pred output shape (1,1,1,16000) # seg_cls_pred = test_net.blobs["output"].data[...].squeeze() seg_cls_pred = test_net.blobs['seg_output'].data[...].squeeze() detections += self.seg_predict(np.arange(0.5, 0.75, 0.05), seg_cls_pred, example, result_path_step, vis=False) # detections_voc += self.seg_predict([0.1, 0.3, 0.5, 0.7, 0.9], seg_cls_pred, example, result_path_step, vis=False) # detections_05 += self.seg_predict([0.5], seg_cls_pred, example, result_path_step, vis=False) result_dict = self.total_segmentation_result(detections) # result_dict_voc = self.total_segmentation_result(detections_voc) # result_dict_05 = self.total_segmentation_result(detections_05) self.model_logging.log_text( f'\nEval at step ---------> {global_step:.2f}:\n', global_step) for k, v in result_dict["results"].items(): self.model_logging.log_text("Evaluation {}".format(k), global_step) self.model_logging.log_text(v, global_step) self.model_logging.log_metrics(result_dict["detail"], global_step) # print("\n") # for k, v in result_dict_voc["results"].items(): # self.model_logging.log_text("Evaluation VOC {}".format(k), global_step) # self.model_logging.log_text(v, global_step) # self.model_logging.log_metrics(result_dict_voc["detail"], global_step) # print("\n") # for k, v in result_dict_05["results"].items(): # self.model_logging.log_text("Evaluation 0.5 {}".format(k), global_step) # self.model_logging.log_text(v, global_step) # self.model_logging.log_metrics(result_dict_05["detail"], global_step) def seg_predict(self, thresh_range, pred, example, result_path_step, vis=False): # pred = 1 / (1 + np.exp(-pred)) #sigmoid gt = example['seg_labels'] ############### Params ############### eps = 1e-5 cls_thresh_range = thresh_range pos_class = 1 # Car list_score = [] cls_thresh_list = [] ############### Params ############### pred, gt = np.array(pred), np.array(gt) gt = np.squeeze(gt) labels = np.unique(gt) ##################Traverse cls_thresh################################### for cls_thresh in cls_thresh_range: scores = {} _pred = np.where(pred>cls_thresh, 1, 0) TPs = np.sum((gt == pos_class) * (_pred == pos_class)) TNs = np.sum((gt != pos_class) * (_pred != pos_class)) FPs = np.sum((gt != pos_class) * (_pred == pos_class)) FNs = np.sum((gt == pos_class) * (_pred != pos_class)) TargetTotal= np.sum(gt == pos_class) scores['accuracy'] = TPs / (TargetTotal + eps) scores['class_iou'] = TPs / ((TPs + FNs + FPs) + eps) scores['precision'] = TPs / ((TPs + FPs) + eps) cls_thresh_list.append(scores) ###################Found best cls_thresh################################ thresh_accuracy=[] thresh_class_iou=[] thresh_precision=[] max_class_iou = 0 max_class_iou_thresh = 0 for thresh, cls_list in zip(cls_thresh_range, cls_thresh_list): accuracy = cls_list['accuracy'] class_iou = cls_list['class_iou'] precision = cls_list['precision'] thresh_accuracy.append(accuracy) thresh_class_iou.append(class_iou) thresh_precision.append(precision) if class_iou > max_class_iou: max_class_iou = class_iou max_class_iou_thresh = thresh scores['accuracy'] = np.mean(np.array(thresh_accuracy)) scores['class_iou'] = np.mean(np.array(thresh_class_iou)) scores['precision'] = np.mean(np.array(thresh_precision)) scores['best_thresh'] = max_class_iou_thresh #choose the max_thresh for seg ############################pred_thresh################################# pred_thresh = self._nms_score_thresholds[0] points = example['seg_points'] points = np.squeeze(points) pd_points = points[pred >= pred_thresh] with open(os.path.join(result_path_step, "gt_points.pkl"), 'ab') as f: pickle.dump(pd_points,f) if vis: image_idx = example['image_idx'] gt_boxes = example['gt_boxes'] with open(os.path.join(result_path_step, "image_idx.pkl"), 'ab') as f: pickle.dump(image_idx,f) with open(os.path.join(result_path_step, "points.pkl"), 'ab') as f: pickle.dump(points,f) with open(os.path.join(result_path_step, "gt_boxes.pkl"), 'ab') as f: pickle.dump(gt_boxes,f) list_score.append(scores) return list_score def total_segmentation_result(self, detections): avg_accuracy=[] avg_class_iou=[] avg_precision=[] avg_thresh=[] for det in detections: avg_accuracy.append(det['accuracy']) avg_class_iou.append(det['class_iou']) avg_precision.append(det['precision']) avg_thresh.append(det['best_thresh']) avg_accuracy = np.sum(np.array(avg_accuracy)) / np.sum((np.array(avg_accuracy)!=0)) #divided by none zero no Cars avg_class_iou = np.sum(np.array(avg_class_iou)) / np.sum((np.array(avg_class_iou)!=0)) #divided by none zero no Cars avg_precision = np.sum(np.array(avg_precision)) / np.sum((np.array(avg_precision)!=0)) #divided by none zero no Cars avg_thresh = np.sum(np.array(avg_thresh)) / np.sum((np.array(avg_thresh)!=0)) #divided by none zero no Cars print('-------------------- Summary --------------------') result_dict = {} result_dict['results'] ={"Summary" : 'Threshhold: {:.3f} \n'.format(avg_thresh) + \ 'Accuracy: {:.3f} \n'.format(avg_accuracy) + \ 'Car IoU: {:.3f} \n'.format(avg_class_iou) + \ 'Precision: {:.3f} \n'.format(avg_precision) } result_dict['detail'] = {"Threshold" : avg_thresh, "Accuracy" : avg_accuracy, "Car IoU": avg_class_iou, "Precision": avg_precision, } return result_dict
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, pretrained_path=None, pretrained_include=None, pretrained_exclude=None, freeze_include=None, freeze_exclude=None, multi_gpu=False, measure_time=False, resume=False): """train a PointPillars model specified by a config file. """ torch.cuda.empty_cache() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_dir = str(Path(model_dir).resolve()) if create_folder: if Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = Path(model_dir) if not resume and model_dir.exists(): raise ValueError("model dir exists and you don't specify resume.") model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config, proto_str = load_config(model_dir, config_path) input_cfg = config.train_input_reader model_cfg = config.model.second train_cfg = config.train_config target_assigner_cfg = model_cfg.target_assigner voxel_generator = voxel_builder.build(model_cfg.voxel_generator) bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) box_coder.custom_ndim = target_assigner._anchor_generators[0].custom_ndim net = PointPillarsNet(1, voxel_generator.grid_size, target_assigner.num_anchors_per_location, target_assigner.box_coder.code_size, with_distance=False).to(device) kaiming_init(net, 1.0) net_loss = build_net_loss(model_cfg, target_assigner).to(device) net_loss.clear_global_step() net_loss.clear_metrics() # print("num parameters:", len(list(net.parameters()))) load_pretrained_model(net, pretrained_path, pretrained_include, pretrained_exclude, freeze_include, freeze_exclude) if resume: torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) amp_optimizer, lr_scheduler = create_optimizer(model_dir, train_cfg, net) collate_fn = merge_second_batch num_gpu = 1 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner, multi_gpu=multi_gpu) dataloader = torch.utils.data.DataLoader( dataset, batch_size=input_cfg.batch_size * num_gpu, shuffle=True, num_workers=input_cfg.preprocess.num_workers * num_gpu, pin_memory=False, collate_fn=collate_fn, worker_init_fn=_worker_init_fn, drop_last=not multi_gpu) ###################### # TRAINING ###################### model_logging = SimpleModelLog(model_dir) model_logging.open() model_logging.log_text(proto_str + "\n", 0, tag="config") start_step = net_loss.get_global_step() total_step = train_cfg.steps t = time.time() steps_per_eval = train_cfg.steps_per_eval clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch amp_optimizer.zero_grad() step_times = [] step = start_step best_mAP = 0 epoch = 0 net.train() net_loss.train() try: while True: if clear_metrics_every_epoch: net_loss.clear_metrics() for example in dataloader: lr_scheduler.step(net_loss.get_global_step()) time_metrics = example["metrics"] example.pop("metrics") example_torch = example_convert_to_torch(example, float_dtype) batch_size = example_torch["anchors"].shape[0] coors = example_torch["coordinates"] input_features = compute_model_input( voxel_generator.voxel_size, voxel_generator.point_cloud_range, with_distance=False, voxels=example_torch['voxels'], num_voxels=example_torch['num_points'], coors=coors) # input_features = reshape_input(batch_size, input_features, coors, voxel_generator.grid_size) input_features = reshape_input1(input_features) net.batch_size = batch_size preds_list = net(input_features, coors) ret_dict = net_loss(example_torch, preds_list) cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"].mean() cls_neg_loss = ret_dict["cls_neg_loss"].mean() loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] cared = ret_dict["cared"] labels = example_torch["labels"] loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) amp_optimizer.step() amp_optimizer.zero_grad() net_loss.update_global_step() net_metrics = net_loss.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) step_times.append(step_time) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net_loss.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["runtime"] = { "step": global_step, "steptime": np.mean(step_times), } metrics["runtime"].update(time_metrics[0]) step_times = [] metrics.update(net_metrics) metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) if model_cfg.use_direction_classifier: dir_loss_reduced = ret_dict["dir_loss_reduced"].mean() metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["misc"] = { "num_vox": int(example_torch["voxels"].shape[0]), "num_pos": int(num_pos), "num_neg": int(num_neg), "num_anchors": int(num_anchors), "lr": float(amp_optimizer.lr), "mem_usage": psutil.virtual_memory().percent, } model_logging.log_metrics(metrics, global_step) step += 1 epoch += 1 if epoch % 2 == 0: global_step = net_loss.get_global_step() torchplus.train.save_models(model_dir, [net, amp_optimizer], global_step) net.eval() net_loss.eval() best_mAP = evaluate(net, net_loss, best_mAP, voxel_generator, target_assigner, config, model_logging, model_dir, result_path) net.train() net_loss.train() if epoch > 100: break if epoch > 100: break except Exception as e: print(json.dumps(example["metadata"], indent=2)) model_logging.log_text(str(e), step) model_logging.log_text(json.dumps(example["metadata"], indent=2), step) torchplus.train.save_models(model_dir, [net, amp_optimizer], step) raise e finally: model_logging.close() torchplus.train.save_models(model_dir, [net, amp_optimizer], net_loss.get_global_step())