def train_model(): """Model training loop.""" model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) mc = pylibmc.Client(["127.0.0.1:11212"], binary=True, behaviors={ "tcp_nodelay": True, "ketama": True }) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): if model.roi_data_loader.has_stopped(): handle_critical_error(model, 'roi_data_loader failed') training_stats.IterTic() workspace.RunNet(model.net.Proto().name) freeze_fastrcnn_label = [] freeze_fastrcnn_label.append(workspace.FetchBlob('gpu_0/cls_score')) freeze_fastrcnn_label.append(workspace.FetchBlob('gpu_0/bbox_pred')) while True: if (mc.get('freeze_fastrcnn_label_s') == 'yidu'): break mc.replace('freeze_fastrcnn_label', freeze_fastrcnn_label) mc.replace('freeze_fastrcnn_label_s', 'weidu') training_stats.IterToc() training_stats.UpdateIterStats() if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) #nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): handle_critical_error(model, 'Loss is NaN') # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') #nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints with SummaryWriter(log_dir=get_output_dir(cfg.TRAIN.DATASETS) + "/events") as writer: writer.write_graph([model.net]) logger = logging.getLogger(__name__) setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) error_count = 0 for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): if model.roi_data_loader.has_stopped(): handle_critical_error(model, 'roi_data_loader failed') training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) try: workspace.RunNet(model.net.Proto().name) except: error_count += 1 logger.warn("Error in iter {}, error count: {}".format( cur_iter, error_count)) if not cfg.CONTINUE_ON_ERROR: raise if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + cfg.TRAIN.EPOCH_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): handle_critical_error(model, 'Loss is NaN') # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(use_tfboard=False): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints if use_tfboard: from c2board.writer import SummaryWriter tblogger = SummaryWriter(output_dir) tblogger.write_graph(model) setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model, tblogger if use_tfboard else None) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter) ) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() for gpu_id in range(cfg.NUM_GPUS): tblogger.append_image("gpu_{}/data".format(gpu_id)) tblogger.write_summaries(cur_iter) if use_tfboard: tblogger.close() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if cur_iter == start_iter: #data_to_save = ['gpu_0/mask_emb_logits','gpu_0/mask_emb_labels','gpu_0/mask_emb_prob','gpu_0/person_mask','gpu_0/body_mask_labels','gpu_0/fg_emb','gpu_0/mask_fcn_emb','gpu_0/fg_emb_normed','gpu_0/bg_emb_normed'] # data_to_save = ['gpu_0/mask_emb_logits','gpu_0/mask_emb_labels','gpu_0/mask_emb_prob','gpu_0/mask_fcn_logits','gpu_0/masks_int32','gpu_0/fg_emb','gpu_0/mask_fcn_emb','gpu_0/fg_emb_normed','gpu_0/bg_emb_normed'] # data_to_save = ['gpu_0/data','gpu_0/body_uv_rois','gpu_0/body_masks_wrt_box','gpu_0/body_mask_labels'] # data_to_save = ['gpu_0/data', 'gpu_0/mask_rois', 'gpu_0/inter_masks_int32', 'gpu_0/masks_int32'] data_to_save = [ 'gpu_0/data', 'gpu_0/keypoint_rois', 'gpu_0/inter_keypoint_int32', 'gpu_0/keypoint_locations_int32' ] #data = [workspace.FetchBlob(k) for k in data_to_save] #cPickle.dump(data,open('inter_kps_data.pkl','wb')) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) TEST_PERIOD = int(cfg.TRAIN.TEST_ITERS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): if model.roi_data_loader.has_stopped(): handle_critical_error(model, 'roi_data_loader failed') training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: print("================SAVING MODEL=================") checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if (cur_iter + 1) % TEST_PERIOD == 0 and cur_iter > start_iter: print("================RUN INFERENCE==================") checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) run_inference(checkpoints[cur_iter], multi_gpu_testing=False, check_expected_results=True) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): handle_critical_error(model, 'Loss is NaN') # Save the final model print("=====================FINAL=======================") checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) # np.save('DensePoseData/image.npy', workspace.FetchBlob('gpu_0/data')) # # np.save('DensePoseData/output.npy',workspace.FetchBlob('conv1')) # np.save('DensePoseData/outputgpu.npy',workspace.FetchBlob('gpu_0/conv1')) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: logger.info("\n\nCheckpoint Reached....Saving model \n\n") checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) writer = SummaryWriter(log_dir=output_dir) training_stats = TrainingStats(model, writer) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): if model.roi_data_loader.has_stopped(): handle_critical_error(model, 'roi_data_loader failed') training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats(cur_iter) training_stats.LogIterStats(cur_iter, lr) writer.add_scalar('learning_rate', lr, cur_iter) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): handle_critical_error(model, 'Loss is NaN') # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # save train loss and metric state_file = os.path.join(output_dir, 'training_state.json') training_stats.SaveTrainingStates(state_file) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(max_iters, roidb, pretrained_weight): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model( pretrained_weight) if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir, roidb) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, max_iters): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints[max_iters - 1] = os.path.join( output_dir, 'model_iter{}.pkl'.format(max_iters - 1)) nu.save_model_to_weights_file(checkpoints[max_iters - 1], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): try: """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model( ) if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model, cfg.TRAIN.LOG_PERIOD) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # initialize empty log file json_train_log = [] ######################################################## model.roi_data_loader.shutdown() return 0 ######################################################## except Exception as e: with open("/output/prep_log.txt", "a") as f: f.write("\n" + output_dir + " failed to start training \n" + str(e)) exit() for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr, json_train_log) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Save training log-file log_path = os.path.join(output_dir, 'train_log.json') json_train_log = { 'info': { 'batch_size': cfg.TRAIN.IMS_PER_BATCH, 'num_gpus': cfg.NUM_GPUS, 'max_iterations': cfg.SOLVER.MAX_ITER, 'datasets': cfg.TRAIN.DATASETS }, 'data': json_train_log } with open(log_path, 'w') as f: json.dump(json_train_log, f) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def main(): # Initialize C2 workspace.GlobalInit( ['caffe2', '--caffe2_log_level=0', '--caffe2_gpu_memory_tracking=1']) # Set up logging and load config options logger = setup_logging(__name__) logging.getLogger('detectron.roi_data.loader').setLevel(logging.INFO) args = parse_args() logger.info('Called with args:') logger.info(args) if args.cfg_file is not None: merge_cfg_from_file(args.cfg_file) if args.opts is not None: merge_cfg_from_list(args.opts) assert_and_infer_cfg() smi_output, cuda_ver, cudnn_ver = c2_utils.get_nvidia_info() logger.info("cuda version : {}".format(cuda_ver)) logger.info("cudnn version: {}".format(cudnn_ver)) logger.info("nvidia-smi output:\n{}".format(smi_output)) logger.info('Training with config:') logger.info(pprint.pformat(cfg)) # Note that while we set the numpy random seed network training will not be # deterministic in general. There are sources of non-determinism that cannot # be removed with a reasonble execution-speed tradeoff (such as certain # non-deterministic cudnn functions). np.random.seed(cfg.RNG_SEED) # test model logger.info("creat test model ...") test_model = test_engine.initialize_model_from_cfg(cfg.TEST.WEIGHTS, gpu_id=0) logger.info("created test model ...") train_data = DataLoader(root, "train_id.txt", cfg, test_model, is_train=True) # creat mode model, weights_file, start_iter, checkpoints = create_model( True, cfg, output_dir) # test blob print(workspace.Blobs()) # create input blob blob_names = ['data_stage2', 'gt_label_stage2'] for gpu_id in range(cfg.NUM_GPUS): with c2_utils.NamedCudaScope(gpu_id): for blob_name in blob_names: workspace.CreateBlob(core.ScopedName(blob_name)) # Override random weight initialization with weights from a saved model if weights_file: nu.initialize_gpu_from_weights_file(model, weights_file, gpu_id=0) # Even if we're randomly initializing we still need to synchronize # parameters across GPUs nu.broadcast_parameters(model) workspace.CreateNet(model.net) logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir))) dump_proto_files(model, output_dir) writer = SummaryWriter(log_dir=output_dir) training_stats = TrainingStats(model, writer) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) logger.info("start train ...") for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): # feed data # print("{} iter starting feed data...".format(cur_iter)) data_stage2, gt_label = train_data.next_batch() with c2_utils.NamedCudaScope(gpu_id): workspace.FeedBlob(core.ScopedName('data_stage2'), data_stage2) workspace.FeedBlob(core.ScopedName('gt_label_stage2'), gt_label) # print("workspace.RunNet(model.net.Proto().name)") training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats(cur_iter) training_stats.LogIterStats(cur_iter, lr) writer.add_scalar('learning_rate', lr, cur_iter) # print("end of RunNet") if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): handle_critical_error(model, 'Loss is NaN') # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # save train loss and metric state_file = os.path.join(output_dir, 'training_state.json') training_stats.SaveTrainingStates(state_file) # Execute the training run checkpoints = detectron.utils.train.train_model() # Test the trained model if not args.skip_test: test_model(checkpoints['final'], args.multi_gpu_testing, args.opts)
def train_model(): """Model training loop.""" model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir, start_iter) training_stats = TrainingStats(model) num_iter_per_epoch = model.roi_data_loader.get_num_iter_per_epoch() if cfg.REID.TRIPLET_LOSS and cfg.REID.TRIPLET_LOSS_CROSS: num_iter_per_epoch_triplet = model.roi_data_loader.get_num_iter_per_epoch_triplet( ) # CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) CHECKPOINT_PERIOD = cfg.TRAIN.SNAPSHOT_ITERS for cur_iter in range(start_iter * num_iter_per_epoch, cfg.SOLVER.MAX_ITER * num_iter_per_epoch): cur_ep = int(cur_iter / num_iter_per_epoch) if cfg.REID.TRIPLET_LOSS and cfg.REID.TRIPLET_LOSS_CROSS: if cur_ep > cfg.REID.TRIPLET_LOSS_START and cur_ep % 2 == 1: if cur_iter % num_iter_per_epoch > num_iter_per_epoch_triplet: continue reid_utils.set_loss_scale(model, 1) else: reid_utils.set_loss_scale(model, 0) if model.roi_data_loader.has_stopped(): handle_critical_error(model, 'roi_data_loader failed') training_stats.IterTic() # lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) lr = model.UpdateWorkspaceLr( cur_ep, lr_policy.get_lr_at_iter(cur_iter, cur_ep, num_iter_per_epoch)) workspace.RunNet(model.net.Proto().name) if cfg.REID.TRIPLET_LOSS and cfg.REID.TRIPLET_LOSS_CROSS: if cur_ep > cfg.REID.TRIPLET_LOSS_START and cur_ep % 2 == 1: # check input for i in range(cfg.NUM_GPUS): data = workspace.FetchBlob('gpu_{}/{}'.format( i, 'labels_int32')) id_unique, id_counts = np.unique(data, return_counts=True) assert id_counts.shape[0] == cfg.REID.P, id_counts for id_count in id_counts: assert id_count == cfg.REID.K, id_count if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if cur_ep % CHECKPOINT_PERIOD == 0 and cur_iter == num_iter_per_epoch * ( cur_ep + 1) - 1 and cur_iter > start_iter: # checkpoints[cur_iter] = os.path.join( # output_dir, 'model_iter{}.pkl'.format(cur_iter) # ) # nu.save_model_to_weights_file(checkpoints[cur_iter], model) checkpoints[cur_ep] = os.path.join( output_dir, 'model_epoch{}.pkl'.format(cur_ep + 1)) nu.save_model_to_weights_file(checkpoints[cur_ep], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): handle_critical_error(model, 'Loss is NaN') # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) #graph = net_drawer.GetPydotGraph(model.net.Proto().op, "mnist", rankdir="LR") #graph.write_png('graph.png') #max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.MAX_EPOCH == -1 else 10**8 #print(max_iter) cur_iter = start_iter #for cur_iter in range(start_iter, max_iter): while True: training_stats.IterTic() if cfg.SOLVER.MAX_EPOCH == -1: lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) else: lr = model.UpdateWorkspaceLr( training_stats.cur_epoch, lr_policy.get_lr_at_epoch(training_stats)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() if cfg.SOLVER.MAX_EPOCH == -1 and cur_iter == cfg.SOLVER.MAX_ITER: break if cfg.SOLVER.MAX_EPOCH != -1 and training_stats.cur_epoch == cfg.SOLVER.MAX_EPOCH + 1: break cur_iter += 1 # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model( ) #for create model if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints if 0: output_dir = '/home/icubic/daily_work/code/Detectron/train/coco_2014_train_ET_PH_part/generalized_rcnn_multi/' #output_dir = output_dir + '_101' setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) uuuu = model.roi_data_loader._blobs_queue_name CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) print('------------train.py') for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) #aaa_debug = workspace.FetchBlob('gpu_0/data') #bbb_debug = workspace.FetchBlob('gpu_0/conv1_w') #ccc_debug = workspace.FetchBlob('gpu_0/'+uuuu) try: workspace.RunNet(model.net.Proto().name) if 0: #import detectron.utils.blob as blob_utils inputs = [workspace.FetchBlob("gpu_0/rpn_rois_fpn2"),workspace.FetchBlob("gpu_0/rpn_rois_fpn3"),workspace.FetchBlob("gpu_0/rpn_rois_fpn4"),workspace.FetchBlob("gpu_0/rpn_rois_fpn5"), \ workspace.FetchBlob("gpu_0/rpn_rois_fpn6"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn2"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn3"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn4"), \ workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn5"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn6"),workspace.FetchBlob("gpu_0/roidb"),workspace.FetchBlob("gpu_0/im_info"),\ ] rois = collect(inputs, True) #inputs.append(workspace.FetchBlob("gpu_0/rpn_rois_fpn2")) im_info = inputs[-1] im_scales = im_info[:, 2] roidb = blob_utils.deserialize(inputs[-2]) # For historical consistency with the original Faster R-CNN # implementation we are *not* filtering crowd proposals. # This choice should be investigated in the future (it likely does # not matter). json_dataset.add_proposals(roidb, rois, im_scales, crowd_thresh=0) roidb_utils.add_bbox_regression_targets(roidb) # Compute training labels for the RPN proposals; also handles # distributing the proposals over FPN levels output_blob_names = fast_rcnn_roi_data.get_fast_rcnn_blob_names( ) blobs = {k: [] for k in output_blob_names} fast_rcnn_roi_data.add_fast_rcnn_blobs(blobs, im_scales, roidb) for i, k in enumerate(output_blob_names): blob_utils.py_op_copy_blob(blobs[k], outputs[i]) #if (np.sum(bb == 1))>0: # print('cc') except: aa = workspace.FetchBlob("gpu_0/rpn_rois_fpn2") aaa_debug = workspace.FetchBlob('gpu_0/data') print('aaaaaerror') #print("blobs:\n{}".format(workspace.Blobs())) #print('train.py aaaaaaaa_debug') if 1: aaa = workspace.FetchBlob("gpu_0/data") # nchw #img = aaa[1].copy() # BGR HWC -> CHW 12 #transform_img = img.swapaxes(0, 1).swapaxes(1, 2) #cv2.imshow("image0 ", transform_img[:, :, (2, 1, 0)]) #cv2.waitKey(0) #cv2.destroyAllWindows() #cv2.imshow('/home/icubic/daily_work/code/Detectron/aaa.png', aaa[0]) aaa_debug = workspace.FetchBlob('gpu_0/data') bbb_debug = workspace.FetchBlob('gpu_0/conv1_w') ccc_debug = workspace.FetchBlob('gpu_0/' + uuuu) ddd_debug = workspace.FetchBlob('gpu_0/roidb') eee_debug = workspace.FetchBlob('gpu_0/im_info') #print("Fetched data:\n{}".format(workspace.FetchBlob("gpu_0/data"))) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % ( CHECKPOINT_PERIOD / 4 ) == 0 and cur_iter > start_iter: #((cur_iter + 1) % (CHECKPOINT_PERIOD/1) == 0 and (cur_iter > start_iter and cur_iter < 50000)) or ((cur_iter + 1) % (CHECKPOINT_PERIOD/8) == 0 and cur_iter > 50000): checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter_50_{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final_50.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" start_time = time.time() model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) if model.train and cfg.TRAIN.PADA: if not hasattr(model, 'class_weight_db'): model.class_weight_db = pada.ClassWeightDB() model.class_weight_db.setup(model.roi_data_loader) if cfg.TRAIN.DA_FADE_IN: model.da_fade_in = pada.DAScaleFading(cfg.SOLVER.MAX_ITER) # if cfg.INTERRUPTING: # source_set_size = len(model.roi_data_loader._roidb) # if cfg.TRAIN.DOMAIN_ADAPTATION: # source_ims_per_batch = cfg.NUM_GPUS * (cfg.TRAIN.IMS_PER_BATCH//2) # else: # source_ims_per_batch = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH # CHECKPOINT_PERIOD = int(1.0 + source_set_size / (source_ims_per_batch * cfg.NUM_GPUS)) # print("Checkpoint period, and interruption, set for after {} batches".format(CHECKPOINT_PERIOD)) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): # print('iter:',cur_iter) # print(model.roi_data_loader._cur,list(model.roi_data_loader._perm)[:10]) if model.roi_data_loader.has_stopped(): handle_critical_error(model, 'roi_data_loader failed') training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) # blob_summary(['conv{}_{}'.format(i,j) for i in [1,2,3,4,5] for j in [1,2,3] if not ((j==3) and (i < 3))]) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter) % (training_stats.LOG_PERIOD * 50) == 0: print_conf_matrix(model.class_weight_db.conf_matrix) pool2 = workspace.FetchBlob('gpu_0/pool2').astype(float) print('pool2 max: {}'.format(pool2.max())) # blob_summary(['conv3_1_w','conv3_1_w_grad','conv3_1_b','conv5_3','da_fc7','da_conv_2','dc_ip3','dc_ip3_w','dc_ip2_w_grad']) blob_summary([ 'conv3_1_w', 'conv3_1_w_grad', 'conv3_1_b', 'da_conv_2', 'dc_ip3', 'dc_ip3_w', 'dc_ip2_w_grad' ]) # light if cfg.INTERRUPTING and time.time() - start_time > cfg.THRESH_TIME: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model, cur_iter=cur_iter) # stop this process and restart to continue form the checkpoint. model.roi_data_loader.shutdown() if cfg.TRAIN.DOMAIN_ADAPTATION: # triggers target data loader to stop: with open('./TargetDataLoaderProcess/read.txt', 'w') as f: f.write(str(0)) f.flush() os.fsync(f.fileno()) # wait a bit for it to stop: time.sleep(5) # enqueue new job: os.system('sbatch run.job') return checkpoints if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model, cur_iter=cur_iter) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() v = training_stats.iter_total_loss + model.class_weight_db.avg_pada_weight if training_stats.iter_total_loss > 4: # print('Loss is {}'.format(training_stats.iter_total_loss)) pool2 = workspace.FetchBlob('gpu_0/pool2').astype(float) print('pool2 max: {}'.format(pool2.max())) blob_summary([ 'conv3_1_w', 'conv3_1_w_grad', 'conv3_1_b', 'conv5_3', 'da_fc7', 'da_conv_2', 'dc_ip3', 'dc_ip3_w', 'dc_ip2_w_grad' ]) if np.isnan(v) or v == np.infty or v == -np.infty: nu.print_net(model) blobs = workspace.Blobs() print() print("Current blobs in the workspace:\n{}".format( '\n'.join(blobs))) print() for blob in blobs: print("Fetched {}:\n{}".format(blob, workspace.FetchBlob(blob))) print() blob_summary([ 'conv3_1_w', 'conv3_1_b', 'conv5_3', 'da_fc7', 'da_conv_2', 'dc_ip3', 'dc_ip3_w', 'dc_ip2_w_grad' ]) blob_summary() handle_critical_error(model, 'Loss is {}'.format(v)) # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model, cur_iter=cur_iter) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints