def train_model(use_tfboard=False): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints if use_tfboard: from c2board.writer import SummaryWriter tblogger = SummaryWriter(output_dir) tblogger.write_graph(model) setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model, tblogger if use_tfboard else None) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter) ) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() for gpu_id in range(cfg.NUM_GPUS): tblogger.append_image("gpu_{}/data".format(gpu_id)) tblogger.write_summaries(cur_iter) if use_tfboard: tblogger.close() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if cur_iter == start_iter: #data_to_save = ['gpu_0/mask_emb_logits','gpu_0/mask_emb_labels','gpu_0/mask_emb_prob','gpu_0/person_mask','gpu_0/body_mask_labels','gpu_0/fg_emb','gpu_0/mask_fcn_emb','gpu_0/fg_emb_normed','gpu_0/bg_emb_normed'] # data_to_save = ['gpu_0/mask_emb_logits','gpu_0/mask_emb_labels','gpu_0/mask_emb_prob','gpu_0/mask_fcn_logits','gpu_0/masks_int32','gpu_0/fg_emb','gpu_0/mask_fcn_emb','gpu_0/fg_emb_normed','gpu_0/bg_emb_normed'] # data_to_save = ['gpu_0/data','gpu_0/body_uv_rois','gpu_0/body_masks_wrt_box','gpu_0/body_mask_labels'] # data_to_save = ['gpu_0/data', 'gpu_0/mask_rois', 'gpu_0/inter_masks_int32', 'gpu_0/masks_int32'] data_to_save = [ 'gpu_0/data', 'gpu_0/keypoint_rois', 'gpu_0/inter_keypoint_int32', 'gpu_0/keypoint_locations_int32' ] #data = [workspace.FetchBlob(k) for k in data_to_save] #cPickle.dump(data,open('inter_kps_data.pkl','wb')) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) # np.save('DensePoseData/image.npy', workspace.FetchBlob('gpu_0/data')) # # np.save('DensePoseData/output.npy',workspace.FetchBlob('conv1')) # np.save('DensePoseData/outputgpu.npy',workspace.FetchBlob('gpu_0/conv1')) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: logger.info("\n\nCheckpoint Reached....Saving model \n\n") checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(max_iters, roidb, pretrained_weight): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model( pretrained_weight) if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir, roidb) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, max_iters): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints[max_iters - 1] = os.path.join( output_dir, 'model_iter{}.pkl'.format(max_iters - 1)) nu.save_model_to_weights_file(checkpoints[max_iters - 1], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter) ) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): try: """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model( ) if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model, cfg.TRAIN.LOG_PERIOD) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # initialize empty log file json_train_log = [] ######################################################## model.roi_data_loader.shutdown() return 0 ######################################################## except Exception as e: with open("/output/prep_log.txt", "a") as f: f.write("\n" + output_dir + " failed to start training \n" + str(e)) exit() for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr, json_train_log) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Save training log-file log_path = os.path.join(output_dir, 'train_log.json') json_train_log = { 'info': { 'batch_size': cfg.TRAIN.IMS_PER_BATCH, 'num_gpus': cfg.NUM_GPUS, 'max_iterations': cfg.SOLVER.MAX_ITER, 'datasets': cfg.TRAIN.DATASETS }, 'data': json_train_log } with open(log_path, 'w') as f: json.dump(json_train_log, f) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model() if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) #graph = net_drawer.GetPydotGraph(model.net.Proto().op, "mnist", rankdir="LR") #graph.write_png('graph.png') #max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.MAX_EPOCH == -1 else 10**8 #print(max_iter) cur_iter = start_iter #for cur_iter in range(start_iter, max_iter): while True: training_stats.IterTic() if cfg.SOLVER.MAX_EPOCH == -1: lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) else: lr = model.UpdateWorkspaceLr( training_stats.cur_epoch, lr_policy.get_lr_at_epoch(training_stats)) workspace.RunNet(model.net.Proto().name) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter: checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() if cfg.SOLVER.MAX_EPOCH == -1 and cur_iter == cfg.SOLVER.MAX_ITER: break if cfg.SOLVER.MAX_EPOCH != -1 and training_stats.cur_epoch == cfg.SOLVER.MAX_EPOCH + 1: break cur_iter += 1 # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints
def train_model(): """Model training loop.""" logger = logging.getLogger(__name__) model, weights_file, start_iter, checkpoints, output_dir = create_model( ) #for create model if 'final' in checkpoints: # The final model was found in the output directory, so nothing to do return checkpoints if 0: output_dir = '/home/icubic/daily_work/code/Detectron/train/coco_2014_train_ET_PH_part/generalized_rcnn_multi/' #output_dir = output_dir + '_101' setup_model_for_training(model, weights_file, output_dir) training_stats = TrainingStats(model) uuuu = model.roi_data_loader._blobs_queue_name CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) print('------------train.py') for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER): training_stats.IterTic() lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter)) #aaa_debug = workspace.FetchBlob('gpu_0/data') #bbb_debug = workspace.FetchBlob('gpu_0/conv1_w') #ccc_debug = workspace.FetchBlob('gpu_0/'+uuuu) try: workspace.RunNet(model.net.Proto().name) if 0: #import detectron.utils.blob as blob_utils inputs = [workspace.FetchBlob("gpu_0/rpn_rois_fpn2"),workspace.FetchBlob("gpu_0/rpn_rois_fpn3"),workspace.FetchBlob("gpu_0/rpn_rois_fpn4"),workspace.FetchBlob("gpu_0/rpn_rois_fpn5"), \ workspace.FetchBlob("gpu_0/rpn_rois_fpn6"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn2"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn3"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn4"), \ workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn5"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn6"),workspace.FetchBlob("gpu_0/roidb"),workspace.FetchBlob("gpu_0/im_info"),\ ] rois = collect(inputs, True) #inputs.append(workspace.FetchBlob("gpu_0/rpn_rois_fpn2")) im_info = inputs[-1] im_scales = im_info[:, 2] roidb = blob_utils.deserialize(inputs[-2]) # For historical consistency with the original Faster R-CNN # implementation we are *not* filtering crowd proposals. # This choice should be investigated in the future (it likely does # not matter). json_dataset.add_proposals(roidb, rois, im_scales, crowd_thresh=0) roidb_utils.add_bbox_regression_targets(roidb) # Compute training labels for the RPN proposals; also handles # distributing the proposals over FPN levels output_blob_names = fast_rcnn_roi_data.get_fast_rcnn_blob_names( ) blobs = {k: [] for k in output_blob_names} fast_rcnn_roi_data.add_fast_rcnn_blobs(blobs, im_scales, roidb) for i, k in enumerate(output_blob_names): blob_utils.py_op_copy_blob(blobs[k], outputs[i]) #if (np.sum(bb == 1))>0: # print('cc') except: aa = workspace.FetchBlob("gpu_0/rpn_rois_fpn2") aaa_debug = workspace.FetchBlob('gpu_0/data') print('aaaaaerror') #print("blobs:\n{}".format(workspace.Blobs())) #print('train.py aaaaaaaa_debug') if 1: aaa = workspace.FetchBlob("gpu_0/data") # nchw #img = aaa[1].copy() # BGR HWC -> CHW 12 #transform_img = img.swapaxes(0, 1).swapaxes(1, 2) #cv2.imshow("image0 ", transform_img[:, :, (2, 1, 0)]) #cv2.waitKey(0) #cv2.destroyAllWindows() #cv2.imshow('/home/icubic/daily_work/code/Detectron/aaa.png', aaa[0]) aaa_debug = workspace.FetchBlob('gpu_0/data') bbb_debug = workspace.FetchBlob('gpu_0/conv1_w') ccc_debug = workspace.FetchBlob('gpu_0/' + uuuu) ddd_debug = workspace.FetchBlob('gpu_0/roidb') eee_debug = workspace.FetchBlob('gpu_0/im_info') #print("Fetched data:\n{}".format(workspace.FetchBlob("gpu_0/data"))) if cur_iter == start_iter: nu.print_net(model) training_stats.IterToc() training_stats.UpdateIterStats() training_stats.LogIterStats(cur_iter, lr) if (cur_iter + 1) % ( CHECKPOINT_PERIOD / 4 ) == 0 and cur_iter > start_iter: #((cur_iter + 1) % (CHECKPOINT_PERIOD/1) == 0 and (cur_iter > start_iter and cur_iter < 50000)) or ((cur_iter + 1) % (CHECKPOINT_PERIOD/8) == 0 and cur_iter > 50000): checkpoints[cur_iter] = os.path.join( output_dir, 'model_iter_50_{}.pkl'.format(cur_iter)) nu.save_model_to_weights_file(checkpoints[cur_iter], model) if cur_iter == start_iter + training_stats.LOG_PERIOD: # Reset the iteration timer to remove outliers from the first few # SGD iterations training_stats.ResetIterTimer() if np.isnan(training_stats.iter_total_loss): logger.critical('Loss is NaN, exiting...') model.roi_data_loader.shutdown() envu.exit_on_error() # Save the final model checkpoints['final'] = os.path.join(output_dir, 'model_final_50.pkl') nu.save_model_to_weights_file(checkpoints['final'], model) # Shutdown data loading threads model.roi_data_loader.shutdown() return checkpoints