Exemplo n.º 1
0
def train_model():
    """Model training loop."""
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    mc = pylibmc.Client(["127.0.0.1:11212"],
                        binary=True,
                        behaviors={
                            "tcp_nodelay": True,
                            "ketama": True
                        })

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        if model.roi_data_loader.has_stopped():
            handle_critical_error(model, 'roi_data_loader failed')
        training_stats.IterTic()

        workspace.RunNet(model.net.Proto().name)

        freeze_fastrcnn_label = []
        freeze_fastrcnn_label.append(workspace.FetchBlob('gpu_0/cls_score'))
        freeze_fastrcnn_label.append(workspace.FetchBlob('gpu_0/bbox_pred'))

        while True:
            if (mc.get('freeze_fastrcnn_label_s') == 'yidu'):
                break
        mc.replace('freeze_fastrcnn_label', freeze_fastrcnn_label)
        mc.replace('freeze_fastrcnn_label_s', 'weidu')

        training_stats.IterToc()
        training_stats.UpdateIterStats()

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            #nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            handle_critical_error(model, 'Loss is NaN')

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    #nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 2
0
def train_model():
    """Model training loop."""
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    with SummaryWriter(log_dir=get_output_dir(cfg.TRAIN.DATASETS) +
                       "/events") as writer:
        writer.write_graph([model.net])

    logger = logging.getLogger(__name__)
    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)
    error_count = 0

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        if model.roi_data_loader.has_stopped():
            handle_critical_error(model, 'roi_data_loader failed')
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        try:
            workspace.RunNet(model.net.Proto().name)
        except:
            error_count += 1
            logger.warn("Error in iter {}, error count: {}".format(
                cur_iter, error_count))
            if not cfg.CONTINUE_ON_ERROR:
                raise

        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + cfg.TRAIN.EPOCH_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            handle_critical_error(model, 'Loss is NaN')

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 3
0
def train_model(use_tfboard=False):
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    if use_tfboard:
        from c2board.writer import SummaryWriter
        tblogger = SummaryWriter(output_dir)
        tblogger.write_graph(model)

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model, tblogger if use_tfboard else None)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter)
            )
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

        for gpu_id in range(cfg.NUM_GPUS):
            tblogger.append_image("gpu_{}/data".format(gpu_id))
        tblogger.write_summaries(cur_iter)

    if use_tfboard:
        tblogger.close()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 4
0
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)
        if cur_iter == start_iter:
            #data_to_save = ['gpu_0/mask_emb_logits','gpu_0/mask_emb_labels','gpu_0/mask_emb_prob','gpu_0/person_mask','gpu_0/body_mask_labels','gpu_0/fg_emb','gpu_0/mask_fcn_emb','gpu_0/fg_emb_normed','gpu_0/bg_emb_normed']
            # data_to_save = ['gpu_0/mask_emb_logits','gpu_0/mask_emb_labels','gpu_0/mask_emb_prob','gpu_0/mask_fcn_logits','gpu_0/masks_int32','gpu_0/fg_emb','gpu_0/mask_fcn_emb','gpu_0/fg_emb_normed','gpu_0/bg_emb_normed']
            # data_to_save = ['gpu_0/data','gpu_0/body_uv_rois','gpu_0/body_masks_wrt_box','gpu_0/body_mask_labels']
            # data_to_save = ['gpu_0/data', 'gpu_0/mask_rois', 'gpu_0/inter_masks_int32', 'gpu_0/masks_int32']
            data_to_save = [
                'gpu_0/data', 'gpu_0/keypoint_rois',
                'gpu_0/inter_keypoint_int32', 'gpu_0/keypoint_locations_int32'
            ]
            #data = [workspace.FetchBlob(k) for k in data_to_save]
            #cPickle.dump(data,open('inter_kps_data.pkl','wb'))

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 5
0
def train_model():
    """Model training loop."""
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)
    TEST_PERIOD = int(cfg.TRAIN.TEST_ITERS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        if model.roi_data_loader.has_stopped():
            handle_critical_error(model, 'roi_data_loader failed')
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            print("================SAVING MODEL=================")
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)
        if (cur_iter + 1) % TEST_PERIOD == 0 and cur_iter > start_iter:
            print("================RUN INFERENCE==================")
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)
            run_inference(checkpoints[cur_iter],
                          multi_gpu_testing=False,
                          check_expected_results=True)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            handle_critical_error(model, 'Loss is NaN')

    # Save the final model
    print("=====================FINAL=======================")
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 6
0
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        # np.save('DensePoseData/image.npy', workspace.FetchBlob('gpu_0/data'))
        # # np.save('DensePoseData/output.npy',workspace.FetchBlob('conv1'))
        # np.save('DensePoseData/outputgpu.npy',workspace.FetchBlob('gpu_0/conv1'))

        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            logger.info("\n\nCheckpoint Reached....Saving model \n\n")
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 7
0
def train_model():
    """Model training loop."""
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    writer = SummaryWriter(log_dir=output_dir)
    training_stats = TrainingStats(model, writer)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        if model.roi_data_loader.has_stopped():
            handle_critical_error(model, 'roi_data_loader failed')
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats(cur_iter)
        training_stats.LogIterStats(cur_iter, lr)
        writer.add_scalar('learning_rate', lr, cur_iter)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            handle_critical_error(model, 'Loss is NaN')

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # save train loss and metric
    state_file = os.path.join(output_dir, 'training_state.json')
    training_stats.SaveTrainingStates(state_file)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 8
0
def train_model(max_iters, roidb, pretrained_weight):
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model(
        pretrained_weight)
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir, roidb)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, max_iters):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints[max_iters - 1] = os.path.join(
        output_dir, 'model_iter{}.pkl'.format(max_iters - 1))
    nu.save_model_to_weights_file(checkpoints[max_iters - 1], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 9
0
def train_model():
    try:
        """Model training loop."""
        logger = logging.getLogger(__name__)
        model, weights_file, start_iter, checkpoints, output_dir = create_model(
        )
        if 'final' in checkpoints:
            # The final model was found in the output directory, so nothing to do
            return checkpoints

        setup_model_for_training(model, weights_file, output_dir)
        training_stats = TrainingStats(model, cfg.TRAIN.LOG_PERIOD)
        CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

        # initialize empty log file
        json_train_log = []

        ########################################################
        model.roi_data_loader.shutdown()
        return 0
        ########################################################
    except Exception as e:
        with open("/output/prep_log.txt", "a") as f:
            f.write("\n" + output_dir + " failed to start training \n" +
                    str(e))
        exit()

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr, json_train_log)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)

    # Save training log-file
    log_path = os.path.join(output_dir, 'train_log.json')
    json_train_log = {
        'info': {
            'batch_size': cfg.TRAIN.IMS_PER_BATCH,
            'num_gpus': cfg.NUM_GPUS,
            'max_iterations': cfg.SOLVER.MAX_ITER,
            'datasets': cfg.TRAIN.DATASETS
        },
        'data': json_train_log
    }
    with open(log_path, 'w') as f:
        json.dump(json_train_log, f)

    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 10
0
def main():
    # Initialize C2
    workspace.GlobalInit(
        ['caffe2', '--caffe2_log_level=0', '--caffe2_gpu_memory_tracking=1'])
    # Set up logging and load config options
    logger = setup_logging(__name__)
    logging.getLogger('detectron.roi_data.loader').setLevel(logging.INFO)
    args = parse_args()
    logger.info('Called with args:')
    logger.info(args)
    if args.cfg_file is not None:
        merge_cfg_from_file(args.cfg_file)
    if args.opts is not None:
        merge_cfg_from_list(args.opts)
    assert_and_infer_cfg()
    smi_output, cuda_ver, cudnn_ver = c2_utils.get_nvidia_info()
    logger.info("cuda version : {}".format(cuda_ver))
    logger.info("cudnn version: {}".format(cudnn_ver))
    logger.info("nvidia-smi output:\n{}".format(smi_output))
    logger.info('Training with config:')
    logger.info(pprint.pformat(cfg))
    # Note that while we set the numpy random seed network training will not be
    # deterministic in general. There are sources of non-determinism that cannot
    # be removed with a reasonble execution-speed tradeoff (such as certain
    # non-deterministic cudnn functions).
    np.random.seed(cfg.RNG_SEED)
    # test model
    logger.info("creat test model ...")
    test_model = test_engine.initialize_model_from_cfg(cfg.TEST.WEIGHTS,
                                                       gpu_id=0)
    logger.info("created test model ...")
    train_data = DataLoader(root,
                            "train_id.txt",
                            cfg,
                            test_model,
                            is_train=True)
    # creat mode
    model, weights_file, start_iter, checkpoints = create_model(
        True, cfg, output_dir)
    # test blob
    print(workspace.Blobs())
    # create input blob
    blob_names = ['data_stage2', 'gt_label_stage2']
    for gpu_id in range(cfg.NUM_GPUS):
        with c2_utils.NamedCudaScope(gpu_id):
            for blob_name in blob_names:
                workspace.CreateBlob(core.ScopedName(blob_name))
    # Override random weight initialization with weights from a saved model
    if weights_file:
        nu.initialize_gpu_from_weights_file(model, weights_file, gpu_id=0)
    # Even if we're randomly initializing we still need to synchronize
    # parameters across GPUs
    nu.broadcast_parameters(model)
    workspace.CreateNet(model.net)

    logger.info('Outputs saved to: {:s}'.format(os.path.abspath(output_dir)))
    dump_proto_files(model, output_dir)

    writer = SummaryWriter(log_dir=output_dir)
    training_stats = TrainingStats(model, writer)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)
    logger.info("start train ...")
    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        # feed data
        # print("{} iter starting feed data...".format(cur_iter))
        data_stage2, gt_label = train_data.next_batch()
        with c2_utils.NamedCudaScope(gpu_id):
            workspace.FeedBlob(core.ScopedName('data_stage2'), data_stage2)
            workspace.FeedBlob(core.ScopedName('gt_label_stage2'), gt_label)

        # print("workspace.RunNet(model.net.Proto().name)")
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats(cur_iter)
        training_stats.LogIterStats(cur_iter, lr)
        writer.add_scalar('learning_rate', lr, cur_iter)

        # print("end of RunNet")
        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            handle_critical_error(model, 'Loss is NaN')

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # save train loss and metric
    state_file = os.path.join(output_dir, 'training_state.json')
    training_stats.SaveTrainingStates(state_file)
    # Execute the training run
    checkpoints = detectron.utils.train.train_model()
    # Test the trained model
    if not args.skip_test:
        test_model(checkpoints['final'], args.multi_gpu_testing, args.opts)
Exemplo n.º 11
0
def train_model():
    """Model training loop."""
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir, start_iter)
    training_stats = TrainingStats(model)

    num_iter_per_epoch = model.roi_data_loader.get_num_iter_per_epoch()
    if cfg.REID.TRIPLET_LOSS and cfg.REID.TRIPLET_LOSS_CROSS:
        num_iter_per_epoch_triplet = model.roi_data_loader.get_num_iter_per_epoch_triplet(
        )

    # CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)
    CHECKPOINT_PERIOD = cfg.TRAIN.SNAPSHOT_ITERS

    for cur_iter in range(start_iter * num_iter_per_epoch,
                          cfg.SOLVER.MAX_ITER * num_iter_per_epoch):
        cur_ep = int(cur_iter / num_iter_per_epoch)
        if cfg.REID.TRIPLET_LOSS and cfg.REID.TRIPLET_LOSS_CROSS:
            if cur_ep > cfg.REID.TRIPLET_LOSS_START and cur_ep % 2 == 1:
                if cur_iter % num_iter_per_epoch > num_iter_per_epoch_triplet:
                    continue
                reid_utils.set_loss_scale(model, 1)
            else:
                reid_utils.set_loss_scale(model, 0)

        if model.roi_data_loader.has_stopped():
            handle_critical_error(model, 'roi_data_loader failed')
        training_stats.IterTic()

        # lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter))
        lr = model.UpdateWorkspaceLr(
            cur_ep,
            lr_policy.get_lr_at_iter(cur_iter, cur_ep, num_iter_per_epoch))

        workspace.RunNet(model.net.Proto().name)

        if cfg.REID.TRIPLET_LOSS and cfg.REID.TRIPLET_LOSS_CROSS:
            if cur_ep > cfg.REID.TRIPLET_LOSS_START and cur_ep % 2 == 1:
                # check input
                for i in range(cfg.NUM_GPUS):
                    data = workspace.FetchBlob('gpu_{}/{}'.format(
                        i, 'labels_int32'))
                    id_unique, id_counts = np.unique(data, return_counts=True)
                    assert id_counts.shape[0] == cfg.REID.P, id_counts
                    for id_count in id_counts:
                        assert id_count == cfg.REID.K, id_count

        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if cur_ep % CHECKPOINT_PERIOD == 0 and cur_iter == num_iter_per_epoch * (
                cur_ep + 1) - 1 and cur_iter > start_iter:
            # checkpoints[cur_iter] = os.path.join(
            # output_dir, 'model_iter{}.pkl'.format(cur_iter)
            # )
            # nu.save_model_to_weights_file(checkpoints[cur_iter], model)
            checkpoints[cur_ep] = os.path.join(
                output_dir, 'model_epoch{}.pkl'.format(cur_ep + 1))
            nu.save_model_to_weights_file(checkpoints[cur_ep], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            handle_critical_error(model, 'Loss is NaN')

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 12
0
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)
    #graph = net_drawer.GetPydotGraph(model.net.Proto().op, "mnist", rankdir="LR")
    #graph.write_png('graph.png')

    #max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.MAX_EPOCH == -1 else 10**8
    #print(max_iter)

    cur_iter = start_iter
    #for cur_iter in range(start_iter, max_iter):
    while True:
        training_stats.IterTic()
        if cfg.SOLVER.MAX_EPOCH == -1:
            lr = model.UpdateWorkspaceLr(cur_iter,
                                         lr_policy.get_lr_at_iter(cur_iter))
        else:
            lr = model.UpdateWorkspaceLr(
                training_stats.cur_epoch,
                lr_policy.get_lr_at_epoch(training_stats))

        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

        if cfg.SOLVER.MAX_EPOCH == -1 and cur_iter == cfg.SOLVER.MAX_ITER:
            break

        if cfg.SOLVER.MAX_EPOCH != -1 and training_stats.cur_epoch == cfg.SOLVER.MAX_EPOCH + 1:
            break

        cur_iter += 1

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 13
0
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model(
    )  #for create model
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints
    if 0:
        output_dir = '/home/icubic/daily_work/code/Detectron/train/coco_2014_train_ET_PH_part/generalized_rcnn_multi/'
    #output_dir = output_dir + '_101'
    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    uuuu = model.roi_data_loader._blobs_queue_name
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)
    print('------------train.py')
    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        #aaa_debug = workspace.FetchBlob('gpu_0/data')
        #bbb_debug = workspace.FetchBlob('gpu_0/conv1_w')
        #ccc_debug = workspace.FetchBlob('gpu_0/'+uuuu)
        try:
            workspace.RunNet(model.net.Proto().name)

            if 0:
                #import detectron.utils.blob as blob_utils
                inputs = [workspace.FetchBlob("gpu_0/rpn_rois_fpn2"),workspace.FetchBlob("gpu_0/rpn_rois_fpn3"),workspace.FetchBlob("gpu_0/rpn_rois_fpn4"),workspace.FetchBlob("gpu_0/rpn_rois_fpn5"), \
                          workspace.FetchBlob("gpu_0/rpn_rois_fpn6"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn2"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn3"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn4"), \
                          workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn5"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn6"),workspace.FetchBlob("gpu_0/roidb"),workspace.FetchBlob("gpu_0/im_info"),\
                          ]
                rois = collect(inputs, True)
                #inputs.append(workspace.FetchBlob("gpu_0/rpn_rois_fpn2"))
                im_info = inputs[-1]
                im_scales = im_info[:, 2]
                roidb = blob_utils.deserialize(inputs[-2])
                # For historical consistency with the original Faster R-CNN
                # implementation we are *not* filtering crowd proposals.
                # This choice should be investigated in the future (it likely does
                # not matter).
                json_dataset.add_proposals(roidb,
                                           rois,
                                           im_scales,
                                           crowd_thresh=0)
                roidb_utils.add_bbox_regression_targets(roidb)
                # Compute training labels for the RPN proposals; also handles
                # distributing the proposals over FPN levels
                output_blob_names = fast_rcnn_roi_data.get_fast_rcnn_blob_names(
                )
                blobs = {k: [] for k in output_blob_names}
                fast_rcnn_roi_data.add_fast_rcnn_blobs(blobs, im_scales, roidb)
                for i, k in enumerate(output_blob_names):
                    blob_utils.py_op_copy_blob(blobs[k], outputs[i])
            #if (np.sum(bb == 1))>0:
            #   print('cc')
        except:
            aa = workspace.FetchBlob("gpu_0/rpn_rois_fpn2")
            aaa_debug = workspace.FetchBlob('gpu_0/data')
            print('aaaaaerror')
        #print("blobs:\n{}".format(workspace.Blobs()))
        #print('train.py   aaaaaaaa_debug')
        if 1:

            aaa = workspace.FetchBlob("gpu_0/data")  # nchw
            #img = aaa[1].copy()
            # BGR HWC -> CHW  12
            #transform_img = img.swapaxes(0, 1).swapaxes(1, 2)

            #cv2.imshow("image0 ", transform_img[:, :, (2, 1, 0)])

            #cv2.waitKey(0)
            #cv2.destroyAllWindows()
            #cv2.imshow('/home/icubic/daily_work/code/Detectron/aaa.png', aaa[0])
            aaa_debug = workspace.FetchBlob('gpu_0/data')
            bbb_debug = workspace.FetchBlob('gpu_0/conv1_w')
            ccc_debug = workspace.FetchBlob('gpu_0/' + uuuu)
            ddd_debug = workspace.FetchBlob('gpu_0/roidb')
            eee_debug = workspace.FetchBlob('gpu_0/im_info')
            #print("Fetched data:\n{}".format(workspace.FetchBlob("gpu_0/data")))
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % (
                CHECKPOINT_PERIOD / 4
        ) == 0 and cur_iter > start_iter:  #((cur_iter + 1) % (CHECKPOINT_PERIOD/1) == 0 and (cur_iter > start_iter and cur_iter < 50000)) or ((cur_iter + 1) % (CHECKPOINT_PERIOD/8) == 0 and cur_iter > 50000):
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter_50_{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final_50.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemplo n.º 14
0
def train_model():
    """Model training loop."""
    start_time = time.time()

    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    if model.train and cfg.TRAIN.PADA:
        if not hasattr(model, 'class_weight_db'):
            model.class_weight_db = pada.ClassWeightDB()
        model.class_weight_db.setup(model.roi_data_loader)
    if cfg.TRAIN.DA_FADE_IN:
        model.da_fade_in = pada.DAScaleFading(cfg.SOLVER.MAX_ITER)

    # if cfg.INTERRUPTING:
    #     source_set_size = len(model.roi_data_loader._roidb)
    #     if cfg.TRAIN.DOMAIN_ADAPTATION:
    #         source_ims_per_batch = cfg.NUM_GPUS * (cfg.TRAIN.IMS_PER_BATCH//2)
    #     else:
    #         source_ims_per_batch = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH
    #     CHECKPOINT_PERIOD = int(1.0 + source_set_size / (source_ims_per_batch * cfg.NUM_GPUS))
    #     print("Checkpoint period, and interruption, set for after {} batches".format(CHECKPOINT_PERIOD))

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        # print('iter:',cur_iter)
        # print(model.roi_data_loader._cur,list(model.roi_data_loader._perm)[:10])

        if model.roi_data_loader.has_stopped():
            handle_critical_error(model, 'roi_data_loader failed')
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
            # blob_summary(['conv{}_{}'.format(i,j) for i in [1,2,3,4,5] for j in [1,2,3] if not ((j==3) and (i < 3))])
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter) % (training_stats.LOG_PERIOD * 50) == 0:
            print_conf_matrix(model.class_weight_db.conf_matrix)
            pool2 = workspace.FetchBlob('gpu_0/pool2').astype(float)
            print('pool2 max: {}'.format(pool2.max()))
            # blob_summary(['conv3_1_w','conv3_1_w_grad','conv3_1_b','conv5_3','da_fc7','da_conv_2','dc_ip3','dc_ip3_w','dc_ip2_w_grad'])
            blob_summary([
                'conv3_1_w', 'conv3_1_w_grad', 'conv3_1_b', 'da_conv_2',
                'dc_ip3', 'dc_ip3_w', 'dc_ip2_w_grad'
            ])  # light

        if cfg.INTERRUPTING and time.time() - start_time > cfg.THRESH_TIME:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter],
                                          model,
                                          cur_iter=cur_iter)

            # stop this process and restart to continue form the checkpoint.
            model.roi_data_loader.shutdown()

            if cfg.TRAIN.DOMAIN_ADAPTATION:
                # triggers target data loader to stop:
                with open('./TargetDataLoaderProcess/read.txt', 'w') as f:
                    f.write(str(0))
                    f.flush()
                    os.fsync(f.fileno())

            # wait a bit for it to stop:
            time.sleep(5)

            # enqueue new job:
            os.system('sbatch run.job')

            return checkpoints

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter],
                                          model,
                                          cur_iter=cur_iter)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        v = training_stats.iter_total_loss + model.class_weight_db.avg_pada_weight
        if training_stats.iter_total_loss > 4:
            # print('Loss is {}'.format(training_stats.iter_total_loss))
            pool2 = workspace.FetchBlob('gpu_0/pool2').astype(float)
            print('pool2 max: {}'.format(pool2.max()))
            blob_summary([
                'conv3_1_w', 'conv3_1_w_grad', 'conv3_1_b', 'conv5_3',
                'da_fc7', 'da_conv_2', 'dc_ip3', 'dc_ip3_w', 'dc_ip2_w_grad'
            ])

        if np.isnan(v) or v == np.infty or v == -np.infty:
            nu.print_net(model)
            blobs = workspace.Blobs()
            print()
            print("Current blobs in the workspace:\n{}".format(
                '\n'.join(blobs)))
            print()
            for blob in blobs:
                print("Fetched {}:\n{}".format(blob,
                                               workspace.FetchBlob(blob)))
                print()
            blob_summary([
                'conv3_1_w', 'conv3_1_b', 'conv5_3', 'da_fc7', 'da_conv_2',
                'dc_ip3', 'dc_ip3_w', 'dc_ip2_w_grad'
            ])
            blob_summary()
            handle_critical_error(model, 'Loss is {}'.format(v))

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'],
                                  model,
                                  cur_iter=cur_iter)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints