def project_onto_image_plane(
        poses,
        image_sizes,
        color,
        text: str,
        org: tuple,
        extrinsics,
        intrinsics,
        dataset: BaseDataset,
        target_size: tuple = None,
        background_color=None,
        synth_model: VunetOrg = None,  # ,thicken=False
        app_img=None,
        thickness=3,
        font_size=2,
        crop=False,
        target_width=128,
        yield_mask=False,
        overlay=False,
        cond_id=None):
    """

    :param poses:
    :param image_sizes:
    :param extrinsics:
    :param intrinsics:
    :param color: the color in which to display the generated poses
    :param joint_model:
    :param target_size:
    :param app_img:
    :return:
    """
    if color == None:
        color = [[255, 0, 0], [255, 0, 0], [0, 0, 255], [0, 0,
                                                         255], [0, 0, 255],
                 [255, 0, 0], [0, 0, 255], [0, 0, 255], [0, 0,
                                                         255], [0, 0, 255],
                 [0, 0, 255], [255, 0, 0], [0, 0, 255], [0, 0, 255],
                 [255, 0, 0], [255, 0, 0]]
        assert len(color) == len(dataset.joint_model.total_relative_joints)

    elif len(color) == 2:
        color = [
            color[0], color[0], color[1], color[1], color[1], color[0],
            color[1], color[1], color[1], color[1], color[1], color[0],
            color[1], color[1], color[0], color[0]
        ]
        assert len(color) == len(dataset.joint_model.total_relative_joints)

    if overlay and len(color) != 2:
        color = [[255, 0, 0], [255, 0, 0], [0, 102, 255], [0, 102, 255],
                 [0, 102, 255], [255, 0, 0], [0, 102, 255], [0, 102, 255],
                 [0, 102, 255], [0, 102, 255], [0, 102, 255], [255, 0, 0],
                 [0, 102, 255], [0, 102, 255], [255, 0, 0], [255, 0, 0]]
    yield_mask &= crop
    masks = None

    if background_color is None:
        background_color = 0

    if synth_model is not None:
        assert app_img is not None
        img_size = [synth_model.spatial_size, synth_model.spatial_size]
        size_arr = np.full((1, 2), synth_model.spatial_size, dtype=np.float)
        dev = app_img.get_device() if app_img.get_device() >= 0 else "cpu"

    imgs = []
    imgs_rgb = []
    for id_count, (p, s) in enumerate(zip(poses, image_sizes)):
        pose_c = apply_affine_transform(p, extrinsics)
        pose_i = camera_projection(pose_c, intrinsics)

        img = add_joints_to_img(
            np.full((s[0], s[1], 3),
                    fill_value=background_color,
                    dtype=np.uint8),
            pose_i[dataset.joint_model.kps_to_use]
            if dataset.small_joint_model else pose_i,
            dataset.joint_model.total_relative_joints,
            color_kps=color,
            color_joints=color,
        )
        if cond_id is not None:
            if id_count < cond_id:
                text = text + "; COND"
            else:
                text = text + "; PRED"

        img = cv2.putText(img,
                          text,
                          org,
                          cv2.FONT_HERSHEY_SIMPLEX,
                          font_size, (0, 0, 0),
                          thickness=thickness)
        if synth_model is not None:
            # rescale joints
            joint_scaling = size_arr / np.expand_dims(s, axis=0)
            pose_i_rescaled = pose_i * joint_scaling
            stickman = make_joint_img(
                img_size,
                pose_i_rescaled,
                dataset.joint_model,
                line_colors=dataset.line_colors,
                scale_factor=dataset.stickman_scale,
            )
            stickman = (dataset.stickman_transforms(stickman).unsqueeze(
                dim=0).to(dev))
            with torch.no_grad():
                rgb_img = synth_model.transfer(app_img, stickman)
            rgb_img = ((scale_img(rgb_img) * 255.0).permute(
                (0, 2, 3, 1)).detach().cpu().numpy().astype(np.uint8))
            # rgb_img = cv2.putText(rgb_img, text, org, cv2.FONT_HERSHEY_SIMPLEX, font_size,
            #                   (255, 255, 255), thickness=thickness)
            if crop:
                kps = pose_i[
                    dataset.joint_model.
                    kps_to_use] if dataset.small_joint_model else pose_i
                kps_rescaled = kps * joint_scaling
                center_kps = np.mean(kps_rescaled, axis=0)
                min_x = int(
                    min(max(0, center_kps[0] - target_width // 2),
                        rgb_img.shape[1] - target_width))
                max_x = min_x + target_width
                rgb_img = rgb_img[:, :, min_x:max_x]
            imgs_rgb.append(rgb_img.squeeze(0))

        if target_size is not None:
            scaling = np.asarray(img.shape[:2], dtype=np.float) / np.asarray(
                target_size, dtype=np.float)
            img = cv2.resize(img, target_size, interpolation=cv2.INTER_NEAREST)
        else:
            scaling = np.ones(2, dtype=np.float)

        if crop:
            kps = pose_i[dataset.joint_model.
                         kps_to_use] if dataset.small_joint_model else pose_i
            kps_rescaled = kps / np.expand_dims(scaling, axis=0)
            center_kps = np.mean(kps_rescaled, axis=0)
            min_x = int(
                min(max(0, center_kps[0] - target_width // 2),
                    img.shape[1] - target_width))
            max_x = min_x + target_width
            img = img[:, min_x:max_x]
            if masks is None:
                masks = np.expand_dims(np.asarray([min_x, max_x]), axis=0)
            else:
                masks = np.concatenate([
                    masks,
                    np.expand_dims(np.asarray([min_x, max_x]), axis=0)
                ],
                                       axis=0)

        imgs.append(img)

    if synth_model is not None and overlay:
        overlays = []
        for stick, r in zip(imgs, imgs_rgb):
            o = np.where(np.expand_dims(np.all(stick == 255, axis=2), axis=-1),
                         r, stick)
            overlays.append(o)

    outs = [np.stack(imgs, axis=0)]
    if synth_model is not None:
        outs.append(np.stack(imgs_rgb, axis=0))
    else:
        outs.append(np.zeros_like(outs[0]))

    if yield_mask:
        outs.append(masks)

    if overlay:
        outs.append(np.stack(overlays, axis=0))

    return outs