示例#1
0
def load_figures_json(filename: str) -> Dict[str, List[Figure]]:
    d = file_util.read_json(filename)
    res = {
        page: [Figure.from_dict(dict_fig) for dict_fig in page_dicts]
        for (page, page_dicts) in d.items()
    }
    return res
def extract_figures_json(pdf_path, page_image_paths, pdffigures_output,
                         output_directory):
    """Extract information about figures to JSON and save to disk.

    :param str pdf_path: path to the PDF from which to extract
      figures.

    :returns: path to the JSON file containing the detection results.
    """
    page_images_array = np.array(
        [imread(page_image_path) for page_image_path in page_image_paths])
    detector = get_detector()
    figure_boxes_by_page = detector.get_detections(page_images_array)
    pdffigures_captions = pdffigures_wrapper.get_captions(
        pdffigures_output=pdffigures_output,
        target_dpi=settings.DEFAULT_INFERENCE_DPI)
    figures_by_page = []
    for page_num in range(len(page_image_paths)):
        figure_boxes = figure_boxes_by_page[page_num]
        pf_page_captions = [
            caption for caption in pdffigures_captions
            if caption.page == page_num
        ]
        caption_boxes = [
            caption.caption_boundary for caption in pf_page_captions
        ]
        figure_indices, caption_indices = figure_utils.pair_boxes(
            figure_boxes, caption_boxes)
        page_image = page_images_array[page_num]
        pad_pixels = PAD_FACTOR * min(page_image.shape[:2])
        for (figure_idx, caption_idx) in zip(figure_indices, caption_indices):
            figures_by_page.append(
                Figure(figure_boundary=figure_boxes[figure_idx].expand_box(
                    pad_pixels).crop_to_page(
                        page_image.shape).crop_whitespace_edges(page_image),
                       caption_boundary=caption_boxes[caption_idx],
                       caption_text=pf_page_captions[caption_idx].caption_text,
                       name=pf_page_captions[caption_idx].name,
                       figure_type=pf_page_captions[caption_idx].figure_type,
                       page=page_num))
    pdf_detection_result = PdfDetectionResult(
        pdf=pdf_path,
        figures=figures_by_page,
        dpi=settings.DEFAULT_INFERENCE_DPI,
        raw_detected_boxes=figure_boxes_by_page,
        raw_pdffigures_output=pdffigures_output)

    output_path = os.path.join(
        output_directory,
        os.path.basename(pdf_path)[:-4] + 'deepfigures-results.json')
    file_util.write_json_atomic(output_path,
                                pdf_detection_result.to_dict(),
                                indent=2,
                                sort_keys=True)
    return output_path
def detect_figures(
        pdf: str, pdffigures_captions: List[CaptionOnly],
        detector: TensorboxCaptionmaskDetector,
        conf_threshold: float) -> Tuple[List[Figure], List[List[BoxClass]]]:
    page_image_files = pdf_renderer.render(pdf,
                                           dpi=settings.DEFAULT_INFERENCE_DPI)
    page_tensors = []
    for f in page_image_files:
        page_im = image_util.read_tensor(f)
        if detector.hypes['image_channels'] == 3:
            page_tensors.append(page_im)
        else:
            im_with_mask = np.pad(page_im,
                                  pad_width=[(0, 0), (0, 0), (0, 1)],
                                  mode='constant',
                                  constant_values=CAPTION_CHANNEL_BACKGROUND)
            for caption in pdffigures_captions:
                (x1, y1, x2, y2) = caption.caption_boundary.get_rounded()
                im_with_mask[y1:y2, x1:x2, 3] = CAPTION_CHANNEL_MASK
            page_tensors.append(im_with_mask)
    figure_boxes_by_page = detector.get_detections(
        page_tensors, conf_threshold=conf_threshold)
    figures_by_page = []
    for page_num in range(len(page_image_files)):
        # Page numbers are always 0 indexed
        figure_boxes = figure_boxes_by_page[page_num]
        pf_page_captions = [
            cap for cap in pdffigures_captions if cap.page == page_num
        ]
        caption_boxes = [cap.caption_boundary for cap in pf_page_captions]
        (figure_indices,
         caption_indices) = figure_utils.pair_boxes(figure_boxes,
                                                    caption_boxes)
        figures_by_page.extend([
            Figure(
                figure_boundary=figure_boxes[figure_idx],
                caption_boundary=caption_boxes[caption_idx],
                caption_text=pf_page_captions[caption_idx].caption_text,
                name=pf_page_captions[caption_idx].name,
                figure_type=pf_page_captions[caption_idx].figure_type,
                page=page_num,
            ) for (figure_idx,
                   caption_idx) in zip(figure_indices, caption_indices)
        ])
    return figures_by_page, figure_boxes_by_page