def load_figures_json(filename: str) -> Dict[str, List[Figure]]: d = file_util.read_json(filename) res = { page: [Figure.from_dict(dict_fig) for dict_fig in page_dicts] for (page, page_dicts) in d.items() } return res
def extract_figures_json(pdf_path, page_image_paths, pdffigures_output, output_directory): """Extract information about figures to JSON and save to disk. :param str pdf_path: path to the PDF from which to extract figures. :returns: path to the JSON file containing the detection results. """ page_images_array = np.array( [imread(page_image_path) for page_image_path in page_image_paths]) detector = get_detector() figure_boxes_by_page = detector.get_detections(page_images_array) pdffigures_captions = pdffigures_wrapper.get_captions( pdffigures_output=pdffigures_output, target_dpi=settings.DEFAULT_INFERENCE_DPI) figures_by_page = [] for page_num in range(len(page_image_paths)): figure_boxes = figure_boxes_by_page[page_num] pf_page_captions = [ caption for caption in pdffigures_captions if caption.page == page_num ] caption_boxes = [ caption.caption_boundary for caption in pf_page_captions ] figure_indices, caption_indices = figure_utils.pair_boxes( figure_boxes, caption_boxes) page_image = page_images_array[page_num] pad_pixels = PAD_FACTOR * min(page_image.shape[:2]) for (figure_idx, caption_idx) in zip(figure_indices, caption_indices): figures_by_page.append( Figure(figure_boundary=figure_boxes[figure_idx].expand_box( pad_pixels).crop_to_page( page_image.shape).crop_whitespace_edges(page_image), caption_boundary=caption_boxes[caption_idx], caption_text=pf_page_captions[caption_idx].caption_text, name=pf_page_captions[caption_idx].name, figure_type=pf_page_captions[caption_idx].figure_type, page=page_num)) pdf_detection_result = PdfDetectionResult( pdf=pdf_path, figures=figures_by_page, dpi=settings.DEFAULT_INFERENCE_DPI, raw_detected_boxes=figure_boxes_by_page, raw_pdffigures_output=pdffigures_output) output_path = os.path.join( output_directory, os.path.basename(pdf_path)[:-4] + 'deepfigures-results.json') file_util.write_json_atomic(output_path, pdf_detection_result.to_dict(), indent=2, sort_keys=True) return output_path
def detect_figures( pdf: str, pdffigures_captions: List[CaptionOnly], detector: TensorboxCaptionmaskDetector, conf_threshold: float) -> Tuple[List[Figure], List[List[BoxClass]]]: page_image_files = pdf_renderer.render(pdf, dpi=settings.DEFAULT_INFERENCE_DPI) page_tensors = [] for f in page_image_files: page_im = image_util.read_tensor(f) if detector.hypes['image_channels'] == 3: page_tensors.append(page_im) else: im_with_mask = np.pad(page_im, pad_width=[(0, 0), (0, 0), (0, 1)], mode='constant', constant_values=CAPTION_CHANNEL_BACKGROUND) for caption in pdffigures_captions: (x1, y1, x2, y2) = caption.caption_boundary.get_rounded() im_with_mask[y1:y2, x1:x2, 3] = CAPTION_CHANNEL_MASK page_tensors.append(im_with_mask) figure_boxes_by_page = detector.get_detections( page_tensors, conf_threshold=conf_threshold) figures_by_page = [] for page_num in range(len(page_image_files)): # Page numbers are always 0 indexed figure_boxes = figure_boxes_by_page[page_num] pf_page_captions = [ cap for cap in pdffigures_captions if cap.page == page_num ] caption_boxes = [cap.caption_boundary for cap in pf_page_captions] (figure_indices, caption_indices) = figure_utils.pair_boxes(figure_boxes, caption_boxes) figures_by_page.extend([ Figure( figure_boundary=figure_boxes[figure_idx], caption_boundary=caption_boxes[caption_idx], caption_text=pf_page_captions[caption_idx].caption_text, name=pf_page_captions[caption_idx].name, figure_type=pf_page_captions[caption_idx].figure_type, page=page_num, ) for (figure_idx, caption_idx) in zip(figure_indices, caption_indices) ]) return figures_by_page, figure_boxes_by_page