def extract(self, pdf_path, output_directory): """Return a ``FigureExtraction`` instance for ``pdf_path``. Extract the figures and additional information from the PDF at ``pdf_path``, saving the results to disk in ``output_directory`` and returning the corresponding ``FigureExtraction`` instance. Parameters ---------- pdf_path : str The path to the PDF. output_directory : str The directory in which to save the results from extraction. Returns ------- FigureExtraction A ``FigureExtraction`` instance for the PDF at ``pdf_path``. """ figure_extraction = FigureExtraction(pdf_path=pdf_path, parent_directory=output_directory) # create the extraction results directory os.makedirs(figure_extraction.paths['BASE']) # copy the PDF into the extraction results directory shutil.copy(pdf_path, figure_extraction.paths['PDF_PATH']) pdf_renderer = settings_utils.import_setting( settings.DEEPFIGURES_PDF_RENDERER)() # render the PDF into low-res images figure_extraction.low_res_rendering_paths = \ pdf_renderer.render( pdf_path=figure_extraction.paths['PDF_PATH'], output_dir=figure_extraction.paths['BASE'], dpi=settings.DEFAULT_INFERENCE_DPI) # render the PDF into hi-res images figure_extraction.hi_res_rendering_paths = \ pdf_renderer.render( pdf_path=figure_extraction.paths['PDF_PATH'], output_dir=figure_extraction.paths['BASE'], dpi=settings.DEFAULT_CROPPED_IMG_DPI) # extract captions from PDF using pdffigures2 figure_extraction.pdffigures_output_path = \ pdffigures_wrapper.pdffigures_extractor.extract( pdf_path=figure_extraction.paths['PDF_PATH'], output_dir=figure_extraction.paths['BASE']) # run deepfigures / neural networks on the PDF images figure_extraction.deepfigures_json_path = \ detection.extract_figures_json( pdf_path=figure_extraction.paths['PDF_PATH'], page_image_paths=figure_extraction.low_res_rendering_paths, pdffigures_output=figure_extraction.pdffigures_output_path, output_directory=figure_extraction.paths['BASE']) return figure_extraction
def convert_pdf_paths_to_images(input_dir, output_dir, temp, cpu_count=cpu_count(), max_pages=500): pdf_renderer = settings_utils.import_setting( settings.DEEPFIGURES_PDF_RENDERER)() pdf_paths = glob.glob(os.path.join(input_dir, '*.pdf')) logger.info( "Total paths obtained: {path_count}".format(path_count=len(pdf_paths))) with Pool(cpu_count) as pool: pool.map( partial(convert_pdf_to_images, _output_dir=output_dir, _pdf_renderer=pdf_renderer, _temp_dir=temp, max_pages=max_pages), pdf_paths)
import numpy as np import tensorflow as tf from deepfigures import settings from deepfigures.extraction.datamodels import (BoxClass, Figure, PdfDetectionResult, CaptionOnly) from deepfigures.extraction import (figure_utils, pdffigures_wrapper) from deepfigures.extraction.pdffigures_wrapper import pdffigures_extractor from deepfigures.utils import (file_util, image_util, settings_utils) import train from vendor.tensorboxresnet.tensorboxresnet.utils import train_utils CAPTION_CHANNEL_BACKGROUND = 255 CAPTION_CHANNEL_MASK = 0 pdf_renderer = settings_utils.import_setting( settings.DEEPFIGURES_PDF_RENDERER)() class TensorboxCaptionmaskDetector(object): """Interface for using the neural network model to detect figures. Instantiating this class creates a tensorflow session object as the self.sess attribute. When done using the instance, remember to close the session; however, do not open and close sessions every time you extract a figure because the added overhead will very negatively affect performance. """ def __init__( self, save_dir, iteration,