Exemplo n.º 1
0
    def extract(self, pdf_path, output_directory):
        """Return a ``FigureExtraction`` instance for ``pdf_path``.

        Extract the figures and additional information from the PDF at
        ``pdf_path``, saving the results to disk in ``output_directory``
        and returning the corresponding ``FigureExtraction`` instance.

        Parameters
        ----------
        pdf_path : str
            The path to the PDF.
        output_directory : str
            The directory in which to save the results from extraction.

        Returns
        -------
        FigureExtraction
            A ``FigureExtraction`` instance for the PDF at ``pdf_path``.
        """
        figure_extraction = FigureExtraction(pdf_path=pdf_path,
                                             parent_directory=output_directory)

        # create the extraction results directory
        os.makedirs(figure_extraction.paths['BASE'])

        # copy the PDF into the extraction results directory
        shutil.copy(pdf_path, figure_extraction.paths['PDF_PATH'])

        pdf_renderer = settings_utils.import_setting(
            settings.DEEPFIGURES_PDF_RENDERER)()

        # render the PDF into low-res images
        figure_extraction.low_res_rendering_paths = \
            pdf_renderer.render(
                pdf_path=figure_extraction.paths['PDF_PATH'],
                output_dir=figure_extraction.paths['BASE'],
                dpi=settings.DEFAULT_INFERENCE_DPI)

        # render the PDF into hi-res images
        figure_extraction.hi_res_rendering_paths = \
            pdf_renderer.render(
                pdf_path=figure_extraction.paths['PDF_PATH'],
                output_dir=figure_extraction.paths['BASE'],
                dpi=settings.DEFAULT_CROPPED_IMG_DPI)

        # extract captions from PDF using pdffigures2
        figure_extraction.pdffigures_output_path = \
            pdffigures_wrapper.pdffigures_extractor.extract(
                pdf_path=figure_extraction.paths['PDF_PATH'],
                output_dir=figure_extraction.paths['BASE'])

        # run deepfigures / neural networks on the PDF images
        figure_extraction.deepfigures_json_path = \
            detection.extract_figures_json(
                pdf_path=figure_extraction.paths['PDF_PATH'],
                page_image_paths=figure_extraction.low_res_rendering_paths,
                pdffigures_output=figure_extraction.pdffigures_output_path,
                output_directory=figure_extraction.paths['BASE'])

        return figure_extraction
Exemplo n.º 2
0
def convert_pdf_paths_to_images(input_dir,
                                output_dir,
                                temp,
                                cpu_count=cpu_count(),
                                max_pages=500):
    pdf_renderer = settings_utils.import_setting(
        settings.DEEPFIGURES_PDF_RENDERER)()
    pdf_paths = glob.glob(os.path.join(input_dir, '*.pdf'))
    logger.info(
        "Total paths obtained: {path_count}".format(path_count=len(pdf_paths)))
    with Pool(cpu_count) as pool:
        pool.map(
            partial(convert_pdf_to_images,
                    _output_dir=output_dir,
                    _pdf_renderer=pdf_renderer,
                    _temp_dir=temp,
                    max_pages=max_pages), pdf_paths)
import numpy as np
import tensorflow as tf

from deepfigures import settings
from deepfigures.extraction.datamodels import (BoxClass, Figure,
                                               PdfDetectionResult, CaptionOnly)
from deepfigures.extraction import (figure_utils, pdffigures_wrapper)
from deepfigures.extraction.pdffigures_wrapper import pdffigures_extractor
from deepfigures.utils import (file_util, image_util, settings_utils)
import train
from vendor.tensorboxresnet.tensorboxresnet.utils import train_utils

CAPTION_CHANNEL_BACKGROUND = 255
CAPTION_CHANNEL_MASK = 0

pdf_renderer = settings_utils.import_setting(
    settings.DEEPFIGURES_PDF_RENDERER)()


class TensorboxCaptionmaskDetector(object):
    """Interface for using the neural network model to detect figures.

    Instantiating this class creates a tensorflow session object as the
    self.sess attribute. When done using the instance, remember to close
    the session; however, do not open and close sessions every time you
    extract a figure because the added overhead will very negatively
    affect performance.
    """
    def __init__(
        self,
        save_dir,
        iteration,