예제 #1
0
def parse_img(img_obj, labels):
    from pdfminer.image import ImageWriter
    from subprocess import call
    from pdfminer.pdftypes import PDFNotImplementedError
    from os import remove, rename, path

    try:
        logging.info("Writing " + img_obj.name)
        image_writer = ImageWriter(".")
        # TODO this is not thread safe... (so run with "-p 1" for now)
        output_filename = image_writer.export_image(img_obj)
        # TODO rename(output_filename, ...)
        logging.info("Written " + output_filename)
        logging.info("Calling Tesseract hOCR")
        call(["tesseract", output_filename, "out", "-l", "nld", "hocr"])
        # TODO remove(output_filename)

        if path.isfile("./out.html"):
            # TODO fix race condition
            from extracthocr import get_bbox_page, get_bbox_texts

            for label in get_bbox_texts("./out.html"):
                labels.append(convert_label_bbox(label, img_obj.bbox, get_bbox_page("./out.html")))
            remove("./out.html")
        else:
            logging.error("Image object kon niet verwerkt worden door Tesseract: " + output_filename + ".")
    except PDFNotImplementedError as e:
        logging.error(
            "Image object kon niet verwerkt worden: "
            + str(e)
            + ". Mogelijk helpt het opslaan vam de PDF met ondersteuning voor Acrobat 4.x en later."
        )
예제 #2
0
 def get_text(self):
     """Does OCR on this image."""
     image_writer = ImageWriter("temp")
     try:
         temp_image = image_writer.export_image(self._image_obj)
     except PDFNotImplementedError:
         # No filter method available for this stream
         # https://github.com/euske/pdfminer/issues/99
         return u""
     try:
         text = image_to_string(Image.open("temp/" + temp_image),
                                lang="fin")
     except IOError:
         # PdfMiner did not return an image
         # Let's try to create one ourselves
         # TODO: Create proper color_mode values from ColorSpace
         # Most of the times "L" will create something good enough
         # for OCR, though
         temp_image = Image.frombuffer("L",
                                       self._image_obj.srcsize,
                                       self._stream.get_data(), "raw",
                                       "L", 0, 1)
         text = image_to_string(temp_image, lang="fin")
     unlink("temp/" + temp_image)
     return text
예제 #3
0
def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v.encode('ascii')
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-t': outtype = v
        elif k == '-c': encoding = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()
    return
예제 #4
0
def extract_text(
    files=[],
    outfile=None,
    _py2_no_more_posargs=None,
    no_laparams=False,
    all_texts=None,
    detect_vertical=None,  # LAParams
    word_margin=None,
    char_margin=None,
    line_margin=None,
    boxes_flow=None,  # LAParams
    output_type="text",
    codec="utf-8",
    strip_control=False,
    maxpages=0,
    page_numbers=None,
    password="",
    scale=1.0,
    rotation=0,
    layoutmode="normal",
    output_dir=None,
    debug=False,
    disable_caching=False,
    **other
):
    if _py2_no_more_posargs is not None:
        raise ValueError("Too many positional arguments passed.")
    if not files:
        raise ValueError("Must provide files to work upon!")

    # If any LAParams group arguments were passed, create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if not no_laparams:
        laparams = pdfminer.layout.LAParams()
        for param in (
            "all_texts",
            "detect_vertical",
            "word_margin",
            "char_margin",
            "line_margin",
            "boxes_flow",
        ):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)
    else:
        laparams = None

    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)

    if output_type == "text" and outfile != "-":
        for override, alttype in (
            (".htm", "html"),
            (".html", "html"),
            (".xml", "xml"),
            (".tag", "tag"),
        ):
            if outfile.endswith(override):
                output_type = alttype

    with open(outfile, "wb") as outfp:
        for fname in files:
            with open(fname, "rb") as fp:
                pdfminer.high_level.extract_text_to_fp(fp, **locals())
        outfp.flush()
예제 #5
0
def convert_pdf2txt(args=None):
    import argparse
    P = argparse.ArgumentParser(description=__doc__)
    P.add_argument("files",
                   type=str,
                   default=None,
                   nargs="+",
                   help="Files to process.")
    P.add_argument("-d",
                   "--debug",
                   default=False,
                   action="store_true",
                   help="Debug output.")
    P.add_argument(
        "-p",
        "--pagenos",
        type=str,
        help=
        "Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry."
    )
    P.add_argument(
        "--page-numbers",
        type=int,
        default=None,
        nargs="+",
        help=
        "Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used."
    )
    P.add_argument("-m",
                   "--maxpages",
                   type=int,
                   default=0,
                   help="Maximum pages to parse")
    P.add_argument("-P",
                   "--password",
                   type=str,
                   default="",
                   help="Decryption password for PDF")
    P.add_argument("-o",
                   "--outfile",
                   type=str,
                   default="-",
                   help="Output file (default/'-' is stdout)")
    P.add_argument("-t",
                   "--output_type",
                   type=str,
                   default="text",
                   help="Output type: text|html|xml|tag (default is text)")
    P.add_argument("-c",
                   "--codec",
                   type=str,
                   default="utf-8",
                   help="Text encoding")
    P.add_argument("-s", "--scale", type=float, default=1.0, help="Scale")
    P.add_argument("-A",
                   "--all-texts",
                   default=None,
                   action="store_true",
                   help="LAParams all texts")
    P.add_argument("-V",
                   "--detect-vertical",
                   default=None,
                   action="store_true",
                   help="LAParams detect vertical")
    P.add_argument("-W",
                   "--word-margin",
                   type=float,
                   default=None,
                   help="LAParams word margin")
    P.add_argument("-M",
                   "--char-margin",
                   type=float,
                   default=None,
                   help="LAParams char margin")
    P.add_argument("-L",
                   "--line-margin",
                   type=float,
                   default=None,
                   help="LAParams line margin")
    P.add_argument("-F",
                   "--boxes-flow",
                   type=float,
                   default=None,
                   help="LAParams boxes flow")
    P.add_argument("-Y",
                   "--layoutmode",
                   default="normal",
                   type=str,
                   help="HTML Layout Mode")
    P.add_argument("-n",
                   "--no-laparams",
                   default=False,
                   action="store_true",
                   help="Pass None as LAParams")
    P.add_argument("-R", "--rotation", default=0, type=int, help="Rotation")
    P.add_argument("-O",
                   "--output-dir",
                   default=None,
                   help="Output directory for images")
    P.add_argument("-C",
                   "--disable-caching",
                   default=False,
                   action="store_true",
                   help="Disable caching")
    P.add_argument("-S",
                   "--strip-control",
                   default=False,
                   action="store_true",
                   help="Strip control in XML mode")
    # A = P.parse_args(args=[fp, '--outfile', fout])
    A = P.parse_args(args=args)

    if A.page_numbers:
        A.page_numbers = set([x - 1 for x in A.page_numbers])
    if A.pagenos:
        A.page_numbers = set([int(x) - 1 for x in A.pagenos.split(",")])

    imagewriter = None
    if A.output_dir:
        imagewriter = ImageWriter(A.output_dir)

    if six.PY2 and sys.stdin.encoding:
        A.password = A.password.decode(sys.stdin.encoding)

    if A.output_type == "text" and A.outfile != "-":
        for override, alttype in ((".htm", "html"), (".html", "html"),
                                  (".xml", "xml"), (".tag", "tag")):
            if A.outfile.endswith(override):
                A.output_type = alttype

    if A.outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            # Why ignore outfp.encoding? :-/ stupid cathal?
            A.codec = 'utf-8'
    else:
        outfp = open(A.outfile, "wb")

    ## Test Code
    outfp = extract_text(**vars(A))
    outfp.close()
    return 0
예제 #6
0
    def extract_text(
            files=[],
            outfile=[],
            _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
            no_laparams=False,
            all_texts=None,
            detect_vertical=None,  # LAParams
            word_margin=None,
            char_margin=None,
            line_margin=None,
            boxes_flow=None,  # LAParams
            output_type='text',
            codec='utf-8',
            strip_control=False,
            maxpages=0,
            page_numbers=None,
            password="",
            scale=1.0,
            rotation=0,
            layoutmode='normal',
            output_dir=None,
            debug=False,
            disable_caching=False,
            **other):
        if _py2_no_more_posargs is not None:
            raise ValueError("Too many positional arguments passed.")
        """if not files:
            raise ValueError("Must provide files to work upon!")"""

        # If any LAParams group arguments were passed, create an LAParams object and
        # populate with given args. Otherwise, set it to None.
        if not no_laparams:
            laparams = pdfminer.layout.LAParams()
            for param in ("all_texts", "detect_vertical", "word_margin",
                          "char_margin", "line_margin", "boxes_flow"):
                paramv = locals().get(param, None)
                if paramv is not None:
                    setattr(laparams, param, paramv)
        else:
            laparams = None

        imagewriter = None
        if output_dir:
            imagewriter = ImageWriter(output_dir)
        """if output_type == "text" and outfile != "-":
            for override, alttype in (  (".htm", "html"),
                                    (".html", "html"),
                                    (".xml", "xml"),
                                    (".tag", "tag"),
                                 (".txt","text")):
            if outfile.endswith(override):
                output_type = alttype"""

        if outfile == []:
            outfp = sys.stdout
            if outfp.encoding is not None:
                codec = 'utf-8'
        else:
            i = 0
            for outfi in outfile:
                fname = files[i]
                i += 1
                outfp = open('textresume/' + outfi, "w")

                with open('Original_Resumes/' + fname, "rb") as fp:
                    pdfminer.high_level.extract_text_to_fp(fp, **locals())
        return
예제 #7
0
def read_chars(file,
               pages=[],
               laycntrl={},
               codec='utf-8',
               strip_control=False,
               password='',
               caching=True,
               maxpages=0,
               rotation=0,
               image_dir=''):
    """ Reads a file in pdf format.

    Use **pdfminer** to read a pdf-file into **Python**.

    Args:
        file (str): A string providing the location of the file.
        pages (list[int]): A list giving the numbers of the
            pages to be extracted, by default (default is `[]`) all
            pages are extracted.
        codec (str): A string giving the codec (default is 'utf-8').
        strip_control (bool): (default is `False`) not used in XML2Converter.
        password (str): A string giving the password (default is '').
        caching (bool): (default is `True`)
        maxpages (int): (default is `0`)
        rotation (int): (default is `0`)
        image_dir (str): (default is `''`)

    Returns:
        PdfDoc: An object of type `PdfDoc`.

    """
    if not (os.path.splitext(file)[1] == ".pdf"):
        raise IOError("PDF-file expected got '%s'!" %
                      (os.path.splitext(file)[1], ))

    if not os.path.exists(file):
        raise IOError("Could not find PDF-file '%s'!" % (file, ))

    if len(image_dir) == 0:
        imagewriter = None
    else:
        if not os.path.exists(image_dir):
            os.mkdir(image_dir)
        imagewriter = ImageWriter(image_dir)

    rsrcmgr = PDFResourceManager(caching=caching)
    laparams = LAParams(**laycntrl)

    device = XML2Converter(rsrcmgr,
                           codec=codec,
                           laparams=laparams,
                           imagewriter=imagewriter,
                           stripcontrol=strip_control)

    interpreter = PDFPageInterpreter(rsrcmgr, device)

    with open(file, 'rb') as con:
        if (pages is None) or (len(pages) == 0):
            pages = [i[0] for i in enumerate(PDFPage.get_pages(con))]

        for page in PDFPage.get_pages(con,
                                      pages,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)

    return PdfDoc(device.doc)
예제 #8
0
def main(argv):

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    using_optparse = False

    parser = ArgumentParser(prog='pdf2txt.py',
                            description='Convert pdf to txt',
                            formatter_class=ArgumentDefaultsHelpFormatter)

    if using_optparse:
        DEBUG(3, 'using optparse')
        parser.add_argument = parser.add_option
        parser.parse_known_args = parser.parse_args
        parser.disable_interspersed_args()

    parser.add_argument('-d',
                        dest='debuglevel',
                        action='count',
                        default=0,
                        help='Debug (repeat for more verbose debugging)')

    parser.add_argument(
        '-p',
        '--pages',
        dest='pagenos',
        action='store',
        type=str,
        default='',
        help=
        'Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.'
    )

    parser.add_argument('-c',
                        '--codec',
                        dest='codec',
                        action='store',
                        type=str,
                        default='utf-8',
                        help='Specifies the output codec.')

    parser.add_argument(
        '-t',
        '--type',
        dest='outtype',
        action='store',
        type=str,
        default='shape',
        choices=['text', 'html', 'xml', 'tag', 'shape'],
        help='Specifies the output format, one of: shape, text, html, xml, tag'
    )

    parser.add_argument(
        '-m',
        dest='maxpages',
        action='store',
        type=int,
        default=0,
        help=
        'Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.'
    )

    parser.add_argument(
        '-P',
        '--password',
        dest='password',
        action='store',
        type=str,
        default='',
        help='Provides the user password to access PDF contents.')

    parser.add_argument(
        '-o',
        '--output',
        dest='outfile',
        action='store',
        type=str,
        default=None,
        help=
        'Specifies the output file name. By default, it prints the extracted contents to stdout in text format.'
    )

    parser.add_argument(
        '-C',
        '--no-caching',
        dest='caching',
        action='store_false',
        default=True,
        help=
        'Suppress object caching. This will reduce the memory consumption but also slows down the process.'
    )

    parser.add_argument('-n',
                        '--no-layout',
                        dest='layout',
                        action='store_false',
                        default=True,
                        help='Suppress layout analysis.')

    parser.add_argument('--show-pageno',
                        dest='show_pageno',
                        action='store_true',
                        default=False,
                        help='Show page numbers.')

    parser.add_argument(
        '-A',
        '--analyze-all',
        dest='all_texts',
        action='store_true',
        default=False,
        help=
        'Forces to perform layout analysis for all the text strings, including text contained in figures.'
    )

    parser.add_argument('-V',
                        '--detect-vertical',
                        dest='detect_vertical',
                        action='store_true',
                        default=False,
                        help='Allows vertical writing detection.')

    parser.add_argument(
        '-M',
        dest='char_margin',
        action='store',
        type=float,
        default=2.0,
        help=
        'Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.'
    )

    parser.add_argument(
        '-L',
        dest='line_margin',
        action='store',
        type=float,
        default=0.5,
        help=
        'Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.'
    )

    parser.add_argument(
        '-W',
        dest='word_margin',
        action='store',
        type=float,
        default=0.1,
        help=
        'It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.'
    )

    parser.add_argument(
        '-F',
        dest='boxes_flow',
        action='store',
        type=float,
        default=0.5,
        help=
        'Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).'
    )

    parser.add_argument(
        '-Y',
        '--layout-mode',
        dest='layoutmode',
        action='store',
        type=str,
        default='normal',
        choices=['exact', 'normal', 'loose'],
        help=
        'Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.'
    )

    parser.add_argument('-O',
                        '--image-writer',
                        dest='imagewriter',
                        action='store',
                        type=str,
                        default=None,
                        help='imagewriter')

    parser.add_argument('-R',
                        '--rotation',
                        dest='rotation',
                        action='store',
                        type=int,
                        default=0,
                        help='rotation')

    parser.add_argument('-S',
                        '--strip-control',
                        dest='stripcontrol',
                        action='store_true',
                        default=False,
                        help='stripcontrol')

    parser.add_argument(
        '-s',
        dest='scale',
        action='store',
        type=float,
        default=1,
        help='Specifies the output scale. Can be used in HTML format only.')

    parser.add_argument(
        '--draw-lines',
        dest='draw_lines',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--draw-boxes',
        dest='draw_boxes',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--draw-blocks',
        dest='draw_blocks',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--shear-limit',
        dest='shear_limit',
        action='store',
        default=0.1,
        type=float,
        help=
        "If the text is sheared above this limit, reject it. Valid only for the `shape' output."
    )

    parser.add_argument(
        '--rotation-limit',
        dest='rotation_limit',
        action='store',
        default=2,
        type=float,
        help=
        "If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output."
    )

    parser.add_argument(
        '--line-height-diff',
        dest='line_height_diff',
        action='store',
        type=float,
        default=0.1,
        help=
        'Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).'
    )

    parser.add_argument('--heading-before',
                        dest='heading_before',
                        action='store',
                        type=str,
                        default='',
                        help='String to put before each heading, e.g. <h1>')

    parser.add_argument('--heading-after',
                        dest='heading_after',
                        action='store',
                        type=str,
                        default='',
                        help='String to put after each heading, e.g. </h1>')

    parser.add_argument(
        '--box-separator',
        dest='box_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--block-separator',
        dest='block_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-separator',
        dest='indent_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-string',
        dest='indent_string',
        action='store',
        type=str,
        default=r'\t',
        help=
        r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-limit',
        dest='indent_limit',
        action='store',
        type=float,
        default=3,
        help=
        'If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.'
    )

    parser.add_argument(
        '--page-separator',
        dest='page_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--norm-whitespace',
        dest='norm_whitespace',
        action='store_true',
        default=False,
        help=
        'Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).'
    )

    parser.add_argument(
        '--print-stats',
        dest='print_stats',
        action='store_true',
        default=False,
        help=
        'Instead of the text, output some simple statistics about the file.')

    parser.add_argument(
        '--max-blocks',
        dest='max_blocks',
        action='store',
        default=0,
        type=int,
        help=
        'If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.'
    )

    parser.add_argument(
        '--max-textlines',
        dest='max_textlines',
        action='store',
        default=0,
        type=int,
        help=
        'If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.'
    )

    parser.add_argument(
        '--line-height-method',
        dest='line_height_method',
        action='store',
        type=str,
        default='bbox',
        choices=['bbox', 'mean', 'median'],
        help=
        'Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.'
    )

    parser.add_argument(dest='pdffile',
                        help='List of PDF files to go through',
                        default=None,
                        nargs='+')

    args, rest = parser.parse_known_args()

    global debuglevel
    debuglevel = debug = args.debuglevel
    DEBUG(3, 'args:', str(args))
    DEBUG(3, 'rest:', str(rest))

    DEBUG(3, 'optparse:', using_optparse)

    if args.pagenos:
        pagenos.update(int(x) - 1 for x in args.pagenos.split(','))
    maxpages = args.maxpages
    outfile = args.outfile
    password = args.password
    caching = args.caching
    showpageno = args.show_pageno
    if not args.layout:
        laparams = None
    if laparams and args.all_texts:
        laparams.all_texts = True
    if laparams and args.detect_vertical:
        laparams.detect_vertical = True
    if laparams:
        laparams.char_margin = args.char_margin
        laparams.line_margin = args.line_margin
        laparams.word_margin = args.word_margin
        laparams.boxes_flow = args.boxes_flow
    layoutmode = args.layoutmode

    if args.imagewriter:
        imagewriter = ImageWriter(args.imagewriter)

    rotation = args.rotation
    stripcontrol = args.stripcontrol
    outtype = args.outtype
    codec = args.codec
    scale = args.scale

    args.box_separator = unescape_string(args.box_separator)
    args.block_separator = unescape_string(args.block_separator)
    args.indent_separator = unescape_string(args.indent_separator)
    args.indent_string = unescape_string(args.indent_string)

    args.page_separator = unescape_string(args.page_separator)

    global options
    options = args

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
        DEBUG(2, 'output goes to', outfile)
    else:
        outfp = sys.stdout
        DEBUG(2, 'output goes to stdout')
    if outtype == 'shape':
        device = ShapeTextConverter(rsrcmgr,
                                    outfp,
                                    codec=codec,
                                    laparams=laparams,
                                    showpageno=showpageno,
                                    imagewriter=imagewriter)
    elif outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in options.pdffile:
        DEBUG(2, 'processing', fname)
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()

    outfp.close()
    DEBUG(2, 'finished.')

    return
예제 #9
0
def main(argv):
    import getopt
    def usage():
        print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
               ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
               ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
               ' [-t text|html|xml|tag] [-c codec] [-s scale]'
               ' file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = 'tag'
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'tag'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    for fname in args:
        l = glob.glob(fname)
        count = len(l)
        print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format'
        for pdf in l:
#             print pdf
            d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'}
            ext = '.' + d[outtype]
            outfile = pdf[0:-4] + ext
            print outfile
            outfp = file(outfile, 'wb')
            if outtype == 'text':
                device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'xml':
                device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                      imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'html':
                device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                                       layoutmode=layoutmode, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'tag':
                device = TagExtractor(rsrcmgr, outfp, codec=codec)
                device.showpageno = False
            else:
                return usage()
    
            fp = file(pdf, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                          maxpages=maxpages, password=password,
                                          caching=caching, check_extractable=True):
                page.rotate = (page.rotate+rotation) % 360
                interpreter.process_page(page)
            fp.close()
            device.close()
            outfp.close()

        print 'Done'
    return
예제 #10
0
def extract_text(
        files=[],
        outfile='-',
        no_laparams=False,
        all_texts=None,
        detect_vertical=None,  # LAParams
        word_margin=None,
        char_margin=None,
        line_margin=None,
        boxes_flow=None,  # LAParams
        output_type='text',
        codec='utf-8',
        strip_control=False,
        maxpages=0,
        page_numbers=None,
        password="",
        scale=1.0,
        rotation=0,
        layoutmode='normal',
        output_dir=None,
        debug=False,
        disable_caching=False,
        **kwargs):
    if '_py2_no_more_posargs' in kwargs is not None:
        raise DeprecationWarning(
            'The `_py2_no_more_posargs will be removed on January, 2020. At '
            'that moment pdfminer.six will stop supporting Python 2. Please '
            'upgrade to Python 3. For more information see '
            'https://github.com/pdfminer/pdfminer .six/issues/194')

    if not files:
        raise ValueError("Must provide files to work upon!")

    # If any LAParams group arguments were passed, create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if not no_laparams:
        laparams = pdfminer.layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin",
                      "char_margin", "line_margin", "boxes_flow"):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)
    else:
        laparams = None

    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)

    if output_type == "text" and outfile != "-":
        for override, alttype in ((".htm", "html"), (".html", "html"),
                                  (".xml", "xml"), (".tag", "tag")):
            if outfile.endswith(override):
                output_type = alttype

    if outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            codec = 'utf-8'
    else:
        outfp = open(outfile, "wb")

    for fname in files:
        with open(fname, "rb") as fp:
            pdfminer.high_level.extract_text_to_fp(fp, **locals())
    return outfp
예제 #11
0
def ConvertPdf(pdfpath, outfp, opts={}):
    import sys
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice, TagExtractor
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.cmapdb import CMapDB
    from pdfminer.layout import LAParams
    from pdfminer.image import ImageWriter

    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    if not outtype:
        outtype = 'txt'
    if outtype == 'txt':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    fp = file(pdfpath, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()

    return True
예제 #12
0
    def get_html_tree(self) -> str:
        # Create a temp folder where images are temporarily saved.
        dirname = tempfile.mkdtemp()
        imagewriter = ImageWriter(dirname)

        doc = Document()
        self.doc = doc
        html = doc.createElement("html")
        doc.appendChild(html)
        head = doc.createElement("head")
        html.appendChild(head)
        # meta
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-system")
        meta.setAttribute("content",
                          f"Converted from PDF by pdftotree {__version__}")
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-capabilities")
        meta.setAttribute("content",
                          "ocr_page ocr_table ocrx_block ocrx_line ocrx_word")
        meta = doc.createElement("meta")
        head.appendChild(meta)
        meta.setAttribute("name", "ocr-number-of-pages")
        meta.setAttribute("content", f"{len(self.elems.keys())}")
        # body
        body = doc.createElement("body")
        html.appendChild(body)
        for page_num in self.elems.keys():  # 1-based
            boxes: List[Tuple[str, float, float, float, float]] = []
            for clust in self.tree[page_num]:
                for (pnum, pwidth, pheight, top, left, bottom,
                     right) in self.tree[page_num][clust]:
                    boxes += [(clust.lower().replace(" ", "_"), top, left,
                               bottom, right)]
            page = doc.createElement("div")
            page.setAttribute("class", "ocr_page")
            page.setAttribute("id", f"page_{page_num}")
            width = int(self.elems[page_num].layout.width)
            height = int(self.elems[page_num].layout.height)
            page.setAttribute(
                "title",
                f"bbox 0 0 {width} {height}; ppageno {page_num-1}",
            )
            body.appendChild(page)
            # TODO: We need to detect columns and sort acccordingly.
            boxes.sort(key=cmp_to_key(column_order))

            for box in boxes:
                if box[0] == "table":
                    table = box[1:]  # bbox
                    table_element = self.get_html_table(table, page_num)
                    page.appendChild(table_element)
                elif box[0] == "figure":
                    elems: List[LTTextLine] = get_mentions_within_bbox(
                        box, self.elems[page_num].figures)
                    fig_element = doc.createElement("figure")
                    page.appendChild(fig_element)
                    top, left, bottom, right = [int(i) for i in box[1:]]
                    fig_element.setAttribute(
                        "title", f"bbox {left} {top} {right} {bottom}")
                    for img in [img for elem in elems for img in elem]:
                        if not isinstance(img, LTImage):
                            continue
                        filename = imagewriter.export_image(img)
                        with open(os.path.join(dirname, filename), "rb") as f:
                            base64 = b64encode(f.read()).decode("ascii")
                        if filename.endswith("jpg"):
                            mediatype = "jpeg"
                        elif filename.endswith("bmp"):
                            mediatype = "bmp"
                        else:
                            logger.info(
                                f"Skipping an unknown type image: {filename}.")
                            continue
                        logger.info(
                            f"Embedding a known type image: {filename}.")
                        img_element = doc.createElement("img")
                        fig_element.appendChild(img_element)
                        img_element.setAttribute("title", bbox2str(img.bbox))
                        img_element.setAttribute(
                            "src", f"data:image/{mediatype};base64,{base64}")
                else:
                    element = self.get_html_others(box[0], box[1:], page_num)
                    page.appendChild(element)
        return doc.toprettyxml()
예제 #13
0
def convert(argv):
    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = 'inputs/' + sys.argv[1].replace(' ', '')[:-4] + '.txt'
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file('pdfs/' + fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()

    #read converted file
    y = open("inputs/" + sys.argv[1].replace(' ', '')[:-4] + '.txt', "r")
    output = brl.translate(y.read())
    #convert into Grade 2 Braille unicode
    x = brl.toUnicodeSymbols(output, flatten=True)
    #save to results folder in .txt format
    text_file = open(
        "results/" + sys.argv[1].replace(' ', '')[:-4] + "-Braille.txt", "w")
    text_file.write(x.encode(codec))
    text_file.close()
예제 #14
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-P password] [-o output] [-t text|html|xml|tag]'
               ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
               ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
               ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
               ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    pages_text = []
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-P': password = v.encode('ascii')
        elif k == '-o': outfile = v
        elif k == '-t': outtype = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-c': encoding = v
        elif k == '-s': scale = float(v)
        elif k == '-R': rotation = int(v)
        elif k == '-Y': layoutmode = v
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
    #
    # PDFDocument.debug = debug
    # PDFParser.debug = debug
    # CMapDB.debug = debug
    # PDFPageInterpreter.debug = debug
    #
    retstr = io.StringIO()
    rsrcmgr = PDFResourceManager(caching=caching)
    device = TextConverter(rsrcmgr, retstr, laparams=laparams,imagewriter=imagewriter)
    data = []
    
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                          maxpages=maxpages, password=password,
                                          caching=caching, check_extractable=True):
                interpreter.process_page(page)
                data = retstr.getvalue()
   #print(data)
    data=data.replace("\xa0", "")
    data=data.replace("\uf0da", "")
    data=data.replace("\x0c", "")
    data=data.replace("• ", "")
    data=data.replace("* ", "")
    data=data.replace("(LinkedIn)", "")
    data=data.replace(" (LinkedIn)", "")
    data=data.replace("\uf0a7", "")
    data=data.replace("(Mobile)", "")
    data=data.replace("-       ", "")

    

    
    result_list=data.split('\n')
    #print(result_list)
    skills=[]
    languages=[]
    summary=[]
    certifications=[]
    contact=[]
    linkedin=[]
    experience=[]
    education=[]
    exp_dict={}
    edu_dict={}

    for i in result_list:
        if i=='Contact':
            value=result_list.index(i)
            while True:
                contact.append(result_list[value].strip())
                value=value+1
                if result_list[value] =='':
                    break
        if i.__contains__('www.linkedin.com'):
            value=result_list.index(i)
            while True:
                linkedin.append(result_list[value])
                value=value+1
                if result_list[value] =='':
                    break
            if len(linkedin)>=2:
                ln=[]
                merged=linkedin[0]+linkedin[1].strip()
                ln.append(merged)
                linkedin=ln
        if i=='Top Skills':
            value=result_list.index(i)
            while True:
                skills.append(result_list[value])
                value=value+1
                if result_list[value] =='':
                    break
        if i.__contains__('Certifications'):
            value=result_list.index(i)
            while True:
                certifications.append(result_list[value])
                value=value+1
                if result_list[value] =='':
                    break
        if i.__contains__('Summary'):
            value=result_list.index(i)
            while True:
                summary.append(result_list[value])
                value=value+1
                if result_list[value] =='':
                    break
        if i=='Languages':
            value=result_list.index(i)
            while True:
                languages.append(result_list[value])
                value=value+1
                if result_list[value] =='':
                    break
        if i=='Experience':
            value=result_list.index(i)
            value=value+2
            while True:
                experience.append(result_list[value])
                value=value+1
                a=str(result_list[value])
                if a.__contains__('-'):
                    k=a.split('-')
                    print('start:',k[0],'end:',k[1])
                    break
                elif result_list[value] =='':
                    break
            
            listOfExp = ["company", "position","period","place","description" ]
   
            zipbObj = zip(listOfExp, experience)
            exp_dict = dict(zipbObj)

        
        if i=='Education':
            value=result_list.index(i)
            value=value+1
            while True:
                education.append(result_list[value])
                value=value+1
                # a=str(result_list[value])
                # if a.__contains__('-'):
                #     k=a.split('-')
                #     print('start:',k[0],'end:',k[1])
                #     break
                if result_list[value] =='':
                    break
            listOfEdu = ["school", "degree" ]
   
            zipbObj = zip(listOfEdu, education)
            edu_dict = dict(zipbObj)

   # print(languages)
    print('###############')
    print(contact,'\n',linkedin,'\n',summary,'\n',skills,'\n',certifications,'\n',languages,'\n',exp_dict,'\n',edu_dict)

    #print(data.splitlines())
    device.close()
    retstr.close()
    return
예제 #15
0
def extract_text(
        files=[],
        outfile='-',
        _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
        no_laparams=False,
        all_texts=None,
        detect_vertical=None,  # LAParams
        word_margin=None,
        char_margin=None,
        line_margin=None,
        boxes_flow=None,  # LAParams
        output_type='text',
        codec='utf-8',
        strip_control=False,
        maxpages=0,
        page_numbers=None,
        password="",
        scale=1.0,
        rotation=0,
        layoutmode='normal',
        output_dir=None,
        debug=False,
        disable_caching=False,
        **other):
    if _py2_no_more_posargs is not None:
        raise ValueError("Too many positional arguments passed.")
    if not files:
        raise ValueError("Must provide files to work upon!")

    # If any LAParams group arguments were passed, create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if not no_laparams:
        laparams = pdfminer.layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin",
                      "char_margin", "line_margin", "boxes_flow"):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)
    else:
        laparams = None

    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)

    if output_type == "text" and outfile != "-":
        for override, alttype in ((".htm", "html"), (".html", "html"),
                                  (".xml", "xml"), (".tag", "tag")):
            if outfile.endswith(override):
                output_type = alttype

    if outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            codec = 'utf-8'
    else:
        outfp = open(outfile, "wb")

    for fname in files:
        with open(fname, "rb") as infp:
            #           pdfminer.high_level.extract_text_to_fp(fp, **locals())

            rsrcmgr = PDFResourceManager_new(caching=not disable_caching)
            device = TextConverter(rsrcmgr,
                                   outfp,
                                   codec=codec,
                                   laparams=laparams,
                                   imagewriter=imagewriter)
            if outfp == sys.stdout:
                outfp = sys.stdout.buffer

            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(infp,
                                          page_numbers,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=not disable_caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)

            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.get_pages(infp):
                interpreter.process_page(page)
                # receive the LTPage object for the page.
                layout = device.get_result()
                for element in layout:
                    if isinstance(element, LTTextBoxHorizontal):
                        print(element.get_text())

    return outfp