def pdf_to_html(path):
    manager = PDFResourceManager()
    retstr = BytesIO()
    layout = LAParams(all_texts=True)
    device = HTMLConverter(manager, retstr, laparams=layout)
    filepath = open(path, 'rb')
    interpreter = PDFPageInterpreter(manager, device)

    for page in PDFPage.get_pages(filepath, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    filepath.close()
    device.close()
    retstr.close()

    # Write HTML String to file.html
    # f = open("demofile3.html", "wb")
    # f.write(text)
    # f.close()

    font_size = extract_font_table(text)

    return font_size
示例#2
0
def convertPDF(fname, pages=None):
    if not pages:
        pagenos = set()
    else:
        pagenos = set(pages)
    caching = True
    outfp = StringIO()
    layoutmode = 'normal'
    laparams = LAParams()
    rotation = 0

    rsrcmgr = PDFResourceManager(caching=caching)
    device = HTMLConverter(rsrcmgr,
                           outfp,
                           codec='utf-8',
                           scale=1,
                           layoutmode=layoutmode,
                           laparams=laparams,
                           imagewriter=None)
    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=0,
                                  password='',
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()

    text = outfp.getvalue()
    outfp.close()
    return text
示例#3
0
    def convert_pdf_to_html(self):
        """

        Converts the pdf that is currently stored in the temporary file inside the repository to an html object

        """
        rsrcmgr = PDFResourceManager()  # Magic (simply functional)
        retstr = BytesIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = open(self.path + 'parliament/repository/temp.pdf', 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        self.text = retstr.getvalue()
        retstr.close()
def pdf_to_text(path):
    manager = PDFResourceManager()
    retstr = BytesIO()
    layout = LAParams(all_texts=True)
    # device = TextConverter(manager, retstr, laparams=layout)
    layoutmode = 'normal'
    imagewriter = None
    device = HTMLConverter(manager, retstr,
                           layoutmode=layoutmode, laparams=layout,
                           imagewriter=imagewriter)

    filepath = open(path, 'rb')
    interpreter = PDFPageInterpreter(manager, device)

    for page in PDFPage.get_pages(filepath, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    filepath.close()
    device.close()
    retstr.close()
    # print text
    from BeautifulSoup import BeautifulSoup
    parsed_html = BeautifulSoup(text)
    return clean_string(parsed_html.text)
示例#5
0
    def convert_pdf_to_html(self,fname,pages=None,skip_first=True) :

        if not pages: 
            pagenums = set()
        else:         
            pagenums = set(pages)      
        manager = PDFResourceManager() 
        codec = 'utf-8'
        caching = True

        output = io.BytesIO()
        converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams())

        interpreter = PDFPageInterpreter(manager, converter)   
        infile = open(fname, 'rb')
        
        print('Processing Page # :',end=' ')
        for i,page in enumerate(PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True)):
            if skip_first :
                if i in [0,1] :
                    continue
            print(i,end=',')
            interpreter.process_page(page)

        convertedPDF = output.getvalue()  

        infile.close(); converter.close(); output.close()
        return convertedPDF
def extract_pdf_page(filename):
    input_file_name = Path(filename).stem
    # Paths for creating folder and file
    output_file_folder = Path(HTML_PATH, input_file_name)
    output_file_folder.mkdir(parents=True, exist_ok=True)
    output_file_path = Path(output_file_folder, input_file_name + ".html") 

    
    output_file = io.StringIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager()
    device = HTMLConverter(rsrcmgr, output_file, laparams=laparams)
                   
            
# EXTRACTING TEXT TO HTML 
    with open(filename, 'rb') as fh:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            interpreter.process_page(page)
 
    device.close()
 
    html = output_file.getvalue()
    with open (output_file_path, 'w', encoding="utf-8" ) as fd:
        fd.write(html)
        
    output_file.close()
 
    return html
示例#7
0
def pdf_to_html(scraped_pdf_data): 
    from pdfminer.pdfinterp import PDFResourceManager, process_pdf 
    from pdfminer.pdfdevice import PDFDevice 
    from pdfminer.converter import HTMLConverter 
    from pdfminer.layout import LAParams 

    import StringIO 
    fp = StringIO.StringIO() 
    fp.write(scraped_pdf_data) 
    fp.seek(0) 
    outfp = StringIO.StringIO() 
    layoutmode='normal'
    scale=2
    charmargin=0.5
    linemargin=0.5
    wordmargin=0.3
    boxesflow=0

    rsrcmgr = PDFResourceManager() 
    device = HTMLConverter(rsrcmgr, outfp, layoutmode=layoutmode, scale=scale, laparams=LAParams(char_margin=charmargin, line_margin=linemargin, word_margin=wordmargin, boxes_flow=boxesflow)) 
    process_pdf(rsrcmgr, device, fp) 
    device.close() 

    t = outfp.getvalue() 
    outfp.close() 
    fp.close() 
    return t
示例#8
0
def pdfTotxt(filepath, outpath):
    try:
        fp = file(filepath, 'rb')
        outfp = file(outpath, 'w')
        # 创建一个PDF资源管理器对象来存储共享资源
        # caching = False不缓存
        rsrcmgr = PDFResourceManager(caching=False)
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec='utf-8',
                               laparams=laparams,
                               imagewriter=None)
        # 创建一个PDF解析器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos=set(),
                                      maxpages=0,
                                      password='',
                                      caching=False,
                                      check_extractable=True):
            page.rotate = page.rotate % 360
            interpreter.process_page(page)
        # 关闭输入流
        fp.close()
        # 关闭输出流
        device.close()
        outfp.flush()
        outfp.close()
    except Exception, e:
        print "Exception:%s", e
示例#9
0
 def html(self):
     html = None
     if os.path.isfile(self.__filename):
         output_file = 'cache/html/' + str(uuid.uuid4()) + '.html'
         if not os.path.exists(os.path.dirname(output_file)):
             os.makedirs(os.path.dirname(output_file))
         codec = 'utf-8'
         maxpages = 0
         pagenos = None
         html = True
         outfp = open(output_file, 'wb')
         rsrcmgr = PDFResourceManager()
         laparams = LAParams()
         device = HTMLConverter(rsrcmgr,
                                outfp,
                                codec=codec,
                                laparams=laparams,
                                layoutmode='normal',
                                text_colors={})
         fp = open(self.__filename, 'rb')
         # noinspection PyBroadException
         try:
             interpreter = PDFPageInterpreter(rsrcmgr, device)
             for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages):
                 interpreter.process_page(page)
         except:
             pass
         fp.close()
         device.close()
         outfp.flush()
         outfp.close()
         if os.path.isfile(output_file):
             file = open(output_file, "r", encoding='utf-8')
             html = file.read()
     return html
示例#10
0
def parse_html(file_name):
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = TextReciver()
    device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    for fname in [file_name]:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    return outfp.text
示例#11
0
def process_pdf(in_path, out_path):
    """
    Processes a PDF and extracts its contents to HTML.

    Args:
        in_path: The full path to the source PDF file.
        out_path: The full path to the destination HTML file.
    """
    page_numbers=set()

    # Get source/destination file handles
    in_file = file(in_path, 'rb')
    out_file = file(out_path, 'w')

    # Set up the resource manager, device, and interpreter
    res_mgr = PDFResourceManager()
    device = HTMLConverter(res_mgr, out_file, codec='utf-8', laparams=LAParams(), imagewriter=None)
    interpreter = PDFPageInterpreter(res_mgr, device)

    for page in PDFPage.get_pages(in_file, page_numbers, 
            maxpages=0, password="", 
            caching=True, check_extractable=True):
        interpreter.process_page(page)

    # Close all the file handles
    in_file.close()
    device.close()
    out_file.close()
    return
def convert_pdf_to_html(path):
    rsrcmgr = PDFResourceManager()
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    #laparams = LAParams()
    laparams = LAParams(char_margin=3.5, all_texts=True)
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0  # is for all
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
示例#13
0
def convert_pdf_to_html(path):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0  #is for all
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    # out = open(path[:-4]+".html", 'w')
    # out.write(string)
    return str(string)
示例#14
0
def pdftohtml(page):
    output = BytesIO()
    manager = PDFResourceManager()

    class imagewriter(object):
        @staticmethod
        def export_image(img):
            if img.stream:
                fstream = img.stream.get_rawdata()
            else:
                return "undefined"

            imhash = md5(fstream).hexdigest()
            imgobj = db.get_imgbyhash(imhash)

            if imgobj is not "undefined":
                return imgobj["tabname"] + "." + str(imgobj["id"])
            else:
                return "undefined"

    converter = HTMLConverter(manager
                             ,output
                             ,laparams=LAParams()
                             ,imagewriter=imagewriter)
    interpreter = PDFPageInterpreter(manager, converter)

    interpreter.process_page(page)
    converter.close()
    text = output.getvalue().decode("utf-8")
    output.close()

    return text
    def read_pages(self, path, html=False,
                   laparams=None, maxpages=0, page_numbers=None,
                   password="", scale=1.0, rotation=0, layoutmode='normal',
                   output_dir=None, strip_control=False, debug=False,
                   disable_caching=False, **kwargs):

        rsrcmgr = PDFResourceManager(caching=True)
        pages = []
        with open(path, "rb") as f:
            for page in PDFPage.get_pages(f, None, maxpages=0,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                text = StringIO()
                if html:
                    device = HTMLConverter(rsrcmgr, text, codec=None, scale=scale,
                                           layoutmode=layoutmode, laparams=laparams)
                else:
                    device = TextConverter(rsrcmgr, text, codec=None, laparams=laparams)

                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                pages.append(text.getvalue())
                device.close()

        return pages
def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    #retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    #device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    output = BytesIO()
    print("stage1")
    converter = HTMLConverter(rsrcmgr, output, codec=codec, laparams=LAParams())

    interpreter = PDFPageInterpreter(rsrcmgr, converter)
    print("stage2")
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(pdfFile, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    converter.close()
    print("stage3")
    #textstr = retstr.getvalue()
    convertedPDF = output.getvalue()
    print("stage4")
    #retstr.close()
    output.close()
    #device.close()
    return convertedPDF
示例#17
0
    def pdf_para_html(self, path):
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
        from pdfminer.converter import HTMLConverter
#        from pdfminer.converter import TextConverter
        from pdfminer.layout import LAParams
        from pdfminer.pdfpage import PDFPage
        from cStringIO import StringIO
#        import re
#        import csv
        
        
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0 #is for all
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str   
def pdf_to_text(path):
    manager = PDFResourceManager(caching=True)
    retstr = BytesIO()
    laparams = LAParams()
    device = HTMLConverter(manager, retstr, laparams=laparams)
    filepath = open(path, 'rb')
    interpreter = PDFPageInterpreter(manager, device)

    for page in PDFPage.get_pages(filepath,
                                  set(),
                                  maxpages=0,
                                  caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)
    device.close()
    text = retstr.getvalue()

    filepath.close()

    retstr.close()

    text_file = open("Output.txt", "w")
    text_file.write(str(text))
    text_file.close()
    return text
def convertPDFToHTMLPage(bookPath):

    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    scale = 1
    rotation = 0

    outfile = bookPath.replace('.pdf', '.html')
    outfp = file(outfile, 'w')

    laparams = LAParams()

    layoutmode = 'normal'

    device = HTMLConverter(rsrcmgr,
                           outfp,
                           codec=codec,
                           scale=scale,
                           layoutmode=layoutmode,
                           laparams=laparams)

    fp = file(bookPath, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, password="", check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

    fp.close()
    device.close()
    outfp.close()

    print "HTML output written to : ", outfile
示例#20
0
文件: castorama.py 项目: JdeH/pytigon
def convert(fp):
    showpageno = True

    pagenos = set()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=False)
    retstr = StringIO2()
    retstr.encoding = 'utf-8'
    device = HTMLConverter(rsrcmgr,
                           retstr,
                           scale=1,
                           layoutmode='normal',
                           laparams=laparams,
                           outdir=None,
                           debug=False)

    process_pdf(rsrcmgr,
                device,
                fp,
                pagenos,
                maxpages=0,
                password='',
                caching=False,
                check_extractable=True)
    device.close()

    return retstr.getvalue()
示例#21
0
def convertPDF(fname, pages=None):
    if not pages:
        pagenos = set()
    else:
        pagenos = set(pages)
    caching = True
    outfp = StringIO()
    layoutmode = 'normal'
    laparams = LAParams()
    rotation = 0

    rsrcmgr = PDFResourceManager(caching=caching)
    device = HTMLConverter(rsrcmgr, outfp, codec='utf-8', scale=1,
                           layoutmode=layoutmode, laparams=laparams,
                           imagewriter=None)
    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=0, password='',
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()

    text = outfp.getvalue()
    outfp.close()
    return text
示例#22
0
 def readText(self,path, outtype='text', opts={}):
     outfile = path[:-3] + outtype
     outdir = '/'.join(path.split('/')[:-1])
     # debug option
     pagenos = set()
     maxpages = 0
     # output option
     # ?outfile = None
     # ?outtype = None
     outdir = None
     #layoutmode = 'normal'
     codec = 'utf-8'
     pageno = 1
     scale = 1
     showpageno = True
     laparams = LAParams()
     for (k, v) in opts:
         if k == '-d': debug += 1
         elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
         elif k == '-m': maxpages = int(v)
         elif k == '-P': password = v
         elif k == '-o': outfile = v
         elif k == '-n': laparams = None
         elif k == '-A': laparams.all_texts = True
         elif k == '-V': laparams.detect_vertical = True
         elif k == '-M': laparams.char_margin = float(v)
         elif k == '-L': laparams.line_margin = float(v)
         elif k == '-W': laparams.word_margin = float(v)
         elif k == '-F': laparams.boxes_flow = float(v)
         elif k == '-Y': layoutmode = v
         elif k == '-O': outdir = v
         elif k == '-t': outtype = v
         elif k == '-c': codec = v
         elif k == '-s': scale = float(v)
     print laparams
     #
     #PDFDocument.debug = debug
     #PDFParser.debug = debug
     CMapDB.debug = self.debug
     PDFResourceManager.debug = self.debug
     PDFPageInterpreter.debug = self.debug
     PDFDevice.debug = self.debug
     #
     rsrcmgr = PDFResourceManager()
     #outtype = 'text'
     outfp = StringIO()
     device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
     fp = file(path, 'rb')
     process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True)
     fp.close()
     device.close()
     print outfp.getvalue()
     outfp.close()
     return
示例#23
0
def convert_pdf_to_html(input, output="temp.html"):
    """
    :param input: PDF File to be converted,
    :param output:  output filename, default is temp.html
    :return: doesn't return anything
    """
    debug = 0
    password = b''
    pagenos = set()
    maxpages = 0
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    encoding = 'utf-8'
    scale = 1
    caching = True
    laparams = LAParams()

    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)

    outfile = output
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    else:
        outfp = sys.stdout

    device = HTMLConverter(rsrcmgr,
                           outfp,
                           scale=scale,
                           layoutmode=layoutmode,
                           laparams=laparams,
                           imagewriter=imagewriter,
                           debug=debug)

    with open(input, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
    device.close()
    outfp.close()
    return
def lerPDF(arquivo):
    recursos = PDFResourceManager()
    buffer = StringIO()
    layoutParams = LAParams()
    disp = HTMLConverter(recursos, buffer, laparams=layoutParams)

    process_pdf(recursos, disp, arquivo)
    disp.close()

    conteudo = buffer.getvalue()
    buffer.close()
    return conteudo
示例#25
0
 def to_html(self, fp):
     out_buf = StringIO.StringIO()
     device = HTMLConverter(self.resmgr,
                            out_buf,
                            codec=self.options.codec,
                            scale=self.options.scale,
                            layoutmode=self.options.layoutmode,
                            laparams=self.options.laparams,
                            outdir=None)
     self._process(fp, device)
     device.close()
     result = out_buf.getvalue()
     out_buf.close()
     return result
示例#26
0
 def to_html(self, fp):
     out_buf = StringIO.StringIO()
     device = HTMLConverter( self.resmgr
                           , out_buf
                           , codec=self.options.codec
                           , scale=self.options.scale
                           , layoutmode=self.options.layoutmode
                           , laparams=self.options.laparams
                           , outdir=None
                           )
     self._process(fp, device)
     device.close()
     result = out_buf.getvalue()
     out_buf.close()
     return result
示例#27
0
    def transform_file(self, pdfpath):
        try:
            self.LOGGER.debug(pdfpath)
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'

            device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=self.laparams)
            fp = file(pdfpath, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            # NOTE check_extractable seems to allow overriding text extraction locks
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=False):
                interpreter.process_page(page)
            fp.close()
            device.close()
            html = retstr.getvalue()
            # otherwise html is str at this point, not unicode
            html = html.decode('utf8')
            retstr.close()
            soup = BeautifulSoup(html)
            # LOGGER.debug(soup.text)
            text_size = len(soup.text)
            stub_data = {
                # "URL": uri,
                "markup": {
                    "innerHTML": unicode(html),
                    "innerText": unicode(soup.text)
                },
                "workflow": {
                    "is_stub": True
                },
                "__text_size": text_size,
                # __fields are ignored by kibana
                "timestamp": datetime.now()
            }
        except Exception as e:
            stub_data = {
                "error": str(e),
                "workflow": {
                    "is_stub": True
                },
                "__text_size": -1
            }
        return stub_data
示例#28
0
def convert_pdf(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    fp = file(path, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()

    str = retstr.getvalue()
    retstr.close()
    return str
示例#29
0
def extract_price_from_pdf(file_name):
    pagenos = set()
    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = TextReciver()
    device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    #Read the file
    for fname in [file_name]:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      caching=True,
                                      check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()

    #Find all lines that end with a price and include position
    #information. Also find all following lines that include prices
    #but no new location (shorter 100 characters)
    matches = re.finditer('(.*left.*[0-9]{1,2}\.[0-9]{1,2} )'
                          '(\n<br>.{0,100}[0-9]{1,2}\.[0-9]{1,2} *)*',
                          outfp.text)

    pos_list = []
    for m in matches:
        line_group = m.group().split('\n')

        #Extract the position information from the string
        pos_string = re.findall('(.*top:)([0-9]+)(px)', line_group[0])[0]
        ypos = pos_string[1]

        #Iterate over all lines and extract the price. Increment the
        #position slightly for each new line
        for i, price_text in enumerate(line_group):n
            price = float(re.findall('[0-9]{1,2}\.[0-9]{1,2}',
                                     price_text[::-1])[0][::-1])
            ypos= int(ypos) + i
            pos_list.append((ypos, price))
示例#30
0
def get_html_agenda_pdfminer(agendaloc):
    """Convert a PDF agenda to text and/or HTML using pdfminer.
       pdfminer doesn't give very clean output, so this is optional and
       the imports are only loaded the first time this function is called.
       Probably better: pdftohtml -c -s -i -noframes abc.pdf abc.html
       Returns bytes, not str. I think.
    """
    try:
        fh = open(agendaloc, 'rb')
    except FileNotFoundError:
        response = requests.get(agendaloc, stream=True)
        # response.raw supposedly gives a file handle,
        # but it's not seekable and pdfminer needs to seek.
        # fh = response.raw
        fh = io.BytesIO(response.content)

    try:
        resource_manager = PDFResourceManager()
    except UnboundLocalError:
        from pdfminer.layout import LAParams, LTTextBox
        from pdfminer.pdfpage import PDFPage
        from pdfminer.pdfinterp import PDFResourceManager
        from pdfminer.pdfinterp import PDFPageInterpreter
        from pdfminer.converter import TextConverter, HTMLConverter

        resource_manager = PDFResourceManager()

    # The fake file object needs to be StringIO for TextConverter,
    # BytesIO for HTMLConverter.
    # fake_file_handle = io.StringIO()
    fake_file_handle = io.BytesIO()
    converter = HTMLConverter(resource_manager,
                              fake_file_handle,
                              laparams=LAParams())
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
        page_interpreter.process_page(page)

    text = fake_file_handle.getvalue()

    # close open handles
    fh.close()
    converter.close()
    fake_file_handle.close()

    return text
示例#31
0
def extract_text_from_pdf(pdf_path):
    text = ""
    resource_manager = PDFResourceManager()
    fake_file_handle = io.BytesIO()
    converter = HTMLConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
        text = fake_file_handle.getvalue().decode()
    converter.close()
    fake_file_handle.close()
    text = html_text.extract_text(text)
    file = open(pdf_path.replace('.pdf', '.txt'), 'w')
    file.write(text)
    file.close()
示例#32
0
def extract_text_from_pdf(pdf_path):
    global path
    path = pdf_path
    resource_manager = PDFResourceManager()
    fake_file_handle = io.BytesIO()
    converter = HTMLConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
        text = fake_file_handle.getvalue()
    # close open handles
    converter.close()
    fake_file_handle.close()
    if text:
        return text
def pdf_to_html(path):
    manager = PDFResourceManager()
    retstr = BytesIO()
    layout = LAParams(all_texts=True)
    device = HTMLConverter(manager, retstr, laparams=layout)
    filepath = open(path, 'rb')
    interpreter = PDFPageInterpreter(manager, device)

    for page in PDFPage.get_pages(filepath, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    filepath.close()
    device.close()
    retstr.close()
    return text
示例#34
0
def get_soup(codebook_path: str):
    resource_manager = PDFResourceManager()
    file_handle      = io.StringIO()
    converter        = HTMLConverter(resource_manager, file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(codebook_path, 'rb') as file:
        for page in PDFPage.get_pages(file):
            page_interpreter.process_page(page)
        text = file_handle.getvalue()

    converter.close()
    file.close()

    if text:
        soup = BeautifulSoup(text, features='html.parser')
        return soup
示例#35
0
def convert(i, doc_list, path1, path2):
    filePDF = doc_list[i]
    print("Working with doc: {}".format(filePDF))
    fileHTML = filePDF.replace('pdf', 'html')
    pathin = path1 + filePDF
    pathout = path2 + fileHTML

    #Define parameters to the PDF device objet
    manager = PDFResourceManager()
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    pagenos = set()
    password = ''
    maxpages = 0

    #Bytes IO used for XML and HTML conversions
    output = io.BytesIO()
    converter = HTMLConverter(manager,
                              output,
                              codec=codec,
                              laparams=LAParams())

    #Create PDF interpreter object
    interpreter = PDFPageInterpreter(manager, converter)
    infile = open(pathin, 'rb')

    #Process each page contained in the document
    for page in PDFPage.get_pages(infile,
                                  pagenos,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()
    infile.close()
    converter.close()
    output.close()

    with open(pathout, "wb") as fileConverted:
        fileConverted.write(convertedPDF)
        fileConverted.close()

    print("Done with file: {} numbered: {}".format(fileHTML, i))

    return
示例#36
0
def convert_pdf_to_html(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, retstr, codec = codec, laparams = laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages = maxpages, password = password, caching = caching, check_extractable = True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
    def convert_pdf_to_txt(self):

        pdf_folder = Path(self.path)
        # print(self.allfiles)
        list_txt = list()
        for files in self.allfiles:
            rsrcmgr = PDFResourceManager()
            retstr = BytesIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = HTMLConverter(rsrcmgr,
                                   retstr,
                                   codec=codec,
                                   laparams=laparams)
            print(files)
            fp = open(pdf_folder / files, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            # text = files
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)

            text = retstr.getvalue()
            text = text.decode('utf-8')
            print("Longueur du doc : ")
            print(len(text))
            print("Début du doc : ")
            print(text[1:100])
            list_txt.append(text)
            print(len(list_txt))
            fp.close()
            device.close()
            retstr.close()

        return list_txt
示例#38
0
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = HTMLConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text
示例#39
0
def parse(in_stream, out_stream):
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = io.open(out_stream, 'wt', encoding=codec, errors='ignore')
    device = HTMLConverter(rsrcmgr,
                           outfp,
                           scale=scale,
                           layoutmode=layoutmode,
                           laparams=laparams,
                           outdir=outdir,
                           debug=debug)

    fp = io.open(in_stream, 'rb')
    process_pdf(rsrcmgr,
                device,
                fp,
                pagenos,
                maxpages=maxpages,
                password=password,
                caching=caching,
                check_extractable=True)
    fp.close()
    device.close()
    outfp.close()
    return out_stream
示例#40
0
def convert_pdf_to_html(path, save=True):
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import HTMLConverter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfparser import PDFSyntaxError
    # works with PDFMiner version 20140328
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0 # use 0 to ensure all pages are processed
    caching = True
    pagenos=set()
    pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
    pagecount = 0
    try:
        for page in pages:
            pagecount += 1
            interpreter.process_page(page)
    except PDFSyntaxError:
        print "Invalid PDF", path
        fp.close()
        return
    fp.close()
    device.close()
    text = retstr.getvalue()
    retstr.close()
    if pagecount < 2:
        print "No content!", path
        return
    elif save:
        savepath = get_html_path(path)
        open(savepath,"w").write(text)
        return savepath
    else: 
        return text
示例#41
0
def extract_price_from_pdf(file_name):
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = TextReciver()
    device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    for fname in [file_name]:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    matches = re.finditer('(.*left.*[0-9]{1,2}\.[0-9]{1,2} )(\n<br>.{0,100}[0-9]{1,2}\.[0-9]{1,2} *)*',outfp.text)    
    pos_list = []
    for m in matches:
        line_group = m.group().split('\n')
        ypos = re.findall('[0-9]+',re.findall('.*top:[0-9]+px', line_group[0])[0][::-1])[0][::-1]
        for i,price in enumerate(line_group):
            if len(price):
                p = float(re.findall('[0-9]{1,2}\.[0-9]{1,2}',price[::-1])[0][::-1])
                ypos= int(ypos) + i
                
                pos_list.append((ypos, p))
    pos_list.sort()
    pos, price_list = zip(*pos_list)
    
    return price_list
示例#42
0
 def get_html(self, path):  # Pulls html from PDF instead of plain text
     if path[-4:] != ".pdf":
         path = path + ".pdf"
     rsrcmgr = PDFResourceManager()
     retstr = StringIO()
     codec = 'utf-8'
     laparams = LAParams()
     device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
     fp = file(path, 'rb')
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     password = ""
     maxpages = 0
     caching = True
     pagenos = set()
     for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
         interpreter.process_page(page)
     fp.close()
     device.close()
     result = retstr.getvalue()
     retstr.close()
     return result
示例#43
0
def convert_pdf_to_html(url):

    r = requests.head(url)
    r.headers["content-type"]

    if 'application/pdf' in r.headers["content-type"]:

        r = requests.get(url)

        # Cast to StringIO object
        from StringIO import StringIO
        memory_file = StringIO(r.content)

        # Create a PDF parser object associated with the StringIO object
        parser = PDFParser(memory_file)

        # Create a PDF document object that stores the document structure
        document = PDFDocument(parser)

        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0 #is for all
        caching = True
        pagenos=set()

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str
示例#44
0
class PDF2Txt:
	def __init__(self,pdffile,outfile,output_type='text'):
		PDFDocument.debug = 0
		PDFParser.debug = 0
		CMapDB.debug = 0
		PDFResourceManager.debug = 0
		PDFPageInterpreter.debug = 0
		PDFDevice.debug = 0
		self.rsrcmgr = PDFResourceManager(caching=True)
		self.outtype = output_type
		self.outfile = outfile
		self.pdffile = pdffile

	def convert(self):
		outfp = file(self.outfile,'w')
		if self.outtype == 'text':
			self.device = TextConverter(self.rsrcmgr,outfp,codec='utf-8',laparams=LAParams(),imagewriter=None)
		elif self.outtype == 'xml':
			self.device = XMLConverter(self.rsrcmgr, outfp, codec='utf-8', laparams=LAParams(),
							  imagewriter=None)
		elif self.outtype == 'html':
			self.device = HTMLConverter(self.rsrcmgr, outfp, codec='utf-8', scale=1,
							   layoutmode='normal', laparams=LAParams(),
							   imagewriter=None)
		else:
			print 'Formato de salida no soportado'
			sys.exit(-1)
		fp = file(self.pdffile,'rb')
		interpreter = PDFPageInterpreter(self.rsrcmgr,self.device)
		pagenos = set()
		for page in PDFPage.get_pages(fp,pagenos,caching=True,check_extractable=True):
			page.rotate = (page.rotate) % 360
			interpreter.process_page(page)
		fp.close()
		self.device.close()
		outfp.close()
		print "Archivo %s creado en base a %s" % (self.outfile,self.pdffile)
示例#45
0
class PDFHandler(object):
	'''A PDF Handle class to read contains
	Now also support file object/StringIO object(won't close after process)'''
	def __init__(self):
		# debug option
		self.setdebug(0)
		#only first page
		self.pagenos=set([0])
		self.pageno = 1
		self.outfp = stdmodel()
		self.codec = 'utf-8'
		self.showpageno = True
		self.scale = 1
		self.password = ''
		self.maxpages = 0
		self.rotation = 0
		self.imagewriter = None
		self.laparams = LAParams()	
		self.layoutmode = 'normal'	
	# ResourceManager facilitates reuse of shared resources such as fonts and images so that 
	# large objects are not allocated multiple times.
		#### This will cause some problem when set to default True.
		self.caching = False
		self.rsrcmgr = PDFResourceManager(caching=self.caching)

		# Important Main converter for pdf file
		self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, 
			laparams=self.laparams, imagewriter=self.imagewriter)

		self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale,
                               layoutmode=self.layoutmode, laparams=self.laparams,
                               imagewriter=self.imagewriter)

	def reset(self,html=False):
		'''Reset can avoid wrong judge'''
		self.rsrcmgr = PDFResourceManager(caching=self.caching)

		# Important Main converter for pdf file
		
		if (html):
			self.htmldevice.close()
			self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale,
		                   layoutmode=self.layoutmode, laparams=self.laparams,
		                   imagewriter=self.imagewriter)
		else:	
			self.device.close()
			self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, 
			laparams=self.laparams, imagewriter=self.imagewriter)

	def setdebug(self,value):
		'''Set Debug Information. Especially when init'''
		# debug option
		self.debug = 0
		PDFResourceManager.debug = self.debug
		PDFPageInterpreter.debug = self.debug
		#PDFDocument.debug = self.debug
		#PDFParser.debug = self.debug
		#CMapDB.debug = self.debug
		#PDFDevice.debug = self.debug	

	def GetPageNumber(self,fname,fobj=None):
		'''Get total page number of PDF'''
		if (fobj):
			#fp=StringIO(fobj.read())
			#fobj.seek(0)
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			pageno=0
			for page in PDFPage.get_pages(fp, set(), maxpages=0, 
				password=self.password, caching=self.caching, check_extractable=False):
				pageno+=1
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return pageno
		except Exception as e:
			print e
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			print "Error Reading PDF page number.."
			return 0

	def FastCheck(self,fname,fobj=None):
		'''Fast check whether has page one'''
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			for page in PDFPage.get_pages(fp, set([0]), maxpages=1, 
				password=self.password, caching=self.caching, check_extractable=False):
				break
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return True
		except Exception as e:
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			print "Error Reading PDF page number..",fname
			return False

	def GetSinglePage(self,fname,pageno=1,html=False,fobj=None):
		'''Get Single Page contents of PDF, return string
		Default first page'''	
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			if (html):
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice)
			else:
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)

			for page in PDFPage.get_pages(fp, set([pageno-1]), maxpages=self.maxpages, 
				password=self.password, caching=self.caching, check_extractable=False):

				page.rotate = (page.rotate+self.rotation) % 360
				interpreter.process_page(page)
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			outstr=self.outfp.get()
			self.outfp.reset()
			return outstr 
		except Exception as e:
			self.outfp.reset()
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return ""

	def GetPages(self,fname,pagenos=[1],html=False,fobj=None):
		'''Get Several Page contents of PDF, return string
		Default first page'''	
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			if (html):
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice)
			else:
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)

			for page in PDFPage.get_pages(fp, set([i-1 for i in pagenos]), maxpages=self.maxpages, 
				password=self.password, caching=self.caching, check_extractable=False):

				page.rotate = (page.rotate+self.rotation) % 360
				interpreter.process_page(page)
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			outstr=self.outfp.get()
			self.outfp.reset()
			return outstr 
		except Exception as e:
			self.outfp.reset()
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return ""

	def GetAllPages(self,fname,html=False,fobj=None):
		'''Get All Page contents of PDF, return string'''	
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			if (html):
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice)
			else:
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)

			for page in PDFPage.get_pages(fp, set(), maxpages=self.maxpages, 
				password=self.password, caching=self.caching, check_extractable=False):

				page.rotate = (page.rotate+self.rotation) % 360
				interpreter.process_page(page)
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			outstr=self.outfp.get()
			self.outfp.reset()
			return outstr 
		except Exception as e:
			self.outfp.reset()
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return ""
def readPDF2HTML(pdfFile, opts={}):
    # open a PDF file
    fp = StringIO(pdfFile.read())
    retstr = StringIO()
    # create a PDF parser object associated with the file object
    parser = PDFParser(fp)
    # create a PDF document allows text extraction
    document = PDFDocument(parser) # password if needed
    # check if document allows text extraction without password
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # create a PDF resource manager object that sotres shared resources
    rsrcmgr = PDFResourceManager()
    # create a PDF device object
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d':
            debug += 1
        elif k == '-p':
            pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m':
            maxpages = int(v)
        elif k == '-P':
            password = v
        elif k == '-o':
            outfile = v
        elif k == '-n':
            laparams = None
        elif k == '-A':
            laparams.all_texts = True
        elif k == '-V':
            laparams.detect_vertical = True
        elif k == '-M':
            laparams.char_margin = float(v)
        elif k == '-L':
            laparams.line_margin = float(v)
        elif k == '-W':
            laparams.word_margin = float(v)
        elif k == '-F':
            laparams.boxes_flow = float(v)
        elif k == '-Y':
            layoutmode = v
        elif k == '-O':
            outdir = v
        elif k == '-t':
            outtype = v
        elif k == '-c':
            codec = v
        elif k == '-s':
            scale = float(v)
    codec = 'utf-8'
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()
    # process each page contained in the document
    for page in PDFPage.get_pages(fp, pagenos):
        interpreter.process_page(page)
    # close streams and return text content
    fp.close()
    content = retstr.getvalue()
    device.close()
    retstr.close()
    return content
示例#47
0
def pdf_gettext(filepath, reserve):
	# debug option
	debug = 0
	# input option
	password = ''
	pagenos = set()
	maxpages = 0
	# output option
	outfile = 'output.html'
	imagewriter = None
	layoutmode = 'normal'
	codec = 'utf-8'
	scale = 1
	caching = True
	laparams = LAParams()
	firstout = 'firstout.html'
	lastout = 'lastout.html'
	firstpage = None
	lastpage = None
	first = []
	last = []
	
	if False:
		firstout = filepath[:-3] + firstout
		lastout = filepath[:-3] + lastout
		if os.path.exists(firstout):					
			html_textparser(firstout, first)
			if os.path.exists(lastout):
				html_textparser(lastout, last)
			return first, last
	
	rsrcmgr = PDFResourceManager(caching=caching)
	
	#import io
	#outfp = io.StringIO()
	#outfp = io.open(outfile, 'w+t', encoding=codec, errors='ignore')
	import tempfile
	outfp = tempfile.TemporaryFile(mode='w+t', encoding=codec)
	
	device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams)
	
	
	with open(filepath, 'rb') as fp:
		#process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
        #            caching=caching, check_extractable=True)
		
		parser = PDFParser(fp)
		doc = PDFDocument(caching=caching)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize(password)
		if not doc.is_extractable:
			raise Exception('Text extraction is not allowed: %s' % filepath)
			
		interpreter = PDFPageInterpreter(rsrcmgr, device)	
		for page in doc.get_pages():
			if not firstpage:
				firstpage = page
			else:
				lastpage = page

		if firstpage:
			interpreter.process_page(firstpage)
			with open(firstout, 'w', encoding=codec) as f:
				outfp.seek(0)
				f.write(outfp.read())			
			html_textparser(firstout, first)				
		if lastpage:
			outfp.truncate(0)
			interpreter.process_page(lastpage)
			with open(lastout, 'w', encoding=codec) as f:
				outfp.seek(0)
				f.write(outfp.read())
			html_textparser(lastout, last)
		
		
	device.close()	
	outfp.close()

	return first, last