示例#1
0
class Miner:
    def __init__(self, pdf_file, txt_file, file_format='txt', layout_analysis=True):
        self.pdf_file = file(pdf_file, 'rb')
        self.outfp = file(txt_file, 'w')

        if layout_analysis:
            laparams = LAParams()
        else:
            laparams = None

        self.rsrcmgr = PDFResourceManager(caching=True)
        
        if file_format == 'txt':
            self.device = TextConverter(self.rsrcmgr, self.outfp, codec='utf-8', 
                laparams=laparams, imagewriter=None)
        elif file_format == 'html':
            self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', 
                laparams=laparams, imagewriter=None)
        elif file_format == 'xml':
            self.device = XMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', 
                laparams=laparams, imagewriter=None)

    def extract(self):
        interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
        pagenos = set()
        for page in PDFPage.get_pages(self.pdf_file, pagenos, maxpages=0, 
            password=None, caching=True, check_extractable=True):
            interpreter.process_page(page)
        self.pdf_file.close()
        self.device.close()
        self.outfp.close()
示例#2
0
    def __init__(self,
                 pdf_file,
                 txt_file,
                 file_format='txt',
                 layout_analysis=True):
        self.pdf_file = file(pdf_file, 'rb')
        self.outfp = file(txt_file, 'w')

        if layout_analysis:
            laparams = LAParams()
        else:
            laparams = None

        self.rsrcmgr = PDFResourceManager(caching=True)

        if file_format == 'txt':
            self.device = TextConverter(self.rsrcmgr,
                                        self.outfp,
                                        codec='utf-8',
                                        laparams=laparams,
                                        imagewriter=None)
        elif file_format == 'html':
            self.device = HTMLConverter(self.rsrcmgr,
                                        self.outfp,
                                        codec='utf-8',
                                        laparams=laparams,
                                        imagewriter=None)
        elif file_format == 'xml':
            self.device = XMLConverter(self.rsrcmgr,
                                       self.outfp,
                                       codec='utf-8',
                                       laparams=laparams,
                                       imagewriter=None)
示例#3
0
def convert(infile, outfile, rotation=0):
    debug = 0
    password = ''
    pagenos = set()
    maxpages = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(outfile, 'wb')
    device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    fp = open(infile, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                    maxpages=maxpages, password=password,
                                    caching=caching, check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    outfp.close()
示例#4
0
def pdf2xml(path, codec='utf-8', password = "", maxpages = 0, caching = True):
	'''
	Given the name of a PDF file, use PDFMiner to extract its pages and return them as XML (in utf-8 bytes).
	'''
	rsrcmgr = PDFResourceManager()
	retstr = BytesIO()
	laparams = LAParams()

	device = XMLConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
	with open(path, 'rb') as fp:
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		pagenos=set()
		#pg = 1
		for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
			interpreter.process_page(page)
			#xml = '%s %s %s' % ('<PAGE {}>'.format(pg), retstr.getvalue(), '</PAGE {}>'.format(pg))
			#pg += 1
		xml = retstr.getvalue()
	
	device.close()
	retstr.close()

	xml = xml.decode('utf-8')
	if not xml.startswith('</pages>'):
		xml += '\n</pages>'

	return xml
示例#5
0
def pdf2xml(filename):
    rsrcmgr = PDFResourceManager(caching=True)
    outfp = StringIO.StringIO()
    device = XMLConverter(rsrcmgr,
                          outfp,
                          codec='utf-8',
                          laparams=LAParams(),
                          imagewriter=None)

    fp = file(filename, 'rb')
    pages = PDFPage.get_pages(fp,
                              None,
                              maxpages=0,
                              password='',
                              caching=True,
                              check_extractable=True)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in pages:
        interpreter.process_page(page)
    fp.close()
    device.close()

    xml = outfp.getvalue()
    outfp.close()
    return xml
示例#6
0
def parse_pdf_to_txt(pdf_handle, write_file):
    pagenos = set()
    maxpages = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    #laparams.all_texts = True
    laparams.detect_vertical = True

    # 创建pdf资源管理器 来管理共享资源
    rsrcmgr = PDFResourceManager(caching=caching)

    print("ready to open out file ........")
    with open(write_file, "wt", encoding=codec, errors='ignore') as outfp:
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams)
        print("ready to converte pdf to xml ........")
        process_pdf(rsrcmgr,
                    device,
                    pdf_handle,
                    pagenos,
                    maxpages=maxpages,
                    password='',
                    caching=caching,
                    check_extractable=True)
        device.close()
def parse_pdfs(pdf_filenames):
    # Set parameters
    pagenos = set()
    maxpages = 0
    password = ''
    imagewriter = None
    codec = 'utf-8'
    caching = True
    laparams = LAParams()

    rsrcmgr = PDFResourceManager(caching=caching)

    # Convert to XML as it retains the most information about text position (compared to text, html, etc).
    for pdf_file in pdf_filenames:

        print "Converting %s to xml."%pdf_file

        fname, ext = os.path.splitext(pdf_file)
        outfile = fname + '.xml'
        with open(pdf_file, 'rb') as fp, open(outfile, 'w') as outfp:

            device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                          imagewriter=imagewriter)

            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
                interpreter.process_page(page)

            device.close()

        print "Conversion complete."
示例#8
0
 def _get_xml_data(self, sourcefile):
     """Store XML representation fo file"""
     rm = PDFResourceManager(caching=True,
                             font_correctors=self.font_correctors)
     laparams = LAParams()
     outfp = io.BytesIO()
     device = XMLConverter(rm,
                           outfp,
                           codec="UTF-8",
                           laparams=laparams,
                           imagewriter=None)
     interpreter = PDFPageInterpreter(rm, device)
     infile = open(sourcefile, "rb")
     pagenos = set()
     maxpages = 0
     rotation = 0
     password = ""
     for page in PDFPage.get_pages(infile,
                                   pagenos,
                                   maxpages=maxpages,
                                   password=password,
                                   caching=True,
                                   check_extractable=True):
         interpreter.process_page(page)
     infile.close()
     device.close()
     retval = outfp.getvalue()
     outfp.close()
     return retval
示例#9
0
def extract_pdf_page(filename, page_number_or_numbers):
    """Given the name of a PDF file and the pages to extract, use PDFMiner to extract those
    pages and return them as XML (in utf-8 bytes).

    The param page_number_or_numbers can be a single page number or an iterable thereof.
    """
    # This code adapted from pdf2txt.py which is part of PDFMiner.
    # Here's the command line version of the code below --
    #    pdf2txt.py -p 1 -o expected.xml sample.pdf

    if is_iterable(page_number_or_numbers):
        page_numbers = page_number_or_numbers
    else:
        page_numbers = [page_number_or_numbers]

    f_out = StringIO.StringIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager()
    device = XMLConverter(rsrcmgr, f_out, codec='utf-8', laparams=laparams)

    with open(filename, 'rb') as f_in:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(f_in, page_numbers):
            interpreter.process_page(page)

    device.close()

    xml = f_out.getvalue()
    f_out.close()

    return xml
示例#10
0
文件: converter.py 项目: 2mv/seuraaja
    def to_xml(infile):
        output = StringIO()
        manager = PDFResourceManager()
        converter = XMLConverter(manager, output, laparams=LAParams())
        interpreter = PDFPageInterpreter(manager, converter)

        for page in PDFPage.get_pages(infile):
            interpreter.process_page(page)
        converter.close()
        xml = output.getvalue()
        output.close
        return xml
def lerPDF(arquivo):
    recursos = PDFResourceManager()
    buffer = StringIO()
    layoutParams = LAParams()
    disp = XMLConverter(recursos, buffer, laparams=layoutParams)

    process_pdf(recursos, disp, arquivo)
    disp.close()

    conteudo = buffer.getvalue()
    buffer.close()
    return conteudo
示例#12
0
 def getTitle(self, stream):
     stream.seek(0)
     input1 = PdfFileReader(stream)
     title = input1.getDocumentInfo().title
     # if fail to get thesis's title , we deal with it by using a special algorithm.
     if title in ['untitled', '']:
         from pdfminer.pdfdocument import PDFDocument
         from pdfminer.pdfparser import PDFParser
         from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
         from pdfminer.pdfdevice import PDFDevice, TagExtractor
         from pdfminer.pdfpage import PDFPage
         from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
         from pdfminer.layout import LAParams
         try:
             from cStringIO import StringIO
         except ImportError:
             from StringIO import StringIO
         # init parameters
         caching = True
         codec = 'utf-8'
         imagewriter = None
         stripcontrol = False
         pagenos = set()
         password = ''
         maxpages = 0
         rotation = 0
         rsrcmgr = PDFResourceManager(caching=caching)
         laparams = LAParams()
         outfp = StringIO()
         # convert pdf to xml, using StringIO to store XML
         device = XMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               stripcontrol=stripcontrol)
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         pagenos.update(int(x) - 1 for x in '1'.split(','))
         stream.seek(0)
         for page in PDFPage.get_pages(stream,
                                       pagenos,
                                       maxpages=maxpages,
                                       password=password,
                                       caching=caching,
                                       check_extractable=True):
             page.rotate = (page.rotate + rotation) % 360
             interpreter.process_page(page)
         device.close()
         outfp.seek(0)
         # parse the xml to get title
         title = self._getTitleFromXmlStr(outfp.read().encode(codec))
     return title
示例#13
0
def extract_pdf_page(filename):

    # Paths for creating folder and file
    input_file_name = Path(filename).stem
    output_file_folder = Path(XML_PATH, input_file_name)
    output_file_folder.mkdir(parents=True, exist_ok=True)
    output_file_path = Path(output_file_folder,
                            input_file_name + "-" + TIME_NOW + ".xml")
    output_images_path = Path(XML_PATH, input_file_name, "images")
    output_images_path.mkdir(parents=True, exist_ok=True)

    output_file = io.StringIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager()
    device = XMLConverter(rsrcmgr, output_file, laparams=laparams)

    doc = fitz.open(filename)

    for i in range(len(doc)):
        for img in doc.getPageImageList(i):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            if pix.n < 5:  # this is GRAY or RGB
                pix.writePNG(
                    str(output_images_path) + "//" + "%s-%s-%s.png" %
                    (input_file_name, i, xref))
            else:  # CMYK: convert to RGB first
                pix1 = fitz.Pixmap(fitz.csRGB, pix)
                pix1.writePNG(
                    str(output_images_path) + "//" + "%s-%s.png" %
                    (input_file_name, i, xref))
                pix1 = None
            pix = None

    with open(filename, 'rb') as fh:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)

    device.close()

    xml = output_file.getvalue()
    with open(output_file_path, 'w', encoding="utf-8") as fd:
        fd.write(xml)

    output_file.close()

    return xml
示例#14
0
 def __init__(self, src, limit = None):
     if(len(src) < 5 or src[(len(src) - 4):(len(src))] != ".pdf"):
         raise Exception("PDF file has to end in .pdf and has to have a name!")
     input_file = open(src, "rb")
     out = StringIO()
     rsrc = PDFResourceManager()
     device = XMLConverter(rsrc, out, codec='UTF-8', laparams=None) 
     try:
         process_pdf(rsrc, device, input_file, pagenos=None, maxpages=limit, password='', check_extractable=True)
     finally:
         device.close()
         input_file.close()
     text = out.getvalue()
     out.close()
     self.text = self.cleanText(text)
示例#15
0
class Miner:
    def __init__(self,
                 pdf_file,
                 txt_file,
                 file_format='txt',
                 layout_analysis=True):
        self.pdf_file = file(pdf_file, 'rb')
        self.outfp = file(txt_file, 'w')

        if layout_analysis:
            laparams = LAParams()
        else:
            laparams = None

        self.rsrcmgr = PDFResourceManager(caching=True)

        if file_format == 'txt':
            self.device = TextConverter(self.rsrcmgr,
                                        self.outfp,
                                        codec='utf-8',
                                        laparams=laparams,
                                        imagewriter=None)
        elif file_format == 'html':
            self.device = HTMLConverter(self.rsrcmgr,
                                        self.outfp,
                                        codec='utf-8',
                                        laparams=laparams,
                                        imagewriter=None)
        elif file_format == 'xml':
            self.device = XMLConverter(self.rsrcmgr,
                                       self.outfp,
                                       codec='utf-8',
                                       laparams=laparams,
                                       imagewriter=None)

    def extract(self):
        interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
        pagenos = set()
        for page in PDFPage.get_pages(self.pdf_file,
                                      pagenos,
                                      maxpages=0,
                                      password=None,
                                      caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)
        self.pdf_file.close()
        self.device.close()
        self.outfp.close()
示例#16
0
def convert_pdf(path, format='text', codec='utf-8', password=''):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    laparams = LAParams()
    if format == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'html':
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'xml':
        device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    else:
        raise ValueError('provide format, either text, html or xml!')
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue().decode()
    soup = bs(text)
    prettyHTML = soup.prettify()
    print(prettyHTML)
    # html_file = open("../../Data/document-page0.xml", "w")
    # html_file.write(prettyHTML)
    # html_file.close()
    # fp.close()
    # device.close()
    # retstr.close()
    return text
示例#17
0
def convert_pdf(path, format='text', codec='utf-8', password=''):
    r = requests.get(path)
    f = io.BytesIO(r.content)

    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    laparams = LAParams()
    if format == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'html':
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'xml':
        device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    else:
        raise ValueError('provide format, either text, html or xml!')
    fp = io.BytesIO(f.getvalue())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue().decode()
    fp.close()
    device.close()
    retstr.close()
    return text
示例#18
0
def pdf_to_string(path, format='xml', password=''):
    rsrcmgr = PDFResourceManager()
    out_stream = BytesIO()
    laparams = LAParams()
    if format == 'text':
        device = TextConverter(rsrcmgr, out_stream, laparams=laparams)
    elif format == 'html':
        device = HTMLConverter(rsrcmgr, out_stream, laparams=laparams)
    elif format == 'xml':
        device = XMLConverter(rsrcmgr, out_stream, laparams=laparams)
    else:
        raise ValueError('provide format, either text, html or xml!')

    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    fp.close()
    device.close()

    text = out_stream.getvalue().decode("utf-8")
    out_stream.close()
    return text
示例#19
0
def convert_pdf(path, outp, format='txt', codec='utf-8', password=''):
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    outf = open(outp + '.' + format, 'wb')
    if format == 'txt':
        device = TextConverter(rsrcmgr, outf, codec=codec, laparams=laparams)
    elif format == 'html':
        device = HTMLConverter(rsrcmgr, outf, codec=codec, laparams=laparams)
    elif format == 'xml':
        device = XMLConverter(rsrcmgr, outf, codec=codec, laparams=laparams)
    else:
        raise ValueError('provide format, either text, html or xml!')
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    fp.close()
    device.close()
    outf.close()
def pdf2xml(filename):
    rsrcmgr = PDFResourceManager(caching=True)
    outfp = StringIO.StringIO()
    device = XMLConverter(rsrcmgr, outfp, codec='utf-8', laparams=LAParams(), imagewriter=None)

    fp = file(filename, 'rb')
    pages = PDFPage.get_pages(fp, None, maxpages=0, password='', caching=True, check_extractable=True)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in pages:
        interpreter.process_page(page)
    fp.close()
    device.close()

    xml = outfp.getvalue()
    outfp.close()
    return xml
示例#21
0
 def getTitle(self,stream):
     stream.seek(0)
     input1 = PdfFileReader(stream)
     title = input1.getDocumentInfo().title
     # if fail to get thesis's title , we deal with it by using a special algorithm.
     if title in ['untitled','']:
         from pdfminer.pdfdocument import PDFDocument
         from pdfminer.pdfparser import PDFParser
         from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
         from pdfminer.pdfdevice import PDFDevice, TagExtractor
         from pdfminer.pdfpage import PDFPage
         from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
         from pdfminer.layout import LAParams
         try:
             from cStringIO import StringIO
         except ImportError:
             from StringIO import StringIO
         # init parameters
         caching = True
         codec = 'utf-8'
         imagewriter = None
         stripcontrol = False
         pagenos = set()
         password = ''
         maxpages = 0
         rotation = 0
         rsrcmgr = PDFResourceManager(caching=caching)
         laparams = LAParams()
         outfp = StringIO()
         # convert pdf to xml, using StringIO to store XML
         device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                           imagewriter=imagewriter,
                           stripcontrol=stripcontrol)
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         pagenos.update( int(x)-1 for x in '1'.split(',') )
         stream.seek(0)
         for page in PDFPage.get_pages(stream, pagenos,
                                   maxpages=maxpages, password=password,
                                   caching=caching, check_extractable=True):
             page.rotate = (page.rotate+rotation) % 360
             interpreter.process_page(page)
         device.close()
         outfp.seek(0)
         # parse the xml to get title
         title = self._getTitleFromXmlStr(outfp.read().encode(codec))
     return title
示例#22
0
def pdf_to_xml(pdfpath):
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    laparams = LAParams()
    device = XMLConverter(rsrcmgr, sio, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    fp = open(pdfpath, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    text = sio.getvalue()

    device.close()
    sio.close()

    return text
示例#23
0
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = BytesIO()
    manager = PDFResourceManager()
    converter = XMLConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, "rb")
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    return text
def trasformaPDFinXML(doc, directoryOutfile):			# PDF Miner solo parte che mi serve (trasforma PDF in XML)

	password = ""
	pagenos = []
	i = 0
	codec ="utf-8"
	laparams = None 
	imagewriter = None 
 
	doc = doc.replace(" ","")
	fp = open(doc, 'rb')	

	doc = doc.replace("/", "_")
	pos = doc.find(".pdf")
	doc = doc[: pos]
	doc ="outfile "+ doc

	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	# Supply the password for initialization.
	document = PDFDocument(parser, password)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
	    raise PDFTextExtractionNotAllowed
	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()
	outfp = file(directoryOutfile+"/"+doc,"w") 

	device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)

	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	# Process each page contained in the document.
	for page in PDFPage.create_pages(document):
	    interpreter.process_page(page)
	  
	fp.close()
	device.close()
	outfp.close()

	return doc
示例#25
0
    def __init__(self, pdf_file, txt_file, file_format='txt', layout_analysis=True):
        self.pdf_file = file(pdf_file, 'rb')
        self.outfp = file(txt_file, 'w')

        if layout_analysis:
            laparams = LAParams()
        else:
            laparams = None

        self.rsrcmgr = PDFResourceManager(caching=True)
        
        if file_format == 'txt':
            self.device = TextConverter(self.rsrcmgr, self.outfp, codec='utf-8', 
                laparams=laparams, imagewriter=None)
        elif file_format == 'html':
            self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', 
                laparams=laparams, imagewriter=None)
        elif file_format == 'xml':
            self.device = XMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', 
                laparams=laparams, imagewriter=None)
def parse_pdfs(pdf_filenames):
    # Set parameters
    pagenos = set()
    maxpages = 0
    password = ''
    imagewriter = None
    codec = 'utf-8'
    caching = True
    laparams = LAParams()

    rsrcmgr = PDFResourceManager(caching=caching)

    # Convert to XML as it retains the most information about text position (compared to text, html, etc).
    for pdf_file in pdf_filenames:

        print "Converting %s to xml." % pdf_file

        fname, ext = os.path.splitext(pdf_file)
        outfile = fname + '.xml'
        with open(pdf_file, 'rb') as fp, open(outfile, 'w') as outfp:

            device = XMLConverter(rsrcmgr,
                                  outfp,
                                  codec=codec,
                                  laparams=laparams,
                                  imagewriter=imagewriter)

            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)

            device.close()

        print "Conversion complete."
示例#27
0
def convert_pdf_to_xml(path):
    from pdfminer.converter import XMLConverter
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
示例#28
0
 def get_xml_data(self):
     """Store XML representation fo file"""
     rm = PDFResourceManager(caching=True,
                             font_correctors=self.font_correctors)
     laparams = LAParams()
     outfp = open(self.xmlfile, "wb")
     device = XMLConverter(rm,
                           outfp,
                           codec="UTF-8",
                           laparams=laparams,
                           imagewriter=None)
     interpreter = PDFPageInterpreter(rm, device)
     infile = open(self.pdffile, "rb")
     pagenos = set()
     maxpages = 0
     rotation = 0
     password = ""
     for page in PDFPage.get_pages(infile,
                                   pagenos,
                                   maxpages=maxpages,
                                   password=password,
                                   caching=True,
                                   check_extractable=True):
         page.rotate = (page.rotate + rotation) % 360
         interpreter.process_page(page)
     self.font_metrics = {}
     for font in list(rm._cached_fonts.values()):
         try:
             self.font_metrics[font.fontname] = {
                 "bbox": font.bbox,
                 "descent": font.descent
             }
         except AttributeError:
             print((dir(font)))
     infile.close()
     device.close()
     outfp.close()
示例#29
0
def convert_xml(inf,
                outf,
                page_numbers=None,
                output_type='xml',
                codec='utf-8',
                laparams=None,
                maxpages=0,
                scale=1.0,
                rotation=0,
                output_dir=None,
                strip_control=False,
                debug=False,
                disable_caching=False):
    laparams = LAParams()
    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)

    rsrcmgr = PDFResourceManager(caching=not disable_caching)

    device = XMLConverter(rsrcmgr,
                          outf,
                          codec='utf-8',
                          laparams=laparams,
                          imagewriter=imagewriter)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(inf,
                                  page_numbers,
                                  maxpages=maxpages,
                                  caching=not disable_caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    device.close()
    return page
示例#30
0
 def reinit(self):
     rsrcmgr = PDFResourceManager()
     retstr = BytesIO()
     laparams = LAParams()
     if self.format == 'text':
         device = TextConverter(rsrcmgr, retstr, codec=self.codec, laparams=laparams)
     elif self.format == 'html':
         device = HTMLConverter(rsrcmgr, retstr, codec=self.codec, laparams=laparams)
     elif self.format == 'xml':
         device = XMLConverter(rsrcmgr, retstr, codec=self.codec, laparams=laparams)
     elif self.format == 'filter':
         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
     else:
         raise ValueError('provide format, either text, html or xml!')
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     return {'retstr': retstr, 'device': device, 'interpreter': interpreter}
示例#31
0
    def run(self):
        rsrcmgr = PDFResourceManager(caching=self._caching)
        if not self._outtype:
            self._outtype = 'text'
            if __name__ == '__main__':
                if self._outfile:
                    if self._outfile.endswith('.htm') or self._outfile.endswith('.html'):
                        self._outtype = 'html'
                    elif self._outfile.endswith('.xml'):
                        self._outtype = 'xml'
                    elif self._outfile.endswith('.tag'):
                        self._outtype = 'tag'
        if __name__ == '__main__':
            if self._outfile:
                outfp = file(self._outfile, 'w')
            else:
                outfp = sys.stdout
        else:
            from cStringIO import StringIO
            outfp = StringIO()
        if self._outtype == 'text':
            device = TextConverter(rsrcmgr, outfp, codec=self._codec, laparams=self._laparams, imagewriter=self._imagewriter)
        elif self._outtype == 'xml':
            device = XMLConverter(rsrcmgr, outfp, codec=self._codec, laparams=self._laparams, imagewriter=self._imagewriter)
        elif self._outtype == 'html':
            device = HTMLConverter(rsrcmgr, outfp, codec=self._codec, scale=self._scale, layoutmode=self._layoutmode, laparams=self._laparams, imagewriter=self._imagewriter)
        elif self._outtype == 'tag':
            device = TagExtractor(rsrcmgr, outfp, codec=self._codec)
        else:
            return usage()
        for fname in self._args:
            fp = file(fname, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.get_pages(fp, self._pagenos, maxpages=self._maxpages, password=self._password, caching=self._caching, check_extractable=True):
                interpreter.process_page(page)

            fp.close()
        device.close()
        if __name__ == '__main__':
            outfp.close()
        else:
            return outfp.getvalue()
示例#32
0
def getPDFText(pdfFilenamePath):
    retstr = StringIO()
    parser = PDFParser(open(pdfFilenamePath, 'r'))
    try:
        document = PDFDocument(parser)
    except Exception as e:
        print(pdfFilenamePath, 'is not a readable pdf')
        return ''
    if document.is_extractable:
        rsrcmgr = PDFResourceManager()
        device = XMLConverter(rsrcmgr,
                              retstr,
                              codec='ascii',
                              laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
        return retstr.getvalue()
    else:
        print(pdfFilenamePath,
              "Warning: could not extract text from pdf file.")
        return ''
示例#33
0
def convert_pdf(format='html', codec='utf-8', password=''):
    pdf_folder = '/home/bichitra/Desktop/project/pdf/'
    file_name = '1c1edeee-a13e-4b2e-90be-eb1dd03c3384.pdf'
    # file_name = 'EICHERMOT.pdf'
    file_name = pdf_folder + file_name
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    laparams = LAParams()
    if format == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'html':
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'xml':
        device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    else:
        raise ValueError('provide format, either text, html or xml!')
    fp = open(file_name, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue().decode()
    fp.close()
    device.close()
    retstr.close()
    soup = BeautifulSoup(text, 'html.parser')
    htmlfile = open(file_name + '.html', 'w+')
    htmlfile.write(str(soup))
    htmlfile.flush()
    htmlfile.close()
    return soup
示例#34
0
def convert(fname,Converter='HTML',pages=None,write=False):
    '''
    Converter: 'HTML','Text','XML'
    pages: [beginPage, endPage]
    '''
    if not pages:
        pagenums = set()
    else:
        pages = map((lambda x:x-1),pages)
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    
    if Converter == 'HTML':
        converter = HTMLConverter(manager, output, laparams=LAParams())
    elif Converter == 'Text':
        converter = TextConverter(manager, output, laparams=LAParams())
    elif Converter == 'XML':
        converter = XMLConverter(manager, output, laparams=LAParams())
        
    interpreter = PDFPageInterpreter(manager, converter)
    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    
    if write:
        if Converter=='HTML':
            format = '.html'
        elif Converter=='Text':
            format = '.txt'
        elif Converter == 'XML':
            format = '.xml'
        writeFile(fname,text,format)
    else:
        return text 
def convert_pdf(path, format='html', codec='utf-8', password=''):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    laparams = LAParams()
    if format == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'html':
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'xml':
        device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    else:
        raise ValueError('provide format, either text, html or xml!')
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue().decode()
    fp.close()
    device.close()
    retstr.close()
    return text


#text = convert_pdf('python cv.pdf', 'text')
#
#with open('python cv.txt', 'w', encoding='utf-8') as f:
#    f.write(text)
示例#36
0
def main(argv):
    def usage():
        print((
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
            '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
            '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...'
            % argv[0]))
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug = True
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)

    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
        close_outfp = True
    else:
        outfp = sys.stdout
        close_outfp = False
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               outdir=outdir,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        fp = io.open(fname, 'rb')
        process_pdf(rsrcmgr,
                    device,
                    fp,
                    pagenos,
                    maxpages=maxpages,
                    password=password,
                    caching=caching,
                    check_extractable=True)
        fp.close()
    device.close()
    if close_outfp:
        outfp.close()
示例#37
0
def main(argv):
    import getopt

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
示例#38
0
def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
示例#39
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Convert PDF into text.')
    parser.add_argument('file',
                        nargs='*',
                        type=argparse.FileType('rb'),
                        default=sys.stdin,
                        help='file(s) to convert')
    parser.add_argument('-C',
                        '--nocache',
                        dest='cache',
                        action='store_false',
                        help='prevent object caching (slower)')
    parser.add_argument('-l',
                        metavar='level',
                        default='warn',
                        help='logging level (warn, info, debug)')
    parser.add_argument('-p',
                        metavar='page',
                        nargs='+',
                        default=[],
                        type=int,
                        help='page number(s) (space separated)')
    parser.add_argument('-m',
                        metavar='maxpages',
                        default=0,
                        type=int,
                        help='maximum number of pages to extract')
    parser.add_argument('-P',
                        metavar='password',
                        default='',
                        help='pdf password')
    parser.add_argument('-o',
                        metavar='outfile',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help='output file name (default: stdout)')
    parser.add_argument('-O',
                        metavar='directory',
                        type=ImageWriter,
                        help='extract images and save to directory')
    parser.add_argument('-t',
                        metavar='outtype',
                        help='output type (text, html, xml, tag)')
    parser.add_argument('-c',
                        metavar='codec',
                        default='utf-8',
                        help='output text encoding (default: %(default)s)')
    lagroup = parser.add_argument_group(title='layout analysis')
    lagroup.add_argument('-n',
                         action='store_true',
                         help='disable layout analysis')
    lagroup.add_argument('-A',
                         action='store_true',
                         help='force layout analysis on all text')
    lagroup.add_argument('-V',
                         action='store_true',
                         help='detect vertical text')
    lagroup.add_argument('-M',
                         metavar='char_margin',
                         type=float,
                         help='custom character margin')
    lagroup.add_argument('-L',
                         metavar='line_margin',
                         type=float,
                         help='custom line margin')
    lagroup.add_argument('-W',
                         metavar='word_margin',
                         type=float,
                         help='custom word margin')
    lagroup.add_argument('-F',
                         metavar='boxes_flow',
                         type=float,
                         help='custom boxes flow')
    lagroup.add_argument('-Y',
                         metavar='layout_mode',
                         default='normal',
                         help='layout mode for HTML (normal, exact, loose)')
    lagroup.add_argument('-s',
                         metavar='scale',
                         default=1,
                         type=float,
                         help='output scaling for HTML')
    args = parser.parse_args(argv)

    logging.basicConfig()
    logging.getLogger('pdfminer').setLevel(args.l.upper())

    laparams = LAParams()
    if args.n:
        laparams = None
    else:
        laparams.all_texts = args.A
        laparams.detect_vertical = args.V
        if args.M:
            laparams.char_margin = args.M
        if args.L:
            laparams.line_margin = args.L
        if args.W:
            laparams.word_margin = args.W
        if args.F:
            laparams.boxes_flow = args.F

    rsrcmgr = PDFResourceManager(caching=args.cache)
    outtype = args.t
    if not outtype:
        if args.o:
            if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
                outtype = 'html'
            elif args.o.name.endswith('.xml'):
                outtype = 'xml'
            elif args.o.name.endswith('.tag'):
                outtype = 'tag'
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              args.o,
                              codec=args.c,
                              laparams=laparams,
                              imagewriter=args.O)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               args.o,
                               codec=args.c,
                               scale=args.s,
                               layoutmode=args.Y,
                               laparams=laparams,
                               imagewriter=args.O)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, args.o, codec=args.c)
    else:
        device = TextConverter(rsrcmgr,
                               args.o,
                               codec=args.c,
                               laparams=laparams,
                               imagewriter=args.O)
    for fp in args.file:
        process_pdf(rsrcmgr,
                    device,
                    fp, [i - 1 for i in args.p],
                    maxpages=args.m,
                    password=args.P,
                    caching=args.cache,
                    check_extractable=True)
        fp.close()
    device.close()
    if args.o is not sys.stdout:
        args.o.close()
示例#40
0
    if fname[-3:] == "pdf":

        # Set parameters
        pagenos = set()
        maxpages = 0
        password = ''
        imagewriter = None
        codec = 'utf-8'
        caching = True
        laparams = LAParams()
        outfile = fname + '.txt'

        rsrcmgr = PDFResourceManager(caching=caching)

        outfp = file(outfile, 'w')

        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                          imagewriter=imagewriter)

        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()

        device.close()
        outfp.close()
示例#41
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Convert PDF into text.')
    parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert')
    parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)')
    parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)')
    parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)')
    parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract')
    parser.add_argument('-P', metavar='password', default='', help='pdf password')
    parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout,
                        help='output file name (default: stdout)')
    parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory')
    parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)')
    parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)')
    lagroup = parser.add_argument_group(title='layout analysis')
    lagroup.add_argument('-n', action='store_true', help='disable layout analysis')
    lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text')
    lagroup.add_argument('-V', action='store_true', help='detect vertical text')
    lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin')
    lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin')
    lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin')
    lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow')
    lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)')
    lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML')
    args = parser.parse_args(argv)

    logging.basicConfig()
    logging.getLogger('pdfminer').setLevel(args.l.upper())

    laparams = LAParams()
    if args.n:
        laparams = None
    else:
        laparams.all_texts = args.A
        laparams.detect_vertical = args.V
        if args.M:
            laparams.char_margin = args.M
        if args.L:
            laparams.line_margin = args.L
        if args.W:
            laparams.word_margin = args.W
        if args.F:
            laparams.boxes_flow = args.F

    rsrcmgr = PDFResourceManager(caching=args.cache)
    outtype = args.t
    if not outtype:
        if args.o:
            if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
                outtype = 'html'
            elif args.o.name.endswith('.xml'):
                outtype = 'xml'
            elif args.o.name.endswith('.tag'):
                outtype = 'tag'
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y,
                               laparams=laparams, imagewriter=args.O)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, args.o, codec=args.c)
    else:
        device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
    for fp in args.file:
        process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P,
                    caching=args.cache, check_extractable=True)
        fp.close()
    device.close()
    if args.o is not sys.stdout:
        args.o.close()