Exemplo n.º 1
0
def process_pdf(in_path, out_path):
    """
    Processes a PDF and extracts its contents to HTML.

    Args:
        in_path: The full path to the source PDF file.
        out_path: The full path to the destination HTML file.
    """
    page_numbers=set()

    # Get source/destination file handles
    in_file = file(in_path, 'rb')
    out_file = file(out_path, 'w')

    # Set up the resource manager, device, and interpreter
    res_mgr = PDFResourceManager()
    device = HTMLConverter(res_mgr, out_file, codec='utf-8', laparams=LAParams(), imagewriter=None)
    interpreter = PDFPageInterpreter(res_mgr, device)

    for page in PDFPage.get_pages(in_file, page_numbers, 
            maxpages=0, password="", 
            caching=True, check_extractable=True):
        interpreter.process_page(page)

    # Close all the file handles
    in_file.close()
    device.close()
    out_file.close()
    return
Exemplo n.º 2
0
def pdf_to_html(scraped_pdf_data): 
    from pdfminer.pdfinterp import PDFResourceManager, process_pdf 
    from pdfminer.pdfdevice import PDFDevice 
    from pdfminer.converter import HTMLConverter 
    from pdfminer.layout import LAParams 

    import StringIO 
    fp = StringIO.StringIO() 
    fp.write(scraped_pdf_data) 
    fp.seek(0) 
    outfp = StringIO.StringIO() 
    layoutmode='normal'
    scale=2
    charmargin=0.5
    linemargin=0.5
    wordmargin=0.3
    boxesflow=0

    rsrcmgr = PDFResourceManager() 
    device = HTMLConverter(rsrcmgr, outfp, layoutmode=layoutmode, scale=scale, laparams=LAParams(char_margin=charmargin, line_margin=linemargin, word_margin=wordmargin, boxes_flow=boxesflow)) 
    process_pdf(rsrcmgr, device, fp) 
    device.close() 

    t = outfp.getvalue() 
    outfp.close() 
    fp.close() 
    return t
Exemplo n.º 3
0
    def pdf_para_html(self, path):
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
        from pdfminer.converter import HTMLConverter
#        from pdfminer.converter import TextConverter
        from pdfminer.layout import LAParams
        from pdfminer.pdfpage import PDFPage
        from cStringIO import StringIO
#        import re
#        import csv
        
        
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0 #is for all
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str   
Exemplo n.º 4
0
def convertPDF(fname, pages=None):
    if not pages:
        pagenos = set()
    else:
        pagenos = set(pages)
    caching = True
    outfp = StringIO()
    layoutmode = 'normal'
    laparams = LAParams()
    rotation = 0

    rsrcmgr = PDFResourceManager(caching=caching)
    device = HTMLConverter(rsrcmgr, outfp, codec='utf-8', scale=1,
                           layoutmode=layoutmode, laparams=laparams,
                           imagewriter=None)
    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=0, password='',
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()

    text = outfp.getvalue()
    outfp.close()
    return text
Exemplo n.º 5
0
def parse_html(file_name):
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = TextReciver()
    device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    for fname in [file_name]:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    return outfp.text
Exemplo n.º 6
0
 def to_html(self, fp):
     out_buf = StringIO.StringIO()
     device = HTMLConverter( self.resmgr
                           , out_buf
                           , codec=self.options.codec
                           , scale=self.options.scale
                           , layoutmode=self.options.layoutmode
                           , laparams=self.options.laparams
                           , outdir=None
                           )
     self._process(fp, device)
     device.close()
     result = out_buf.getvalue()
     out_buf.close()
     return result
Exemplo n.º 7
0
    def transform_file(self, pdfpath):
        try:
            self.LOGGER.debug(pdfpath)
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'

            device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=self.laparams)
            fp = file(pdfpath, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            # NOTE check_extractable seems to allow overriding text extraction locks
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=False):
                interpreter.process_page(page)
            fp.close()
            device.close()
            html = retstr.getvalue()
            # otherwise html is str at this point, not unicode
            html = html.decode('utf8')
            retstr.close()
            soup = BeautifulSoup(html)
            # LOGGER.debug(soup.text)
            text_size = len(soup.text)
            stub_data = {
                # "URL": uri,
                "markup": {
                    "innerHTML": unicode(html),
                    "innerText": unicode(soup.text)
                },
                "workflow": {
                    "is_stub": True
                },
                "__text_size": text_size,
                # __fields are ignored by kibana
                "timestamp": datetime.now()
            }
        except Exception as e:
            stub_data = {
                "error": str(e),
                "workflow": {
                    "is_stub": True
                },
                "__text_size": -1
            }
        return stub_data
Exemplo n.º 8
0
	def __init__(self):
		# debug option
		self.setdebug(0)
		#only first page
		self.pagenos=set([0])
		self.pageno = 1
		self.outfp = stdmodel()
		self.codec = 'utf-8'
		self.showpageno = True
		self.scale = 1
		self.password = ''
		self.maxpages = 0
		self.rotation = 0
		self.imagewriter = None
		self.laparams = LAParams()	
		self.layoutmode = 'normal'	
	# ResourceManager facilitates reuse of shared resources such as fonts and images so that 
	# large objects are not allocated multiple times.
		#### This will cause some problem when set to default True.
		self.caching = False
		self.rsrcmgr = PDFResourceManager(caching=self.caching)

		# Important Main converter for pdf file
		self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, 
			laparams=self.laparams, imagewriter=self.imagewriter)

		self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale,
                               layoutmode=self.layoutmode, laparams=self.laparams,
                               imagewriter=self.imagewriter)
Exemplo n.º 9
0
def convert_pdf(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    fp = file(path, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()

    str = retstr.getvalue()
    retstr.close()
    return str
Exemplo n.º 10
0
def extract_price_from_pdf(file_name):
    pagenos = set()
    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = TextReciver()
    device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    #Read the file
    for fname in [file_name]:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      caching=True,
                                      check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()

    #Find all lines that end with a price and include position
    #information. Also find all following lines that include prices
    #but no new location (shorter 100 characters)
    matches = re.finditer('(.*left.*[0-9]{1,2}\.[0-9]{1,2} )'
                          '(\n<br>.{0,100}[0-9]{1,2}\.[0-9]{1,2} *)*',
                          outfp.text)

    pos_list = []
    for m in matches:
        line_group = m.group().split('\n')

        #Extract the position information from the string
        pos_string = re.findall('(.*top:)([0-9]+)(px)', line_group[0])[0]
        ypos = pos_string[1]

        #Iterate over all lines and extract the price. Increment the
        #position slightly for each new line
        for i, price_text in enumerate(line_group):n
            price = float(re.findall('[0-9]{1,2}\.[0-9]{1,2}',
                                     price_text[::-1])[0][::-1])
            ypos= int(ypos) + i
            pos_list.append((ypos, price))
Exemplo n.º 11
0
def convert_pdf_to_html(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, retstr, codec = codec, laparams = laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages = maxpages, password = password, caching = caching, check_extractable = True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
Exemplo n.º 12
0
def extract_price_from_pdf(file_name):
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = TextReciver()
    device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    for fname in [file_name]:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    matches = re.finditer('(.*left.*[0-9]{1,2}\.[0-9]{1,2} )(\n<br>.{0,100}[0-9]{1,2}\.[0-9]{1,2} *)*',outfp.text)    
    pos_list = []
    for m in matches:
        line_group = m.group().split('\n')
        ypos = re.findall('[0-9]+',re.findall('.*top:[0-9]+px', line_group[0])[0][::-1])[0][::-1]
        for i,price in enumerate(line_group):
            if len(price):
                p = float(re.findall('[0-9]{1,2}\.[0-9]{1,2}',price[::-1])[0][::-1])
                ypos= int(ypos) + i
                
                pos_list.append((ypos, p))
    pos_list.sort()
    pos, price_list = zip(*pos_list)
    
    return price_list
Exemplo n.º 13
0
 def get_html(self, path):  # Pulls html from PDF instead of plain text
     if path[-4:] != ".pdf":
         path = path + ".pdf"
     rsrcmgr = PDFResourceManager()
     retstr = StringIO()
     codec = 'utf-8'
     laparams = LAParams()
     device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
     fp = file(path, 'rb')
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     password = ""
     maxpages = 0
     caching = True
     pagenos = set()
     for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
         interpreter.process_page(page)
     fp.close()
     device.close()
     result = retstr.getvalue()
     retstr.close()
     return result
Exemplo n.º 14
0
def convert_pdf_to_html(url):

    r = requests.head(url)
    r.headers["content-type"]

    if 'application/pdf' in r.headers["content-type"]:

        r = requests.get(url)

        # Cast to StringIO object
        from StringIO import StringIO
        memory_file = StringIO(r.content)

        # Create a PDF parser object associated with the StringIO object
        parser = PDFParser(memory_file)

        # Create a PDF document object that stores the document structure
        document = PDFDocument(parser)

        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0 #is for all
        caching = True
        pagenos=set()

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str
Exemplo n.º 15
0
class PDF2Txt:
	def __init__(self,pdffile,outfile,output_type='text'):
		PDFDocument.debug = 0
		PDFParser.debug = 0
		CMapDB.debug = 0
		PDFResourceManager.debug = 0
		PDFPageInterpreter.debug = 0
		PDFDevice.debug = 0
		self.rsrcmgr = PDFResourceManager(caching=True)
		self.outtype = output_type
		self.outfile = outfile
		self.pdffile = pdffile

	def convert(self):
		outfp = file(self.outfile,'w')
		if self.outtype == 'text':
			self.device = TextConverter(self.rsrcmgr,outfp,codec='utf-8',laparams=LAParams(),imagewriter=None)
		elif self.outtype == 'xml':
			self.device = XMLConverter(self.rsrcmgr, outfp, codec='utf-8', laparams=LAParams(),
							  imagewriter=None)
		elif self.outtype == 'html':
			self.device = HTMLConverter(self.rsrcmgr, outfp, codec='utf-8', scale=1,
							   layoutmode='normal', laparams=LAParams(),
							   imagewriter=None)
		else:
			print 'Formato de salida no soportado'
			sys.exit(-1)
		fp = file(self.pdffile,'rb')
		interpreter = PDFPageInterpreter(self.rsrcmgr,self.device)
		pagenos = set()
		for page in PDFPage.get_pages(fp,pagenos,caching=True,check_extractable=True):
			page.rotate = (page.rotate) % 360
			interpreter.process_page(page)
		fp.close()
		self.device.close()
		outfp.close()
		print "Archivo %s creado en base a %s" % (self.outfile,self.pdffile)
Exemplo n.º 16
0
	def reset(self,html=False):
		'''Reset can avoid wrong judge'''
		self.rsrcmgr = PDFResourceManager(caching=self.caching)

		# Important Main converter for pdf file
		
		if (html):
			self.htmldevice.close()
			self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale,
		                   layoutmode=self.layoutmode, laparams=self.laparams,
		                   imagewriter=self.imagewriter)
		else:	
			self.device.close()
			self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, 
			laparams=self.laparams, imagewriter=self.imagewriter)
Exemplo n.º 17
0
def translate(output, args):
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    outfile = output
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    fp = file(args, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    outfp.close()
    return
Exemplo n.º 18
0
class PDFHandler(object):
	'''A PDF Handle class to read contains
	Now also support file object/StringIO object(won't close after process)'''
	def __init__(self):
		# debug option
		self.setdebug(0)
		#only first page
		self.pagenos=set([0])
		self.pageno = 1
		self.outfp = stdmodel()
		self.codec = 'utf-8'
		self.showpageno = True
		self.scale = 1
		self.password = ''
		self.maxpages = 0
		self.rotation = 0
		self.imagewriter = None
		self.laparams = LAParams()	
		self.layoutmode = 'normal'	
	# ResourceManager facilitates reuse of shared resources such as fonts and images so that 
	# large objects are not allocated multiple times.
		#### This will cause some problem when set to default True.
		self.caching = False
		self.rsrcmgr = PDFResourceManager(caching=self.caching)

		# Important Main converter for pdf file
		self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, 
			laparams=self.laparams, imagewriter=self.imagewriter)

		self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale,
                               layoutmode=self.layoutmode, laparams=self.laparams,
                               imagewriter=self.imagewriter)

	def reset(self,html=False):
		'''Reset can avoid wrong judge'''
		self.rsrcmgr = PDFResourceManager(caching=self.caching)

		# Important Main converter for pdf file
		
		if (html):
			self.htmldevice.close()
			self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale,
		                   layoutmode=self.layoutmode, laparams=self.laparams,
		                   imagewriter=self.imagewriter)
		else:	
			self.device.close()
			self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, 
			laparams=self.laparams, imagewriter=self.imagewriter)

	def setdebug(self,value):
		'''Set Debug Information. Especially when init'''
		# debug option
		self.debug = 0
		PDFResourceManager.debug = self.debug
		PDFPageInterpreter.debug = self.debug
		#PDFDocument.debug = self.debug
		#PDFParser.debug = self.debug
		#CMapDB.debug = self.debug
		#PDFDevice.debug = self.debug	

	def GetPageNumber(self,fname,fobj=None):
		'''Get total page number of PDF'''
		if (fobj):
			#fp=StringIO(fobj.read())
			#fobj.seek(0)
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			pageno=0
			for page in PDFPage.get_pages(fp, set(), maxpages=0, 
				password=self.password, caching=self.caching, check_extractable=False):
				pageno+=1
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return pageno
		except Exception as e:
			print e
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			print "Error Reading PDF page number.."
			return 0

	def FastCheck(self,fname,fobj=None):
		'''Fast check whether has page one'''
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			for page in PDFPage.get_pages(fp, set([0]), maxpages=1, 
				password=self.password, caching=self.caching, check_extractable=False):
				break
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return True
		except Exception as e:
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			print "Error Reading PDF page number..",fname
			return False

	def GetSinglePage(self,fname,pageno=1,html=False,fobj=None):
		'''Get Single Page contents of PDF, return string
		Default first page'''	
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			if (html):
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice)
			else:
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)

			for page in PDFPage.get_pages(fp, set([pageno-1]), maxpages=self.maxpages, 
				password=self.password, caching=self.caching, check_extractable=False):

				page.rotate = (page.rotate+self.rotation) % 360
				interpreter.process_page(page)
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			outstr=self.outfp.get()
			self.outfp.reset()
			return outstr 
		except Exception as e:
			self.outfp.reset()
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return ""

	def GetPages(self,fname,pagenos=[1],html=False,fobj=None):
		'''Get Several Page contents of PDF, return string
		Default first page'''	
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			if (html):
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice)
			else:
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)

			for page in PDFPage.get_pages(fp, set([i-1 for i in pagenos]), maxpages=self.maxpages, 
				password=self.password, caching=self.caching, check_extractable=False):

				page.rotate = (page.rotate+self.rotation) % 360
				interpreter.process_page(page)
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			outstr=self.outfp.get()
			self.outfp.reset()
			return outstr 
		except Exception as e:
			self.outfp.reset()
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return ""

	def GetAllPages(self,fname,html=False,fobj=None):
		'''Get All Page contents of PDF, return string'''	
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			if (html):
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice)
			else:
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)

			for page in PDFPage.get_pages(fp, set(), maxpages=self.maxpages, 
				password=self.password, caching=self.caching, check_extractable=False):

				page.rotate = (page.rotate+self.rotation) % 360
				interpreter.process_page(page)
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			outstr=self.outfp.get()
			self.outfp.reset()
			return outstr 
		except Exception as e:
			self.outfp.reset()
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return ""
Exemplo n.º 19
0
def pdf_gettext(filepath, reserve):
	# debug option
	debug = 0
	# input option
	password = ''
	pagenos = set()
	maxpages = 0
	# output option
	outfile = 'output.html'
	imagewriter = None
	layoutmode = 'normal'
	codec = 'utf-8'
	scale = 1
	caching = True
	laparams = LAParams()
	firstout = 'firstout.html'
	lastout = 'lastout.html'
	firstpage = None
	lastpage = None
	first = []
	last = []
	
	if False:
		firstout = filepath[:-3] + firstout
		lastout = filepath[:-3] + lastout
		if os.path.exists(firstout):					
			html_textparser(firstout, first)
			if os.path.exists(lastout):
				html_textparser(lastout, last)
			return first, last
	
	rsrcmgr = PDFResourceManager(caching=caching)
	
	#import io
	#outfp = io.StringIO()
	#outfp = io.open(outfile, 'w+t', encoding=codec, errors='ignore')
	import tempfile
	outfp = tempfile.TemporaryFile(mode='w+t', encoding=codec)
	
	device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams)
	
	
	with open(filepath, 'rb') as fp:
		#process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
        #            caching=caching, check_extractable=True)
		
		parser = PDFParser(fp)
		doc = PDFDocument(caching=caching)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize(password)
		if not doc.is_extractable:
			raise Exception('Text extraction is not allowed: %s' % filepath)
			
		interpreter = PDFPageInterpreter(rsrcmgr, device)	
		for page in doc.get_pages():
			if not firstpage:
				firstpage = page
			else:
				lastpage = page

		if firstpage:
			interpreter.process_page(firstpage)
			with open(firstout, 'w', encoding=codec) as f:
				outfp.seek(0)
				f.write(outfp.read())			
			html_textparser(firstout, first)				
		if lastpage:
			outfp.truncate(0)
			interpreter.process_page(lastpage)
			with open(lastout, 'w', encoding=codec) as f:
				outfp.seek(0)
				f.write(outfp.read())
			html_textparser(lastout, last)
		
		
	device.close()	
	outfp.close()

	return first, last
Exemplo n.º 20
0
def readPDF2HTML(pdfFile, opts={}):
    # open a PDF file
    fp = StringIO(pdfFile.read())
    retstr = StringIO()
    # create a PDF parser object associated with the file object
    parser = PDFParser(fp)
    # create a PDF document allows text extraction
    document = PDFDocument(parser) # password if needed
    # check if document allows text extraction without password
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # create a PDF resource manager object that sotres shared resources
    rsrcmgr = PDFResourceManager()
    # create a PDF device object
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d':
            debug += 1
        elif k == '-p':
            pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m':
            maxpages = int(v)
        elif k == '-P':
            password = v
        elif k == '-o':
            outfile = v
        elif k == '-n':
            laparams = None
        elif k == '-A':
            laparams.all_texts = True
        elif k == '-V':
            laparams.detect_vertical = True
        elif k == '-M':
            laparams.char_margin = float(v)
        elif k == '-L':
            laparams.line_margin = float(v)
        elif k == '-W':
            laparams.word_margin = float(v)
        elif k == '-F':
            laparams.boxes_flow = float(v)
        elif k == '-Y':
            layoutmode = v
        elif k == '-O':
            outdir = v
        elif k == '-t':
            outtype = v
        elif k == '-c':
            codec = v
        elif k == '-s':
            scale = float(v)
    codec = 'utf-8'
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()
    # process each page contained in the document
    for page in PDFPage.get_pages(fp, pagenos):
        interpreter.process_page(page)
    # close streams and return text content
    fp.close()
    content = retstr.getvalue()
    device.close()
    retstr.close()
    return content
def convertPDF(outfile,pdfFile):
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    #outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    """    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #"""
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        pass  #return usage()
    fname = pdfFile  #for fname in args:
    fp = file(fname, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                caching=caching, check_extractable=True)
    fp.close()
    device.close()
    outfp.close()
    return
Exemplo n.º 22
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    outfp.close()
    return
import sys, getopt
 
#converts pdf, returns its text content as a string
def convert(case,fname, pages=None):
    if not pages: pagenums = set();
    else:         pagenums = set(pages);      
    manager = PDFResourceManager() 
    codec = 'utf-8'
    caching = True
 
    if case == 'text' :
        output = io.StringIO()
        converter = TextConverter(manager, output, codec=codec, laparams=LAParams())     
    if case == 'HTML' :
        output = io.BytesIO()
        converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams())
 
    interpreter = PDFPageInterpreter(manager, converter)   
    infile = open(fname, 'rb')
 
    for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True):
        interpreter.process_page(page)
 
    convertedPDF = output.getvalue()  
 
    infile.close(); converter.close(); output.close()
    return convertedPDF
 
def convert_pdf_to_txt(path_to_file):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
Exemplo n.º 24
0
def main(argv):
  import getopt
  def usage():
    print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
           '[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
           '[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
    return 100
  try:
    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
  except getopt.GetoptError:
    return usage()
  if not args: return usage()
  # debug option
  debug = 0
  # path option
  cmapdir = find_cmap_path()
  # input option
  password = ''
  pagenos = set()
  maxpages = 0
  # output option
  outfile = None
  outtype = None
  codec = 'utf-8'
  pageno = 1
  scale = 1
  showpageno = True
  laparams = LAParams()
  for (k, v) in opts:
    if k == '-d': debug += 1
    elif k == '-C': cmapdir = v
    elif k == '-P': password = v
    elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
    elif k == '-m': maxpages = int(v)
    elif k == '-t': outtype = v
    elif k == '-c': codec = v
    elif k == '-o': outfile = v
    elif k == '-s': scale = float(v)
    elif k == '-D': laparams.direction = v
    elif k == '-M': laparams.char_margin = float(v)
    elif k == '-L': laparams.line_margin = float(v)
    elif k == '-W': laparams.word_margin = float(v)
  #
  CMapDB.debug = debug
  PDFResourceManager.debug = debug
  PDFDocument.debug = debug
  PDFParser.debug = debug
  PDFPageInterpreter.debug = debug
  PDFDevice.debug = debug
  #
  CMapDB.initialize(cmapdir)
  rsrc = PDFResourceManager()
  if not outtype:
    outtype = 'text'
    if outfile:
      if outfile.endswith('.htm') or outfile.endswith('.html'):
        outtype = 'html'
      elif outfile.endswith('.sgml'):
        outtype = 'sgml'
      elif outfile.endswith('.tag'):
        outtype = 'tag'
  if outfile:
    outfp = file(outfile, 'w')
  else:
    outfp = sys.stdout
  if outtype == 'text':
    device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
  elif outtype == 'sgml':
    device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
  elif outtype == 'html':
    device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
  elif outtype == 'tag':
    device = TagExtractor(rsrc, outfp, codec=codec)
  else:
    return usage()
  for fname in args:
    fp = file(fname, 'rb')
    process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
    fp.close()
  device.close()
  return
Exemplo n.º 25
0
def ConvertPdf(pdfpath, outfp, opts={}):
    import sys
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice, TagExtractor
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.cmapdb import CMapDB
    from pdfminer.layout import LAParams
    from pdfminer.image import ImageWriter

    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    if not outtype:
        outtype = 'txt'
    if outtype == 'txt':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    fp = file(pdfpath, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()

    return True
Exemplo n.º 26
0
def decode_pdf(filename):
    global current_section
    global pre_section
    global pre_font_family
    global pre_font_size
    global title
    global authors
    global abstract
    global keywords

    current_section = ""
    pre_section = TAG_BEGIN
    pre_font_family = ""
    pre_font_size = ""
    title = ""
    authors = set()
    abstract = ""
    keywords = ""

    path = basedir + "/static/demos/paperminer/papers/" + filename
    # layout parameters
    laparams = LAParams()
    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    outtype = 'html'
    out = StringIO()
    # Opens a file for reading only in binary format. The file pointer is
    # placed at the beginning of the file. This is the default mode.
    fp = file(path, 'rb')

    # parse PDF to HTML
    codec = 'utf-8'
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               out,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=None)
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              out,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=None)
    if outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               out,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=None)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    pagenos = set()
    # only process the first page
    max_page = 1
    p = 0
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=max_page,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        if p >= max_page:
            break
        interpreter.process_page(page)
    fp.close()
    device.close()
    # str_value is the first PDF page in HTML
    str_value = out.getvalue()
    out.close()

    # loop through each line in HTML
    for line in str_value.split('<br>'):
        analyze(line)
    result = [
        title.decode('utf-8'), authors,
        abstract.decode('utf-8'),
        keywords.decode('utf-8')
    ]

    return result
Exemplo n.º 27
0
def main(argv):
    import getopt

    def usage():
        print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = 'tag'
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'tag'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    for fname in args:
        l = glob.glob(fname)
        count = len(l)
        print 'Converting ' + str(
            count) + ' from ' + fname + ' to ' + outtype + ' format'
        for pdf in l:
            #             print pdf
            d = {'html': 'htm', 'tag': 'tag', 'text': 'txt', 'xml': 'xml'}
            ext = '.' + d[outtype]
            outfile = pdf[0:-4] + ext
            print outfile
            outfp = file(outfile, 'wb')
            if outtype == 'text':
                device = TextConverter(rsrcmgr,
                                       outfp,
                                       codec=codec,
                                       laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'xml':
                device = XMLConverter(rsrcmgr,
                                      outfp,
                                      codec=codec,
                                      laparams=laparams,
                                      imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'html':
                device = HTMLConverter(rsrcmgr,
                                       outfp,
                                       codec=codec,
                                       scale=scale,
                                       layoutmode=layoutmode,
                                       laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'tag':
                device = TagExtractor(rsrcmgr, outfp, codec=codec)
                device.showpageno = False
            else:
                return usage()

            fp = file(pdf, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
            fp.close()
            device.close()
            outfp.close()

        print 'Done'
    return
Exemplo n.º 28
0
def main(argv):

    import getopt

    def usage():
        print(
            f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
            ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
            ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
            ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
            ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-P': password = v.encode('ascii')
        elif k == '-o': outfile = v
        elif k == '-t': outtype = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-c': encoding = v
        elif k == '-s': scale = float(v)
        elif k == '-R': rotation = int(v)
        elif k == '-Y': layoutmode = v
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()
    return
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words


def convert_to_html(case, fname, pages=None):
    if not pages: pagenums = set()
    else: pagenums = set(pages)
    manager = PDFResourceManager()
    codec = 'utf-8'
    caching = True

    if case == 'HTML':
        output = io.BytesIO()
        converter = HTMLConverter(manager, output, laparams=LAParams())

    interpreter = PDFPageInterpreter(manager, converter)
    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile,
                                  pagenums,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()

    infile.close()
    converter.close()
    output.close()
Exemplo n.º 30
0
def convert(argv):
    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = 'inputs/' + sys.argv[1].replace(' ', '')[:-4] + '.txt'
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file('pdfs/' + fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()

    #read converted file
    y = open("inputs/" + sys.argv[1].replace(' ', '')[:-4] + '.txt', "r")
    output = brl.translate(y.read())
    #convert into Grade 2 Braille unicode
    x = brl.toUnicodeSymbols(output, flatten=True)
    #save to results folder in .txt format
    text_file = open(
        "results/" + sys.argv[1].replace(' ', '')[:-4] + "-Braille.txt", "w")
    text_file.write(x.encode(codec))
    text_file.close()