def parse_pdf_to_txt(pdf_handle, write_file): pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() #laparams.all_texts = True laparams.detect_vertical = True # 创建pdf资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager(caching=caching) print("ready to open out file ........") with open(write_file, "wt", encoding=codec, errors='ignore') as outfp: device = XMLConverter(rsrcmgr, outfp, laparams=laparams) print("ready to converte pdf to xml ........") process_pdf(rsrcmgr, device, pdf_handle, pagenos, maxpages=maxpages, password='', caching=caching, check_extractable=True) device.close()
def extract_pdf_page(filename, page_number_or_numbers): """Given the name of a PDF file and the pages to extract, use PDFMiner to extract those pages and return them as XML (in utf-8 bytes). The param page_number_or_numbers can be a single page number or an iterable thereof. """ # This code adapted from pdf2txt.py which is part of PDFMiner. # Here's the command line version of the code below -- # pdf2txt.py -p 1 -o expected.xml sample.pdf if is_iterable(page_number_or_numbers): page_numbers = page_number_or_numbers else: page_numbers = [page_number_or_numbers] f_out = StringIO.StringIO() laparams = LAParams() rsrcmgr = PDFResourceManager() device = XMLConverter(rsrcmgr, f_out, codec='utf-8', laparams=laparams) with open(filename, 'rb') as f_in: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(f_in, page_numbers): interpreter.process_page(page) device.close() xml = f_out.getvalue() f_out.close() return xml
class Miner: def __init__(self, pdf_file, txt_file, file_format='txt', layout_analysis=True): self.pdf_file = file(pdf_file, 'rb') self.outfp = file(txt_file, 'w') if layout_analysis: laparams = LAParams() else: laparams = None self.rsrcmgr = PDFResourceManager(caching=True) if file_format == 'txt': self.device = TextConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None) elif file_format == 'html': self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None) elif file_format == 'xml': self.device = XMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None) def extract(self): interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) pagenos = set() for page in PDFPage.get_pages(self.pdf_file, pagenos, maxpages=0, password=None, caching=True, check_extractable=True): interpreter.process_page(page) self.pdf_file.close() self.device.close() self.outfp.close()
def pdf2xml(filename): rsrcmgr = PDFResourceManager(caching=True) outfp = StringIO.StringIO() device = XMLConverter(rsrcmgr, outfp, codec='utf-8', laparams=LAParams(), imagewriter=None) fp = file(filename, 'rb') pages = PDFPage.get_pages(fp, None, maxpages=0, password='', caching=True, check_extractable=True) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in pages: interpreter.process_page(page) fp.close() device.close() xml = outfp.getvalue() outfp.close() return xml
def convert(infile, outfile, rotation=0): debug = 0 password = '' pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug rsrcmgr = PDFResourceManager(caching=caching) outfp = open(outfile, 'wb') device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = open(infile, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close()
def parse_pdfs(pdf_filenames): # Set parameters pagenos = set() maxpages = 0 password = '' imagewriter = None codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) # Convert to XML as it retains the most information about text position (compared to text, html, etc). for pdf_file in pdf_filenames: print "Converting %s to xml."%pdf_file fname, ext = os.path.splitext(pdf_file) outfile = fname + '.xml' with open(pdf_file, 'rb') as fp, open(outfile, 'w') as outfp: device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) device.close() print "Conversion complete."
def pdf2xml(path, codec='utf-8', password = "", maxpages = 0, caching = True): ''' Given the name of a PDF file, use PDFMiner to extract its pages and return them as XML (in utf-8 bytes). ''' rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() device = XMLConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) with open(path, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos=set() #pg = 1 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) #xml = '%s %s %s' % ('<PAGE {}>'.format(pg), retstr.getvalue(), '</PAGE {}>'.format(pg)) #pg += 1 xml = retstr.getvalue() device.close() retstr.close() xml = xml.decode('utf-8') if not xml.startswith('</pages>'): xml += '\n</pages>' return xml
def _get_xml_data(self, sourcefile): """Store XML representation fo file""" rm = PDFResourceManager(caching=True, font_correctors=self.font_correctors) laparams = LAParams() outfp = io.BytesIO() device = XMLConverter(rm, outfp, codec="UTF-8", laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rm, device) infile = open(sourcefile, "rb") pagenos = set() maxpages = 0 rotation = 0 password = "" for page in PDFPage.get_pages(infile, pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=True): interpreter.process_page(page) infile.close() device.close() retval = outfp.getvalue() outfp.close() return retval
def lerPDF(arquivo): recursos = PDFResourceManager() buffer = StringIO() layoutParams = LAParams() disp = XMLConverter(recursos, buffer, laparams=layoutParams) process_pdf(recursos, disp, arquivo) disp.close() conteudo = buffer.getvalue() buffer.close() return conteudo
def to_xml(infile): output = StringIO() manager = PDFResourceManager() converter = XMLConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) for page in PDFPage.get_pages(infile): interpreter.process_page(page) converter.close() xml = output.getvalue() output.close return xml
def getTitle(self, stream): stream.seek(0) input1 = PdfFileReader(stream) title = input1.getDocumentInfo().title # if fail to get thesis's title , we deal with it by using a special algorithm. if title in ['untitled', '']: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.layout import LAParams try: from cStringIO import StringIO except ImportError: from StringIO import StringIO # init parameters caching = True codec = 'utf-8' imagewriter = None stripcontrol = False pagenos = set() password = '' maxpages = 0 rotation = 0 rsrcmgr = PDFResourceManager(caching=caching) laparams = LAParams() outfp = StringIO() # convert pdf to xml, using StringIO to store XML device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos.update(int(x) - 1 for x in '1'.split(',')) stream.seek(0) for page in PDFPage.get_pages(stream, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.seek(0) # parse the xml to get title title = self._getTitleFromXmlStr(outfp.read().encode(codec)) return title
def extract_pdf_page(filename): # Paths for creating folder and file input_file_name = Path(filename).stem output_file_folder = Path(XML_PATH, input_file_name) output_file_folder.mkdir(parents=True, exist_ok=True) output_file_path = Path(output_file_folder, input_file_name + "-" + TIME_NOW + ".xml") output_images_path = Path(XML_PATH, input_file_name, "images") output_images_path.mkdir(parents=True, exist_ok=True) output_file = io.StringIO() laparams = LAParams() rsrcmgr = PDFResourceManager() device = XMLConverter(rsrcmgr, output_file, laparams=laparams) doc = fitz.open(filename) for i in range(len(doc)): for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n < 5: # this is GRAY or RGB pix.writePNG( str(output_images_path) + "//" + "%s-%s-%s.png" % (input_file_name, i, xref)) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG( str(output_images_path) + "//" + "%s-%s.png" % (input_file_name, i, xref)) pix1 = None pix = None with open(filename, 'rb') as fh: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): interpreter.process_page(page) device.close() xml = output_file.getvalue() with open(output_file_path, 'w', encoding="utf-8") as fd: fd.write(xml) output_file.close() return xml
def __init__(self, src, limit = None): if(len(src) < 5 or src[(len(src) - 4):(len(src))] != ".pdf"): raise Exception("PDF file has to end in .pdf and has to have a name!") input_file = open(src, "rb") out = StringIO() rsrc = PDFResourceManager() device = XMLConverter(rsrc, out, codec='UTF-8', laparams=None) try: process_pdf(rsrc, device, input_file, pagenos=None, maxpages=limit, password='', check_extractable=True) finally: device.close() input_file.close() text = out.getvalue() out.close() self.text = self.cleanText(text)
def getTitle(self,stream): stream.seek(0) input1 = PdfFileReader(stream) title = input1.getDocumentInfo().title # if fail to get thesis's title , we deal with it by using a special algorithm. if title in ['untitled','']: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.layout import LAParams try: from cStringIO import StringIO except ImportError: from StringIO import StringIO # init parameters caching = True codec = 'utf-8' imagewriter = None stripcontrol = False pagenos = set() password = '' maxpages = 0 rotation = 0 rsrcmgr = PDFResourceManager(caching=caching) laparams = LAParams() outfp = StringIO() # convert pdf to xml, using StringIO to store XML device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos.update( int(x)-1 for x in '1'.split(',') ) stream.seek(0) for page in PDFPage.get_pages(stream, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) device.close() outfp.seek(0) # parse the xml to get title title = self._getTitleFromXmlStr(outfp.read().encode(codec)) return title
def pdf_to_xml(pdfpath): rsrcmgr = PDFResourceManager() sio = StringIO() laparams = LAParams() device = XMLConverter(rsrcmgr, sio, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) fp = open(pdfpath, 'rb') for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() text = sio.getvalue() device.close() sio.close() return text
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = BytesIO() manager = PDFResourceManager() converter = XMLConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, "rb") for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() return text
def trasformaPDFinXML(doc, directoryOutfile): # PDF Miner solo parte che mi serve (trasforma PDF in XML) password = "" pagenos = [] i = 0 codec ="utf-8" laparams = None imagewriter = None doc = doc.replace(" ","") fp = open(doc, 'rb') doc = doc.replace("/", "_") pos = doc.find(".pdf") doc = doc[: pos] doc ="outfile "+ doc # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() outfp = file(directoryOutfile+"/"+doc,"w") device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) fp.close() device.close() outfp.close() return doc
def parse_pdfs(pdf_filenames): # Set parameters pagenos = set() maxpages = 0 password = '' imagewriter = None codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) # Convert to XML as it retains the most information about text position (compared to text, html, etc). for pdf_file in pdf_filenames: print "Converting %s to xml." % pdf_file fname, ext = os.path.splitext(pdf_file) outfile = fname + '.xml' with open(pdf_file, 'rb') as fp, open(outfile, 'w') as outfp: device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) device.close() print "Conversion complete."
def convert_pdf_to_xml(path): from pdfminer.converter import XMLConverter rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def get_xml_data(self): """Store XML representation fo file""" rm = PDFResourceManager(caching=True, font_correctors=self.font_correctors) laparams = LAParams() outfp = open(self.xmlfile, "wb") device = XMLConverter(rm, outfp, codec="UTF-8", laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rm, device) infile = open(self.pdffile, "rb") pagenos = set() maxpages = 0 rotation = 0 password = "" for page in PDFPage.get_pages(infile, pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) self.font_metrics = {} for font in list(rm._cached_fonts.values()): try: self.font_metrics[font.fontname] = { "bbox": font.bbox, "descent": font.descent } except AttributeError: print((dir(font))) infile.close() device.close() outfp.close()
def convert_xml(inf, outf, page_numbers=None, output_type='xml', codec='utf-8', laparams=None, maxpages=0, scale=1.0, rotation=0, output_dir=None, strip_control=False, debug=False, disable_caching=False): laparams = LAParams() imagewriter = None if output_dir: imagewriter = ImageWriter(output_dir) rsrcmgr = PDFResourceManager(caching=not disable_caching) device = XMLConverter(rsrcmgr, outf, codec='utf-8', laparams=laparams, imagewriter=imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(inf, page_numbers, maxpages=maxpages, caching=not disable_caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() return page
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i - 1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
def lambda_handler(event, context): # Grab file that was just uploaded to S3 bucket's "pdf" directory bucket = event['Records'][0]['s3']['bucket']['name'] s3_new_arrived_filename = urllib.unquote_plus( event['Records'][0]['s3']['object']['key'].encode('utf8')) print('Reading file ' + s3_new_arrived_filename + ' from S3') extracted_results_from_pdf = '/tmp/extract.xml' downloaded_pdf_file = '/tmp/input.pdf' #download file into /tmp s3.meta.client.download_file(bucket, s3_new_arrived_filename, downloaded_pdf_file) print('Downloaded file ' + s3_new_arrived_filename + ' from S3') # extract pdf into xml and upload xml to S3 bucket's "xml" directory resource_mgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' laparams = LAParams() device = XMLConverter(resource_mgr, retstr, codec=codec, laparams=laparams) maxpages = 0 caching = True pagenos = set() infile_pdf_fp = file(downloaded_pdf_file, 'rb') interpreter = PDFPageInterpreter(resource_mgr, device) for page in PDFPage.get_pages(infile_pdf_fp, pagenos, maxpages=maxpages, password='', caching=caching, check_extractable=True): interpreter.process_page(page) data = retstr.getvalue() # xml data extracted from pdf device.close() retstr.close() # write xml (extracted from pdf) to a new file print('Opening file ' + extracted_results_from_pdf + ' to write extracted xml from ' + s3_new_arrived_filename) outfile_xml_fp = file(extracted_results_from_pdf, 'w') print('Opened file ' + extracted_results_from_pdf) outfile_xml_fp.write(data) # pdfminer has a bug wherein it misses out the last </pages> tag in the extracted xml. Hence, adding this last tag manually. # Bug reported: https://github.com/euske/pdfminer/issues/229 outfile_xml_fp.write("</pages>") outfile_xml_fp.close() filename_without_folderprefix_and_ext = re.sub( r'.*/', '', os.path.splitext(s3_new_arrived_filename)[0]) extracted_xml_filename_in_s3 = 'xml/' + filename_without_folderprefix_and_ext + '.xml' s3.meta.client.upload_file(extracted_results_from_pdf, bucket, extracted_xml_filename_in_s3) # Publish to "StockDataExtracted" SNS topic. Send location of newly extracted XML in S3 in the message to SNS topic. This topic triggers the next lambda function - get_recommended_stocks message = {"topten_trader_xml_filepath": extracted_xml_filename_in_s3} sns_client = boto3.client('sns', region_name='us-east-1') sns_response = sns_client.publish( TargetArn='arn:aws:sns:us-east-1:<aws_account_#>:stock_data_extracted', Message=json.dumps({'default': json.dumps(message)}), Subject='Stock Buy Recommendations ' + str(datetime.date.today()), MessageStructure='json')
if fname[-3:] == "pdf": # Set parameters pagenos = set() maxpages = 0 password = '' imagewriter = None codec = 'utf-8' caching = True laparams = LAParams() outfile = fname + '.txt' rsrcmgr = PDFResourceManager(caching=caching) outfp = file(outfile, 'w') device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() outfp.close()
def get_placeholder_image_info(filename, xmlfile, outputdir): if not os.path.isdir(outputdir): os.makedirs(outputdir) image_info = [] password = '' caching = True rotation = 0 fname = filename maxpages = 0 pagenos = set() outputdir = outputdir placeholder_imgs = [] outfile = os.path.join(outputdir, xmlfile) outfp = file(outfile, 'w') codec = 'utf-8' laparams = LAParams() #laparams = None imagewriter = MyImageWriter(outputdir) #imagewriter = ImageWriter(outputdir) #imagewriter = None rsrcmgr = PDFResourceManager(caching=caching) device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() root = lxml.etree.parse(outfile) found_images = root.findall('.//image') found_image_boxes = root.xpath('.//figure[image]') jpg_count = 0 #get_images(filename, imagewriter.get_jpgs()) for i, e in enumerate(found_images): #imgpth = os.path.join(outputdir, found_image_boxes[i].attrib['name'] + '.jpg') imgpth = os.path.join(outputdir, e.attrib['src']) #print imgpth if not os.path.exists(imgpth): print "path doesnt exist - tag is none for " + imgpth tag = None else: tag = get_image_tag(imgpth) image_info.append({ "id": i, "src": imgpth, "height": e.attrib['height'], "width": e.attrib['width'], "bbox": found_image_boxes[i].attrib['bbox'], "tag": tag }) if tag is not None: placeholder_imgs.append(jpg_count) jpg_count += 1 return {'image_info': image_info, 'placeholder_imgs': placeholder_imgs}
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
def get_text(url, parse=True, laparams=laparams): url += 'v1.full.pdf' max_attempts = 4 attempts = 0 print(url) while attempts < max_attempts: r = requests.get(url) if r.status_code != 429: break # If rate limited, wait and try again (in seconds) time.sleep((2**attempts) + random.random()) attempts = attempts + 1 data = r.content try: f = io.BytesIO(data) rsrcmgr = PDFResourceManager() retstr = BytesIO() codec = 'utf-8' device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # , rect_colors=rect_colors) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 # is for all caching = True pagenos = set() for page in PDFPage.get_pages(f, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) device.close() pdf_data = retstr.getvalue() retstr.close() except: return ('.raw.txt', data) try: if parse == False: return ('.xml', pdf_data) else: # xmltest = convert_pdf_to_xml(pdf_data) root = ET.fromstring(pdf_data) temp = root.find('.//text') curr_font = temp.get('font') curr_size = float(temp.get('size')) text = '' rmargin = 70 i = 0 newline_pos = [] for l in root.iterfind('.//textline'): for t in l.findall('./text'): if (t.get('font') or t.get('size')) is None: if t.text[0] == ' ': text += ' ' else: text += '<<NEWLINE>>' newline_pos.append([]) else: x0, y0, x1, y1 = [ float(z) for z in t.get('bbox').split(',') ] char_size = float(t.get('size', 0)) char_font = t.get('font', '') if y0 > 750 or y0 < 75: continue if x0 < rmargin: if re.search('[A-Za-z]+', t.text) is not None: print('changing rmargin to ', str(x0 - 1)) rmargin = x0 - 1 text += t.text continue else: if (char_size != curr_size) or (char_font != curr_font): if (char_size) <= 8.: continue text += '<<NEWFONT>>' + t.text curr_font = t.get('font') curr_size = float(t.get('size')) else: text += t.text lines = text.split('<<NEWLINE>>') [print(l) for l in lines[:min(len(lines), 5)]] doc = lines[0] open_parens = False parens = [] if len(re.findall(r'\(', doc)) > len(re.findall(r'\)', doc)): parens.append(True) else: parens.append(False) for i, t in enumerate(lines): if (i == 0): if re.search(r'^\s*[a-z(]', lines[1]) is None: doc += '\n' continue if len(t) < 1: if open_parens == False: doc += '\n' else: continue continue else: o = len(re.findall(r'\(', t)) if open_parens == True: o += 1 c = len(re.findall(r'\)', t)) if o > c: open_parens = True else: open_parens = False if open_parens == False: if t.startswith(' '): t = re.sub(r'^ +', '<<PARAGRAPH>>', t) if t.lstrip(' ').startswith('<<NEWFONT>>') and lines[ i - 1].rstrip(' ').endswith('.'): t = re.sub(r'^<<NEWFONT>>', '<<PARAGRAPH>>', t.lstrip(' ')) if t.rstrip(' ').endswith('.'): t += '<<PARAGRAPH>>' if re.match(r'^\d{1,3}\.<<NEWFONT>>', t): t = '<<PARAGRAPH>>' + t doc += t parens.append(open_parens) doc = re.sub(r'(?<=[^.])\n+', '', doc) doc = re.sub(r' {3,}', '<<PARAGRAPH>>', doc) print(doc[:50]) parsed = [] for _text in doc.split(r'<<PARAGRAPH>>'): _text = re.sub('(<<NEWLINE>>)+', '\n', _text) _text = re.sub(r' ', r'\n', _text) _text = re.sub( r'<<NEWFONT>>(?P<url>http[a-zA-Z0-9./+?_=:-]+)( <<NEWFONT>>)?', r'\g<url>', _text) _text = re.sub(r'<<NEWFONT>> <<NEWFONT>>', r' ', _text) _text = re.sub(r'\(<<NEWFONT>>(.+)<<NEWFONT>>\)', r'(\g<1>)', _text, re.M) pattern = re.compile( r'<<NEWFONT>>(((\W|\d)+)|([A-Za-z_-]{1,2}\n?))<<NEWFONT>>') _text = pattern.sub(r'\g<1>', _text) pat2 = re.compile( r'<<NEWFONT>>([A-Za-z- :]+)<<NEWFONT>>([.:]?)') _text = pat2.sub(r'\g<1>\g<2>\n', _text) pat3 = re.compile( r'<<NEWFONT>>([A-Za-z_-]{1,3} *\n?)<<NEWFONT>>') _text = pat3.sub(r'\g<1>', _text) _text = re.sub(r'<<NEWFONT>>(.+)<<NEWFONT>>([a-z]+)', r'\g<1> \g<2>', _text) _text = re.sub(r'<<NEWFONT>>(.+)<<NEWFONT>>(\W*)\.?', r'\g<1> \g<2>', _text) _text = re.sub(r'-\n', r'-', _text) _text = re.sub(r'\((.+)(?:\n)(.+)\)', r'(\g<1>\g<2>))', _text) _text = re.sub(r'\((.+)<<NEWFONT>>(.+)\)', r'(\g<1>\g<2>)', _text, re.M) if len(_text.strip(' \n')) > 0: if len(re.findall(r'<<NEWFONT>>', _text)) == 1: _text = re.sub(r'<<NEWFONT>>', '\n', _text) parsed.append(_text) parsed2 = [ parsed[0], ] for i, p in enumerate(parsed): if i > 0: if re.search(r'^\s*\n*[a-z]', p) is not None: parsed2[i - 1] += p p = '' parsed2.append(p) parsed2 = '\n===================================\n'.join( [p for p in parsed2 if p != '']) print(parsed2[:50]) return ('.txt', parsed2) except: return ('.raw.xml', pdf_data)
laparams = LAParams() imagewriter = None codec = 'utf-8' outfp = sys.stdout stripcontrol = True pagenos = set() fname = sys.argv[1] rsrcmgr = PDFResourceManager(caching=True) device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.debug = 1 for page in PDFPage.get_pages(fp, pagenos, maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) fp.close() device.close() outfp.close()