def get_tables(fh): """ Return a list of 'tables' from the given file handle, where a table is a list of rows, and a row is a list of strings. """ result = [] doc, interpreter, device = initialize_pdf_miner(fh) doc_length = len(list(PDFPage.create_pages(doc))) for i, pdf_page in enumerate(PDFPage.create_pages(doc)): #print("Trying page {}".format(i + 1)) if not page_contains_tables(pdf_page, interpreter, device): #print("Skipping page {}: no tables.".format(i + 1)) continue # receive the LTPage object for the page. interpreter.process_page(pdf_page) processed_page = device.get_result() (table, _) = page_to_tables( processed_page, extend_y=False, hints=[], atomise=True) crop_table(table) result.append(Table(table,i+1,doc_length,1,1)) return result
def GetScript(filename): global scriptName ResetGlobals() scriptName = filename password = "" # Open a PDF file. fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: print "---Not translatable---" return #raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.boxes_flow = 2 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for pgnum,page in enumerate(PDFPage.create_pages(document)): if pgnum == 0: continue interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() text = [] for page in layout: try: if page.get_text().strip(): text.append(TextBlock(page.x0,page.y1,page.get_text().strip())) except: temp=5 print ".", text.sort(key = lambda row:(-row.y)) # Parse all of the "line" objects in each page for line in text: ParseLine(line.text, line.x)
def calculate_locations(filename,keywords): locations = [] fp = open(filename, 'rb') parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) #Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) pagenum = 0 reader = PdfFileReader(file(filename,"rb")) for page in pages: interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() page = reader.getPage(pagenum) x = page.trimBox[0].as_numeric() y = page.trimBox[1].as_numeric() #Handling special case if (x > 0 and y < 0): x = 0 # print "At page = %s X = %s , y = %s"%(pagenum,x,y) for keyword in keywords: print '********************************' co_ordinates = get_location(keyword,layout,x,y) print'Keyword %s , location %s'%(keyword,co_ordinates) print '********************************' if co_ordinates != None : for location in co_ordinates: print "PageNum-->%s"%pagenum l = LocationKeeper(keyword,location,pagenum) locations.append(l) pagenum+=1 return locations
def convert(url, pages=None): assert isinstance(url, basestring) assert pages == None or isinstance(pages, list) rscmng = PDFResourceManager() retstr = StringIO() device = TextConverter(rscmng, retstr, codec='utf-8', laparams=LAParams()) web_page = urllib2.urlopen(urllib2.Request(url)) fp = StringIO(web_page.read()) interpreter = PDFPageInterpreter(rscmng, device) pdf_pages = PDFPage.get_pages( fp, set(pages if pages != None else []), maxpages=0, password='', caching=True, check_extractable=True ) for page in pdf_pages: interpreter.process_page(page) result = retstr.getvalue() fp.close() web_page.close() device.close() retstr.close() return result
def pdf_to_text(page_object): parser = PDFParser(page_object) # Create a PDF document object that stores the document structure doc = PDFDocument(parser) # Connect the parser and document objects. parser.set_document(doc) doc.initialize('') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] # i = page number #without this it doesn't work # page are items in page for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for object in layout: if isinstance(object, LTTextBox) or isinstance(object, LTTextLine): trial = [] trial.append(object.get_text()) for word in trial: text_content.append(word) return text_content
def get_pdf_text(path): """ Reads a pdf file and returns a dict of the text where the index represents the page number. http://stackoverflow.com/a/20905381 """ rsrcmgr = PDFResourceManager() retstr = StringIO() # change to to utf-8 if the text comes out garbled codec = 'ascii' #codec = 'utf-8' laparams = LAParams() pages = {} device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() retstr.close() return pages
def dumppdf(fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) res = "" if objids: for objid in objids: obj = doc.getobj(objid) res += dumpxml(obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) res += dumpxml( obj, codec=codec) else: res += dumpxml(page.attrs) #print "before dumpall" if dumpall: res += dumpallobjs( doc, codec=codec) #print "after dumpall" if (not objids) and (not pagenos) and (not dumpall): res += dumptrailers( doc) fp.close() if codec not in ('raw','binary'): res += '\n' #print "end proc" return res
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in PDFPage.get_pages(f, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() self.parse_page(fpath, data, page_num) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def read_fields(pdffile): outfields = list() fp = open(pdffile, 'rb') id_to_page = dict() parser = PDFParser(fp) doc = PDFDocument(parser) pageno = 1; for page in PDFPage.create_pages(doc): id_to_page[page.pageid] = pageno pageno += 1 fields = resolve1(doc.catalog['AcroForm'])['Fields'] for i in fields: field = resolve1(i) name, value, rect, page, field_type = field.get('T'), field.get('V'), field.get('Rect'), field.get('P'), field.get('FT') logmessage("name is " + str(name) + " and FT is |" + str(field_type) + "|") if page is not None: pageno = id_to_page[page.objid] else: pageno = 1 if str(field_type) == '/Btn': if value == '/Yes': default = "Yes" else: default = "No" elif str(field_type) == '/Sig': default = '${ user.signature }' else: if value is not None: default = value else: default = word("something") outfields.append((name, default, pageno, rect, field_type)) return outfields
def convert_pdf_to_txt(path, output): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() f = open(output, 'wb') f.write(text) f.close() return text
def dumppdf(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) if objids: for objid in objids: obj = doc.getobj(objid) dumpxml(outfp, obj, codec=codec) if pagenos: for (pageno,page) in enumerate(PDFPage.create_pages(doc)): if pageno in pagenos: if codec: for obj in page.contents: obj = stream_value(obj) dumpxml(outfp, obj, codec=codec) else: dumpxml(outfp, page.attrs) if dumpall: dumpallobjs(outfp, doc, codec=codec) if (not objids) and (not pagenos) and (not dumpall): dumptrailers(outfp, doc) fp.close() if codec not in ('raw','binary'): outfp.write('\n') return
def convert_pdf_to_txt(self, path): """ A very simple conversion function which returns text for parsing from PDF. path = The path to the file """ try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter( rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text except Exception as e: text = "" return text self.logger.error( "Failed to PDF to text: " + str(e))
def pdf_to_txt(in_file): """ turn a PDF file to a TXT file (roughly processed) """ # Open a PDF file. fp = open(in_file, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Set parameters for analysis. laparams = LAParams() # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) # Receive the LTPage object for the page. layout = device.get_result() for klass in layout: if isinstance(klass, LTTextBoxHorizontal): out_file = in_file[:-3] + 'txt' with open(out_file, 'a') as dst_file: text = klass.get_text().encode('utf-8') dst_file.write(text + '\n') return None
def convert_pdf_to_txt(path): ## TAKEN FROM STACK OVERFLOW ## see... http://www.unixuser.org/~euske/python/pdfminer/programming.html for tutorial ## Also see... https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() fp = file(path, 'rb') password = "" maxpages = 0 caching = True pagenos=set() # Read text from pages device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fp.close() device.close() retstr.close() return str
def get_layout(path): '''returns a list of every character in the document as well as its location''' rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() fp = file(path, 'rb') password = "" maxpages = 0 caching = True pagenos=set() layout = [] device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) layout.append( device.get_result() ) fp.close() device.close() retstr.close() return layout
def pdf_read(pdf): """ Use PDFMiner to extract text from pdf file. <PDFMiner even though more low-level but pretty good tool to read pdfs> Args: *pdf* (str) -- path to pdf file Returns: *text* (str) -- a text extracted from pdf """ # initalizing objects res_manager = PDFResourceManager() strio = StringIO() lps = LAParams() device = TextConverter(res_manager, strio, codec='utf-8', laparams=lps) interpreter = PDFPageInterpreter(res_manager, device) # opening a pdf file with 'rb' mode for reading binary files pdf_file = file(pdf, 'rb') for page in PDFPage.get_pages(pdf_file, maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) # finishing up pdf_file.close() device.close() text = strio.getvalue() strio.close() return text
def __init__(self, file, password='', just_text=1, check_extractable=True, char_margin=1.0, line_margin=0.1, word_margin=0.1): self.parser = PDFParser(file) self.laparams = LAParams(char_margin=char_margin, line_margin=line_margin, word_margin=word_margin) if PYTHON_3: self.doc = PDFDocument() self.parser.set_document(self.doc) self.doc.set_parser(self.parser) self.doc.initialize(password) else: self.doc = PDFDocument(self.parser, password) if not check_extractable or self.doc.is_extractable: self.resmgr = PDFResourceManager() self.device = TextConverter(self.resmgr, outfp=StringIO(), laparams=self.laparams) self.interpreter = PDFPageInterpreter( self.resmgr, self.device) if PYTHON_3: page_generator = self.doc.get_pages() else: page_generator = PDFPage.create_pages(self.doc) for page in page_generator: self.append(self.interpreter.process_page(page)) self.metadata = self.doc.info if just_text: self._cleanup()
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 120 caching = True pagenos=set() # print "two" for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) # print "one" try: fp.close() device.close() str = retstr.getvalue() retstr.close() except: str = retstr.getvalue() return str
def parsePDF(pdf_file): pdf_file = open(pdf_file, "r").read() # Cast to StringIO object from StringIO import StringIO memory_file = StringIO(pdf_file) # Create a PDF parser object associated with the StringIO object parser = PDFParser(memory_file) # Create a PDF document object that stores the document structure document = PDFDocument(parser) # Define parameters to the PDF device objet rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() codec = "utf-8" # Create a PDF device object device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document for page in PDFPage.create_pages(document): interpreter.process_page(page) data = retstr.getvalue() print data break
def run(path): print "Calling parser :%s" % path t0 = time.clock() rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() book = Book() i = 0 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page_tmp = Page() begin_page = len(retstr.getvalue()) interpreter.process_page(page) page_tmp.text = retstr.getvalue()[begin_page:-1] book.pages.append(page_tmp) fp.close() device.close() retstr.close() print "Parsing in:", time.clock() - t0 return book
def convert_pdf_to_txt(path): """ Converts PDF to text using the pdfminer library """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) file_handle = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() file_handle.close() device.close() retstr.close() return text
def pdf2xml(infile): ''' Return a string of XML representation for given PDF file handle. Uses pdfminer to do the conversion and does some final post-processing. ''' outfile = StringIO() # Empirically determined... laparams = LAParams() laparams.char_margin = 0.4 # See pdf2txt.py rsrcmgr = PDFResourceManager(caching=False) device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) if page_api: for page in PDFPage.get_pages(infile, set()): interpreter.process_page(page) else: process_pdf(rsrcmgr, device, infile, set()) infile.close() return outfile.getvalue().replace("\n", "")
def Parse(self): # 先看是否有 cache,以及日期是否夠新 if not os.path.exists(parseCacheDir): os.makedirs(parseCacheDir) cacheFile = os.path.join(parseCacheDir, os.path.basename(self.pdfFileName) + '.cache') foundCache = (os.path.isfile(cacheFile) and \ os.path.getsize(cacheFile) > 0 and \ os.path.getmtime(cacheFile) > os.path.getmtime(self.pdfFileName)) if (foundCache): fp = open(cacheFile, 'rb') self.RawData = pickle.load(fp) fp.close() else: fp = open(self.pdfFileName, 'rb') for page in PDFPage.get_pages(fp, None, maxpages=1): rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) layout = device.get_result() self.__readobj(layout._objs) for category in self.RawData.values(): self.__reverseYaxis(category, layout.bbox[3]) cacheFp = open(cacheFile, 'wb') pickle.dump(self.RawData, cacheFp) cacheFp.close() fp.close() self.__calculateBoundary() self.__assignCharsAndLinesToCell() self.__processCells() return (self.effectiveFrom, self.__getResult())
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use PyPDF2 to extract textual content first. If none is found, it'll send the file through OCR. """ with open(path, 'rb') as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, '') result = {'pages': []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() if k != 'pages': result[k] = safe_text(v) if not doc.is_extractable: log.warning("PDF not extractable: %s", path) return result for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, languages) result['pages'].append(text) device.close() return result
def extract_text(self): pdf_data = file(self.local_file, 'rb').read() pdf_stream = io.BytesIO(pdf_data) laparams = LAParams() resource_manager = PDFResourceManager(caching=True) output_type = 'text' codec = 'utf-8' output_stream = io.BytesIO() pagenos = set() device = TextConverter( resource_manager, output_stream, codec=codec, laparams=laparams, ) interpreter = PDFPageInterpreter( resource_manager, device, ) pages = PDFPage.get_pages( pdf_stream, pagenos, maxpages=0, caching=True, check_extractable=True, ) for page in pages: interpreter.process_page(page) self.text = output_stream.getvalue().decode('utf8')
def convert_pdf_to_txt(path): temp = os.path.splitext(path) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() outputFile = temp[0] + ".txt" print outputFile ff = open(outputFile, "w") ff.write(text) ff.close()
def extract_text_from_pdf(pdf_filename): """ Function to extract the text from pdf documents using pdfminer Parameters: ----------- pdf_filename -- string File name of the pdf document as string Returns: -------- extracted_text -- string Text extracted from pdf as string """ resource_manager = PDFResourceManager() return_string = StringIO() la_params = LAParams() device = TextConverter(resource_manager, return_string, codec='utf-8', laparams=la_params) fp = file(pdf_filename, 'rb') interpreter = PDFPageInterpreter(resource_manager, device) page_nos = set() for page in PDFPage.get_pages(fp, page_nos): interpreter.process_page(page) fp.close() device.close() extracted_text = return_string.getvalue() return_string.close() return extracted_text
def pdf_from_url_to_txt(url, maxpages=0): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Open the url provided as an argument to the function and read the content f = urllib2.urlopen(urllib2.Request(url)).read() # Cast to StringIO object fp = StringIO(f) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() string = retstr.getvalue() retstr.close() return string
def pdf_to_text(pdfname): from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from cStringIO import StringIO # PDFMiner boilerplate rsrcmgr = PDFResourceManager() sio = StringIO() # codec = 'utf-8' codec = 'ascii' laparams = LAParams() device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Extract text fp = file(pdfname, 'rb') for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() # Get text from StringIO text = sio.getvalue() # Cleanup device.close() sio.close() return text
def fix_text(self, filename): # Open a PDF file. pdfText = StringIO() fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. if not self.password: document = PDFDocument(parser) else: document = PDFDocument(parser, self.password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = TextConverter(rsrcmgr, pdfText, codec=self.codec , laparams=LAParams(), imagewriter=None ) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) txt = pdfText.getvalue() return txt
#laparams = LAParams() laparams = None #password = '' #maxpages = 0 manager = PDFResourceManager(caching=caching) if case == 'txt' : output = io.StringIO() converter = TextConverter(manager, output, codec=codec, laparams=laparams) if case == 'HTML' : output = io.BytesIO() converter = HTMLConverter(manager, output, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for index,page in enumerate(PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True)): interpreter.process_page(page) convertedPDF = output.getvalue() infile.close(); converter.close(); output.close() return convertedPDF #%% #file_path = 'Q:/DATA/AI/JelleBarkema/DP Mission Chief Project/Documents' folder = 'd:/usr-profiles/chuang/Desktop/Dev/textmining/2_imf_docs/1_use_xmls/data/pdfs' dest_folder = 'data/txt'
parser = argparse.ArgumentParser(description=Path of a document) parser.add_argument('--path', dest='pathoffile') args = parser.parse_args() path = args.pathoffile rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() print(text) fp.close() device.close() retstr.close() ''' DOWNLOADING STOPWORDS. ''' nltk.download('stopwords') ''' GETTING RID OF UNWANTED PUNCTUATIONS. '''
def anotate_pdf(file_path, sht, query_dict): # preparing the output file name path = pathlib.Path(file_path).parent extension = pathlib.Path(file_path).suffix name = pathlib.Path(file_path).name[:-len(extension)] result_file = str(path) + '\\' + name + '_highlighted' + extension #========================================================= # create a parser object associated with the file object parser = PDFParser(open(file_path, 'rb')) # create a PDFDocument object that stores the document structure doc = PDFDocument(parser) # Layout Analysis # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # create pdf layout - this is list with layout of every page layout = [] for page in PDFPage.create_pages(doc): interpreter.process_page(page) # receive the LTPage object for the page. layout.append(device.get_result()) # add tooltip info not sure how to use this option in the most usefull way m_meta = {"author": "AK", "contents": "HL text1"} outputStream = open(result_file, "wb") pdfInput = PdfFileReader(open(file_path, 'rb'), strict=True) pdfOutput = PdfFileWriter() npage = pdfInput.numPages for pgn in range(0, npage): for query in query_dict: all_coor = [] for page in layout: result = get_page_coordinates(page, query) all_coor.append(result) page_hl = pdfInput.getPage(pgn) for item in all_coor[pgn]: highlight = create_highlight(item[0], item[1], item[2], item[3], m_meta, color=query_dict[query]) highlight_ref = pdfOutput._addObject(highlight) if "/Annots" in page_hl: page_hl[NameObject("/Annots")].append(highlight_ref) else: page_hl[NameObject("/Annots")] = ArrayObject( [highlight_ref]) pdfOutput.addPage(page_hl) # save HL to new file pdfOutput.write(outputStream) outputStream.close() sht.range('B2').value = f'File {name+extension} completed'
def convert( self, source_path: str = None, trim: bool = True, pagenum: Optional[Union[int, str]] = None, ): """Parse source PDF into entities. These entities can be used for text searches or XML dumping for example. The conversion will be done automatically when using the dependent keywords directly. :param source_path: source PDF filepath :param trim: trim whitespace from the text is set to True (default) :param pagenum: Page number where search is performed on, defaults to `None`. ( meaning all pages get converted) **Examples** **Robot Framework** .. code-block:: robotframework ***Settings*** Library RPA.PDF ***Tasks*** Example Keyword Convert /tmp/sample.pdf **Python** .. code-block:: python from RPA.PDF import PDF pdf = PDF() def example_keyword(): pdf.convert("/tmp/sample.pdf") """ self.ctx.switch_to_pdf(source_path) converted_pages = self.active_pdf_document.has_converted_pages if pagenum is not None: pagenum = int(pagenum) if pagenum in converted_pages: return # specific page already converted else: pages_count = self.active_pdf_document.reader.getNumPages() if len(converted_pages) >= pages_count: return # all pages got converted already self.logger.debug( "Converting active PDF document page %s on: %s", pagenum if pagenum is not None else "<all>", self.active_pdf_document.path, ) rsrcmgr = PDFResourceManager() if not self.ctx.convert_settings: self.set_convert_settings() laparams = pdfminer.layout.LAParams(**self.ctx.convert_settings) device = Converter( self.active_pdf_document, rsrcmgr, laparams=laparams, trim=trim, logger=self.logger, # Also explicitly set by us when iterating pages for processing. pageno=pagenum if pagenum is not None else 1, ) interpreter = pdfminer.pdfinterp.PDFPageInterpreter(rsrcmgr, device) # Look at all (nested) objects on each page. source_parser = PDFParser(self.active_pdf_document.fileobject) source_document = PDFDocument(source_parser) source_pages = PDFPage.create_pages(source_document) for idx, page in enumerate(source_pages, start=1): # Process relevant pages only if instructed like so. # (`pagenum` starts from 1 as well) if pagenum is None or idx == pagenum: if idx not in converted_pages: # Skipping converted pages will leave this counter un-incremented, # therefore we increment it explicitly. device.pageno = idx interpreter.process_page(page) converted_pages.add(idx) device.close()
#print(o.get_font()) for c in o._objs: if isinstance(c, pdfminer.layout.LTChar): #print(c,"fontname %s"%c.fontname) break i += 1 # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): parse_obj(obj._objs) else: pass path = "/home/pavan/Downloads/Test2.pdf" document = createPDFDoc(path) device, interpreter = createDeviceInterpreter() pages = PDFPage.create_pages(document) interpreter.process_page(pages.next()) #layout = device.get_result() for page in PDFPage.get_pages(open(path, 'rb'), set(), maxpages=0, password="", caching=True, check_extractable=True): interpreter.process_page(page) layout = device.get_result() parse_obj(layout._objs) #input('------------------------------------------------------')
def extractHighlights2(filename, anno, verbose=True): '''Extract highlighted texts from a PDF Extract texts from PDF using pdftotext ''' hlpages = anno.hlpages if len(hlpages) == 0: return [] #--------------Get pdfmine instances-------------- document, interpreter, device = init(filename) #----------------Loop through pages---------------- hltexts = [] for ii, page in enumerate(PDFPage.create_pages(document)): #------------Get highlights in page------------ if len(hlpages) > 0 and ii + 1 in hlpages: annoii = anno.highlights[ii + 1] anno_total = len(annoii) anno_found = 0 #------------Merge annos in single line------------ annoii = mergeLine(annoii) #-----------Sort annotations vertically----------- annoii = sortAnnoY(annoii) interpreter.process_page(page) layout = device.get_result() page_height = layout.height #--------------Sort boxes diagnoally-------------- objs = sortDiag(layout) #-----------------Refine ordering----------------- objs = fineTuneOrder(objs) #----------------Loop through boxes---------------- for jj, objj in enumerate(objs): if type(objj)!=LTTextBox and\ type(objj)!=LTTextBoxHorizontal: continue textjj, numjj = findStrFromBox2(annoii, objj, filename, page_height) if numjj > 0: #--------------Attach text with meta-------------- authors = tools.getAuthorList(anno.meta) textjj=Anno(textjj,\ ctime=getCtime(annoii),\ title=anno.meta['title'],\ page=ii+1, citationkey=anno.meta['citationkey'],\ tags=anno.meta['tags'], bbox=objj.bbox, author=authors, note_author=anno.meta['user_name']) hltexts.append(textjj) #----------------Break if all found---------------- anno_found += numjj if anno_total == anno_found: break #----------------Number highlights---------------- for ii, hlii in enumerate(hltexts): hlii.num = ii + 1 return hltexts
def main(argv): import getopt def usage(): print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = open(outfile, 'w', encoding=encoding) else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
def extract_text_from_pdf(pdf_path): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() print(text) dem = re.search("Demand/fixed charge.{10}", text) dem = dem.group() dem = dem.replace("Demand/fixed charge", " ") dem = dem.split(".") dem = dem[0] + "." + dem[1][:2] print(dem) wheel = re.search("Wheeling Charges.{10}", text) wheel = wheel.group() wheel = wheel.replace("Wheeling Charges", " ") wheel = wheel.split(".") wheel = wheel[0] + "." + wheel[1][:2] print(wheel) ec = re.search("Energy charge.{10}", text) ec = ec.group() ec = ec.replace("Energy charge", " ") ec = ec.split(".") ec = ec[0] + "." + ec[1][:2] print(ec) ge = re.search("Government Electricity Duty.{50}", text) ge = ge.group() ge = ge.replace("Government Electricity Duty", " ") ge = ge.split("%") ge = ge[1].split(".") ge = ge[0] + "." + ge[1][:2] print(ge) ma = re.search("Mah.Govt.Tax on sale of electricity.{50}", text) ma = ma.group() ma = ma.replace("Mah.Govt.Tax on sale of electricity", " ") ma = ma.split("unit") ma = ma[1].split(".") ma = ma[0] + "." + ma[1][:2] print(ma) amt = re.search("Current month's bill amount.{50}", text) amt = amt.group() amt = amt.replace("Current month's bill amount(A)", " ") amt = amt.split(".") amt = amt[0] + "." + amt[1][:2] print(amt) dp = re.search("Digital Payment Discount.{50}", text) dp = dp.group() dp = dp.replace("Digital Payment Discount", " ") dp = dp.split(".") dp = dp[0] + "." + dp[1][:2] print(dp) pd = re.search("Payment received upto.{30}", text) pd = pd.group() pd = pd.replace("Payment received upto", " ") if "-" in pd: pd = pd.split("-") pd = pd[0] + "-" + pd[1] + "-" + pd[2][:4] else: pd = pd.split(".") pd = pd[0] + "." + pd[1] + "." + pd[2][:4] print(pd) pr = re.search("Payment received upto.{30}", text) pr = pr.group() pr = pr.replace("Payment received upto", " ") pr = pr.replace(pd, " ") pr = pr.split(".") pr = pr[0] + "." + pr[1][:2] print(pr) md = re.search("Meter reading date.{100}", text) md = md.group().split("-") md = md[0][-2:] + "-" + md[1] + "-" + md[2][:4] print(md) pmd = re.search("Meter reading date.{100}", text) pmd = pmd.group().split("-") pmd = pmd[2][-2:] + "-" + pmd[3] + "-" + pmd[4][:4] print(pmd) sd = re.search("Your security deposit.{50}", text) sd = sd.group() sd = sd.replace("Your security deposit (SD) with us", " ") sd = sd.split(".") sd = sd[0] + "." + sd[1][:2] print(sd) dpc = re.search("Total bill amount with DPC.{30}", text) dpc = dpc.group() dpc = dpc.replace("Total bill amount with DPC", " ") dpc = dpc.split(".") dpc = dpc[0] + "." + dpc[1][:2] print(dpc) CoD = re.search("Contract Demand.{20}", text) CoD = CoD.group() CoD = CoD.replace("Contract Demand", " ").split(".") CoD = CoD[0] + "." + CoD[1][:5] print(CoD) PF = re.search("Power Factor.{40}", text) PF = PF.group().replace("Power Factor (PF) penalty/incentive", " ").split(".") PF = PF[0] + "." + PF[1][:2] print(PF) number = re.search("Meter No..{7}", text) number = number.group().replace("Meter No.", " ") print(number) mf = re.search("Multiplying Factor.{1}", text) mf = mf.group().replace("Multiplying Factor", " ") print(mf) redpr = re.search("Energy consumptionReadingPresent.{50}", text) redprk = redpr.group().replace("Energy consumptionReadingPresent", " ").split(".") z = redprk z = z[0] + "." + z[1][:2] print(z) redprv = re.search("Energy consumptionReadingPresent.{50}", text) redprv = redprv.group().replace("Energy consumptionReadingPresent", " ") redprv = redprv.replace(z, " ") redprv = redprv.replace("Previous", " ").split(".") y = redprv y = y[0] + "." + y[1][:2] print(y) total = re.search(" Factor1Energy consumption.{100}", text) total = total.group().split("TOD") total = total[0].split("(kWh)") total = total[1] n = len(total) m = int(n / 2) total = total[-m:] #total = "".join(total).split("(kWh)") print(total)
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() outfp.close() return
def processor_pdfminersix(self, pathfile): ''' ''' self._tmp_file = open(str(pathfile), 'rb') return [x for x in PDFPage.get_pages(self._tmp_file)]
listadoarchivos = listadoarchivos[::-1] dfrow_list=[] # guardado temporal de los datos antes de incluirlos en el DataFrame csv_file = "provincias.csv" for pdf_file in listadoarchivos: fp = open(pdf_file, 'rb') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) fecha = re.split('_|-',pdf_file) if "matutino" in pdf_file: fecha[0] = str(int(fecha[0])-1) if len(fecha[0]) == 1: fecha[0] = "0" + str(fecha[0]) fecha = fecha[0] + "/" + fecha[1] + "/" + fecha[2] for page in pages: interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): paragraph = lobj.get_text() paragraph = paragraph.split("\n")
def print_and_write(txt): print(txt) #outputfile.write(txt) # output_txt.write('\n') outputfile.close() with open(sys.argv[1], 'rb') as f: # PDFPage.get_pages()にファイルオブジェクトを指定して、PDFPageオブジェクトを順に取得する。 # 時間がかかるファイルは、キーワード引数pagenosで処理するページ番号(0始まり)のリストを指定するとよい。 rank = False lanes = [] thisHeat = "" number = 1 semi = False seminumber = 1 for page in PDFPage.get_pages(f): # print_and_write('\n====== ページ区切り ======\n') interpreter.process_page(page) # ページを処理する。 layout = device.get_result() # LTPageオブジェクトを取得。 # ページ内のテキストボックスのリストを取得する。 boxes = find_textboxes_recursively(layout) # テキストボックスの左上の座標の順でテキストボックスをソートする。 # y1(Y座標の値)は上に行くほど大きくなるので、正負を反転させている。 boxes.sort(key=lambda b: (-b.y1, b.x0)) for box in boxes: text = box.get_text() if prefix != "": if "Heat" in text and "of" in text:
def get_text_from_pdf(pdfname, caption, skip_header, skip_footer): # PDF 読み込み fp = open(pdfname, 'rb') texts = [] for page in tqdm( PDFPage.get_pages(fp, pagenos=None, maxpages=0, password=None, caching=True, check_extractable=True)): rsrcmgr = PDFResourceManager() out_fp = StringIO() la_params = LAParams() la_params.detect_vertical = True device = TextConverter(rsrcmgr, out_fp, codec='utf-8', laparams=la_params) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) texts.append(out_fp.getvalue()) device.close() out_fp.close() fp.close() output = "" # 文章成形 for text in tqdm(texts): lines = text.splitlines() replace_strs = [b'\x00'] # 除去するutf8文字 new_lines = [] for line in lines: line_utf8 = line.encode('utf-8') for replace_str in replace_strs: line_utf8 = line_utf8.replace(replace_str, b'') line = line_utf8.decode() line = re.sub("[ ]+", " ", line) # 連続する空白を一つにする line = line.strip() if len(line) == 0: continue # 空行は無視 if is_float(line): continue # 数字だけの行は無視 new_lines.append(line) for index in range(len(new_lines)): if index == 0 and skip_header: continue if index == len(new_lines) - 1 and skip_footer: continue line = new_lines[index] # 見出しで改行 if is_float(line.split(".")[0]) and len( line.split()) < caption and (not line.endswith(".")): output += str(line) output += "\r\n" continue if line.endswith("."): output += str(line) output += "\r\n" elif line.endswith("-"): # 前の行からの続きの場合 output += str(line[:-1]) elif line.endswith(":"): # 数式が続く場合 output += str(line) output += "\r\n" else: # それ以外の場合は、単語の切れ目として半角空白を入れる output += str(line) output += " " return output
def pdf2txt(self): ''' ============================= return : str, text File path ''' # input password = '' pagenos = set() maxpages = 0 # output imagewriter = None rotation = 0 codec = 'UTF-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() infp = open(self.input_path, "rb") if self.output_path == None: self.output_path = self.input_path[:-4] + '_trans.txt' outfp = open(self.output_path, "w", encoding='UTF8') else: outfp = open(self.output_path, "w", encoding='UTF8') #page total num parser = PDFParser(infp) document = PDFDocument(parser) page_total_num = resolve1(document.catalog['Pages'])['Count'] # rsrcmgr = PDFResourceManager(caching=caching) # pdf -> text converter device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # pdf -> text interpreter interpreter = PDFPageInterpreter(rsrcmgr, device) # pdf -> text start with tqdm(total=page_total_num) as pbar: for page in PDFPage.get_pages(infp, pagenos, maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) pbar.update(1) print('[INFO] pdf -> text') outfp.close() infp.close() return self.output_path
# Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) from pdfminer.layout import LAParams from pdfminer.converter import PDFPageAggregator # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result()
def parse(): fp = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument(praser) # 连接分析器 与文档对象 praser.set_document(doc) # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 wb = Workbook() #新建excel ws = wb.active # 记录page的行数 text_number = 0 for page in PDFPage.create_pages(doc): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, # 得到box page_container = [] #存储所有该page的字符串字典 page_rows = [] #存储行位置数据 for text_box in layout: if (isinstance(text_box, LTTextBox)): # 得到line for text_line in text_box: if (isinstance(text_line, LTTextLine)): # 得到每个字符 temp = [] # 存储得到的字符 temp_loc = [] #存储字符串位置 isfirst = True #判断是否为字符串的第一个字符 for text_index in text_line: # 判断是否为字符数据,并不断更新temp temp_loc if (isinstance(text_index, LTChar)): temp.append(text_index.get_text()) if isfirst == True: temp_loc.append( round(text_index.bbox[0], 3)) temp_loc.append( round(text_index.bbox[1], 3)) temp_loc.append( round(text_index.bbox[2], 3)) temp_loc.append( round(text_index.bbox[3], 3)) isfirst = False temp_loc[2] = round(text_index.bbox[2], 3) temp_loc[3] = round(text_index.bbox[3], 3) # 判断是否为LTText,并将得到的字符串输入page_container的指定位置,最后更新temp 、temp_loc、 isfirst elif (isinstance(text_index, LTText)): # 如果page_rows没有该行的位置数据,则将数据信息插入page_container,page_rows # if temp_loc[1] not in page_rows: if is_not_in(page_rows, temp_loc[1]): insert_loc = insert_into_page_rows( page_rows, temp_loc[1]) page_container.insert( insert_loc, [{ 'value': ''.join(temp), 'location': temp_loc }]) # page_rows.append(temp_loc[1]) # page_container.append([{'value':''.join(temp),'location':temp_loc}]) # 如果有该行的信息 elif not is_not_in(page_rows, temp_loc[1]): # loc = page_rows.index(temp_loc[1]) loc = get_page_rows_loc( page_rows, temp_loc[1]) temp_list = insert_into_page_container( page_container[loc], { 'value': ''.join(temp), 'location': temp_loc }) page_container[loc] = temp_list[:] temp = [] temp_loc = [] isfirst = True rows_num = len(page_container) # 对最后一行进行重排 if len(page_container[rows_num - 1]) != len( page_container[rows_num - 2]): loc_for_no2 = [] loc_for_no1 = [] adjust_for_no1 = [] temp_array = page_container[rows_num - 1][:] for i in page_container[rows_num - 2]: loc_for_no2.append([i['location'][0], i['location'][2]]) for i in page_container[rows_num - 1]: loc_for_no1.append([i['location'][0], i['location'][2]]) for i in range(len(loc_for_no1)): for j in range(len(loc_for_no2)): if not (loc_for_no1[i][0] > loc_for_no2[j][1] or loc_for_no1[i][1] < loc_for_no2[j][0]): adjust_for_no1.append(j) break page_container[rows_num - 1] = [] for i in range(len(page_container[rows_num - 2])): if i in adjust_for_no1: page_container[rows_num - 1].append( temp_array[adjust_for_no1.index(i)]) else: page_container[rows_num - 1].append(None) # 对前五行进行重排 if len(page_container[0]) != len(page_container[1]) or len( page_container[1]) != len(page_container[2]) or len( page_container[2]) != len(page_container[3]) or len( page_container[3]) != len(page_container[4]): rows_length = [] the_max_row = [] new_max_row = [] for i in range(6): rows_length.append(len(page_container[i])) max_length = max(rows_length) the_max_row = page_container[rows_length.index(max_length)][:] for i in range(len(rows_length)): if rows_length[i] < max_length: page_container[i] = align_row(the_max_row, page_container[i]) # 检测表头 # 输出验证 for i in range(len(page_container)): for j in range(len(page_container[i])): print(page_container[i][j]) # print(page_container) # print(page_rows) # 得到该页数据以后写入excel for i in range(len(page_container)): for j in range(len(page_container[i])): cell_index = ws.cell(row=i + 1 + text_number, column=j + 1) if page_container[i][j] == None: cell_index.value = ' ' else: cell_index.value = page_container[i][j]['value'] # 更新text_number,保证page之间的数据连续 text_number += rows_num wb.save(r'C:\Users\15644\Desktop\pdf_file\test_pdf_list\test_1.xlsx')
def scientific_analysis(password, path, title, topn): from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO print('Convering pdf to text ...') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password_pdf = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password_pdf, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() text = text.replace('-\n', '').replace('’', "'").replace('infl', 'infl') lines = text.split('\n') lines_section_ids_dict = {} lines_section_ids = [] for i, line in enumerate(lines[1:-2]): if len(lines[i - 1]) == 0 and len(lines[i + 1]) == 0 and len( lines[i]) > 3 and not str(lines[i]).isdigit(): lines_section_ids_dict[i] = lines[i] lines_section_ids.append(i) data = [] for id in lines_section_ids_dict: data.append((lines_section_ids_dict[id], id)) data = dict(data) final_data = {} new_txt = '' try: ref_id = data['References'] except KeyError: ref_id = len(lines) - 1 for i, id in enumerate(lines_section_ids): if i < len(lines_section_ids) - 1 and id < ref_id: start = lines_section_ids[i] end = lines_section_ids[i + 1] interval_lines = lines[start + 1:end] interval_lines_txt = ' '.join(interval_lines) if 'Abbreviations' not in lines_section_ids_dict[ start] and '18 of 36' not in lines_section_ids_dict[start]: new_txt += interval_lines_txt if interval_lines and len(interval_lines_txt) > 100: final_data[lines_section_ids_dict[start]] = ' '.join( interval_lines) final_data['paper_title'] = title final_data['full_text'] = new_txt final_data['topn'] = topn print('Uploading text ...') response = requests.post( 'http://tzagerlib1-env.eba-wjp8tqpj.eu-west-2.elasticbeanstalk.com/scientific_analysis/' + password, json=json.dumps(final_data)) if response.status_code == 200: data = dict(response.json()) else: data = {'error': response.status_code} data = dict(data) return data
from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfpage import PDFPage import io import os data_dir = '/home/lsy2018/媛媛/data/人工智能教师/' resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) file_list = os.listdir(data_dir) for each_file in file_list: file_name = os.path.join(data_dir, each_file) print(file_name) with open(file_name, 'rb') as fh: for page in PDFPage.get_pages(fh, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() print(text) exit()
def parse_obj(lt_objs, texts): # loop over the object list for obj in lt_objs: if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): texts.append([obj.bbox[0], obj.bbox[1], obj.get_text()]) # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): parse_obj(obj._objs, page_state) i = 0 # loop over all pages in the document for i, page in enumerate(PDFPage.create_pages(document)): p = PageState() # read the page into a layout object interpreter.process_page(page) layout = device.get_result() texts = [] # extract texts from this object parse_obj(layout._objs, texts) # sort text on first x, then y coordinate. texts.sort(key=lambda a: a[0], reverse=False) texts.sort(key=lambda a: a[1], reverse=True)
def pdf_to_csv(filename): regexMålestasjon = re.compile(r"(MSA|MSB)") regexLinje = re.compile("linje " + r"[0-9]{1}[0-1]?") regexOljetype = re.compile(r"[0-9]{2}:[0-9]{2}" + "(14|1)") with open(filename, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): resource_manager = PDFResourceManager() fake_file_handler = io.StringIO() converter = TextConverter(resource_manager, fake_file_handler) page_interpreter = PDFPageInterpreter(resource_manager, converter) page_interpreter.process_page(page) text = fake_file_handler.getvalue() converter.close() fake_file_handler.close() fh.close() Målestasjon = re.search(regexMålestasjon, text) if Målestasjon == None: return else: matchMålestasjon = Målestasjon.group(0) Linje = re.search(regexLinje, text) if Linje == None: return else: matchLinje = Linje.group(0) oljetype = re.search(regexOljetype, text) if oljetype == None: return else: matchOljetype_mau = oljetype.group(0) if matchOljetype_mau.endswith("4"): matchOljetype = "14" if matchOljetype_mau.endswith("1"): matchOljetype = "1" if matchOljetype_mau.endswith("0"): return newfilename = str(matchMålestasjon) + "_" + str( matchLinje) + "_" + matchOljetype + ".pdf" location = str(matchMålestasjon) + "_" + str(matchOljetype) try: os.rename(filename, newfilename) shutil.move(newfilename, location) except IOError: print(f"""failed:\nOld filename: {filename}\n New Filename: {newfilename}""")
from io import StringIO from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser output_string = StringIO() with open('simple1.pdf', 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) print(output_string.getvalue())
def extract_text_from_pdf(pdf_path): ''' Helper function to extract the plain text from .pdf files :param pdf_path: path to PDF file to be extracted (remote or local) :return: iterator of string of extracted text ''' # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/ if not isinstance(pdf_path, io.BytesIO): # extract text from local pdf file with open(pdf_path, 'rb') as fh: try: for page in PDFPage.get_pages( fh, caching=True, check_extractable=True ): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) page_interpreter = PDFPageInterpreter( resource_manager, converter ) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return else: # extract text from remote pdf file try: for page in PDFPage.get_pages( pdf_path, caching=True, check_extractable=True ): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) page_interpreter = PDFPageInterpreter( resource_manager, converter ) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return
def parse(): with open("schedule/{}".format(cfg.get("schedule_file")), "rb") as fp: parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFDevice(rsrcmgr) laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) def parse_obj(lt_objs): for obj in lt_objs: if isinstance(obj, LTTextBoxHorizontal): coor = getTextCoords(obj.bbox[0:2]) text = obj.get_text().replace('\n', ' ') # check if content contains a date match = re.search(r"\d{2}/\d{2}/\d{4}", text) if match: data["dates"].append({ "date": match.group(), "coords": coor }) match = re.findall(r"\d{1,2}:\d{2}", text) if match: data["hours"].append({ "hours": list(map(lambda x: "{0:0>5}".format(x), match)), "coords": coor }) data["textboxes"].append([coor, text, ""]) if isinstance(obj, LTRect): data["rects"].append(getRectCoords(obj.bbox[0:4])) if isinstance(obj, LTFigure): parse_obj(obj._objs) if LOG_TEXTS: with open("outputs/" + cfg.get("folder") + "/pdf_texts.txt", "w", encoding="utf8") as log: log.write("") with open("outputs/" + cfg.get("folder") + "/pdf_svg.html", "w", encoding="utf8") as svg: ''' SVG HEAD ''' if CREATE_SVG: svg.write( "<style type=\"text/css\">svg{stroke:#000;stroke-width:1;fill:none}</style>\n" ) i = 0 # loop over all pages in the document for page in PDFPage.create_pages(document): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() ''' CREATE SVG ''' if CREATE_SVG: svg.write( "<svg id=\"s{}\" width=\"1200\" height=\"600\">\n". format(i)) data["rects"] = [] data["textboxes"] = [] data["dates"] = [] data["datelines"] = [] data["hours"] = [] # extract info from this page parse_obj(layout._objs) lines = rectsToLines(data["rects"]) lines = mergeLines(lines) lines.sort(key=lambda x: x[1][1]) lines.sort(key=lambda x: x[0][1]) grid = createGrid(lines) data["textboxes"] = mergeTexts(grid, data["textboxes"]) data["textboxes"] = splitSimultaneousCourses(data["textboxes"]) data["hours"].sort(key=lambda x: x["coords"][1]) if data["hours"]: calcHourBoundaries(grid) if data["dates"]: calcDateBoundaries(grid) # keyword matching for each textbox for t in data["textboxes"]: t[1] = " ".join(t[1].split()) res = keywords.match(format_text(t[1])) if len(res["indexes"]) == 1: data["courses"][res["indexes"][0]] = { "coords": t[0], "date": getDate(t[0]) } t[2] = " (match: {})".format(res["titles"][0]) ''' DRAW LINES ''' if CREATE_SVG: minX, maxX = 1e10, 0 for l in lines: svg.write( "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#{}\"></line>\n" .format(l[0][0], l[0][1], l[1][0], l[1][1], randomColor())) if l[0][0] < minX: minX = l[0][0] if l[1][0] > maxX: maxX = l[1][0] if SHOW_DATELINES: for h in data["hours"]: svg.write( "<circle cx=\"{}\" cy=\"{}\" r=\"1\" stroke=\"red\"></circle>\n" .format(h["coords"][0], h["coords"][1])) for d in data["dates"]: if d["boundaries"][0] != 0 and d["boundaries"][ 1] != 0: svg.write( "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#111111\"></line>\n" .format(minX, d["boundaries"][0], maxX, d["boundaries"][0])) svg.write( "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#111111\"></line>\n" .format(minX, d["boundaries"][1], maxX, d["boundaries"][1])) if SHOW_TEXTBOXES: for t in data["textboxes"]: svg.write( "<text x=\"{}\" y=\"{}\" font-size=\"4\" font-weight=\"lighter\">{}</text>\n" .format(t[0][0], t[0][1], t[1][:5])) if LOG_TEXTS: with open("outputs/" + cfg.get("folder") + "/pdf_texts.txt", "a", encoding="utf8") as log: for t in data["textboxes"]: log.write("{}, {}, {}{}\n".format( t[0][0], t[0][1], t[1], t[2])) ''' CLOSE SVG ''' if CREATE_SVG: svg.write('</svg>' + "\n") i += 1 coursedates = {} for key, c in data["courses"].items(): coursedates[key] = c["date"] write(coursedates)
def extract_text(my_file): """Pulling text boxes out of PDFs. First half of this defn copies off the internet.""" try: #my_file = os.path.join(base_path + "/" + filename) #my_file = os.path.join(dayDataPath, frontPages[paper]) password = "" extracted_text = "" extracted_text_plus = [] # Open and read the pdf file in binary mode fp = open(my_file, "rb") # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, password) # Check if document is extractable, if not abort #if not document.is_extractable: # raise PDFTextExtractionNotAllowed # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) # Ok now that we have everything to process a pdf document, lets process it page by page for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: #print(lt_obj) #extracted_text_plus.append(lt_obj) if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): extracted_text_plus.append(lt_obj) #print(layout) #close the pdf file fp.close() #save the text #with open(log_file, "wb") as my_log: # my_log.write(extracted_text.encode("utf-8")) ###Finally getting to my contributions.### #Headlines are assumed to be large text. By comparing the number of lines of text in a textbox #with the height of the textbox, the average size of the text can be found. #Text that's larger than average is kept. df = pd.DataFrame() df['cords'] = 0 df['num'] = 0 df['height'] = 0 df['text'] = '' df['TL_X'] = -1 df['TL_Y'] = -1 df['width'] = -1 nums = [] heights = [] for n in range(0, len(extracted_text_plus)): cords = str(extracted_text_plus[n]).split(' ')[1].split(',') vals = [float(elm) for elm in cords] a, b, c, d = vals text = ' '.join(str(extracted_text_plus[n]).split(' ')[2:]) h = d - b #float(cords[3])-float(cords[1]) w = c - a #nums.append(n) #heights.append(h) #print(cords) df.loc[n, 'cords'] = ' '.join(cords) df.loc[n, 'num'] = n df.loc[n, 'height'] = h df.loc[n, 'width'] = w df.loc[n, 'TL_X'] = a df.loc[n, 'TL_Y'] = b df.loc[n, 'text'] = text df['newlines'] = 0 for x in range(0, len(df)): df.loc[x, 'newlines'] = df.loc[x, 'text'].count('\\n') df['text height'] = df['height'] / df['newlines'] return df except: pass
def cas_pdf_to_text(filename: Union[str, io.IOBase], password) -> PartialCASData: """ Parse CAS pdf and returns line data. :param filename: CAS pdf file (CAMS or Kfintech) :param password: CAS pdf password :return: array of lines from the CAS. """ file_type: Optional[FileType] = None if isinstance(filename, str): fp = open(filename, "rb") elif isinstance(filename, io.IOBase): fp = filename elif hasattr(filename, "read"): # compatibility for Django UploadedFile fp = filename else: raise CASParseError( "Invalid input. filename should be a string or a file like object") with fp: pdf_parser = PDFParser(fp) try: document = PDFDocument(pdf_parser, password=password) except PDFPasswordIncorrect: raise CASParseError("Incorrect PDF password!") except PDFSyntaxError: raise CASParseError("Unhandled error while opening file") line_margin = { FileType.KFINTECH: 0.1, FileType.CAMS: 0.2 }.get(detect_pdf_source(document), 0.2) rsrc_mgr = PDFResourceManager() laparams = LAParams(line_margin=line_margin, detect_vertical=True) device = PDFPageAggregator(rsrc_mgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrc_mgr, device) pages: List[Iterator[LTTextBoxHorizontal]] = [] investor_info = None for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() text_elements = filter( lambda x: isinstance(x, LTTextBoxHorizontal), layout) if file_type is None: for el in filter(lambda x: isinstance(x, LTTextBoxVertical), layout): if re.search("CAMSCASWS", el.get_text()): file_type = FileType.CAMS if re.search("KFINCASWS", el.get_text()): file_type = FileType.KFINTECH if investor_info is None: investor_info = parse_investor_info(layout, *page.mediabox[2:]) pages.append(text_elements) lines = group_similar_rows(pages) return PartialCASData(file_type=file_type, investor_info=investor_info, lines=lines)
from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.layout import LAParams from pdfminer.converter import PDFResourceManager, PDFPageAggregator from pdfminer.pdfpage import PDFPage from pdfminer.layout import LTTextBoxHorizontal document = open('../sample_surat2.pdf', 'rb') #Create resource manager rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for element in layout: if instanceof(element, LTTextBoxHorizontal): print(element.get_text())
def pdf_to_csv(filename, separator, threshold): #from cStringIO import StringIO from pdfminer.converter import LTChar, TextConverter from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage class CsvConverter(TextConverter): def __init__(self, *args, **kwargs): TextConverter.__init__(self, *args, **kwargs) self.separator = separator self.threshold = threshold def end_page(self, i): from collections import defaultdict lines = defaultdict(lambda: {}) for child in self.cur_item._objs: # <-- changed if isinstance(child, LTChar): (_, _, x, y) = child.bbox line = lines[int(-y)] line[x] = child._text.encode(self.codec) # <-- changed for y in sorted(lines.keys()): line = lines[y] self.line_creator(line) self.outfp.write(self.line_creator(line)) self.outfp.write("\n") def line_creator(self, line): keys = sorted(line.keys()) # calculate the average distange between each character on this row average_distance = sum( [keys[i] - keys[i - 1] for i in range(1, len(keys))]) / len(keys) # append the first character to the result result = [line[keys[0]]] for i in range(1, len(keys)): # if the distance between this character and the last character is greater than the average*threshold if (keys[i] - keys[i - 1]) > average_distance * self.threshold: # append the separator into that position result.append(self.separator) # append the character result.append(line[keys[i]]) printable_line = ''.join(result) return printable_line # ... the following part of the code is a remix of the # convert() function in the pdfminer/tools/pdf2text module rsrc = PDFResourceManager() ft = 'txt\\' + filename + '.txt' outfp = open(ft, 'w') #outfp = StringIO() device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) # becuase my test documents are utf-8 (note: utf-8 is the default codec) fp = open(filename, 'rb') interpreter = PDFPageInterpreter(rsrc, device) for i, page in enumerate(PDFPage.get_pages(fp)): outfp.write("START PAGE %d\n" % i) if page is not None: interpreter.process_page(page) else: print 'none' outfp.write("END PAGE %d\n" % i) device.close() fp.close() outfp.close() #return outfp.getvalue() return 0
def extract_text( files=[], outfile='-', _py2_no_more_posargs=None, # Bloody Python2 needs a shim no_laparams=False, all_texts=None, detect_vertical=None, # LAParams word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams output_type='text', codec='utf-8', strip_control=False, maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, layoutmode='normal', output_dir=None, debug=False, disable_caching=False, **other): if _py2_no_more_posargs is not None: raise ValueError("Too many positional arguments passed.") if not files: raise ValueError("Must provide files to work upon!") # If any LAParams group arguments were passed, create an LAParams object and # populate with given args. Otherwise, set it to None. if not no_laparams: laparams = pdfminer.layout.LAParams() for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): paramv = locals().get(param, None) if paramv is not None: setattr(laparams, param, paramv) else: laparams = None imagewriter = None if output_dir: imagewriter = ImageWriter(output_dir) if output_type == "text" and outfile != "-": for override, alttype in ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag")): if outfile.endswith(override): output_type = alttype if outfile == "-": outfp = sys.stdout if outfp.encoding is not None: codec = 'utf-8' else: outfp = open(outfile, "wb") for fname in files: with open(fname, "rb") as infp: # pdfminer.high_level.extract_text_to_fp(fp, **locals()) rsrcmgr = PDFResourceManager_new(caching=not disable_caching) device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) if outfp == sys.stdout: outfp = sys.stdout.buffer interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(infp, page_numbers, maxpages=maxpages, password=password, caching=not disable_caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(infp): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for element in layout: if isinstance(element, LTTextBoxHorizontal): print(element.get_text()) return outfp
def collect_events(helper, ew): if helper.get_arg('endpoint') == 'worldwide': graph_url = 'https://graph.microsoft.com/v1.0' elif helper.get_arg('endpoint') == 'gcchigh': graph_url = 'https://graph.microsoft.us/v1.0' access_token = _get_access_token(helper) headers = { "Authorization": "Bearer " + access_token, "User-Agent": "MicrosoftGraphEmail-Splunk/" + _get_app_version(helper) } #"Prefer": "outlook.body-content-type=text"} #defining email account to retrieve messages from endpoint = "/users/" + helper.get_arg('audit_email_account') #defining inbox id to retrieve messages from endpoint += "/mailFolders/inbox/messages/" #expanding property id 0x0E08 to gather message size, and then expanding attachments to get fileattachment type contentBytes endpoint += "?$expand=SingleValueExtendedProperties($filter=Id eq 'LONG 0x0E08'),attachments" #selecting which fields to retrieve from emails endpoint += "&$select=receivedDateTime,subject,sender,from,hasAttachments,internetMessageId,toRecipients,ccRecipients,bccRecipients,replyTo,internetMessageHeaders,body,bodyPreview,isReadReceiptRequested,isDeliveryReceiptRequested" #defining how many messages to retrieve from each page endpoint += "&$top=980" #getting the oldest messages first endpoint += "&$orderby=receivedDateTime" #getting the total count of messages in each round endpoint += "&$count=true" messages_response = helper.send_http_request(graph_url + endpoint, "GET", headers=headers, parameters=None, timeout=(15.0, 15.0)).json() helper.log_info("Retrieving " + str(messages_response['@odata.count']) + " messages") messages = [] #Routine that iterates through the messages. Uses the @odata.nextLink values to find the next endpoint to query. messages.append(messages_response['value']) #Calculate how many pages of 980 messages we'll attempt based on the interval value. Helps to keep requests within API limits. interval_in_seconds = int(helper.get_arg('interval')) url_count_limit = (interval_in_seconds // 60) - 1 if url_count_limit > 0: url_count = 0 while ("@odata.nextLink" in messages_response) and (is_https( messages_response["@odata.nextLink"])): if url_count < url_count_limit: nextlinkurl = messages_response["@odata.nextLink"] messages_response = helper.send_http_request( nextlinkurl, "GET", headers=headers, parameters=None, timeout=(15.0, 15.0)).json() messages.append(messages_response['value']) url_count += 1 else: helper.log_debug("Protecting API limits, breaking out") break #Routine to find attachments in messages. This caters for both standard, as well as inline attachments. MS Graph doesn't list inline attachments in the "hasAttachments" value, this fixes that. message_data = [] attach_data = [] for message in messages: for item in message: message_items = {} message_items['_time'] = item['receivedDateTime'] message_items['to'] = item['toRecipients'] message_items['from'] = item['from'] message_items['sender'] = item['sender'] message_items['subject'] = item['subject'] message_items['id'] = item['id'] message_items['internetMessageId'] = item['internetMessageId'] message_items['ccRecipients'] = item['ccRecipients'] message_items['bccRecipients'] = item['bccRecipients'] message_items['replyTo'] = item['replyTo'] message_items['hasAttachments'] = item['hasAttachments'] message_body = item['body']['content'] body_preview = item['bodyPreview'] attachments = item['attachments'] single_value_properties = item['singleValueExtendedProperties'] if 'internetMessageHeaders' in item: internet_message_headers = item['internetMessageHeaders'] if helper.get_arg('get_internet_headers'): message_items[ 'Internet-Headers'] = internet_message_headers #message path calculations message_path = [] path_item = {} for item in internet_message_headers: if item['name'] == "Received": path_item = item message_path.append(path_item) src_line = str(message_path[-1]) dest_line = str(message_path[0]) re_by = re.compile(r'(?<=\bby\s)(\S+)') re_from = re.compile(r'(?<=\bfrom\s)(\S+)') dest = re_by.search(dest_line) if re_from.search(src_line): src = re_from.search(src_line) elif re_by.search(src_line): src = re_by.search(src_line) message_items['src'] = str(src[0]) message_items['dest'] = str(dest[0]) if helper.get_arg('get_message_path'): message_items['message_path'] = message_path if helper.get_arg('get_x_headers'): x_headers = [] x_header_item = {} for item in internet_message_headers: if "X-" in item['name']: x_header_item = item x_headers.append(x_header_item) message_items['X-Headers'] = x_headers if helper.get_arg('get_auth_results'): auth_results = [] auth_results_item = {} for item in internet_message_headers: if "Authentication-Results" in item['name']: auth_results_item = item auth_results.append(auth_results_item) message_items['Authentication-Results'] = auth_results if helper.get_arg('get_spf_results'): spf_results = [] spf_results_item = {} for item in internet_message_headers: if "Received-SPF" in item['name']: spf_results_item = item spf_results.append(spf_results_item) message_items['Received-SPF'] = spf_results if helper.get_arg('get_dkim_signature'): dkim_sig = [] dkim_sig_item = {} for item in internet_message_headers: if "DKIM-Signature" in item['name']: dkim_sig_item = item dkim_sig.append(dkim_sig_item) message_items['DKIM-Signature'] = dkim_sig #tracking pixel detection if pixeltrack_re.search(message_body): pixel_data = pixeltrack_re.search(message_body) message_items['tracking_pixel'] = "true" message_items['tracking_pixel_data'] = pixel_data.group(0) else: message_items['tracking_pixel'] = "false" #size mapping for item in single_value_properties: if item['id'] == "Long 0xe08": message_items['size'] = item['value'] if helper.get_arg('get_body'): message_items['body'] = message_body if helper.get_arg('get_body_preview'): message_items['bodyPreview'] = body_preview if helper.get_arg('get_internet_headers'): message_items['Internet-Headers'] = internet_message_headers if helper.get_arg('get_attachment_info'): message_items['attachments'] = attachments if helper.get_arg('get_body'): if helper.get_arg('extract_iocs'): iocs = extract_iocs(helper, message_items["body"]) email_iocs = [] for ioc in iocs: if not ioc in email_iocs: email_iocs.append(ioc) if email_iocs: message_items['iocs'] = email_iocs if helper.get_arg('get_attachment_info'): if message_items['attachments'] is not None: for attachment in message_items["attachments"]: #Looks for itemAttachment type, which is a contact, event, or message that's attached. if attachment[ "@odata.type"] == "#microsoft.graph.itemAttachment": my_added_data = {} my_added_data['name'] = attachment['name'] my_added_data['odata_type'] = attachment[ '@odata.type'] my_added_data['id'] = attachment['id'] my_added_data['contentType'] = attachment[ 'contentType'] my_added_data['size'] = attachment['size'] attach_data.append(my_added_data) #Looks for referenceAttachment type, which is a link to a file on OneDrive or other supported storage location if attachment[ "@odata.type"] == "#microsoft.graph.referenceAttachment": my_added_data = {} my_added_data['name'] = attachment['name'] my_added_data['odata_type'] = attachment[ '@odata.type'] my_added_data['id'] = attachment['id'] my_added_data['contentType'] = attachment[ 'contentType'] my_added_data['size'] = attachment['size'] attach_data.append(my_added_data) #Looks for fileAttachment type, which is a standard email attachment. if attachment[ "@odata.type"] == "#microsoft.graph.fileAttachment": my_added_data = {} attach_b64decode = base64.b64decode( attachment['contentBytes']) #Selects which hashing algorithm (md5, sha1, sha256) to use on the attachment. if helper.get_arg( 'get_attachment_info') and helper.get_arg( 'file_hash_algorithm') == 'md5': hash_object = hashlib.md5(attach_b64decode) if helper.get_arg( 'get_attachment_info') and helper.get_arg( 'file_hash_algorithm') == 'sha1': hash_object = hashlib.sha1(attach_b64decode) if helper.get_arg( 'get_attachment_info') and helper.get_arg( 'file_hash_algorithm') == 'sha256': hash_object = hashlib.sha256(attach_b64decode) att_hash = hash_object.hexdigest() my_added_data['name'] = attachment['name'] my_added_data['odata_type'] = attachment[ '@odata.type'] my_added_data['id'] = attachment['id'] my_added_data['contentType'] = attachment[ 'contentType'] my_added_data['size'] = attachment['size'] my_added_data['file_hash'] = att_hash #Attempts to open up zip file to list file names and hashes if the option is selected in the input. if helper.get_arg( 'get_attachment_info' ) and helper.get_arg( 'read_zip_files' ) and attachment[ '@odata.mediaContentType'] == 'application/zip': filedata_encoded = attachment[ 'contentBytes'].encode() file_bytes = base64.b64decode(filedata_encoded) zipbytes = io.BytesIO(file_bytes) try: zipfile = ZipFile(zipbytes) zipmembers = zipfile.namelist() zip_files = [] zip_hashes = [] for file in zipmembers: zip_read = zipfile.read(file) if helper.get_arg('file_hash_algorithm' ) == 'md5': hash_object = hashlib.md5(zip_read) if helper.get_arg('file_hash_algorithm' ) == 'sha1': hash_object = hashlib.sha1( zip_read) if helper.get_arg('file_hash_algorithm' ) == 'sha256': hash_object = hashlib.sha256( zip_read) zip_hash = hash_object.hexdigest() if not file in zip_files: zip_files.append(file) zip_hashes.append(zip_hash) if zip_files: my_added_data[ 'zip_files'] = zip_files my_added_data[ 'zip_hashes'] = zip_hashes except: my_added_data[ 'attention'] = 'could not extract the zip file, may be encrypted' #Routine to gather info on CSV file types. if helper.get_arg( 'get_attachment_info' ) and attachment[ '@odata.mediaContentType'] == 'text/csv': filedata_encoded = attachment[ 'contentBytes'].encode() file_bytes = base64.b64decode(filedata_encoded) csvbytes = io.BytesIO(file_bytes) try: csvstring = csvbytes.read().decode('utf-8') if helper.get_arg('extract_iocs'): iocs = extract_iocs(helper, csvstring) csv_iocs = [] for ioc in iocs: if not ioc in csv_iocs: csv_iocs.append(ioc) if csv_iocs: my_added_data['iocs'] = csv_iocs #Will attempt to ingest the actual contents of the CSV file if this option is selected in the input. if 'csv' in helper.get_arg( 'attachment_data_ingest'): my_added_data['csv_data'] = csvstring except: my_added_data[ 'attention'] = 'could not parse the csv document, may be encrypted' #Routine to gather info on HTML file types. if helper.get_arg( 'get_attachment_info' ) and attachment[ '@odata.mediaContentType'] == 'text/html': filedata_encoded = attachment[ 'contentBytes'].encode() file_bytes = base64.b64decode(filedata_encoded) try: uncooked_soup = html.unescape( str(file_bytes)) soup = BeautifulSoup(uncooked_soup) soup_data = str(soup) if helper.get_arg('extract_iocs'): iocs = extract_iocs(helper, soup_data) html_iocs = [] for ioc in iocs: if not ioc in html_iocs: html_iocs.append(ioc) if html_iocs: my_added_data['iocs'] = html_iocs #Will attempt to ingest the actual contents of the HTML file if this option is selected in the input. if 'html' in helper.get_arg( 'attachment_data_ingest'): my_added_data['html_data'] = soup_data except: my_added_data[ 'attention'] = 'could not parse the html document, may be encrypted' #Routine to gather info on PDF file types. if helper.get_arg( 'get_attachment_info' ) and attachment[ '@odata.mediaContentType'] == 'application/pdf': filedata_encoded = attachment[ 'contentBytes'].encode() file_bytes = base64.b64decode(filedata_encoded) pdf_content = io.BytesIO(file_bytes) output_string = StringIO() try: parser = PDFParser(pdf_content) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter( rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) pdf_text = output_string.getvalue() if helper.get_arg('extract_iocs'): iocs = extract_iocs(helper, pdf_text) pdf_iocs = [] for ioc in iocs: if not ioc in pdf_iocs: pdf_iocs.append(ioc) if pdf_iocs: my_added_data[ 'iocs'] = pdf_iocs #Will attempt to ingest the actual contents of the PDF file if this option is selected in the input. if 'pdf' in helper.get_arg( 'attachment_data_ingest'): my_added_data['pdf_data'] = pdf_text except: my_added_data[ 'attention'] = 'could not parse the pdf document, may be encrypted' #Routine to gather info on XML file types. if helper.get_arg( 'get_attachment_info' ) and attachment[ '@odata.mediaContentType'] == 'text/xml': filedata_encoded = attachment[ 'contentBytes'].encode() file_bytes = base64.b64decode(filedata_encoded) try: soup = BeautifulSoup(file_bytes, 'lxml') soup_data = str(soup) if helper.get_arg('extract_iocs'): iocs = extract_iocs(helper, soup_data) xml_iocs = [] for ioc in iocs: if not ioc in xml_iocs: xml_iocs.append(ioc) if xml_iocs: my_added_data['iocs'] = xml_iocs #Will attempt to ingest the actual contents of the XML file if this option is selected in the input. if 'xml' in helper.get_arg( 'attachment_data_ingest'): my_added_data['xml_data'] = soup_data except: my_added_data[ 'attention'] = 'could not parse the xml document, may be encrypted' #Routine to do macro analysis on files of supported content types listed below if selected in the input setup. This uses OLEVBA tools to detect macros in the attachment, then analyses the macros. if helper.get_arg( 'get_attachment_info') and helper.get_arg( 'macro_analysis'): filename = attachment['name'] #Content types supported by OLEVBA. supported_content = [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.openxmlformats-officedocument.spreadsheetml.template', 'application/vnd.ms-excel.sheet.macroenabled.12', 'application/vnd.ms-excel.template.macroenabled.12', 'application/vnd.ms-excel.addin.macroenabled.12', 'application/vnd.ms-excel.sheet.binary.macroenabled.12', 'application/vnd.ms-excel', 'application/xml', 'application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation', 'application/vnd.openxmlformats-officedocument.presentationml.template', 'application/vnd.openxmlformats-officedocument.presentationml.slideshow', 'application/vnd.ms-powerpoint.addin.macroenabled.12', 'application/vnd.ms-powerpoint.presentation.macroenabled.12', 'application/vnd.ms-powerpoint.template.macroenabled.12', 'application/vnd.ms-powerpoint.slideshow.macroenabled.12', 'application/msword', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.wordprocessingml.template', 'application/vnd.ms-word.document.macroenabled.12', 'application/vnd.ms-word.template.macroenabled.12' ] if attachment[ '@odata.mediaContentType'] in supported_content: filedata_encoded = attachment[ 'contentBytes'].encode() file_bytes = base64.b64decode( filedata_encoded) try: vbaparser = VBA_Parser(filename, data=file_bytes) if vbaparser.detect_vba_macros(): my_added_data[ 'macros_exist'] = "true" macro_analysis = VBA_Parser.analyze_macros( vbaparser) helper.log_debug( "GET Response: " + json.dumps( macro_analysis, indent=4)) if macro_analysis == []: my_added_data[ 'macro_analysis'] = "Macro doesn't look bad, but I never trust macros." else: my_added_data[ 'macros_analysis'] = macro_analysis else: my_added_data[ 'macros_exist'] = "false" except: my_added_data[ 'attention'] = 'could not extract the office document, may be encrypted' attach_data.append(my_added_data) message_items['attachments'] = attach_data message_data.append(message_items) _write_events(helper, ew, messages=message_data) _purge_messages(helper, messages)
def _extract_pages_from_file(self, source_pdf: str): self.switch_to_pdf_document(source_pdf) pdf_pages = PDFPage.get_pages(self.active_fileobject) return PageGenerator(pdf_pages)