def getPageLayouts(f1): '''Takes a pdf file object, f1, extracts the text-like objects, and returns''' try: '''The parser and doc pair for a "pipe" of sorts''' with open(fpath, 'rb') as f1: parser = PDFParser(f1) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize(pss_wd) # can we extract text? if doc.is_extractable: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_layouts = [] for page in doc.get_pages(): ''' I *think* we're actually calling on fp here, and not some stored data; the idea is that .pdf files are "too big and complicated" to load all at once, so why not just parse what you need when you need it? ''' interpreter.process_page(page) # receive the LTPage object for the page page_layouts.append(device.get_result()) except IOError: raise IOError, "issue with loading file, please try again" finally: f1.close() return page_layouts
def pdfconvert(infullpath, file, infolder, pages=None): #Handle PDF if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) pdffile = open(infullpath, 'rb') # print "pdffile=", pdffile for page in PDFPage.get_pages(pdffile, pagenums): interpreter.process_page(page) pdffile.close() converter.close() txtfilename = file jpgfile = infolder + str(txtfilename) + '.jpg' txtfile = corpuspath + corpusfolder + '/' + txtfilename + '.txt' text = output.getvalue() output.close temp = open(txtfile, 'w') temp.write (text) temp.close() imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"' os.system(imagemagick_string) return jpgfile
def ParseAllPages(self, filepath): # Open a PDF file. self.filepath = filepath fp = open(filepath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) password = "" doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page)
def fix_text(self, filename): # Open a PDF file. pdfText = StringIO() fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. if not self.password: document = PDFDocument(parser) else: document = PDFDocument(parser, self.password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = TextConverter(rsrcmgr, pdfText, codec=self.codec , laparams=LAParams(), imagewriter=None ) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) txt = pdfText.getvalue() return txt
def convert_pdf_to_txt(path): temp = os.path.splitext(path) rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() outputFile = temp[0] + ".txt" print outputFile ff = open(outputFile, "w") ff.write(text) ff.close()
def convert(url, pages=None): assert isinstance(url, basestring) assert pages == None or isinstance(pages, list) rscmng = PDFResourceManager() retstr = StringIO() device = TextConverter(rscmng, retstr, codec='utf-8', laparams=LAParams()) web_page = urllib2.urlopen(urllib2.Request(url)) fp = StringIO(web_page.read()) interpreter = PDFPageInterpreter(rscmng, device) pdf_pages = PDFPage.get_pages( fp, set(pages if pages != None else []), maxpages=0, password='', caching=True, check_extractable=True ) for page in pdf_pages: interpreter.process_page(page) result = retstr.getvalue() fp.close() web_page.close() device.close() retstr.close() return result
def pdf_to_text(page_object): parser = PDFParser(page_object) # Create a PDF document object that stores the document structure doc = PDFDocument(parser) # Connect the parser and document objects. parser.set_document(doc) doc.initialize('') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) text_content = [] # i = page number #without this it doesn't work # page are items in page for i, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() for object in layout: if isinstance(object, LTTextBox) or isinstance(object, LTTextLine): trial = [] trial.append(object.get_text()) for word in trial: text_content.append(word) return text_content
def get_pdf_text(path): """ Reads a pdf file and returns a dict of the text where the index represents the page number. http://stackoverflow.com/a/20905381 """ rsrcmgr = PDFResourceManager() retstr = StringIO() # change to to utf-8 if the text comes out garbled codec = 'ascii' #codec = 'utf-8' laparams = LAParams() pages = {} device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() retstr.close() return pages
def parse(self, path): out = StringIO.StringIO() fp = None # Directory if os.path.isdir(path): raise NotImplementedError() # File else: fp = file(path) rsrc = PDFResourceManager() codec = 'utf-8' laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin = 2.0 laparams.word_margin = 0.0 device = TextConverter(rsrc, out, codec=codec, laparams=laparams) doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() interpreter = PDFPageInterpreter(rsrc, device) for page in doc.get_pages(): interpreter.process_page(page) device.close() sample = Sample(path, None, out.getvalue()) out.close() return sample
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in PDFPage.get_pages(f, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() self.parse_page(fpath, data, page_num) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def convert_pdf_to_txt(path, output): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() f = open(output, 'wb') f.write(text) f.close() return text
def load( self, open_file ): self.fields = {} self.text= {} # Create a PDF parser object associated with the file object. parser = PDFParser(open_file) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize('') # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for pgnum, page in enumerate( doc.get_pages() ): interpreter.process_page(page) if page.annots: self._build_annotations( page ) txt= self._get_text( device ) self.text[pgnum+1]= txt
def convert_pdf_to_txt(self, path): """ A very simple conversion function which returns text for parsing from PDF. path = The path to the file """ try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter( rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text except Exception as e: text = "" return text self.logger.error( "Failed to PDF to text: " + str(e))
def pdf_to_txt(in_file): """ turn a PDF file to a TXT file (roughly processed) """ # Open a PDF file. fp = open(in_file, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Set parameters for analysis. laparams = LAParams() # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) # Receive the LTPage object for the page. layout = device.get_result() for klass in layout: if isinstance(klass, LTTextBoxHorizontal): out_file = in_file[:-3] + 'txt' with open(out_file, 'a') as dst_file: text = klass.get_text().encode('utf-8') dst_file.write(text + '\n') return None
def convert_pdf_to_txt(path): ## TAKEN FROM STACK OVERFLOW ## see... http://www.unixuser.org/~euske/python/pdfminer/programming.html for tutorial ## Also see... https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() fp = file(path, 'rb') password = "" maxpages = 0 caching = True pagenos=set() # Read text from pages device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) str = retstr.getvalue() fp.close() device.close() retstr.close() return str
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 120 caching = True pagenos=set() # print "two" for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) # print "one" try: fp.close() device.close() str = retstr.getvalue() retstr.close() except: str = retstr.getvalue() return str
def get_layout(path): '''returns a list of every character in the document as well as its location''' rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() fp = file(path, 'rb') password = "" maxpages = 0 caching = True pagenos=set() layout = [] device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) layout.append( device.get_result() ) fp.close() device.close() retstr.close() return layout
def parsePDF(pdf_file): pdf_file = open(pdf_file, "r").read() # Cast to StringIO object from StringIO import StringIO memory_file = StringIO(pdf_file) # Create a PDF parser object associated with the StringIO object parser = PDFParser(memory_file) # Create a PDF document object that stores the document structure document = PDFDocument(parser) # Define parameters to the PDF device objet rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() codec = "utf-8" # Create a PDF device object device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document for page in PDFPage.create_pages(document): interpreter.process_page(page) data = retstr.getvalue() print data break
def run(path): print "Calling parser :%s" % path t0 = time.clock() rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() book = Book() i = 0 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page_tmp = Page() begin_page = len(retstr.getvalue()) interpreter.process_page(page) page_tmp.text = retstr.getvalue()[begin_page:-1] book.pages.append(page_tmp) fp.close() device.close() retstr.close() print "Parsing in:", time.clock() - t0 return book
def convert_pdf_to_txt(path): """ Converts PDF to text using the pdfminer library """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = "utf-8" laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) file_handle = file(path, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): interpreter.process_page(page) text = retstr.getvalue() file_handle.close() device.close() retstr.close() return text
def pdf2xml(infile): ''' Return a string of XML representation for given PDF file handle. Uses pdfminer to do the conversion and does some final post-processing. ''' outfile = StringIO() # Empirically determined... laparams = LAParams() laparams.char_margin = 0.4 # See pdf2txt.py rsrcmgr = PDFResourceManager(caching=False) device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) if page_api: for page in PDFPage.get_pages(infile, set()): interpreter.process_page(page) else: process_pdf(rsrcmgr, device, infile, set()) infile.close() return outfile.getvalue().replace("\n", "")
def pdf_to_text(pdfname): from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from cStringIO import StringIO # PDFMiner boilerplate rsrcmgr = PDFResourceManager() sio = StringIO() # codec = 'utf-8' codec = 'ascii' laparams = LAParams() device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Extract text fp = file(pdfname, 'rb') for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() # Get text from StringIO text = sio.getvalue() # Cleanup device.close() sio.close() return text
def Parse(self): # 先看是否有 cache,以及日期是否夠新 if not os.path.exists(parseCacheDir): os.makedirs(parseCacheDir) cacheFile = os.path.join(parseCacheDir, os.path.basename(self.pdfFileName) + '.cache') foundCache = (os.path.isfile(cacheFile) and \ os.path.getsize(cacheFile) > 0 and \ os.path.getmtime(cacheFile) > os.path.getmtime(self.pdfFileName)) if (foundCache): fp = open(cacheFile, 'rb') self.RawData = pickle.load(fp) fp.close() else: fp = open(self.pdfFileName, 'rb') for page in PDFPage.get_pages(fp, None, maxpages=1): rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) layout = device.get_result() self.__readobj(layout._objs) for category in self.RawData.values(): self.__reverseYaxis(category, layout.bbox[3]) cacheFp = open(cacheFile, 'wb') pickle.dump(self.RawData, cacheFp) cacheFp.close() fp.close() self.__calculateBoundary() self.__assignCharsAndLinesToCell() self.__processCells() return (self.effectiveFrom, self.__getResult())
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the document password for initialization. # (If no password is set, give an empty string.) doc.initialize(password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. pages = dict(enumerate(doc.get_pages())) for num, page in pages.iteritems(): if pagenums and (num not in pagenums): continue interpreter.process_page(page) if maxpages and maxpages <= num + 1: break return pages
def pdf_read(pdf): """ Use PDFMiner to extract text from pdf file. <PDFMiner even though more low-level but pretty good tool to read pdfs> Args: *pdf* (str) -- path to pdf file Returns: *text* (str) -- a text extracted from pdf """ # initalizing objects res_manager = PDFResourceManager() strio = StringIO() lps = LAParams() device = TextConverter(res_manager, strio, codec='utf-8', laparams=lps) interpreter = PDFPageInterpreter(res_manager, device) # opening a pdf file with 'rb' mode for reading binary files pdf_file = file(pdf, 'rb') for page in PDFPage.get_pages(pdf_file, maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) # finishing up pdf_file.close() device.close() text = strio.getvalue() strio.close() return text
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use PyPDF2 to extract textual content first. If none is found, it'll send the file through OCR. """ with open(path, 'rb') as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, '') result = {'pages': []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() if k != 'pages': result[k] = safe_text(v) if not doc.is_extractable: log.warning("PDF not extractable: %s", path) return result for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, languages) result['pages'].append(text) device.close() return result
def extract_text(self): pdf_data = file(self.local_file, 'rb').read() pdf_stream = io.BytesIO(pdf_data) laparams = LAParams() resource_manager = PDFResourceManager(caching=True) output_type = 'text' codec = 'utf-8' output_stream = io.BytesIO() pagenos = set() device = TextConverter( resource_manager, output_stream, codec=codec, laparams=laparams, ) interpreter = PDFPageInterpreter( resource_manager, device, ) pages = PDFPage.get_pages( pdf_stream, pagenos, maxpages=0, caching=True, check_extractable=True, ) for page in pages: interpreter.process_page(page) self.text = output_stream.getvalue().decode('utf8')
def parse_pdf(pdf_url): remote_file = urllib.request.urlopen(pdf_url).read() memory_file = io.BytesIO(remote_file) parser = PDFParser(memory_file) doc = PDFDocument() parser.set_document(doc) #Warning sometimes, error in pdf? doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) ret = [] # Process each page contained in the document. for pageIdx, page in enumerate(doc.get_pages()): ret.append([]) interpreter.process_page(page) layout = device.get_result() for idx, lt_obj in enumerate(layout): if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): if len(lt_obj.get_text().strip()) > 0: ret[pageIdx].append((lt_obj.get_text().splitlines())) return ret
def extract_text_from_pdf(pdf_filename): """ Function to extract the text from pdf documents using pdfminer Parameters: ----------- pdf_filename -- string File name of the pdf document as string Returns: -------- extracted_text -- string Text extracted from pdf as string """ resource_manager = PDFResourceManager() return_string = StringIO() la_params = LAParams() device = TextConverter(resource_manager, return_string, codec='utf-8', laparams=la_params) fp = file(pdf_filename, 'rb') interpreter = PDFPageInterpreter(resource_manager, device) page_nos = set() for page in PDFPage.get_pages(fp, page_nos): interpreter.process_page(page) fp.close() device.close() extracted_text = return_string.getvalue() return_string.close() return extracted_text
def pdf_from_url_to_txt(url, maxpages=0): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Open the url provided as an argument to the function and read the content f = urllib2.urlopen(urllib2.Request(url)).read() # Cast to StringIO object fp = StringIO(f) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() string = retstr.getvalue() retstr.close() return string
def extract_text_from_pdf(pdf_path): ''' Helper function to extract the plain text from .pdf files :param pdf_path: path to PDF file to be extracted (remote or local) :return: iterator of string of extracted text ''' # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/ if not isinstance(pdf_path, io.BytesIO): # extract text from local pdf file with open(pdf_path, 'rb') as fh: try: for page in PDFPage.get_pages( fh, caching=True, check_extractable=True ): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) page_interpreter = PDFPageInterpreter( resource_manager, converter ) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return else: # extract text from remote pdf file try: for page in PDFPage.get_pages( pdf_path, caching=True, check_extractable=True ): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter( resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams() ) page_interpreter = PDFPageInterpreter( resource_manager, converter ) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close() except PDFSyntaxError: return
def mine_area(filename): """ use pdfminer to get the valid area of each page. all results are relative position! """ pageboxlist = [] # 打开一个pdf文件 with open(filename, 'rb') as fp: # 创建一个PDF文档解析器对象 parser = PDFParser(fp) # 创建一个PDF文档对象存储文档结构 # 提供密码初始化,没有就不用传该参数 #document = PDFDocument(parser, password) document = PDFDocument(parser) # 检查文件是否允许文本提取 if not document.is_extractable: raise PDFTextExtractionNotAllowed # 创建一个PDF资源管理器对象来存储共享资源 # caching = False不缓存 rsrcmgr = PDFResourceManager(caching=False) # 创建一个PDF设备对象 laparams = LAParams() # 创建一个PDF页面聚合对象 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解析器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理文档当中的每个页面 # doc.get_pages() 获取page列表 # for i, page in enumerate(document.get_pages()): # PDFPage.create_pages(document) 获取page列表的另一种方式 # 循环遍历列表,每次处理一个page的内容 count = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象。一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 boxlist = [] for item in layout: if count >= 3: break box = item.bbox boxlist.append(box) if isinstance(item, LTTextBox) or isinstance(item, LTTextLine): print('text:{}'.format(item)) print(item.height) print(item.get_text()) count += 1 elif isinstance(item, LTImage): print('image:{}'.format(item)) elif isinstance(item, LTFigure): print('figure:{}'.format(item)) elif isinstance(item, LTAnno): print('anno:{}'.format(item)) elif isinstance(item, LTChar): print('char:{}'.format(item)) elif isinstance(item, LTLine): print('line:{}'.format(item)) elif isinstance(item, LTRect): print('rect:{}'.format(item)) elif isinstance(item, LTCurve): print('curve:{}'.format(item)) pageboxlist.append(boxlist) # for x in layout: # #如果x是水平文本对象的话 # if (isinstance(x, LTTextBoxHorizontal)): # # text=re.sub(replace,'',x.get_text()) # text = x.get_text() # if len(text) != 0: # print text break res = [] for boxlist in pageboxlist: tmp = get_max_box(boxlist) res.append(tmp) return res
class PdfParser(object): ''' basic CLI tool to extra info from a pdf, based on PDFMiner https://github.com/pdfminer/pdfminer.six ''' ''' instantiate for given page or default for all page layouts ''' def __init__(self, fp, pagenr=None): parser = PDFParser(fp) self.doc = PDFDocument(parser) laparams = LAParams() rsrcmgr = PDFResourceManager() self.device = PDFPageAggregator(rsrcmgr, laparams=laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) pages = PDFPage.create_pages(self.doc) self.nrpages = 0 self.pagelayouts = [] # count all pages, but only store requested pagelayout (or all if pagenr==None) for p in pages: self.nrpages += 1 # pagenr starts at 1, index in islice at 0 if not pagenr or (pagenr and (pagenr == self.nrpages)): self.interpreter.process_page(p) layout = self.device.get_result() self.pagelayouts.append(layout) def nrofpages(self): return self.nrpages def getdocinfo(self): return self.doc.info[0] ''' GENERATOR for all LTTextLineHorizontal objects in all pagelayouts ''' def txtlinegenerator(self): for pl in self.pagelayouts: for o in self.__txtlinegenerator_recursive(pl): yield o ''' actual recursive generator behind txtlinegenerator ''' def __txtlinegenerator_recursive(self, obj): for o in obj: if isinstance(o, LTTextLineHorizontal): yield o else: try: iterator = iter(o) except TypeError: # not iterable pass else: yield from self.__txtlinegenerator_recursive(o) return ''' return all text objects where given search string is found ''' def searchstr(self, searchstring): searchresult = [] gen = self.txtlinegenerator() for txtboxobject in gen: if searchstring in txtboxobject.get_text(): searchresult.append(txtboxobject) return searchresult ''' search all text objects within y0 in maxerr range from given yval ''' def searchy(self, yval, maxerr): miny = yval - maxerr maxy = yval + maxerr searchresult = [] gen = self.txtlinegenerator() for txtboxobject in gen: object_y0 = txtboxobject.y0 if object_y0 > miny and object_y0 < maxy: searchresult.append(txtboxobject) return searchresult
for bl in blacklisted: myRoutes[u"excluded_lines"].append(bl) for i in getLines(): pdf = download_pdf(i) if pdf == None: continue # Start pdfminer parser = PDFParser(io.BytesIO(pdf)) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() fieldNr = 0 ref = u"" name = u"" origin = u"" destination = u"" wd_ida = [] wd_volta = [] sa_ida = [] sa_volta = [] su_ida = [] su_volta = [] for object in layout:
def get_contents(filename): # found this online # outputs a single list of strings from a pdf rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pdf_file_instance = open(filename, 'rb') total_text = [] # Boxes to look for info #xmin,xmax, ymin,ymax xy = [ (325,335, 750,805), # commande_n (190,210, 550,564), # reference_n (31,32, 300,506), # tasks (500,515, 300,506), # prices (500,550, 145,216) # total ht, total ht net, total tva, net a payer ] #xy = [ # (325,335, 800,805), # commande_n # (190,210, 560,564), # reference_n # (31,32, 300,506), # tasks # (509,515, 300,506), # prices # (523,535, 145,216) # total ht, total ht net, total tva, net a payer # ] res = [' ']*len(xy) for page in PDFPage.get_pages(pdf_file_instance, maxpages=1): interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text() #print('At %r is text: %s' % ((x, y), text)) if any(a <= x <= b and c <=y <=d for a,b,c,d in xy): for idx, (a,b,c,d) in enumerate(xy): if a <= x <= b and c <=y <=d: res[idx] = '\n'.join([res[idx],text]).strip() #print('%r text: %s' % ((x, y), text)) #print('At %r is text: %s' % ((x, y), text)) pdf_file_instance.close() try: commande_n = res[0].split('\n')[-1] except: commande_n = '' try: reference_n = res[1] except: reference_n = '' try: tasks = res[2] except: tasks = '' try: prices = res[3] except: prices = '' try: total_ht,_,total_tva,total_ttc = res[4].split('\n') #note need to deal with total_ht.replace(u'\xa0', u''). #this should be done in the update_facture function except: total_ht,total_tva,total_ttc = '','','' return(commande_n, reference_n, total_ht, total_tva, total_ttc, tasks, prices)
def parse(page): rsrcmgr = PDFResourceManager() laparams = LAParams(char_margin=4 ,word_margin=6 ,boxes_flow=1.5 ,line_margin=0.4) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) def parseit(obj): res = [""] if isinstance(obj, LTChar): if isinstance(res[-1], str): res[-1] += obj.get_text() else: res.append(obj.get_text()) elif isinstance(obj, LTTextBox) or\ isinstance(obj, LTTextLine): res.append(obj.get_text()) elif isinstance(obj, LTFigure) or isinstance(obj, LTPage): for subobj in obj: subpar = parseit(subobj) if isinstance(res[-1], str) and isinstance(subpar[0], str): res = res[:-1] + [res[-1]+subpar[0]] + subpar[1:] else: res = res + subpar elif isinstance(obj, LTImage): rawdata = obj.stream.get_rawdata() res += [("Image", obj.bbox, rawdata)] return res interpreter.process_page(page) layout = device.get_result() parsed = parseit(layout) parsed = list(filter(None, parsed)) #return parsed itemcounter = 2 res = {"items": list() , "nonitems": list()} images = list(filter(lambda item: item[0] is "Image",parsed)) # treat first image as non-item if len(images) > 0: for img in images[0:1]: res["nonitems"].append({"image": img[2] , "name": itemcounter , "description": str(img[1])}) itemcounter += 1 # treat others as item-images if len(images) > 1: for img in images[1:]: res["items"].append({"image": img[2] , "itemonpage": itemcounter , "other": str(img[1])}) itemcounter += 1 res["items"].append({"itemonpage": 1, "other": "something"}) return res
def read_pdf(file, pages=[], laycntrl={}, codec='utf-8', strip_control=False, password='', caching=True, maxpages=0, rotation=0, image_dir=''): """ Reads a file in pdf format. Use **pdfminer** to read a pdf-file into **Python**. Args: file (str): A string providing the location of the file. pages (list[int]): A list giving the numbers of the pages to be extracted, by default (default is `[]`) all pages are extracted. codec (str): A string giving the codec (default is 'utf-8'). strip_control (bool): (default is `False`) not used in XML2Converter. password (str): A string giving the password (default is ''). caching (bool): (default is `True`) maxpages (int): (default is `0`) rotation (int): (default is `0`) image_dir (str): (default is `''`) Returns: PdfDoc: An object of type `PdfDoc`. """ if not (os.path.splitext(file)[1] == ".pdf"): raise IOError("PDF-file expected got '%s'!" % (os.path.splitext(file)[1], )) if not os.path.exists(file): raise IOError("Could not find PDF-file '%s'!" % (file, )) if len(image_dir) == 0: imagewriter = None else: if not os.path.exists(image_dir): os.mkdir(image_dir) imagewriter = ImageWriter(image_dir) rsrcmgr = PDFResourceManager(caching=caching) laparams = LAParams(**laycntrl) device = XML2Converter(rsrcmgr, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=strip_control) interpreter = PDFPageInterpreter(rsrcmgr, device) with open(file, 'rb') as con: if (pages is None) or (len(pages) == 0): pages = [i[0] for i in enumerate(PDFPage.get_pages(con))] for page in PDFPage.get_pages(con, pages, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) return PdfDoc(device.doc)
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-S': stripcontrol = True elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def csa_pdf(document): chaines = { 2752:'FRANCE INTER', 2746:'FRANCE INTER', 3054:'FRANCE INTER', 2845:'FRANCE INFO', 2314:'FRANCE CULTURE', 3116:'RADIO CLASSIQUE', 2152:'BFM', 3249:'RMC', 2481:'EUROPE 1', 2897:'RTL', 1575:'TF1', 6084:'FRANCE 2', 5694:'FRANCE 3', 1563:'FRANCE 3', 8237:'CANAL+', 18914:'FRANCE 5', 7541:'M6', 1857:'C8', 1818:'C8', 1931:'TMC', 2301:'BFMTV', 1872:'CNEWS', 1833:'LCI', 2329:'FRANCEINFO', 5937:'FRANCEINFO', 2460:'FRANCEINFO'} from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.pdfpage import PDFPage from pdfminer.layout import LTTextBoxHorizontal,LTFigure,LTImage itvs = [] #Create resource manager rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) chaine = "" info = False for page in PDFPage.get_pages(document): #print "page-----------" interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() elts = dict(noms=[],orgs=[],durees=[]) go = False col ="" for element in layout: #print element if isinstance(element, LTFigure): for e in element: if isinstance(e, LTImage): if e.stream: data = e.stream.get_rawdata() chaine = chaines[len(data)] #print len(data) if isinstance(element, LTTextBoxHorizontal): x = element.x1 txt = element.get_text().strip() if 'TELEVISION' in txt or 'INTERVENTION' in txt or 'PROGRAMMES' in txt: go = False if 'TELEVISIONS (AUTRES' in txt: info = False if 'TELEVISIONS -' in txt: info = True #print txt if txt[0:3]=='Du ': date = (int(txt[9:13]),int(txt[6:8])) if txt in ['MAG','JT','PROG']: typ = txt if go: if x<seuils[0]: col = 'noms' elif x<seuils[1]: col = 'orgs' else: col = 'durees' txt = int(txt[0:2])*3600+int(txt[3:5])*60+int(txt[6:8]) #print "-->",txt #print x,txt,col elts[col].append((txt,date,typ)) if txt==u'Dur\xe9e': seuils = [250 if info else 220,500] go = True elif txt==u'DUREE': seuils = [260,480] go = True if (len(elts['noms'])==len(elts['orgs']) or len(elts['noms'])==len(elts['durees'])): for i in range(len(elts['noms'])): itvs.append(dict(chaine=chaine,nom=elts['noms'][i][0],org=elts['orgs'][i][0],duree=elts['durees'][i][0], date=elts['durees'][i][1], type=elts['durees'][i][2] ) ) else: return "boom" print len(elts['noms']),len(elts['orgs']),len(elts['durees']) return itvs
def read_IDMP(fpath, **kwargs): ''' This function will read an IDMP and return the DIDs that are expected to be obtained for the event. There is no known way to obtain which TC these belong to, however, that will be addressed manually be some user. Input: fpath - The absolute file path to the IDMP pdf Kwargs: get_all - True: Returns all the DIDs False: Returns the DIDs that follow our criteria - no "pre" no "ee" TODO-UPDATE THESE BECAUSE I CANNOT RECALL ALL OF THEM get_mapping - True: Returns the **** mapping ONLY False: Returns the list of DIDs ONLY Returns: Returns a list of DIDs expected for the event ''' get_all = kwargs.get('get_all', True) get_mapping = kwargs.get('get_mapping', False) lines = [] DIDs = [] mapdict = {'OSF': {}, 'TPY': {}, 'THAAD': {}} ladd = lines.append if os.path.isfile(fpath) and os.path.splitext( fpath)[-1] == '.pdf' and 'IDMP' in fpath: file_content = open(fpath, 'rb') parser = PDFParser(file_content) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() #I changed the following 2 parameters to get rid of white spaces inside words: laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) first_data_items = 0 last_data_items = 0 # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): newlines = lt_obj.get_text().splitlines() for i, l in enumerate(newlines): if l.lower().startswith('(u) index of data items' ) and first_data_items == 0: first_data_items = i + len(lines) - 1 if l.lower().startswith('1.0 (u) introduction') and first_data_items\ and len(lines) - 1 + i > first_data_items: last_data_items = i + len(lines) - 1 ladd(l) if first_data_items and last_data_items: break if first_data_items and last_data_items: RELEVANT_TEXT = lines[first_data_items:last_data_items] #This part parses the DID names only DIDs = list(map(lambda x: x.split(' ')[0], RELEVANT_TEXT)) if get_all: DIDs = [d for d in DIDs if '-' in d and 'gti' not in d.lower()] else: DIDs = [ d for d in DIDs if '-' in d and 'gti' not in d.lower() and not '-pre-' in d.lower() and not d.lower().startswith('ee') ] #This part maps the first two numbers after the '-' in the DID to the numbers in the () for line in lines: DID = line.split(' ')[0] if '-' in DID and 'gti' not in DID.lower(): if '(' in line and ')...' in line and 'osf-' in DID.lower( ) and '-pre-' not in DID.lower(): value = line.split('(')[-1].split(')')[0] if value[-1] in CapAlphabet: value = value[:-1] key = DID.split('-')[-1][:-2] mapdict['OSF'][key] = value if '(' in line and ')...' in line and ( 'tpy2-' in DID.lower() or 'typ2-' in DID.lower()) and '-pre-' not in DID.lower(): value = line.split('(')[-1].split(')')[0] key = DID.split('-')[-1] if key[-1] in CapAlphabet: key = key[-1] mapdict['TPY'][key] = value if '(' in line and ')...' in line and 'thaad' in DID.lower( ) and '-pre-' not in DID.lower(): value = line.split('(')[-1].split(')')[0] value = 'String' + value.split('String')[-1] key = DID.split('-')[-1][:2] mapdict['THAAD'][key] = value else: print( 'Could not find the relevant information needed to parse.. Returning []' ) file_content.close() else: print(''''The IDMP path given failed one or more of the following:\n 1: not a valid file\n 2: not a pdf\n 3: the file does not contain the string 'IDMP'. ''') if not get_mapping: return DIDs else: return mapdict
class GetPic: def __init__(self, filename, password=''): """ 初始化 :param filename: pdf路径 :param password: 密码 """ with open(filename, 'rb') as file: # 创建文档分析器 self.parser = PDFParser(file) # 创建文档 self.doc = PDFDocument() # 连接文档与文档分析器 self.parser.set_document(self.doc) self.doc.set_parser(self.parser) # 初始化, 提供初始密码, 若无则为空字符串 self.doc.initialize(password) # 检测文档是否提供txt转换, 不提供就忽略, 抛出异常 if not self.doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDF资源管理器, 管理共享资源 self.resource_manager = PDFResourceManager() # 创建一个PDF设备对象 self.laparams = LAParams() self.device = PDFPageAggregator(self.resource_manager, laparams=self.laparams) # 创建一个PDF解释器对象 self.interpreter = PDFPageInterpreter(self.resource_manager, self.device) # pdf的page对象列表 self.doc_pdfs = list(self.doc.get_pages()) # 打开PDF文件, 生成一个包含图片doc对象的可迭代对象 self.doc_pics = fitz.open(filename) def to_pic(self, doc, zoom, pg, pic_path): """ 将单页pdf转换为pic :param doc: 图片的doc对象 :param zoom: 图片缩放比例, type int, 数值越大分辨率越高 :param pg: 对象在doc_pics中的索引 :param pic_path: 图片保存路径 :return: 图片的路径 """ rotate = int(0) trans = fitz.Matrix(zoom, zoom).preRotate(rotate) pm = doc.getPixmap(matrix=trans, alpha=False) path = os.path.join(pic_path, str(pg)) + '.png' pm.writePNG(path) return path def get_pic_loc(self, doc, tmp=''): """ 获取单页中图片的位置,输出文本 :param doc: pdf的doc对象 :return: 返回一个list, 元素为图片名称和上下y坐标元组组成的tuple. 当前页的尺寸 """ self.interpreter.process_page(doc) layout = self.device.get_result() # pdf的尺寸, tuple, (width, height) canvas_size = layout.bbox # 图片名称坐标 loc_top = [] # 来源坐标 loc_bottom = [] # 图片名称与应截取的区域y1, y2坐标 loc_named_pic = [] # 遍历单页的所有LT对象 text_export = '' # 输出文本信息 topNumber = 0 bottomNumber = 0 for i in layout: # print('读取变量数据',i) if hasattr(i, 'get_text'): text = i.get_text().strip() text_export += text # 匹配关键词 if re.search(r'^(图表*|表)(\s|\s*\d|\s*[::])', text): loc_top.append((i.bbox, text)) topNumber = topNumber + 1 elif re.search(r'^\n*((来源)|(资料来源)|(数据来源))(\s|[::])', text): bottomNumber = bottomNumber + 1 loc_bottom.append((i.bbox, text)) locname = [] print('读取一页的结果:topNumber.', topNumber, ' bottomNumber.', bottomNumber) i0 = 0 j0 = 0 # size_increase = 10 # name = '' print(loc_top) print(loc_bottom) print(len(loc_top), len(loc_bottom)) # 这里逻辑有点乱。将loc_top和loc_bottom依y轴坐标对齐,以找出 if len(loc_top) == 1 and len(loc_bottom) == 0: try: name = locname[0][0][1] except: name = '' elif len(loc_top) > 0 and len(loc_bottom) > 0: while i0 <= len(loc_top) - 1 and j0 <= len(loc_bottom) - 1: # print (i0,j0) if loc_top[i0][0][1] < loc_bottom[j0][0][ 1]: # 如果尾的y轴坐标值大于头的坐标值(y轴坐标由下往上递增 范围为0到正无穷) bottom = [(0, loc_bottom[j0][0][1], canvas_size[2], loc_bottom[j0][0][3]), loc_bottom[j0][1]] locname.append([bottom, 1]) j0 += 1 continue is_binglie = 0 # 判定是否一行两个图 try: if abs(loc_top[i0][0][1] - loc_top[i0 + 1][0][1]) < 10: # 纵坐标相差不大 is_binglie = 1 except: pass if is_binglie == 0: if loc_top[i0][0][1] > loc_bottom[j0][0][ 1]: # 非并列时,最正常的上下关系图情况 top = [(0, loc_top[i0][0][1], canvas_size[2], loc_top[i0][0][3]), loc_top[i0][1]] # (x1,y1,x2,y2) locname.append([top, 0]) i0 += 1 else: bottom = [(0, loc_bottom[j0][0][1], canvas_size[2], loc_bottom[j0][0][3]), loc_bottom[j0][1]] locname.append([bottom, 1]) j0 += 1 else: is_binglie_laiyuan = 0 try: if abs(loc_bottom[j0][0][1] - loc_bottom[j0 + 1][0][1]) < 10: is_binglie_laiyuan = 2 else: is_binglie_laiyuan = 1 except: try: loc_bottom[j0][0][1] is_binglie_laiyuan = 1 except: is_binglie_laiyuan = 0 if is_binglie_laiyuan == 2: top1 = [(0, loc_top[i0][0][1], loc_top[i0 + 1][0][0], loc_top[i0][0][3]), loc_top[i0][1]] locname.append([top1, 0]) bottom1 = [ (0, loc_bottom[j0][0][1], loc_top[i0 + 1][0][0], loc_bottom[j0][0][3]), loc_bottom[j0][1] ] locname.append([bottom1, 1]) top2 = [(loc_top[i0 + 1][0][0], loc_top[i0 + 1][0][1], canvas_size[2], loc_top[i0 + 1][0][3]), loc_top[i0 + 1][1]] locname.append([top2, 0]) bottom2 = [(loc_top[i0 + 1][0][0], loc_bottom[j0 + 1][0][1], canvas_size[2], loc_bottom[j0 + 1][0][3]), loc_bottom[j0 + 1][1]] locname.append([bottom2, 1]) i0 += 2 j0 += 2 elif is_binglie_laiyuan == 1: top1 = [(0, loc_top[i0][0][1], loc_top[i0 + 1][0][0], loc_top[i0][0][3]), loc_top[i0][1]] locname.append([top1, 0]) bottom1 = [ (0, loc_bottom[j0][0][1], loc_top[i0 + 1][0][0], loc_bottom[j0][0][3]), loc_bottom[j0][1] ] locname.append([bottom1, 1]) top2 = [(loc_top[i0 + 1][0][0], loc_top[i0 + 1][0][1], canvas_size[2], loc_top[i0 + 1][0][3]), loc_top[i0 + 1][1]] locname.append([top2, 0]) bottom2 = [(loc_top[i0 + 1][0][0], loc_bottom[j0][0][1], canvas_size[2], loc_bottom[j0][0][3]), loc_bottom[j0][1]] locname.append([bottom2, 1]) i0 += 2 j0 += 1 else: top1 = [(0, loc_top[i0][0][1], loc_top[i0 + 1][0][0], loc_top[i0][0][3]), loc_top[i0][1]] top2 = [(loc_top[i0 + 1][0][0], loc_top[i0 + 1][0][1], canvas_size[2], loc_top[i0 + 1][0][3]), loc_top[i0 + 1][1]] locname.append([top1, 0]) locname.append([top2, 0]) i0 += 2 if i0 == len(loc_top): while j0 <= len(loc_bottom) - 1: locname.append([loc_bottom[j0], 1]) j0 += 1 if j0 == len(loc_bottom): while i0 <= len(loc_top) - 1: locname.append([loc_top[i0], 0]) i0 += 1 if i0 == len(loc_top): while j0 <= len(loc_bottom) - 1: locname.append([loc_bottom[j0], 1]) j0 += 1 if j0 == len(loc_bottom): while i0 <= len(loc_top) - 1: locname.append([loc_top[i0], 0]) i0 += 1 k = 0 loc_named_pic = [] # print(locname) ''' 将locname转为loc_named_pic ''' while k <= len(locname) - 1: # print(k) if locname[0][1] == 1: # 第一行是表尾,定义x1,x2为pdf宽度,y1为pdf顶,y2为表尾坐标 x1 = canvas_size[0] x2 = canvas_size[2] y1 = canvas_size[3] y2 = locname[0][0][0][3] name = tmp loc_named_pic.append([name, (x1, y1, x2, y2)]) name = '' k += 1 elif locname[k][1] == 0: # 找到第一个表头 name += locname[k][0][1] if k + 1 < len(locname): # k 是表头行 ii = k + 1 while ii < len(locname): ##ii 找表尾 if locname[ii][1] == 0: ## ii不是表尾 name += ' ' + locname[ii][0][1] ii += 1 else: ## ii是表尾 x1 = locname[k][0][0][0] x2 = locname[k][0][0][2] y1 = locname[k][0][0][3] y2 = locname[ii][0][0][1] loc_named_pic.append([name, (x1, y1, x2, y2)]) name = '' k = ii + 1 ii += 1 continue k += 1 else: k += 1 else: k += 1 tmp = name return loc_named_pic, canvas_size, tmp, topNumber, bottomNumber def get_crops(self, pic_path, canvas_size, position, cropped_pic_name, cropped_pic_path): """ 按给定位置截取图片 :param pic_path: 被截取的图片的路径 :param canvas_size: 图片为pdf时的尺寸, tuple, (0, 0, width, height) :param position: 要截取的位置, tuple, (y1, y2) :param cropped_pic_name: 截取的图片名称 :param cropped_pic_path: 截取的图片保存路径 :return: """ img = Image.open(pic_path) # 当前图片的尺寸 tuple(width, height) pic_size = img.size # 截图的范围扩大值 count = 0 size_increase = 10 ##没改完 x1 = max(position[0] - size_increase, 0) * (pic_size[0] / canvas_size[2]) x2 = min(position[2] + size_increase, canvas_size[2]) * (pic_size[0] / canvas_size[2]) # y1 = pic_size[1] * (1 - (position[0] + size_increase)/canvas_size[3]) # y2 = pic_size[1] * (1 - (position[1] - size_increase)/canvas_size[3]) y1 = max(0, (1 - (position[1] + size_increase) / canvas_size[3]) * pic_size[1]) y2 = min(pic_size[1], (1 - (position[3] - size_increase) / canvas_size[3]) * pic_size[1]) # print(x1,x2,y1,y2) cropped_img = img.crop((x1, y1, x2, y2)) cropped_pic_name = cropped_pic_name + str(count) cropped_pic_name = cropped_pic_name.replace('/', '') cropped_pic_name = cropped_pic_name.replace(' ', '') cropped_pic_name = cropped_pic_name.replace('\\', '') cropped_pic_name = cropped_pic_name.replace(':', '') cropped_pic_name = cropped_pic_name.replace('*', '') cropped_pic_name = cropped_pic_name.replace('?', '') cropped_pic_name = cropped_pic_name.replace('"', '') cropped_pic_name = cropped_pic_name.replace('<', '') cropped_pic_name = cropped_pic_name.replace('>', '') cropped_pic_name = cropped_pic_name.replace('|', '') cropped_pic_name = cropped_pic_name.replace('\n', '') cropped_pic_name = cropped_pic_name.replace('\r', '') cropped_pic_name = cropped_pic_name.replace('\f', '') if len(cropped_pic_name) > 50: cropped_pic_name = cropped_pic_name[0:49] count += 1 rand0 = str(random.randint(10000000, 99999999)) text0 = [] log0 = [] try: path = os.path.join(cropped_pic_path, rand0) + '.png' cropped_img.save(path) text0 = cropped_pic_name + '|' + rand0 + '|' + str(x1) + '|' + str( x2) + '|' + str(y1) + '|' + str(y2) # print(text0) return text0, log0 # print('成功截取图片:', cropped_pic_name) except: log0 = cropped_pic_path + cropped_pic_name print('失败', cropped_pic_name) return text0, log0 # pass def main(self, pic_path, cropped_pic_path, pgn=None, tmp=''): """ 主函数 :param pic_path: 被截取的图片路径 :param cropped_pic_path: 图片的截图的保存路径 :param pgn: 指定获取截图的对象的索引 :return: """ text_total = [] log_total = [] topNumber = 0 bottomNumber = 0 if pgn is not None: # 获取当前页的doc doc_pdf = self.doc_pdfs[pgn] doc_pic = self.doc_pics[pgn] # 将当前页转换为PNG, 返回值为图片路径 path = self.to_pic(doc_pic, 2, pgn, pic_path) loc_name_pic, canvas_size, tmp, topNumber, bottomNumber = self.get_pic_loc( doc_pdf, tmp=tmp) print(pgn) if loc_name_pic: for i in loc_name_pic: position = i[1] cropped_pic_name = re.sub('/', '_', i[0]) text1, log1 = self.get_crops(path, canvas_size, position, cropped_pic_name, cropped_pic_path) if text1: text1 = text1 + '|' + str(pgn) text_total.append(text1) ##写入文件 if log1: log1 = log1 + '|第' + str(pgn) + '页出错' log_total.append(log1) return tmp, text_total, log_total, topNumber, bottomNumber
#链接解释器和文档对象 parser.set_document(doc) doc.set_parser(parser) #初始化文档 doc.initialize("") #创建PDF资源管理器 resoure = PDFResourceManager() #参数分析器 laparam = LAParams() #创建聚合器 device = PDFPageAggregator(resoure, laparams=laparam) #创建页面解释器 interpreter = PDFPageInterpreter(resoure, device) #使用文档对象得到页面的集合 for page in doc.get_pages(): #使用页面解释起来读取 interpreter.process_page(page) #使用聚合器获得内容 layout = device.get_result() for out in layout: if hasattr(out, "get_text"): print(out.get_text())
class SvSpecParser(): FONT_TRANSLATION = { # "HEFBHG+TimesNewRomanPS-ItalicMT": "it", # "HEFBAE+TimesNewRomanPS-BoldMT": "b", 'BVXWSQ+CourierNew,Bold': 'b', 'BHDFJL+TimesNewRomanPSMT': None, 'WTCCEL+TimesNewRoman,Italic': None, None: None } def __init__(self, ofile): rsrcmgr = PDFResourceManager() laparams = LAParams() self.device = PDFPageAggregator(rsrcmgr, laparams=laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) self.last_font = None self.in_rule = False self.font_print_pending = False self.ofile = ofile self.first_rule = True def parse_page(self, page): self.interpreter.process_page(page) layout = self.device.get_result() self.parse_obj(layout._objs) def collect_lines(self, o): if isinstance(o, LTTextBox): lines = [_o for _o in o._objs if isinstance(_o, LTTextLine)] yield from lines elif isinstance(o, LTFigure): yield from self.collect_lines(o._objs) return def parse_obj(self, objs): font_translation = self.FONT_TRANSLATION f = None tmp_lines = [] for o in objs: tmp_lines.extend(self.collect_lines(o)) tmp_lines.sort(key=lambda o: o.y0, reverse=True) tmp_lines = tmp_lines[2:-3] # cut off header and footer for o in tmp_lines: text = o.get_text() # print(text) is_rule_header = "::=" in text if is_rule_header or self.in_rule: if is_rule_header: if not self.first_rule: self.ofile.write("\n</br>\n") else: self.first_rule = False # if is_rule_header and text.startswith("unary_module_path_operator"): # print("----------") self.in_rule = True if not is_rule_header: if text and o.x0 < 85: self.in_rule = False continue self.ofile.write(" ") if text.strip(): for c in o._objs: is_char = isinstance(c, LTChar) if is_char: if c.fontname == 'BHDEOM+Arial-BoldMT': self.in_rule = False # title break if (is_char and c.matrix[-1] - o._objs[0].matrix[-1] > 3.5): # sys.stderr.write(c.get_text()) # skipping hrefs, which are upper indexes continue if is_char and self.last_font != c.fontname: # this character has different font need to propagate it to output self.font_print_pending = True if c.get_text().isspace() and font_translation[ self.last_font] is not None: # print the font enclosing string directly after this word (ignore whitespaces behind) self.font_print_pending = True self.ofile.write("</%s>" % f) self.last_font = None if self.font_print_pending and not ( c.get_text().isspace()): self.font_print_pending = False f = font_translation[self.last_font] if f: self.ofile.write("</%s>" % f) f = font_translation[c.fontname] if f: self.ofile.write("<%s>" % f) self.last_font = c.fontname # if text.startswith("list_of_port_declarations") and c.get_text() == "s": # print("----------") self.ofile.write(c.get_text())
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines from pdfminer.pdfpage import PDFPage from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar a=raw_input('enter the file name:\n') if len(a)<1: a='HB.pdf' fp=open(a,'rb') parser=PDFParser(fp) doc=PDFDocument(parser) rs=PDFResourceManager() lapara=LAParams() device=PDFPageAggregator(rs, laparams=lapara) inte=PDFPageInterpreter(rs, device) name=None ID=None state=None phone=None a=[] for page in PDFPage.create_pages(doc): inte.process_page(page) layout=device.get_result() count=0 # pdb.set_trace() state_index = None phone_index = None for x in layout:
def parse_page_box(pdf_file_path, line_overlap=0.2, char_margin=0.1, line_margin=0.2, word_margin=0.1, boxes_flow=0.5, detect_vertical=False, all_texts=False): """ 创建一个PDF文档分析器 创建一个PDF文档对象存储文档结构 检查文件是否允许文本提取 创建一个PDF资源管理器对象来存储共赏资源 设定参数进行分析 创建一个PDF设备对象 创建一个PDF解释器对象 处理每一页 :param pdf_file_path: :param line_overlap: :param word_margin: :param line_margin: :param char_margin: :param boxes_flow: :param detect_vertical: :param all_texts: :return: """ fp = open(pdf_file_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed else: resources_manger = PDFResourceManager() la_params = LAParams(line_overlap=line_overlap, detect_vertical=detect_vertical, all_texts=all_texts, word_margin=word_margin, line_margin=line_margin, char_margin=char_margin, boxes_flow=boxes_flow) device = PDFPageAggregator(resources_manger, laparams=la_params) interpreter = PDFPageInterpreter(resources_manger, device) page_no = 0 page_box_list = list() for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() page_box = PDFTools.create_page_box() page_box.reset_page_box(start_x=layout.x0, start_y=layout.y0, end_x=layout.x1, end_y=layout.y1, page_no=page_no) for box in layout: if isinstance(box, LTTextBoxHorizontal): content = box.get_text().strip(u'\n ') if content == u'': continue text_box = PDFTools.create_text_box() font_dict = PDFTools.get_font_dict(box=box) text_box.reset_text_box(start_x=box.x0, start_y=box.y0, end_x=box.x1, end_y=box.y1, content=content, font_dict=font_dict) page_box.add_text_box(text_box=text_box) page_box_list.append(page_box) page_no += 1 return page_box_list
def main(argv): global Verbose_Flag global Use_local_time_for_output_flag global testing argp = argparse.ArgumentParser(description="extract_pseudo_JSON-from_PDF.py: Extract the pseudo JSON from the end of the thesis PDF file") argp.add_argument('-v', '--verbose', required=False, default=False, action="store_true", help="Print lots of output to stdout") argp.add_argument('-t', '--testing', default=False, action="store_true", help="execute test code" ) argp.add_argument('-p', '--pdf', type=str, default="test.pdf", help="read PDF file" ) argp.add_argument('-j', '--json', type=str, default="calendar_event.json", help="JSON file for extracted calendar event" ) argp.add_argument('-a', '--acronyms', type=str, default="acronyms.tex", help="acronyms filename" ) argp.add_argument('-l', '--ligatures', default=False, action="store_true", help="leave ligatures rahter than replace them" ) args = vars(argp.parse_args(argv)) Verbose_Flag=args["verbose"] filename=args["pdf"] if Verbose_Flag: print("filename={}".format(filename)) #output_string = StringIO() output_string = BytesIO() with open(filename, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) #device = HTMLConverter(rsrcmgr, output_string, laparams=LAParams(), layoutmode='normal', codec='utf-8') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) text=output_string.getvalue().decode('UTF-8') if Verbose_Flag: print("text: {}".format(text)) # define the maker string quad__euro_marker='€€€€' # look for the new start of the For DiVA information diva_start=text.find("{0} For DIVA {0}".format(quad__euro_marker)) if diva_start < 0: # if not found, then try the older For DIVA string diva_start=text.find("For DIVA") if Verbose_Flag: print("For DIVA found at diva_start={}".format(diva_start)) if diva_start >= 0: diva_data=text[:] diva_data=diva_data[diva_start:] diva_start=diva_data.find("{") if diva_start >= 0: diva_data=diva_data[diva_start:] end_block=diva_data.find('”Number of lang instances”:') # note these are right double quote marks if end_block < 0: end_block=diva_data.find('"Number of lang instances":') # note these are straight double quote marks if end_block > 0: end_block=diva_data.find(',', end_block) if end_block > 0: dict_string=diva_data[:] dict_string=dict_string[:end_block]+'}' dict_string=dict_string.replace('', '') # remove any new page characters dict_string=dict_string.replace('”', '"') dict_string=dict_string.replace('\n\n', '\n') dict_string=dict_string.replace(' \n', '') dict_string=dict_string.replace(',}', '}') dict_string=dict_string.replace('”', '"') #dict_string=dict_string.replace('"', '"') #dict_string=dict_string.replace('<br>', '\n') #dict_string=dict_string.replace('<br>"', '\n"') #dict_string=dict_string.replace('<br>}', '\n}') dict_string=dict_string.replace(',\n\n}', '\n}') dict_string=dict_string.replace(',\n}', '\n}') # fix an error in the early template if dict_string.find(',Äddress": ') > 0: print("fix an error in the early template") dict_string=dict_string.replace(',Äddress": ', ',"Address": "') dict_string=dict_string.replace('\"Lindstedtsvägen', 'Lindstedtsvägen') dict_string=dict_string.replace('¨Lindstedtsvägen', 'Lindstedtsvägen') dict_string=dict_string.replace('¨Isafjordsgatan', 'Isafjordsgatan') if not args['ligatures']: dict_string=replace_ligature(dict_string) print("looking for and replacing ligatures") if Verbose_Flag: print("dict_string={}".format(dict_string)) print("dict_string={}".format(dict_string)) d=json.loads(dict_string) if Verbose_Flag: print("d={}".format(d)) abs_keywords=diva_data[(end_block+1):] abs_keywords=abs_keywords.replace('', '') if Verbose_Flag: print("abs_keywords={}".format(abs_keywords)) number_of_quad_euros=abs_keywords.count(quad__euro_marker) if Verbose_Flag: print("number_of_quad_euros={}".format(number_of_quad_euros)) abstracts=dict() keywords=dict() if (number_of_quad_euros % 2) == 1: print("Odd number of markers") save_abs_keywords=abs_keywords[:] number_of_pairs_of_markers=int(number_of_quad_euros/2) for i in range(0, number_of_pairs_of_markers): abstract_key_prefix='”Abstract[' key_offset=abs_keywords.find(abstract_key_prefix) if key_offset > 0: # found a key for an abstract # get language code lang_code_start=key_offset+len(abstract_key_prefix) lang_code=abs_keywords[lang_code_start:lang_code_start+3] quad__euro_marker_start=abs_keywords.find(quad__euro_marker, lang_code_start) if quad__euro_marker_start >= 0: quad__euro_marker_end=abs_keywords.find(quad__euro_marker, quad__euro_marker_start + 5) abstracts[lang_code]=abs_keywords[quad__euro_marker_start+5:quad__euro_marker_end] #br_offset=abstracts[lang_code].find('<br>') #if br_offset >= 0: # abstracts[lang_code]=abstracts[lang_code][br_offset+4:] abs_keywords=abs_keywords[quad__euro_marker_end+1:] abs_keywords=save_abs_keywords[:] for i in range(0, number_of_pairs_of_markers): abstract_key_prefix='”Keywords[' key_offset=abs_keywords.find(abstract_key_prefix) if key_offset > 0: # found a key for an abstract # get language code lang_code_start=key_offset+len(abstract_key_prefix) lang_code=abs_keywords[lang_code_start:lang_code_start+3] quad__euro_marker_start=abs_keywords.find(quad__euro_marker, lang_code_start) if quad__euro_marker_start > 0: quad__euro_marker_end=abs_keywords.find(quad__euro_marker, quad__euro_marker_start + 5) keywords[lang_code]=abs_keywords[quad__euro_marker_start+5:quad__euro_marker_end] keywords[lang_code]=keywords[lang_code].replace('\n', '') # remove newlines from keywords keywords[lang_code]=keywords[lang_code].strip() # remove starting end ending white space br_offset=keywords[lang_code].find('<br>') if br_offset >= 0: keywords[lang_code]=keywords[lang_code][br_offset+4:] abs_keywords=abs_keywords[quad__euro_marker_end+1:] for a in abstracts: print("a={0}, abstract={1}".format(a, abstracts[a])) abstracts[a]=clean_up_abstract(abstracts[a]) any_acronyms_in_abstracts=False for a in abstracts: acronyms_present=check_for_acronyms(abstracts[a]) if acronyms_present: any_acronyms_in_abstracts=True if any_acronyms_in_abstracts: acronyms_filename=args["acronyms"] print("Acronyms found, getting acronyms from {}".format(acronyms_filename)) acronym_dict=get_acronyms(acronyms_filename) if len(acronym_dict) == 0: print("no acronyms found in {}".format(acronyms_filename)) else: # entries of the form: acronym_dict[label]={'acronym': acronym, 'phrase': phrase} for a in abstracts: abstracts[a]=spellout_acronyms_in_abstract(acronym_dict, abstracts[a]) print("abstracts={}".format(abstracts)) print("keywords={}".format(keywords)) d['abstracts']=abstracts d['keywords']=keywords output_filename=args["json"] if Verbose_Flag: print("output_filename={}".format(output_filename)) with open(output_filename, 'w', encoding='utf-8') as output_FH: j_as_string = json.dumps(d, ensure_ascii=False) print(j_as_string, file=output_FH) else: print('No "Number of lang instances" found') dict_string=diva_data[:] print("initial dict_string={}".format(dict_string)) dict_string=dict_string.replace('', '') # remove any new page characters dict_string=dict_string.replace('”', '"') dict_string=dict_string.replace('\n\n', '\n') dict_string=dict_string.replace(' \n', '') dict_string=dict_string.replace(',}', '}') #dict_string=dict_string.replace('"', '"') #dict_string=dict_string.replace('<br>', '\n') #dict_string=dict_string.replace('<br>"', '\n"') #dict_string=dict_string.replace('<br>}', '\n}') dict_string=dict_string.replace(',\n\n}', '\n}') dict_string=dict_string.replace(',\n}', '\n}') # fix an error in the early template if dict_string.find(',Äddress": ') > 0: print("fix an error in the early template") dict_string=dict_string.replace(',Äddress": ', ',"Address": "') dict_string=dict_string.replace('\"Lindstedtsvägen', 'Lindstedtsvägen') dict_string=dict_string.replace('¨Lindstedtsvägen', 'Lindstedtsvägen') dict_string=dict_string.replace('¨Isafjordsgatan', 'Isafjordsgatan') if not args['ligatures']: dict_string=replace_ligature(dict_string) print("looking for and replacing ligatures") print("dict_string={}".format(dict_string)) d=json.loads(dict_string) print("d={}".format(d)) output_filename=args["json"] if Verbose_Flag: print("output_filename={}".format(output_filename)) with open(output_filename, 'w', encoding='utf-8') as output_FH: j_as_string = json.dumps(d, ensure_ascii=False) print(j_as_string, file=output_FH)
def parse(file_name): """ Exact pdf file into text format and calling above methods to detect useful student information Method description: 1. Exact pdf file and transform into text file by using pdfminer library functions 2. Used above methods to detect and store the useful information in the list: uesful return: "useful", a list contains all the useful informaiton of student raises: none know bugs """ fp = open(file_name, 'rb') #Open the file and read as binary mode; #Created pdf parser object to associate original pdf file praser = PDFParser(fp) # Created a blank PDF file to store useful information; doc = PDFDocument() #Connected parse object and doc.pdf praser.set_document(doc) doc.set_parser(praser) useful = [] # This is the empty list to store incoming useful student information #Initialize our empty doc doc.initialize() #To test if doc.pdf can be transformed as text format #If not, stopped; #Else, continue. if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: #Created PDFRecourceManager to manage the shared resource rsrcmgr = PDFResourceManager() #Created PDF device object to store interpreted format of data laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) #Create PDF interpreter object to transform shared informaiton in the rsrcmgr and stored in device .... interpreter = PDFPageInterpreter(rsrcmgr, device) #Use for loop to go through the file, and unit is page number #Initial page number is 0 page_number = 0 temp_use = [] temp_dict = { "name":"", "LASID":"", "DOB":"", "Grade":"", "RD":"", "School":"", "District":"", "Score":"", "Score_level":"", "low_top":"", "course":"", } for page in doc.get_pages(): # doc.get_pages() to get page lists information interpreter.process_page(page) # To accept interpreted page LTPage object layout = device.get_result() #Layout means for every LTPage, which stores interpreted instance of corresponding pages, such asLTTextBox, LTFigure, LTImage, LTTextBoxHorizontal. If we need to capture strings, then it should be txt instance. read_flag = 0# this variable means the pages have been read for x in layout: #for every layout if (isinstance(x, LTTextBoxHorizontal)):# if the instance type of layout is LTTextBoxHorizontal results = x.get_text()# we store all the txt in the results print (results) if page_number%2 == 0 and read_flag==0: # This is the even page number and no previous page has been read #temp_use.append(results) temp_dict["course"] = results.split("\n")[0] read_flag = 1 continue else:# if odd pages or there is previous page has been read # Continue getting name, LASID, DOB, etc... if get_name(results): temp_dict["name"] = get_name(results) if get_LASID(results): temp_dict["LASID"] = get_LASID(results) if get_DOB(results): temp_dict["DOB"] = get_DOB(results) if get_Grade(results): temp_dict["Grade"] = get_Grade(results) if get_RD(results): temp_dict["RD"] = get_RD(results) if get_School(results): temp_dict["School"] = get_School(results) if get_District(results): temp_dict["District"] = get_District(results) if get_Score(results): temp_dict["Score"] = get_Score(results) if get_Score_level(results): temp_dict["Score_level"] = get_Score_level(results) if get_low_top(results): temp_dict["low_top"] = get_low_top(results) #print (temp_dict) #input("==") #page_number += 1 #print (page_number) #if page_number%2 == 0: if 1: # When page number == 1; useful.append(temp_dict) temp_dict = { "name":"", "LASID":"", "DOB":"", "Grade":"", "RD":"", "School":"", "District":"", "Score":"", "Score_level":"", "low_top":"", } return useful
if not file_name.endswith(".pdf"): continue f = open(os.path.join(os.getcwd(), file_name), "rb") parser = PDFParser(f) document = PDFDocument(parser, "") if not document.is_extractable: print("PERMISSION DENIED!!!!") continue rsmanager = PDFResourceManager() outfile = file_name[:len(file_name)-4]+".txt" outfp = open(outfile, 'w', encoding='utf-8') laparams = LAParams() device = TextConverter(rsmanager, outfp, laparams=laparams,imagewriter=None) intrptr = PDFPageInterpreter(rsmanager, device) for page in PDFPage.get_pages(f): intrptr.process_page(page) device.close() outfp.close() f.close() searching.find() #os.system('python file.py')
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import TextConverter, HTMLConverter, XMLConverter from pdfminer.layout import LAParams import io path = "B:\\Alien Brain\\Python Warm-Up\\Test\\2.pdf" pdf = open(path, 'rb') mem = io.StringIO() lp = LAParams() rm = PDFResourceManager() #cnv = TextConverter(rm,mem,laparams=lp) #cnv = HTMLConverter(rm,mem,laparams=lp) cnv = XMLConverter(rm, mem, laparams=lp) ip = PDFPageInterpreter(rm, cnv) for i in PDFPage.get_pages(pdf): ip.process_page(i) text = mem.getvalue() file = open(path + "Converted.xml", 'wb') file.write(text.encode('utf-8')) print("DONE")
def purgeextract(infilename): Report = open(infilename, 'rb') outlist = list() #setup parser = PDFParser(Report) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() #Set parameters laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) #extracting text from PDF extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() texttolist = extracted_text.split('\n') #Extracting found = 0 #Using this to check if the pdf file had any purge values before output for i in range (len(texttolist)): if found <2: if texttolist[i] == 'Type of calibration: Ion transfer (pos): Optimize C-Trap Entrance Lens --- Inject # (V)': stop = 0 y = 0 for y in range(len(texttolist)-(i+1)):#location depends on the various calibrations - have to look through it all. if 'result:' in texttolist[i+y].split() and stop == 0: try: outlist.append(['Entrance', texttolist[i+y].split('->')[0].split()[-1],texttolist[i+y].split('->')[1]]) except(IndexError): outlist.append('indexissue') stop = 1 #Stopping this loop after it found the result - otherwise it might catch the exit value as well. found +=1 if texttolist[i] == 'Type of calibration: HCD Transfer: Optimize C-Trap Exit Lens --- Purge # (V)': y=0 stop = 0 for y in range(len(texttolist)-(i+1)):#Apparently it differs how far away the results are if 'result:' in texttolist[i+y].split() and stop == 0: try: outlist.append(['Exit', texttolist[i+y].split('->')[0].split()[-1],texttolist[i+y].split('->')[1]]) except(IndexError): outlist.append('indexissue') stop = 1 found +=1 #if found == 2: #i = len(texttolist) if found != 0: Report.close() return(outlist) else: Report.close() return(0)
def decode_pdf(filename): global current_section global pre_section global pre_font_family global pre_font_size global title global authors global abstract global keywords current_section = "" pre_section = TAG_BEGIN pre_font_family = "" pre_font_size = "" title = "" authors = set() abstract = "" keywords = "" path = basedir + "/static/demos/paperminer/papers/" + filename # layout parameters laparams = LAParams() caching = True rsrcmgr = PDFResourceManager(caching=caching) outtype = 'html' out = StringIO() # Opens a file for reading only in binary format. The file pointer is # placed at the beginning of the file. This is the default mode. fp = file(path, 'rb') # parse PDF to HTML codec = 'utf-8' if outtype == 'text': device = TextConverter(rsrcmgr, out, codec=codec, laparams=laparams, imagewriter=None) if outtype == 'xml': device = XMLConverter(rsrcmgr, out, codec=codec, laparams=laparams, imagewriter=None) if outtype == 'html': device = HTMLConverter(rsrcmgr, out, codec=codec, laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 pagenos = set() # only process the first page max_page = 1 p = 0 for page in PDFPage.get_pages(fp, pagenos, maxpages=max_page, password=password, caching=caching, check_extractable=True): if p >= max_page: break interpreter.process_page(page) fp.close() device.close() # str_value is the first PDF page in HTML str_value = out.getvalue() out.close() # loop through each line in HTML for line in str_value.split('<br>'): analyze(line) result = [ title.decode('utf-8'), authors, abstract.decode('utf-8'), keywords.decode('utf-8') ] return result
def fetch(self) -> Iterator[Journal]: resp = requests.get(self.url) resp.raise_for_status() content_length = resp.headers.get("Content-Length", None) content_length = int( content_length) if content_length is not None else None cached_file = cache_in_memory(resp, size=content_length) with pdfplumber.open(cached_file) as pdf: def get_entries() -> Iterator[str]: from pdfplumber.utils import cluster_objects, extract_text, DEFAULT_X_TOLERANCE import unicodedata # fontname_regex = re.compile(r"([A-Z]{6})\+([A-Za-z]+)(\d+)?") small_font_size_threshold = Decimal("8.0") def is_font_bold(char: PDFChar) -> bool: tag, fontname = char["fontname"].split("+") return "BX" in fontname def is_font_small(char: PDFChar) -> bool: return char["size"] < small_font_size_threshold def normalize_char( char: PDFChar, interpreter: PDFPageInterpreter) -> Optional[PDFChar]: text = char["text"] if len(text) > 1 and ( cid_match := cid_regex.fullmatch(text)) is not None: cid = int(cid_match.group(1)) text = cmap_char(cid, char["fontname"], interpreter) if text is None: char["text"] = None return char ntext = unicodedata.normalize("NFKC", text) if len(ntext) == 2 and unicodedata.combining(ntext[1]): text = ntext[1] text = make_combining_form(text) or text if is_font_small(char): if text == "o": text = "°" char["text"] = text return char def sort_line_chars( chars: Sequence[PDFChar], interpreter: PDFPageInterpreter) -> Sequence[PDFChar]: chars = (normalize_char(char, interpreter) for char in chars) chars = sorted(chars, key=lambda char: char["x0"]) main_chars, combining_chars = partition( lambda char: char["text"] and unicodedata.combining( char["text"]), chars) combining_chars_iter = peekable(iter(combining_chars)) for main_char in main_chars: yield main_char while combining_chars_iter: combining_char = combining_chars_iter.peek() overlap = max( min(main_char["x1"], combining_char["x1"]) - max(main_char["x0"], combining_char["x0"]), 0) if overlap < main_char["width"] * Decimal("0.5"): break yield combining_char next(combining_chars_iter, None) assert (next(combining_chars_iter, None) is None) return yield x_tolerance = Decimal("3.0") y_tolerance = Decimal("3.0") min_tab_width = Decimal("8.0") for page in pdf.pages: device = PDFPageAggregator( pdf.rsrcmgr, pageno=page.page_number, laparams=pdf.laparams, ) interpreter = PDFPageInterpreter(pdf.rsrcmgr, device) interpreter.process_page(page.page_obj) contents = page.crop( ( Decimal(100), Decimal(70 + (200 if page.page_number == 1 else 0)), page.width - Decimal(100), page.height - Decimal(70), ), relative=False, ) left_column = contents.crop( ( Decimal(0), Decimal(0), contents.width * Decimal(0.5), contents.height, ), relative=True, ) right_column = contents.crop( ( contents.width * Decimal(0.5), Decimal(0), contents.width, contents.height, ), relative=True, ) for column in (left_column, right_column): bold_chars = filter(is_font_bold, column.chars) bold_char_lines = cluster_objects( bold_chars, "top", y_tolerance) bold_line_y0s = (min(char["top"] for char in line) for line in bold_char_lines) hsep_y0s = chain(bold_line_y0s, (column.bbox[3], )) hsep_y0s = list(hsep_y0s) for y0, y1 in windowed(hsep_y0s, 2): if y1 is None: break entry = column.within_bbox( ( column.bbox[0], max(y0 - y_tolerance, column.bbox[1]), column.bbox[2], min(y1 + y_tolerance, column.bbox[3]), ), relative=False, ) entry_lines = cluster_objects( entry.chars, "top", y_tolerance) entry_text = StringIO() # TODO: refactor into separate top-level function, along with sort_line_chars, normalize_char. for line_chars in entry_lines: line_chars = list(line_chars) last_char: Optional[PDFChar] = None for char in sort_line_chars( line_chars, interpreter): if last_char is not None and last_char[ "text"] is not None: if char["x0"] > last_char[ "x1"] + min_tab_width: entry_text.write("\t") elif char["x0"] > last_char[ "x1"] + x_tolerance: entry_text.write(" ") if char["text"] is not None: entry_text.write(char["text"]) if not unicodedata.combining( char["text"]): last_char = char entry_text.write("\n") yield unicodedata.normalize( "NFKC", entry_text.getvalue()) return yield journal = Journal() for entry in get_entries(): print(f"ENTRY: {entry}") # TODO: handle `[name in other language]` bits. pass if journal is not None and journal.names and journal.iso4: yield journal
def parse_ndpdf(pdf_path): fp = open(pdf_path, "rb") # 用文件对象创建一个pdf文档分析器 parse_pdf = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() parse_pdf.set_document(doc) doc.set_parser(parse_pdf) doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF参数分析器 laparams = LAParams() # 创建聚合器 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF页面解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一页的内容 # doc.get_pages() 获取page列表 for page in doc.get_pages(): # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器获取内容 layout = device.get_result() results_last = "" # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, a = 0 gd = [] zj = [] hm = [] xingzhi = [] bili = [] guoji = [] for out in layout: # 判断是否含有get_text()方法,图片之类的就没有 # if hasattr(out,"get_text"): a += 1 if isinstance(out, LTTextBoxHorizontal): results = out.get_text() # 解析亏损表 if a == 1: if results != "A106000企业所得税弥补亏损明细表\n" and results != "中华人民共和国企业所得税年度纳税申报表(A类)\n" and results != "A000000企业基础信息表\n": break else: biaoge = results gd = False # print(results) # results_last = results if biaoge == "A106000企业所得税弥补亏损明细表\n" and results_last == '前五年度\n前四年度\n前三年度\n前二年度\n前一年度\n本年度\n可结转以后年度弥补的亏损额合计\n': nf = results.strip("").split("\n") print(nf) if biaoge == "A106000企业所得税弥补亏损明细表\n": if results_last == '2\n' or results_last == "2011\n2012\n2013\n2014\n2015\n2016\n": nstzhs = results.strip("").split("\n") if len(nstzhs) == 7: nstzhsd = nstzhs print(nstzhsd) # 解析年度纳税申报表 if biaoge == "中华人民共和国企业所得税年度纳税申报表(A类)\n": if results_last == '金额\n' and a == 11: sz = results.strip("").split("\n") print(sz) elif a == 10 and "%" in results and "0.00" in results: sz = results.strip("").split("\n") print(sz) # 解析基础信息表 if biaoge == "A000000企业基础信息表\n": if "备抵法" in results or "直接核销法" in results: cbjj = results.strip("").split("\n") print(cbjj) if biaoge == "A000000企业基础信息表\n" and a == 8: kjzz = results.strip("").split("\n") try: # match = re.search(r'201适用的会计准则或会计制度 (.*?)', kjzz[0]) # print(match.group(1)) kjzzz = kjzz[0].split(" ") kjzzzd = kjzzz[1] print(kjzzzd) except: kjzzzd = "" print(kjzzzd) if biaoge == "A000000企业基础信息表\n" and "否" in results: jcx = results.strip("").split("\n") if len(jcx) == 6: jcxx = jcx print(jcxx) else: continue if biaoge == "A000000企业基础信息表\n" and "301企业主要股东" in results: gd = True gdxx = [] if biaoge == "A000000企业基础信息表\n" and gd: if "证件" not in results and "主要股东" not in results and "经济性质" not in results and "投资比例" not in results and "国籍" not in results and "302中国境内" not in results and "公司财务室" not in results \ and "备抵法" not in results and "直接核销法" not in results and "人民币" not in results: gdxx.append(results) results_last = results pdf_dict = {} try: pdf_dict['所属行业明细'] = jcxx[2] pdf_dict['从业人数'] = jcxx[3] pdf_dict['存货计价方法'] = cbjj[1] pdf_dict['企业会计准则为'] = kjzzzd if "一般企业" in pdf_dict['企业会计准则为']: pdf_dict['企业会计准则为'] = "一般企业会计准则" except Exception as e: print(e) pdf_dict['所属行业明细'] = "" pdf_dict['从业人数'] = "" pdf_dict['存货计价方法'] = "" pdf_dict['企业会计准则为'] = "" try: index = 0 for gl in gdxx: index += 1 if "居民身份证" in gl or "营业执照" in gl: zjhm = gl.replace("\n", "") zjhm = zjhm.split('居民身份证')[1:] clean = [] for g in zjhm: if "营业执照" in g: yy = g.split("营业执照") if len(yy[0]) != 0: clean.append("居民身份证") clean.append(yy[0]) for zz in yy[1:]: clean.append("营业执照") clean.append(zz) else: clean.append("居民身份证") clean.append(g) break tzxx = [] end = index + len(clean) for tz in gdxx[index:end]: tz = tz.replace("\n", "") tzxx.append(tz) gj = [] end2 = end + int(len(clean) / 2) for country in gdxx[end:end2]: country = country.replace("\n", "") gj.append(country) xm = [] gs = int(len(clean) / 2) if index - 1 == gs: for mc in gdxx[:index - 1]: mc = mc.replace("\n", "") xm.append(mc) else: for mc in gdxx[:index - 1]: mc = mc.replace("\n", "") xm.append(mc) for mc in gdxx[end2:]: mc = mc.replace("\n", "") xm.append(mc) zhenghe = {} sb = 0 for j in range(0, len(clean), 2): gdxxdict = {} if '其他单位证件' in clean[j]: gdxxdict["证件种类"] = "居民身份证" else: gdxxdict["证件种类"] = clean[j] gdxxdict["证件号码"] = clean[j + 1] gdxxdict["经济性质"] = tzxx[j] gdxxdict["投资比例"] = tzxx[j + 1] if "中华人民" in gj[sb] or "香港" in gj[sb]: gdxxdict["国籍"] = "中国" else: gdxxdict["国籍"] = gj[sb] gdxxdict["股东名称"] = xm[sb] wc = gdxxdict sb += 1 zhenghe["{}".format(sb)] = wc pdf_dict['股东信息'] = zhenghe tzfxx2, tzfxx3, tzfxx4, tzfxx5, tzfxx6, tzfxx7, tzfxx8, tzfxx9, tzfxx10 = {}, {}, {}, {}, {}, {}, {}, {}, {} tzfxx1 = json.dumps(zhenghe, ensure_ascii=False) tzfxx2 = json.dumps(tzfxx2, ensure_ascii=False) tzfxx3 = json.dumps(tzfxx3, ensure_ascii=False) tzfxx4 = json.dumps(tzfxx4, ensure_ascii=False) tzfxx5 = json.dumps(tzfxx5, ensure_ascii=False) tzfxx6 = json.dumps(tzfxx6, ensure_ascii=False) tzfxx7 = json.dumps(tzfxx7, ensure_ascii=False) tzfxx8 = json.dumps(tzfxx8, ensure_ascii=False) tzfxx9 = json.dumps(tzfxx9, ensure_ascii=False) tzfxx10 = json.dumps(tzfxx10, ensure_ascii=False) # params = ( # self.batchid, "0", "0", self.companyid, self.customerid, tzfxx1, tzfxx2, tzfxx3, tzfxx4, tzfxx5, # tzfxx6, tzfxx7, tzfxx8, tzfxx9, tzfxx10) # self.insert_db("[dbo].[Python_Serivce_GSTaxInfo_AddParent]", params) except: pass pdf_dict['纳税调整后所得'] = sz[18] ksmx = {} try: for i in range(len(nf) - 1): try: if nf[i] == "2016": ksmx[nf[i]] = sz[18] else: ksmx[nf[i]] = nstzhsd[i] except: ksmx[nf[i]] = nstzhsd[i] except: print("ksmx") pdf_dict["亏损明细"] = ksmx print(pdf_dict) return pdf_dict
def parse(): fp = open(path, 'rb') # 以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument(praser) # 连接分析器 与文档对象 praser.set_document(doc) # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 循环遍历列表,每次处理一个page的内容 wb = Workbook() #新建excel ws = wb.active # 记录page的行数 text_number = 0 for page in PDFPage.create_pages(doc): # doc.get_pages() 获取page列表 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, # 得到box page_container = [] #存储所有该page的字符串字典 page_rows = [] #存储行位置数据 for text_box in layout: if (isinstance(text_box, LTTextBox)): # 得到line for text_line in text_box: if (isinstance(text_line, LTTextLine)): # 得到每个字符 temp = [] # 存储得到的字符 temp_loc = [] #存储字符串位置 isfirst = True #判断是否为字符串的第一个字符 for text_index in text_line: # 判断是否为字符数据,并不断更新temp temp_loc if (isinstance(text_index, LTChar)): temp.append(text_index.get_text()) if isfirst == True: temp_loc.append( round(text_index.bbox[0], 3)) temp_loc.append( round(text_index.bbox[1], 3)) temp_loc.append( round(text_index.bbox[2], 3)) temp_loc.append( round(text_index.bbox[3], 3)) isfirst = False temp_loc[2] = round(text_index.bbox[2], 3) temp_loc[3] = round(text_index.bbox[3], 3) # 判断是否为LTText,并将得到的字符串输入page_container的指定位置,最后更新temp 、temp_loc、 isfirst elif (isinstance(text_index, LTText)): # 如果page_rows没有该行的位置数据,则将数据信息插入page_container,page_rows # if temp_loc[1] not in page_rows: if is_not_in(page_rows, temp_loc[1]): insert_loc = insert_into_page_rows( page_rows, temp_loc[1]) page_container.insert( insert_loc, [{ 'value': ''.join(temp), 'location': temp_loc }]) # page_rows.append(temp_loc[1]) # page_container.append([{'value':''.join(temp),'location':temp_loc}]) # 如果有该行的信息 elif not is_not_in(page_rows, temp_loc[1]): # loc = page_rows.index(temp_loc[1]) loc = get_page_rows_loc( page_rows, temp_loc[1]) temp_list = insert_into_page_container( page_container[loc], { 'value': ''.join(temp), 'location': temp_loc }) page_container[loc] = temp_list[:] temp = [] temp_loc = [] isfirst = True # 记录当前page的行数 rows_num = len(page_container) # 对最后一行进行重排 if len(page_container[rows_num - 1]) != len( page_container[rows_num - 2]): page_container[rows_num - 1], unused_flag = align_row( page_container[rows_num - 2], page_container[rows_num - 1]) # 对前五行进行表头检测,并重新排序 if len(page_container[0]) != len(page_container[1]) or len( page_container[1]) != len(page_container[2]) or len( page_container[2]) != len(page_container[3]) or len( page_container[3]) != len(page_container[4]): rows_length = [] the_max_row = [] rejust_rows = [] rejust_rows_num = [] for i in range(7): # 666666 rows_length.append(len(page_container[i])) max_length = max(rows_length) the_max_row = page_container[rows_length.index(max_length)][:] for i in range(len(rows_length)): if rows_length[i] < max_length: page_container[i], flag_for_title = align_row( the_max_row, page_container[i]) if flag_for_title == False: rejust_rows.append(page_container[i]) rejust_rows_num.append(i) # 合并在单元格中换行的元素(前五行) if rejust_rows != []: compiled_row = compile_row(rejust_rows) page_container.insert(rejust_rows_num[0], compiled_row) for i in range(len(rejust_rows_num)): del page_container[rejust_rows_num[0] + 1] # 对表头进行处理后,记录当前page的行数 rows_num = len(page_container) # 输出验证 for i in range(len(page_container)): for j in range(len(page_container[i])): print(page_container[i][j]) # print(page_container) # print(page_rows) # 得到该页数据以后写入excel for i in range(len(page_container)): for j in range(len(page_container[i])): cell_index = ws.cell(row=i + 1 + text_number, column=j + 1) if page_container[i][j] == None: cell_index.value = ' ' elif page_container[i][j]['value'] == '': ws.merge_cells(start_row=i + 1 + text_number, start_column=1, end_row=i + 1 + text_number, end_column=len(page_container[i])) break else: cell_index.value = page_container[i][j]['value'] # 更新text_number,保证page之间的数据连续 text_number += rows_num wb.save(r'C:\Users\15644\Desktop\pdf_file\test_pdf_list\test_10.xlsx')
def parse(pdf_file, code_source, code_new): pdf_path_ = "./"+watermark_dir+"/" + pdf_file with open(pdf_path_, 'rb') as pdf_io: # 用文件对象创建一个PDF文档分析器 # parser = PDFParser(DataIO) parser = PDFParser(pdf_io) # 创建一个PDF文档 doc = PDFDocument() # 分析器和文档相互连接 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码,没有默认为空 doc.initialize() # 检查文档是否可以转成文本,如果不可以读取文本,抛出异常 if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDF资源管理器,来管理共享资源 rsrcmagr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() # 将资源管理器和设备对象聚合 device = PDFPageAggregator(rsrcmagr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmagr, device) pg = 0 # 循环遍历列表,每次处理一个page内容 # print(doc.get_pages()) # 获取page列表 for page in doc.get_pages(): size_x = [] size_y = [] size_font = [] layout_x = 2448 layout_y = 1584 try: interpreter.process_page(page) # 接收该页面的LTPage对象 layout = device.get_result() # 这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象 # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对像 # 想要获取文本就得获取对象的text属性 layout_x = int(layout.bbox[2]) layout_y = int(layout.bbox[3]) for x in layout: # 读取所有文本 if isinstance(x, LTTextBoxHorizontal): for line in x: # 读取所有文本列 if isinstance(line, LTTextLine): # 读取每一文本列 result = line.get_text().lower() # 设置字符串匹配起始值 num_code = 0 # 匹配 原 字符串 # 判断要匹配的字符串code_source 中有几个匹配的值,循环几次 for i in range(result.count(code_source.lower())): # 关键字匹配,并获取在文本列的第几位 codetwo = result.find(code_source, num_code) # 下次从这个值往后开始匹配 num_code = codetwo + 1 i = 0 for char in line: # 读取每一个字符 if isinstance(char, LTChar): # print(char.get_text()) if i == codetwo: size_x.append(float(char.bbox[0])) size_y.append(float(char.bbox[1])) size_font.append(float(char.size)) i += 1 except e: print(e) print("The watermark file Create Failed") return 1 # 根据匹配文本的坐标系,生成替换水印 # 默认大小为 name_without_postfix = os.path.splitext(pdf_file)[0] path_tmp = watermark_dir + "/" + name_without_postfix check_dir(path_tmp) mark = canvas.Canvas(path_tmp+"/" + str(pg) + ".pdf", pagesize=(layout_x, layout_y)) # 生成每一页水印pdf ,将要代替的文字code_new ,写到匹配到的文字source_code位置,(用于合并覆盖) j = 0 for i in size_x: # 移动坐标原点(坐标系左下为(0,0)) x = float(i) y = float(size_y[j]) # 指定描边的颜色 mark.setStrokeColorRGB(1, 1, 1) # 指定填充颜色 mark.setFillColorRGB(1, 1, 1) # 画一个矩形 mark.rect(x, y, size_font[j] * 2.8, size_font[j] * 0.8, fill=1) j += 1 z = 0 for i in size_x: x = float(i) y = float(size_y[z]) # 设置字体 mark.setFont("Helvetica", size_font[z] * 0.8) # 指定填充颜色 mark.setFillColorRGB(0, 0, 0) # 设置透明度,1为不透明 mark.setFillAlpha(1) # 在水印pdf中写入一个字符,输入为全大写 mark.drawString(x + (size_font[z] * 0.1), y + (size_font[z] * 0.125), code_new.upper()) # 匹配是否为大文件,选择遮盖logo if size_font[z] > 20: # 指定描边的颜色 mark.setStrokeColorRGB(1, 1, 1) # 指定填充颜色 mark.setFillColorRGB(1, 1, 1) # 画一个矩形 mark.rect(x - 35, y - 2, 35, size_font[z] * 0.9, fill=1) z += 1 # 画一个空白矩阵 mark.rect(0, 0, 0, 0, fill=1) # 关闭并保存pdf文件 mark.save() pg += 1 print('The watermark file Create Successful.')
def get_paper_content(fname, pages=2, outdir="data"): debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option basename = os.path.basename(fname) basename = basename.replace(".pdf", "") outfile = os.path.join(outdir, basename + ".html") outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.debug = True try: for index, page in enumerate( PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True)): if index > pages: break page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) except: print(fname) return fp.close() device.close() outfp.close() return
def main(argv): #codec = 'utf-8' codec = 'ascii' laparams = LAParams() pagenos = set() maxpages = 0 password = '' caching = True rotation = 0 rsrcmgr = PDFResourceManager(caching=caching) # Do a double read thanks to: # https://mail.python.org/pipermail/python-list/2009-April/531944.html mm = mmap.mmap(-1, 1024 * 1024 * 1024) device = TextConverter(rsrcmgr, mm, codec=codec, laparams=laparams, imagewriter=None) fname = argv[1] fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() eof = mm.tell() device.close() mm.close() # Recreate the mmap area w/the correct size mm = mmap.mmap(-1, eof) device = TextConverter(rsrcmgr, mm, codec=codec, laparams=laparams, imagewriter=None) fname = argv[1] fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() mm.seek(0) ip_regex = re.compile( r'((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))' ) hash_regex = re.compile( r'(?:[A-Fa-f0-9]{32}|[A-Fa-f0-9]{40}|[A-Fa-f0-9]{64})' ) # md5, sha1, sha256 url_regex = re.compile( r'\b((?:[\w-]+://?|www[.])[A-Za-z0-9-_\/.%?=&\[\]()@!$#,;]+)', re.MULTILINE) hostname_regex = re.compile( r'([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63}|\s\.\s[a-zA-Z\d-]{,63})+)', re.MULTILINE) single_line_hostname_regex = re.compile( r'([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63}|\s\.\s[a-zA-Z\d-]{,63})+)') doc = '' while True: if mm.tell() >= eof: break doc += mm.readline().rstrip() m = re.findall(ip_regex, doc) if m != None and len(m) > 0: print set(m) m = re.findall(url_regex, doc) if m != None and len(m) > 0: print set(m) m = re.findall(hash_regex, doc) if m != None and len(m) > 0: print set(m) m = re.findall(hostname_regex, doc) hostname_candidates = [] if m != None and len(m) > 0: hostname_candidates = list(set(m)) m = re.findall(single_line_hostname_regex, doc) if m != None and len(m) > 0: hostname_candidates = list(set(m + hostname_candidates)) if len(hostname_candidates) > 0: for h in hostname_candidates: domain = string.replace(h, ' ', '') #print h if uniaccept.verifytldoffline( domain, "./tld-list.txt") and domain[-1] != '.': print h #print doc device.close() mm.close()
def identify_name_pdf(path_file, operator_code): if operator_code == 'TIM': operator = 'TIM' reference = '(REF: [A-Z]{3}\/[0-9]{2})' account = '(CLIENTE: [0-9]{1}.[0-9]{7})' if operator_code == 'CLARO': operator = 'Claro' reference = '(Data de Emissão: [0-9-]{2}\/[0-9]{2}\/2020)' account = '(Nº da Conta: [0-9]{9})' with open(path_file, 'rb') as fp: rsrcmgr = PDFResourceManager() retstr = io.StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) for page in pages: interpreter.process_page(page) data = retstr.getvalue() operator_find = re.findall(operator, data) reference_find = re.findall(reference, data) account_find = re.findall(account, data) operator = operator_find[0] reference = reference_find[0] account = account_find[0] if operator_code == 'TIM': account = account.replace('.', '') account = account.replace('CLIENTE: ', '') reference = reference.replace('REF: ', '') reference = dateparser.parse( reference, settings={'TIMEZONE': 'America/Sao_Paulo'}) reference = datetime.strftime(reference, "%Y%m") if operator_code == 'CLARO': operator = operator.upper() reference = reference.replace('Data de Emissão: ', '') reference = reference[3:10] reference = dateparser.parse( reference, settings={'TIMEZONE': 'America/Sao_Paulo'}) reference = datetime.strftime(reference, "%Y%m") account = account.replace('Nº da Conta: ', '') path_file = "C:\\repositorio\\teste_pdf\\pdf_split\\" # path_file = 'C:\\Users\\BRCAP-BI01\\Desktop\\Claro Car\\' new_name_pdf = path_file + str( reference) + '-' + operator + '-' + account + '.pdf' print(new_name_pdf) return new_name_pdf # if __name__ == "__main__": # path = 'C:\\Users\\BRCAP-BI01\\Desktop\\Claro Car\\bf.blim_307024353_002_M1_PS.TAMB[1].pdf' # new_name_pdf = identify_name_pdf(path, 'CLARO') # os.rename(path, new_name_pdf)
def postlist_son(request): if 'delete_list' in request.POST: # 删子文件夹-批量 pdf_id = request.POST.getlist("d2p_list") file_infos = FileInfo.objects.filter(id__in=pdf_id) for file_info in file_infos: if file_info.file_type == 'FOLDER': delete_all(file_info.file_path) file_info.delete() else: os.remove(file_info.file_path) file_info.delete() file_infos = FileInfo.objects.all() messages.success(request, "文件删除成功!") elif 'mpdf_list' in request.POST: # 合并PDF文件夹-批量 output = PdfFileWriter() outputPages = 0 output_name = '' pdf_id = request.POST.getlist("d2p_list") file_infos = FileInfo.objects.filter(id__in=pdf_id) for file_info in file_infos: # 读取源PDF文件 input = PdfFileReader(open(file_info.file_path, "rb")) # 获得源PDF文件中页面总数 pageCount = input.getNumPages() outputPages += pageCount print("页数:%d" % pageCount) # 分别将page添加到输出output中 for iPage in range(pageCount): output.addPage(input.getPage(iPage)) # output_name=output_name+file_info.file_name.split('.')[0][0]+'-' output_name = '整合-' + file_info.folder_name.split('\\')[-1] # 写入到目标PDF文件 outputStream = open( file_infos[0].folder_name + '\\' + output_name + '.pdf', "wb") output.write(outputStream) outputStream.close() file_info1 = FileInfo(file_name=output_name + '.pdf', file_path=file_infos[0].folder_name + '\\' + output_name + '.pdf', file_type='pdf', load_user=get_user(request), is_personal=int(B), folder_name=file_infos[0].folder_name) file_info1.save() file_size1 = os.path.getsize(file_infos[0].folder_name + '\\' + output_name + '.pdf') FileInfo.objects.filter( file_path=file_infos[0].folder_name + '\\' + output_name + '.pdf').update( file_size=1 if 0 < file_size1 < 1024 else file_size1 / 1024) messages.success(request, "PDF合并成功!") elif 'download_list' in request.POST: # 下载-子文件夹-批量 download_id = request.POST.getlist("d2p_list") file_infos = FileInfo.objects.filter(id__in=download_id) # print('下载的文件名:' + file_info.file_name) for file_info in file_infos: file = open(file_info.file_path, 'rb') response = FileResponse(file) response[ 'Content-Disposition'] = 'attachment;filename="%s"' % urlquote( file_info.file_name) return response elif 'tj_list' in request.POST: word_id = request.POST.getlist("d2p_list") file_infos = FileInfo.objects.filter(id__in=word_id) for file_info in file_infos: in_file = file_info.file_path out_file = file_info.file_path.split(".")[0] + ".doc" fp = open(in_file, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 parser.set_document(doc) doc.set_parser(parser) # 提供初始化密码 # 如果没有密码 就创建一个空的字符串 doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量 num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0 for page in doc.get_pages(): # doc.get_pages() 获取page列表 num_page += 1 # 页面增一 interpreter.process_page(page) # 接受该页面的LTPage对象 layout = device.get_result() for x in layout: if isinstance(x, LTImage): # 图片对象 num_image += 1 if isinstance(x, LTCurve): # 曲线对象 num_curve += 1 if isinstance(x, LTFigure): # figure对象 num_figure += 1 if isinstance(x, LTTextBoxHorizontal): # 获取文本内容 num_TextBoxHorizontal += 1 # 水平文本框对象增一 # 保存文本内容 with open(out_file, 'a', encoding='utf-8') as f: #生成doc文件的文件名及路径 results = x.get_text() f.write(results) f.write('\n') print('对象数量:\n', '页面数:%s\n' % num_page, '图片数:%s\n' % num_image, '曲线数:%s\n' % num_curve, '水平文本框:%s\n' % num_TextBoxHorizontal) file_info1 = FileInfo( file_name=file_info.file_name.split('.')[0] + '.doc', file_path=out_file, file_type='doc', load_user=get_user(request), is_personal=int(B), folder_name=file_info.folder_name) file_info1.save() file_size1 = os.path.getsize(out_file) FileInfo.objects.filter(file_path=out_file).update( file_size=1 if 0 < file_size1 < 1024 else file_size1 / 1024) messages.success(request, "DOC转换成功!") else: pdf_id = request.POST.getlist("d2p_list") file_infos = FileInfo.objects.filter(id__in=pdf_id) for file_info in file_infos: in_file = file_info.file_path out_file = file_info.file_path.split(".")[0] + ".pdf" pythoncom.CoInitialize() word = win32com.client.Dispatch('Word.Application') doc = word.Documents.Open(in_file) doc.SaveAs(out_file, FileFormat=17) doc.Close() time.sleep(1) file_info1 = FileInfo(file_name=file_info.file_name.split('.')[0] + '.pdf', file_path=out_file, file_type='pdf', load_user=get_user(request), is_personal=int(B), folder_name=file_info.folder_name) file_info1.save() file_size1 = os.path.getsize(out_file) FileInfo.objects.filter(file_path=out_file).update( file_size=1 if 0 < file_size1 < 1024 else file_size1 / 1024) messages.success(request, "PDF转换成功!") return HttpResponseRedirect(reverse('fileserver:list1', args=[a]))
def testOnePriceRu(self): elem = self.driver.find_element_by_id("comp_name") elem.send_keys("ООО Евроторг") elem = self.driver.find_element_by_id("date") elem.send_keys("13.01.2020") # выбор белорусского рубля в выпадающем списке валют elem = self.driver.find_element_by_name("valyuta") elem.click() options = elem.find_elements_by_tag_name("option") for option in options: if option.text == "Белорусский рубль, 974": option.click() break elem = self.driver.find_element_by_id("tovar_ed_default") elem.send_keys("кг") elem = self.driver.find_element_by_id("tovar_country_default") elem.send_keys("РБ") # таблица с товарами elem = self.driver.find_element_by_id("tab1") tbody = elem.find_element_by_tag_name("tbody") tr = tbody.find_element_by_tag_name("tr") # название товара td = tr.find_element_by_class_name("tovar_name") field = td.find_element_by_tag_name("textarea") field.send_keys("Конфеты Южная ночь") # цена товара td = tr.find_element_by_class_name("tovar_cena") field = td.find_element_by_tag_name("input") field.send_keys("10,55") # жмем ссылку "Скачать" elem = self.driver.find_element_by_id("download") elem.click() # 10 сек ожидания # на случай, если Firefox спросит, сохранять файл time.sleep(10) # проверяем наличие сохраненного файла по названию today = datetime.date.today() fullpath = (self.savePath + "cenniki-new-" + today.strftime("%Y-%m-%d") + ".pdf") self.assertEqual(os.path.isfile(fullpath), True) # получаем текст из сохраненного файла # открываем файл fh = open(fullpath, 'rb') # открываем первую страницу page_obj = PDFPage.get_pages(fh, caching=True, check_extractable=True).__next__() resource_manager = PDFResourceManager() # создаем объект для вывода текста fake_file_handle = io.StringIO() # создаем конвертер для извлечения текста из PDF converter = TextConverter(resource_manager, fake_file_handle) # создаем интерпретатор страницы page_interpreter = PDFPageInterpreter(resource_manager, converter) # извлекаем текст страницы page_interpreter.process_page(page_obj) # забираем текст страницы в переменную page page = fake_file_handle.getvalue() # уничтожаем созданные объекты converter.close() fake_file_handle.close() # проверяем наличие введенных значений в тексте файла print(page) self.assertIn("ООО Евроторг", page) self.assertIn("13.01.2020", page) self.assertIn("кг", page) self.assertIn("РБ", page) self.assertIn("Конфеты Южная ночь", page) # цена self.assertIn("10", page) self.assertIn("55", page)