def __init__(self, filename=''): # Open a PDF file. fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) laparams = LAParams() # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, '') # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) self.all_classifications = {} self.problematicClassmarks = [] # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() self.process_page(layout) self.process_classifications() self.remove_debug() self.seperate() self.results = self.all_classifications self.problems = self.problematicClassmarks
def generateFileContent(self): import tempfile import urllib abbreviationsPdfUrl = u"http://www.realacademiagalega.org/c/document_library/get_file?uuid=f29e6ce1-9ac5-42e3-8c15-73c4b9b5f48b&groupId=10157" temporaryFile = tempfile.NamedTemporaryFile() urllib.urlretrieve(abbreviationsPdfUrl, temporaryFile.name) entries = set() fileObject = open(temporaryFile.name, "rb") parser = PDFParser(fileObject) document = PDFDocument(parser) resourceManager = PDFResourceManager() device = PDFPageAggregator(resourceManager) interpreter = PDFPageInterpreter(resourceManager, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() objects = [ object for object in layout if not isinstance(object, LTRect) and not isinstance(object, LTCurve) ] params = LAParams() for line in layout.group_objects(params, objects): text = line.get_text() if u":" in text: entry = text.split(u":")[0] entry = entry.strip() entry = entry.replace(u"..", ".") entries.add(entry) dictionary = u"# Abreviaturas empregadas no Dicionario da Real Academia Galega\n" dictionary += u"# http://www.realacademiagalega.org/abreviaturas\n" dictionary += u"\n" for entry in formatEntriesForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def ParsePDF(): filename = open(pdfpath, 'rb') #以二进制读模式打开 #用文件对象来创建一个pdf文档分析器 parser = PDFParser(filename) # 创建一个PDF文档对象存储文档结构,提供密码初始化,没有就不用传该参数 doc = PDFDocument(parser, password='') #检查文件是否允许文本提取 if not doc.is_extractable: print("Not Allowd Extractable") raise PDFTextExtractionNotAllowed # 创建PDf 资源管理器来管理共享资源,#caching = False不缓存 rsrcmgr = PDFResourceManager(caching = False) # 创建一个PDF设备对象 laparams = LAParams() # 创建一个PDF页面聚合对象 device = PDFPageAggregator(rsrcmgr, laparams=laparams) #device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) # 创建一个PDF解析器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) # 获取page列表list对象, # print(PDFPage.get_pages(doc)) #获取page列表循环遍历列表,每次处理一个page的内容 for page in PDFPage.create_pages(doc): # 接受该页面的LTPage对象 interpreter.process_page(page) # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 layout = device.get_result() for i in layout: if hasattr(i,"get_text") : content = i.get_text().replace(u'\xa0',u'').replace('\n','') document.add_paragraph(content , style=None) break document.save("a.docx") filename.close() return 1
def extract_citations(url): ''' Arguments: url (string): url of a pdf of a research paper ''' extracted_text = "" start_writing = False # Don't want to start writing until we hit References fp = open(my_file, "rb") print(fp) #### f = urllib2.urlopen(urllib2.Request(url)).read() fp = StringIO(f) #### parser = PDFParser(fp) document = PDFDocument(parser, password = "") if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): if start_writing: extracted_text += lt_obj.get_text() if "References\n" in lt_obj.get_text(): start_writing = True fp.close() with open(log_file, "w") as my_log: my_log.write(extracted_text.encode("utf-8")) print("Done !!")
def get_pdf_metadata(self, pdf): """Get PDF metadata with PDF content Args: pdf: PDF content (in bytes) Returns: metadata: PDF metadata dictionary """ temp_pdf_file = tempfile.TemporaryFile() temp_pdf_file.write(pdf) metadata = {'author': 'UNKNOWN_AUTHOR', 'title': 'UNKNOWN_TITLE', 'year': 'UNKNOWN_YEAR'} pdf_parser = PDFParser(temp_pdf_file) pdf_doc = PDFDocument(pdf_parser) pdf_metadata = pdf_doc.info[0] author = make_pdf_metadata_str(pdf_metadata['Author'] if 'Author' in pdf_metadata else '') if author and author != '': metadata['author'] = author title = make_pdf_metadata_str(pdf_metadata['Title'] if 'Title' in pdf_metadata else '') if title and title != '': metadata['title'] = title year = pdf_metadata_moddate_to_year( make_pdf_metadata_str(pdf_metadata['ModDate'] if 'ModDate' in pdf_metadata else '')) if year and year != '': metadata['year'] = year temp_pdf_file.close() return metadata
def pdfminer(f): # Open a PDF file. fp = open(f, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. # device = PDFDevice(rsrcmgr) laparams = LAParams(all_texts=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) converter = HTMLConverter(os.path.basename(f)) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() converter.current_page = page converter.render(layout) break # stop after first page. converter.add_features() return converter
def main(): #First: extract text from PDF path_to_pdf = sys.argv[1] infile = open(path_to_pdf, 'rb') output = io.StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=None) interpreter = PDFPageInterpreter(manager, converter) parser = PDFParser(infile) doc = PDFDocument(parser) for page in PDFPage.get_pages(infile, caching=False): interpreter.process_page(page) break infile.close() converter.close() text = [output.getvalue()] output.close() #Second: classify between Single Stock or not model_tfidf_ss = joblib.load('my_2D_tfidf_full_model.pkl') text_tfidf_ss_transformed = model_tfidf_ss.transform(text) model_clf_ss = joblib.load('my_2D_linear_reg_model.pkl') label_clf_ss_predicted = model_clf_ss.predict(text_tfidf_ss_transformed) #Third: if SngleStock, return, otherwise classify between Economy and Industry if label_clf_ss_predicted == 'SingleStock': return label_clf_ss_predicted[0] else: model_tfidf_2D3D = joblib.load('my_2D3D_tfidf_model.pkl') text_tfidf_2D3D_transformed = model_tfidf_2D3D.transform(text) model_clf_2D3D = joblib.load('my_2D3D_linear_SVC_model.pkl') label_clf_2D3D_predicted = model_clf_2D3D.predict( text_tfidf_2D3D_transformed) return label_clf_2D3D_predicted[0]
def character_extraction(self, address): # Create a file pointer fp = open(address, 'rb') try: # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, '') # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, (LTTextBox, LTTextLine)): self.fetch_chars(lt_obj) self.page_num += 1 finally: fp.close()
def parse_data(self, path, filetype, **kwargs): self.filename = path self.metadata = {} if not filetype == FileTypes.PDF: return None with open(self.filename, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) if doc: try: for xref in doc.xrefs: info_ref = xref.trailer.get('Info') info = None if info_ref: info = resolve1(info_ref) self.metadata = info for k, v in info.items(): if isinstance(v, PDFObjRef): self.metadata[k] = resolve1(v) break if not self.metadata: self.errors.append('No metadata found') out = None else: self._parse_data() out = self except Exception as e: self.logger.error(str(e)) self.errors.append(str(e)) out = None else: self.errors.append('Cannot parse document') parser.close() return out
def extractembedded(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): def extract1(obj): filename = os.path.basename(obj['UF'] or obj['F']) fileref = obj['EF']['F'] fileobj = doc.getobj(fileref.objid) if not isinstance(fileobj, PDFStream): raise PDFValueError( 'unable to process PDF: reference for %r is not a PDFStream' % (filename)) if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE: raise PDFValueError( 'unable to process PDF: reference for %r is not an EmbeddedFile' % (filename)) path = os.path.join(extractdir, filename) if os.path.exists(path): raise IOError('file exists: %r' % path) print >> sys.stderr, 'extracting: %r' % path out = file(path, 'wb') out.write(fileobj.get_data()) out.close() return fp = file(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) for xref in doc.xrefs: for objid in xref.get_objids(): obj = doc.getobj(objid) if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC: extract1(obj) return
def extract_pdf(path, languages=None): """ Extract content from a PDF file. This will attempt to use pdfminer to extract textual content from each page. If none is found, it'll send the images through OCR. """ with open(path, 'rb') as fh: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fh) doc = PDFDocument(parser, '') result = {'pages': []} if len(doc.info): for k, v in doc.info[-1].items(): k = k.lower().strip() v = string_value(v) if k != 'pages' and v is not None and '<PDFObjRef:' not in v: result[k] = string_value(v) for i, page in enumerate(PDFPage.create_pages(doc)): text = None try: interpreter.process_page(page) layout = device.get_result() text = _convert_page(layout, path) except Exception as ex: log.warning("Failed to parse PDF page: %r", ex) if text is None or len(text) < 3: log.info("OCR: %r, pg. %s", path, i + 1) text = _extract_image_page(path, i + 1, languages) result['pages'].append(text) device.close() return result
def parse_document(self): self.res = [] # result set self.media_boxes = dict() # media coordinate dictionary self.n = 0 # page count pdf = open(self.pdf, "rb") pdf_parser = PDFParser(pdf) pdf_document = PDFDocument(pdf_parser) la_params = LAParams(detect_vertical=True) if constants.USE_CUSTOM_PDF_PARAMETERS: la_params = LAParams(detect_vertical=constants.DEFAULT_DETECT_VERTICAL, line_overlap=constants.DEFAULT_LINE_OVERLAP, line_margin=constants.DEFAULT_LINE_MARGIN, word_margin=constants.DEFAULT_WORD_MARGIN, char_margin=constants.DEFAULT_CHAR_MARGIN, boxes_flow=constants.DEFAULT_BOXES_FLOW) if pdf_document.is_extractable: resource_manager = PDFResourceManager() page_aggregator = PDFPageAggregator(resource_manager, laparams=la_params) page_interpreter = PDFPageInterpreter(resource_manager, page_aggregator) pages = PDFPage.create_pages(pdf_document) for page in pages: page_interpreter.process_page(page) layout = page_aggregator.get_result() crop_box = page.cropbox page_box = page.mediabox self.media_boxes[self.n] = {"x0": crop_box[0], "y0": crop_box[1], "x1": crop_box[2], "y1": crop_box[3], "x0page": page_box[0], "y0page": page_box[1], "x1page": page_box[2], "y1page": page_box[3]} self.box_id = -1 self.res = self.get_objects(layout._objs, self.res, self.n, self.media_boxes) self.n += 1 return self.res, self.media_boxes
def convert(input_file): f = input_file fp = open(f, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) objs = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() # collecting objects from the all pages, sorting them by their Y coordinate objs.append( sorted(get_objects(layout), key=lambda x: x.y0, reverse=True)) objs = sum(objs, []) # flattening to 1D array # getting objects from the corresponding sections resume_as_text = extract_text(objs) counter = Counter() for word in technology_jargon.keywords: count = count_of_technology_words(word, resume_as_text) if count > 0: counter[word] += count return json.dumps(counter)
def get_total(filename): path = open(filename, 'rb') parser = PDFParser(path) document = PDFDocument(parser) temp_total = -1 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) check_total = False for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = str(x.get_text()) if "Subtotal for all regions" in results: check_total = True if check_total: # print("results: " + results) temp_results = re.search(r'(.*)\n', results, re.M | re.I).group(1) temp_results = temp_results.replace(" ", "").replace( "\\n", "") try: temp_num = int(temp_results) if temp_num > temp_total: temp_total = temp_num except ValueError: continue return temp_total
def extract_layout_by_page(pdf_path): """ Extracts LTPage objects from a pdf file. """ laparams = LAParams() fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) layouts = [] for page in PDFPage.create_pages(document): interpreter.process_page(page) layouts.append(device.get_result()) return layouts
def pdf_to_string(pdf_file): fp = open(pdf_file, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() # Configuração das margens laparams = LAParams() laparams.line_margin = 0.3 laparams.word_margin = 0.3 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: print(lt_obj)
def get_problem_page(problem, pdf_path): """ Returns the pdf object belonging to the page of a problem widget Parameters ---------- problem : Problem Problem object in the database of the currently selected problem pdf_path : str Path to the PDF file of the exam for this problem Returns ------- page : PDFPage PDFPage object with information about the current page """ fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) page_number = problem.widget.page return next(itertools.islice(PDFPage.create_pages(document), page_number, page_number + 1))
def parse_question_file(question_file_path): text_content = [] with open(question_file_path, 'rb') as question_file: parser = PDFParser(question_file) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTTextBoxHorizontal): line = x.get_text().decode().strip() if line: text_content.append(line + '\n') return text_content
def extract_text(in_path, out_path): #https://towardsdatascience.com/pdf-text-extraction-in-python-5b6ab9e92dd files = glob.glob(in_path + '*.pdf') for i in range(len(files)): print(str(i / len(files) * 100)[:4] + "%", end="\r") name = files[i] file_path = in_path + name output_string = StringIO() with open(file_path, 'rb') as infile: parser = PDFParser(infile) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) out_filename = out_path + os.path.basename(name).replace("pdf", "txt") with open(out_filename, 'w') as outfile: outfile.write(output_string.getvalue())
def load_data_from_pdf(pdf): with open(pdf, 'rb') as file: parser = PDFParser(file) doc = PDFDocument(parser) parser.set_document(doc) outcome = [ load_fields_from_pdf(resolve1(f)) for f in resolve1(doc.catalog['AcroForm'])['Fields'] ] # format the outcome of data extract from ics pdf outcome = split_data(outcome) if outcome['Max Dynamic Reader Limit sets supported']: outcome['Max Dynamic Reader Limit sets supported'] = True if int( outcome['Max Dynamic Reader Limit sets supported']) > 4 else False if outcome['Product Configuration']: outcome['Product Configuration'] = True if outcome[ 'Product Configuration'] == '(A) PCDA (IRWIN Reader) / S-ICR' else False for key in outcome: if outcome[key] == 'Yes': outcome[key] = True elif outcome[key] in ['Off', 'No']: outcome[key] = False return outcome
def extract_pages(fp, start=None, end=None): """ extracts LTPage objects from a pdf file slightly modified from: https://euske.github.io/pdfminer/programming.html """ laparams = LAParams() parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed manager = PDFResourceManager() device = PDFPageAggregator(manager, laparams=laparams) interpreter = PDFPageInterpreter(manager, device) for i, page in enumerate(PDFPage.create_pages(document)): if start is not None and end is not None and i < start or i >= end: continue interpreter.process_page(page) yield device.get_result()
def shan_convert(pdf_path): fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) if not document.is_extractable: temp_file = pikepdf.open(pdf_path) temp_path = pdf_path[:-4] + "shan_temp" + ".pdf" temp_file.save(temp_path) fp = open(temp_path, 'rb') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() for page in PDFPage.get_pages(fp, pagenos): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def main(fname): with open(fname, 'rb') as fd: parser = PDFParser(fd) doc = PDFDocument(parser) # Check if document is extractable, if not abort if not doc.is_extractable: raise Exception rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) all_txt = "" for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() txt = parse_layout(layout) all_txt += txt #print "Converted text\n", all_txt snip = find_pattern(all_txt, "volunteer recycling", 200) print snip
def parse_pdf(path, output_path): with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams(all_texts=True, boxes_flow=2.0, heuristic_word_margin=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): extracted_text += lt_obj.get_text() with open(output_path, "w", encoding="utf-8") as f: f.write(extracted_text)
def with_pdf(pdf_doc, fn, pdf_pwd, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdf_doc, "rb") # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument(parser, pdf_pwd) # connect the parser and document objects parser.set_document(doc) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def __init__(self, stream, pages=None, laparams=None, precision=0.001): self.laparams = None if laparams == None else LAParams(**laparams) self.stream = stream self.pages_to_parse = pages self.precision = precision rsrcmgr = PDFResourceManager() self.doc = PDFDocument(PDFParser(stream)) self.metadata = {} for info in self.doc.info: self.metadata.update(info) for k, v in self.metadata.items(): if hasattr(v, "resolve"): v = v.resolve() if type(v) == list: self.metadata[k] = list(map(decode_text, v)) elif isinstance(v, PSLiteral): self.metadata[k] = decode_text(v.name) elif isinstance(v, bool): self.metadata[k] = v else: self.metadata[k] = decode_text(v) self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
def Pdf2Txt(DataIO,Save_path): #来创建一个pdf文档分析器 parser = PDFParser(DataIO) #创建一个PDF文档对象存储文档结构 document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed else: #创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr=PDFResourceManager(); #设定参数进行分析 laparams=LAParams(); #创建一个PDF设备对象 #device=PDFDevice(rsrcmgr) device=PDFPageAggregator(rsrcmgr,laparams=laparams);#创建一个PDF解释器对象 interpreter=PDFPageInterpreter(rsrcmgr,device) #处理每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page); #接受该页面的LTPage对象 layout=device.get_result() for x in layout: try: if(isinstance(x,LTTextBoxHorizontal)): with open('%s'%(Save_path),'a') as f: f.write(x.get_text().encode('utf-8')+'\n') except: print "Failed!"
def parse(Path): parser = PDFParser(Path) #parser的意思是解析器、分析程序 document = PDFDocument(parser) re_list = [] # 判断PDF是否能够解析 if not document.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) layout = device.get_result() for x in layout: if (isinstance(x, LTTextBoxHorizontal)): results = x.get_text() re_list.append(results) print(re_list) return re_list
def parse_pdf(self, source_pdf: str = None) -> None: """Parse source PDF into entities which can be used for text searches for example. :param source_pdf: source """ if source_pdf is not None: self.switch_to_pdf_document(source_pdf) source_parser = PDFParser(self.active_fileobject) source_document = PDFDocument(source_parser) source_pages = PDFPage.create_pages(source_document) rsrcmgr = PDFResourceManager() laparams = LAParams( detect_vertical=True, all_texts=True, ) device = RPAConverter(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # # Look at all (nested) objects on each page for _, page in enumerate(source_pages, 0): interpreter.process_page(page) self.rpa_pdf_document = device.close()
def get_blurb(): pdfs = glob.glob('/pdfs/*') if not pdfs: print >> sys.stderr, 'NO PDFS' return '', '' pdf = random.choice(pdfs) print >> sys.stderr, 'pdf:', pdf with open(pdf, 'rb') as f: parser = PDFParser(f) document = PDFDocument(parser) assert document.is_extractable rsrcmgr = PDFResourceManager() retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = list(PDFPage.get_pages(f)) pnum = random.randint(0, len(pages)) interpreter.process_page(pages[pnum]) txt = retstr.getvalue() return pdf.replace('pdfs', 'view') + '#page=' + str(pnum), txt[:100]