def __init__(self, pdf, codec='utf-8'): """ Parameters: -------------- codec: codific, default utf-8 pdf: path to the pdf file Attributes: --------------- records: list of lines from the pdf file text: string of joined records, default "" didascalies: list of found didascalies with regexpr nimages: int, number of found images """ self.pdf = pdf self.text = "" self.records = [] self.didascalies = [] self.nimages = 0 self.images = [] parser = PDFParser(pdf) #parser = PDFParser(open(pdf, 'rb')) document = PDFDocument(parser) if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object # that stores shared resources. rsrcmgr = PDFResourceManager() # Create a buffer for the parsed text retstr = StringIO() # Spacing parameters for parsing laparams = LAParams() self.codec = codec device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) #images img_device = PDFPageAggregator(rsrcmgr, laparams=laparams) img_interpreter = PDFPageInterpreter(rsrcmgr, img_device) for page in PDFPage.create_pages(document): img_interpreter.process_page(page) pdf_item = img_device.get_result() if pdf_item is not None: for thing in pdf_item: if isinstance(thing, LTImage): self.save_image(thing) if isinstance(thing, LTFigure): self.find_images_in_thing(thing) lines = retstr.getvalue().splitlines() for line in lines: self.records.append(line)
def convert(fname): pages = None if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') # if names and address on odd page; remove not if names and addresses are on even page for pagenumber, page in enumerate(PDFPage.get_pages(infile, pagenums)): if pagenumber % 2: interpreter.process_page(page) else: pass # for page in PDFPage.get_pages(infile, pagenums): # interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() return text
def __get_pdf_text__(self): """Extracts all the text from the pdf while removing superfluous/unmatched space characters Returns: text (string): A string of all pdf text Code from: https://stackoverflow.com/questions/56494070/how-to-use-pdfminer-six-with-python-3 """ resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams()) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(self.file_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() # close open handles converter.close() fake_file_handle.close() text = text.replace('\n', '').replace('\r', ' ') return text
def extract_pdf_content(pdf_path): rsrcmgr = PDFResourceManager() codec = 'utf-8' outfp = StringIO() # 开始捕捉字节流(outfp) laparams = LAParams() device = TextConverter(rsrcmgr=rsrcmgr, outfp=outfp, codec=codec, laparams=laparams) with open(pdf_path, 'rb') as fp: # 将pdf文件转换为二进制数据 interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) # 解析pdf的每一页,以二进制数据缓存 mystr = outfp.getvalue() # 捕获二进制信息流,以字符串的形式返回 device.close() outfp.close() return mystr
def __extracttxt2(self): """Helper function to extract text by pdfminer, slower but handles formats not recongnised by PyMupdf""" rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(str(self.filepath), 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def pdfreading(): tesi = {} for i in list1: for j in list1[i]: rsrcmgr = PDFResourceManager() retstr = io.StringIO() device = TextConverter(rsrcmgr, retstr) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" pagenos = set() with open('C:/Users/mpich/Desktop/MFI-Thesis/KIID/' + i + '/' + j, 'rb') as fh: for page in PDFPage.get_pages(fh, pagenos, caching=True, password=password, check_extractable=False): interpreter.process_page(page) text = retstr.getvalue() # close open handles tesi[i, j] = text device.close() retstr.close() return tesi
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) manager = PDFResourceManager() codec = 'utf-8' caching = True output = io.StringIO() converter = TextConverter(manager, output, codec=codec, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums, caching=caching, check_extractable=True): interpreter.process_page(page) convertedPDF = output.getvalue() infile.close() converter.close() output.close() return convertedPDF
def pdf_to_txt(fp): sentence = [] rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # fp= open(name, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() str = str.replace('\n', ' ') retstr.close() sentence = str.split('.') return sentence
def pdf_pages_to_list_of_strings(pdf_path): pdf = open(pdf_path, 'rb') rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. pages_text = [] pdf_pages = PDFPage.get_pages(pdf) for page in pdf_pages: # Get (and store) the "cursor" position of stream before reading from PDF # On the first page, this will be zero read_position = retstr.tell() # Read PDF page, write text into stream interpreter.process_page(page) # Move the "cursor" to the position stored retstr.seek(read_position, 0) # Read the text (from the "cursor" to the end) page_text = retstr.read() # Add this page's text to a convenient list pages_text.append(page_text) return pages_text
def to_html(self, pdf_path): rsrcmgr = PDFResourceManager() retstr = io.BytesIO() codec = 'utf-8' laparams = LAParams() device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 #is for all caching = True pagenos = set() try: with open(pdf_path, 'rb') as fp: for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) except Exception: print("[ERR] file path=", pdf_path) device.close() report = retstr.getvalue() retstr.close() self.html = report.decode('utf-8') chtml = self._replace(self.html) self.body = soup(chtml, 'html.parser')
def process_pdf(file): #Creating the required objects resource_manager = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(resource_manager, laparams=laparams) page_interpreter = PDFPageInterpreter(resource_manager, device) #This list will contain the text at each page of the document. pdfText = list() #Processing each page in the pdf. for page in PDFPage.get_pages(file): page_interpreter.process_page(page) layout = device.get_result() text = "" for element in layout: # Whenever, we encounter the layout type as text box, we get the text. # This is to skip images if any. if isinstance(element, LTTextBox): text += element.get_text() pdfText.append(text) #Returing a list, where element at each index contains the text at each page return pdfText
def __call__(self, rev, contenttype=None, arguments=None): rsrcmgr = PDFResourceManager() with io.StringIO() as f, TextConverter(rsrcmgr, f, laparams=LAPARAMS) as device: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(rev): interpreter.process_page(page) return f.getvalue()
def text_extractor(file_name): ''' input: a file name of an earnings transcript output: extracted text from the transcript ''' resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams()) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(file_name, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() # close open handles converter.close() fake_file_handle.close() return text
def newScan(parent): lista = [] f = open("designs.txt", "w+") g = open("paths.txt", "w+") # iterate over all the files in directory 'parent' for file_name in parent: resource_manager = PDFResourceManager() handle = io.StringIO() converter = TextConverter(resource_manager, handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) arquivo = open(file_name, 'rb') with arquivo as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = handle.getvalue() word = find_between(text, "SKUPrice1", "$") print(word) # number = has_sequence(word) # stringnumber = ''.join(map(str, number)) # artwork = find_between(word,"1",stringnumber) f.write(word + "\n") g.write(file_name + "\n") converter.close() handle.close() f.close() g.close() return lista
def scan_folder(parent, keyword): lista = [] # iterate over all the files in directory 'parent' for file_name in os.listdir(parent): resource_manager = PDFResourceManager() handle = io.StringIO() converter = TextConverter(resource_manager, handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) if file_name.endswith(".pdf"): # if it's a txt file, print its name (or do whatever you want) arquivo = open(parent + "/" + file_name, 'rb') with arquivo as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = handle.getvalue() if (text.find(keyword) != -1): # print(file_name + " TEEM") lista.append(parent + "/" + file_name) # else: # print(file_name + " NAOOOO") converter.close() handle.close() else: current_path = "".join((parent, "/", file_name)) if os.path.isdir(current_path): # if we're checking a sub-directory, recall this method scan_folder(current_path) return lista
def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def _pdf(self, path: str) -> str: """Load a PDF file and split document to pages. Args: pdf: PDF file or path to file Returns: list of pages. """ rsrcmgr = PDFResourceManager() retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec="utf-8", laparams=LAParams()) list_of_pages = [] with open(path, "rb") as pdf_file: interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages( pdf_file, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True, ): read_position = retstr.tell() interpreter.process_page(page) retstr.seek(read_position, 0) page_text = retstr.read() list_of_pages.append(page_text) device.close() retstr.close() return "\n".join(list_of_pages)
def handlefile(myfile): kind = filetype.guess('app/static/upload/' + myfile.name) if kind is None: print('Cannot guess file type!') print('File extension: %s' % kind.extension) print('File MIME type: %s' % kind.mime) if (kind.extension == "pdf"): from pdfminer3.layout import LAParams, LTTextBox from pdfminer3.pdfpage import PDFPage from pdfminer3.layout import LAParams, LTTextBox from pdfminer3.pdfpage import PDFPage from pdfminer3.pdfinterp import PDFResourceManager from pdfminer3.pdfinterp import PDFPageInterpreter from pdfminer3.converter import PDFPageAggregator from pdfminer3.converter import TextConverter import io resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() codec = 'utf-8' converter = TextConverter(resource_manager, fake_file_handle, codec=codec, laparams=LAParams()) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open('app/static/upload/' + myfile.name, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() print(text) if (kind.extension == "png" or kind.extension == "jpg" or kind.extension == "webp"): from PIL import Image, ImageFilter, ImageChops import pytesseract from pytesseract import image_to_string import cv2 filename = 'app/static/upload/' + myfile.name imgcv = cv2.imread(filename, 0) imp = Image.open(filename) text = image_to_string(imp) #text = main_fun(imgcv,imp,kind.extension) #text=main_fun(im) print(text) dictionary = MakeForm(text) #dictionary.replace('"', "'") #print(dictionary) return dictionary
def _extract_pdf_scores(stream): # these laparams seem to work ok with the ILIAS default PDF # formatting as well as with UR custom styling. # see pdf/tests/default_style.pdf and pdf/tests.ur_style.pdf laparams = LAParams(line_overlap=0, char_margin=20, word_margin=0.1, boxes_flow=0, detect_vertical=False) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(stream) document = PDFDocument(parser) page = next(PDFPage.create_pages(document)) interpreter.process_page(page) layout = device.get_result() boxes = [] table_head_y = None # y position of result table header order_name = "Reihenfolge" # FIXME localize for element in layout: if isinstance(element, LTTextBoxHorizontal): boxes.append(element) if order_name in element.get_text().strip(): table_head_y = element.y0 tboxes = list(filter(lambda box: box.y0 == table_head_y, boxes)) # if LAParams is set correctly, head should extract the whole # results table's text now. table = tboxes[0].get_text().replace('\t', '') table = table[table.find(order_name):] # note: question titles might lack spaces; this is no problem # since we compare question names and scores only through # Result.normalize_question_title() later. scores = dict() cols = [] for line in table.split("\n")[1:]: cols += re.split(r'\s+', line) if len(cols) >= 6: scores[cols[2]] = cols[4] cols = cols[6:] return scores
def __init__(self, ofile): rsrcmgr = PDFResourceManager() laparams = LAParams() self.device = PDFPageAggregator(rsrcmgr, laparams=laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device) self.last_font = None self.in_rule = False self.font_print_pending = False self.header_footer_skipping = False self.ofile = ofile
def coverpageinfo(earnings_call_file): """ takes in a pdf file and returns a dictionary with information regarding the cover page(company_name,symbol,date,year) :param earnings_call_file: File - representation of the earnings call file :return: dictionary with keys corresponding to the pdf basic info such as 'symbol','quarter_year','company_name' and 'published date' """ coverpage_info = {} PASSWORD = "" # setting parameters for pdfminer's get_pages function MAXPAGES = 0 CACHING = True PAGE_NUMBERS = set() # using loop count to count until first page in order to only extract cover page loop_count = 0 text = "" for page in PDFPage.get_pages(earnings_call_file, PAGE_NUMBERS, maxpages=MAXPAGES, password=PASSWORD, caching=CACHING, check_extractable=True): # setting parameters for PDFMiner's TextConverter function resource_manager = PDFResourceManager() return_string = StringIO() CODENCODING = 'utf-8' analysis_parameter = LAParams() device = TextConverter(resource_manager, return_string, codec=CODENCODING, laparams=analysis_parameter) # updates interpreter with current page interpreter = PDFPageInterpreter(resource_manager, device) interpreter.process_page(page) # stop at 0 as function only extract title page,loop breaks after going through cover page if loop_count == 0: text = return_string.getvalue() break device.close() return_string.close() # removing footer from page cleansed_text = re.search( "(?<= FactSet CallStreet, LLC).*", text).group(0).strip() # use regex to match and extract company ticker ticker = re.search("\((.*?)\)", cleansed_text).group(0).strip() # extracting company ticker symbol splits = cleansed_text.split(ticker) # using ticker as a regex input to identify quarter and year of pdf splits = [i.strip() for i in splits] # extracting quarter_year information as it exist and reolacing white spaces(numerical integers are # used as ingested data is assumed to be structured) quarter_year = splits[1][:7].replace(" ", "_") # using ticker as a regex input to identify published date of pdf published_date_time = splits[0][:11] # using ticker as a regex input to identify company namne company_name = splits[0][12:] # putting all the information of the cover page in a dictionary. coverpage_info['symbol'] = ticker[1:-1] coverpage_info['quarter_year'] = quarter_year coverpage_info['company_name'] = company_name coverpage_info['published_date'] = published_date_time return coverpage_info
def convert_pdf_to_txt(filepath): rm = PDFResourceManager() sio = StringIO() device = TextConverter(rm, sio, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(rm, device) with open(filepath, 'rb') as fp: for page in PDFPage.get_pages(fp=fp, pagenos=set(), maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) text = sio.getvalue() device.close() sio.close() return text
def convert(pdffile): my_file = pdffile extracted_text = "" # Open and read the pdf file in binary mode fp = open(my_file, "rb") # Create parser object to parse the pdf content parser = PDFParser(fp) # Store the parsed content in PDFDocument object document = PDFDocument(parser, password) # Check if document is extractable, if not abort if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create PDFResourceManager object that stores shared resources such as fonts or images rsrcmgr = PDFResourceManager() # set parameters for analysis laparams = LAParams() # Create a PDFDevice object which translates interpreted information into desired format # Device needs to be connected to resource manager to store shared resources # device = PDFDevice(rsrcmgr) # Extract the decive to page aggregator to get LT object elements device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create interpreter object to process page content from PDFDocument # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(rsrcmgr, device) # Ok now that we have everything to process a pdf document, lets process it page by page for page in PDFPage.create_pages(document): # As the interpreter processes the page stored in PDFDocument object interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() #close the pdf file fp.close() # print (extracted_text.encode("utf-8")) with open(log_file, "wb") as my_log: my_log.write(extracted_text.encode("utf-8")) print("Done !!")
def __call__(self, rev, contenttype=None, arguments=None): rsrcmgr = PDFResourceManager() max_parse_time = timedelta(seconds=15) start = datetime.now() with io.StringIO() as f, TextConverter(rsrcmgr, f, laparams=LAPARAMS) as device: interpreter = PDFPageInterpreter(rsrcmgr, device) for page_idx, page in enumerate(PDFPage.get_pages(rev)): logging.debug("Processing PDF page %d", page_idx) interpreter.process_page(page) if datetime.now() - start > max_parse_time: logging.info("PDF parsing timed out after %d pages", page_idx) break logging.debug("PDF text extraction took: %s", datetime.now() - start) return f.getvalue()
def main(args): msg(SCRIPT, args) if len(args) != 1: msg('Parse a PDF file and print some pdfminer-specific stats') msg('Usage:', SCRIPT, '<PDF-filename>') return 1 infilename, = args lt_types = collections.Counter() with open(infilename, 'rb') as pdf_file: # Create a PDF parser object associated with the file object. parser = PDFParser(pdf_file) # Create a PDF document object that stores the document structure. # Supply the password for initialization. password = '' document = PDFDocument(parser, password) # Check if the document allows text extraction. if not document.is_extractable: raise PDFTextExtractionNotAllowed(filename) # Make a page iterator pages = PDFPage.create_pages(document) # Set up for some analysis rsrcmgr = PDFResourceManager() laparams = LAParams( detect_vertical=True, all_texts=True, ) #device = PDFDevice(rsrcmgr) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Look at all (nested) objects on each page for page_count, page in enumerate(pages, 1): # oh so stateful interpreter.process_page(page) layout = device.get_result() lt_types.update(type(item).__name__ for item in flat_iter(layout)) msg('page_count', page_count) msg('lt_types:', ' '.join('{}:{}'.format(*tc) for tc in lt_types.items()))
def get_pdf_content(file_path): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams()) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(file_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() # close open handles converter.close() fake_file_handle.close() return text
def get_text_from_pdf(in_file): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams()) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(in_file, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) pdftext = fake_file_handle.getvalue() converter.close() fake_file_handle.close() content=pdftext.replace('\n', '').replace(" ", "") return content
def convert_pdf_to_txt(path_to_file): rsrcmgr = PDFResourceManager() retstr = StringIO() co = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=co, laparams=laparams) path_ = os.getcwd() + path_to_file print(path_) fp = open(path_, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) parentdir = os.path.abspath(os.path.join(BASE_DIR, os.pardir)) mediadir = os.path.join(BASE_DIR, "media") txtdir = os.path.join(mediadir, "textfiles") base = os.path.basename(path_to_file) fileNAME = os.path.splitext(base)[0] fileTXT = fileNAME + '.txt' filePDF = fileNAME + '.pdf' filetxtpath = os.path.join(txtdir, fileTXT) filePDF = os.path.join(mediadir, filePDF) print(filePDF) convertedPDF = convert(filePDF, pages=None) fileConverted = open(filetxtpath, 'w+', encoding="utf-8") ######## EITHER fileConverted.write(convertedPDF) fileConverted.close() return text
def get_cv_email(cv_path): pagenums = set() output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(cv_path, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() match = re.search(r'[\w\.-]+@[a-z0-9\.-]+', text) email = match.group(0) return email
def extractTextByPage(pdf_path): with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) page_interpreter.process_page(page) text = fake_file_handle.getvalue() yield text # close open handles converter.close() fake_file_handle.close()