def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() #laparams.all_texts = True laparams.word_margin = float(0.15) #laparams=None device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) #fp = file(path, 'rb') fp=path interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def read_pdf(path): fp = open(path, 'rb') pdfFileObj = open(path, 'rb') #'rb' for read binary mode pdfReader = PyPDF2.PdfFileReader(pdfFileObj) pypdf_text = '' for page_number in range(0, pdfReader.numPages): pageObj = pdfReader.getPage(page_number) pypdf_text += pageObj.extractText() + "\n" pypdf_text = " ".join(pypdf_text.replace(u"\xa0", " ").strip().split()) parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() return extracted_text, pypdf_text
def parse_pdf(path, pdf): fp = open(pdf, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() # text = open("EES Quote test reference.txt") text_file = open(path + '/result.txt', "w") for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() print(extracted_text) # extracted_text >> path + "result.txt" text_file.write(extracted_text) text_file.close() return text_file global file_one file_one = path + 'result.txt' global file_two file_two = "Quote test reference" # '/home/rc/Downloads/EES I Packing List reference.txt'
def get_pdf_text(pdf_file): pdf_text = "" with open(pdf_file, 'rb') as file_hdl: parser = PDFParser(file_hdl) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): pdf_text += lt_obj.get_text() if len(pdf_text) == 0: img_files = get_pdf_images(pdf_file) for img_file in img_files: pdf_text += get_text_from_image(img_file) return pdf_text
def initialize_pdf_miner(fh, password = None): # Create a PDF parser object associated with the file object. parser = PDFParser(fh) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser, password) # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise ValueError("PDFDocument is_extractable was False.") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return doc, interpreter, device
def initialize_pdf_miner(fh): # Create a PDF parser object associated with the file object. parser = PDFParser(fh) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser) # Connect the parser and document objects. parser.set_document(doc) #doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) #doc.initialize("") # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: pass #raise ValueError("PDFDocument is_extractable was False.") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams(line_overlap=0.3, char_margin=1.0, line_margin=0.5, word_margin=0.1, boxes_flow=0.1, detect_vertical=False, all_texts=False) laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return doc, interpreter, device
def extract_text_from_pdf(path): ''' extract text from pdf the function works for asian language path `str` to pdf file e.g. './folder_name/text.pdf' extracted_text `str` source: stackoverflow ''' fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() return extracted_text
def getTextPdf(self, filename): try: pdfFileObj = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) numeroDePaginas = pdfReader.numPages pdfFileObj.close() print(numeroDePaginas) if numeroDePaginas > 4: return '' else: file = open(filename, 'rb') parser = PDFParser(file) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): extracted_text += lt_obj.get_text() return extracted_text except: return ''
def convert_pdf_to_txt(path): # setup fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() # close file and return its content fp.close() return extracted_text
def extract_text_from_pdf(path_in, path_out, fichier, page_beg, page_end=0): if (page_end == 0): page_end = page_beg fp = open(path_in + '/' + fichier, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 4.0 # 2.0 by default : two char whose distance is closer than this value are considered contiguous and get grouped into one. laparams.word_margin = 0.3 # 0.1 by default : distance between two words is greater than this value => insert space laparams.line_margin = 0.5 # 0.5 by default : Distance between 2 Lines under this value are grouped device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' x = list(doc.get_pages()) for i in range(page_beg - 1, page_end): page = x[i] extracted_text += "EXTRACTION DE LA PAGE " + str(i + 1) + "\n\n" interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() extracted_text += "\n" return extracted_text
def convertPDF2txt(fname, pages=None): parser = PDFParser(open(fname, 'rb')) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() #I changed the following 2 parameters to get rid of white spaces inside words: laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' # Process each page contained in the document. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() print(extracted_text) return extracted_text
def get_text_rows(path): rows = defaultdict(list) # Open a PDF file. fp = open(path, 'rb') # Create a PDF parser object associated with the file object. # parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Password for initialization as 2nd parameter # document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. # if not document.is_extractable: # raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # BEGIN LAYOUT ANALYSIS # Set parameters for analysis. laparams = LAParams() laparams.line_overlap = 0.01 laparams.line_margin = 0.01 laparams.word_margin = 0.15 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) def parse_obj(lt_objs, page): # loop over the object list for obj in lt_objs: # if it's a textbox, print text and location if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal): rows[(page, -int(obj.bbox[1]))].append( (int(obj.bbox[0]), sanitize(obj.get_text()))) # if it's a container, recurse elif isinstance(obj, pdfminer.layout.LTFigure): parse_obj(obj._objs, page) # loop over all pages in the document for page_num, page in enumerate(PDFPage.get_pages(fp)): # read the page into a layout object interpreter.process_page(page) layout = device.get_result() # extract text from this object parse_obj(layout._objs, page_num) for key in sorted(rows): rows[key] = sorted(rows[key]) page, y = key y = -y yield (page, y, rows[key])
def to_txt(infile: str, outfile: str): """ Convert a pdf file to txt. :param infile: pdf file path; :param outfile: txt file path; :return: txt file path; """ caching = True rsrcmgr = PDFResourceManager(caching=caching) codec = 'utf-8' pagenos = set() maxpages = 0 password = '' laparams = LAParams() laparams.word_margin = float(0) laparams.line_margin = float(1) outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') device = TextConverter(rsrcmgr, outfp, laparams=laparams) fp = io.open(infile, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return outfile
def parse_qp(docket_number): if "-Orig" in docket_number: docket = docket_number.split("-")[0] + ' orig' else: split_docket = docket_number.split("-") docket = '{term}-{num:05d}'.format(term=split_docket[0], num=int(split_docket[1])) fp = io.BytesIO( requests.get("https://www.supremecourt.gov/qp/" + docket + "qp.pdf").content) parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): text = lt_obj.get_text().replace("(cid:160)", " ") if ("LOWER COURT CASE NUMBER:" not in text) and ("DECISION BELOW:" not in text): extracted_text += text return re.sub(' +', ' ', extracted_text)
def parse(self, path): out = StringIO.StringIO() fp = None # Directory if os.path.isdir(path): raise NotImplementedError() # File else: fp = file(path) rsrc = PDFResourceManager() codec = 'utf-8' laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin = 2.0 laparams.word_margin = 0.0 device = TextConverter(rsrc, out, codec=codec, laparams=laparams) doc = PDFDocument() parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize() interpreter = PDFPageInterpreter(rsrc, device) for page in doc.get_pages(): interpreter.process_page(page) device.close() sample = Sample(path, None, out.getvalue()) out.close() return sample
def extractText(file_name): """ extract text in file """ connection = open(file_name, 'rb') parser = PDFParser(connection) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() return extracted_text
def extract_layout_by_page(pdf_path, page_number): """ :param pdf_path: pdf file path :param page_number: the specific page that you want to parse(start from 1) :return: a list of pdfminer layout object """ fp = open(pdf_path, 'rb') # 以二进制读模式打开 # 用文件对象来创建一个pdf文档分析器 praser = PDFParser(fp) # 创建一个PDF文档 doc = PDFDocument() # 连接分析器 与文档对象 praser.set_document(doc) doc.set_parser(praser) doc.initialize() # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() laparams.line_overlap = 0.3 laparams.char_margin = 3 laparams.word_margin = 0.3 laparams.line_margin = 0.01 device = PDFPageAggregator(rsrcmgr, laparams=laparams) # 创建一个PDF解释器对象 interpreter = PDFPageInterpreter(rsrcmgr, device) layouts = [] # 循环遍历列表,每次处理一个page的内容 pages = list(doc.get_pages()) interpreter.process_page(pages[page_number - 1]) # 接受该页面的LTPage对象 return device.get_result()
def pdf_to_text_file(file_path): from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox, LTTextLine extracted_text = '' # In file_path Provide the full file path including the pdf name for example C://UserName/Folder1/PdfFile.pdf file_content = open(file_path, 'rb') parser = PDFParser(file_content) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() #changing below 2 parameters to get rid of white spaces inside words laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the pdf document. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() return (extracted_text.encode("utf-8"))
def initialize_pdf_miner(fh): # Create a PDF parser object associated with the file object. parser = PDFParser(fh) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Supply the password for initialization. # (If no password is set, give an empty string.) doc.initialize("") # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise ValueError("PDFDocument is_extractable was False.") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return doc, interpreter, device
def pdfToTextDict(filename): textDict = [] datafiles = sorted(glob.glob(filename+'*.pdf')) for pdf in datafiles: fp = open(pdf, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) try: doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): #if isinstance(lt_obj, LTTextBox): #compare performance extracted_text += lt_obj.get_text() textDict.append(extracted_text) except: print("set parser error)") return textDict
def __extract_extra__(request, item_id=None): if not request.user.is_authenticated(): return HttpResponse('Please sign in first') from pdfminer.layout import LAParams from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.converter import TextConverter from cStringIO import StringIO laparams = LAParams() outtype = 'text' laparams.char_margin = 1.0 laparams.line_margin = 0.3 laparams.word_margin = 0.2 codec = 'utf-8' caching = True if item_id: all_items = Item.objects.filter(id=item_id) else: all_items = Item.objects.all() for item in all_items: # Don't extract if no PDF exists; or if we already have search text if not item.pdf_file or item.other_search_text: continue rsrcmgr = PDFResourceManager(caching=caching) outfp = StringIO() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = item.pdf_file.file try: process_pdf(rsrcmgr, device, fp, pagenos=set(), maxpages=0, password='', caching=caching, check_extractable=True) except AssertionError: logger.warning('FAILED in completely PDF index "%s"' % item.title) return HttpResponse('FAILED in completely PDF index "%s"' \ % item.title) else: logger.debug('Full PDF index of item "%s"' % item.title) finally: fp.close() device.close() outfp.seek(0) page_text = outfp.read() outfp.close() item.other_search_text = page_text item.save() return HttpResponse('Full PDF indexed for item "%s"' % item.title)
def parse_pdf(self, test_parse=False): """ Parse a PDF and return text contents as an array """ dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file) # input options pagenos = set() maxpages = 0 # output option codec = "utf-8" caching = True laparams = LAParams() laparams.char_margin = 8.0 laparams.word_margin = 2.0 rsrcmgr = PDFResourceManager(caching=caching) try: outfp = file(self.text_file, "w") except IOError as io_error: raise DTPOFileError(self.text_file, 0, str(io_error)) try: fp = file(self.source_file, "rb") except IOError as io_error: raise DTPOFileError(self.source_file, 0, str(io_error)) try: device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True) except PDFException as pdf_error: message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error)) raise DTPOFileError(self.source_file, 0, message) except Exception as exception: message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception)) raise DTPOFileError(self.source_file, 0, message) fp.close() device.close() outfp.close() # Got the PDF converted = now get it into an array self.file_array = [] for line in open(self.text_file): self.file_array.append(line) # Remove the last entry - it's always '\x0c' if len(self.file_array) > 0: del self.file_array[-1] # Remove the outfile if not test_parse: os.remove(self.text_file)
def readPDFs(folder): lem = WordNetLemmatizer() filename_pattern = '.+\.pdf' my_corpus = PlaintextCorpusReader(folder, filename_pattern) list_of_files = my_corpus.fileids() corpus = [] # Get list of file names for i, file in enumerate(list_of_files): fp = open(folder + file, 'rb') print(i) parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) text = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): text += lt_obj.get_text() last = text.rfind('Reg AC') if last != -1: text = text[0:last] corpus.append(text.split()) # Alphabets, Lower and Lemmatize docs1 = [[w.lower() for w in sub_doc] for sub_doc in corpus] docs2 = [[w for w in sub_doc if re.search('^[a-z]+$', w)] for sub_doc in docs1] docs3 = [[w for w in sub_doc if len(w) > 3] for sub_doc in docs2] processed_docs = [[lem.lemmatize(w) for w in sub_doc] for sub_doc in docs3] # Return List of Lists return processed_docs
def convert(fname, pages=None, M=1.0, L=0.3, W=0.2, F=0.5): """ Converts a pdf filename into plain text. Each value is specified not as an actual length, but as a proportion of the length to the size of each character in question. Parameters define layout analysis. In a PDF text is in several chunks of various types. Text extraction needs to recover text chunks which ar regarded as continuous if elements distance is closer than the char_margin (identified as M) and thus are grouped into one block. Two lines are part of the same text if they are closer than the line_margin (L). If the distance between two words is greater than the word_margin (W), blank characters (spaces) shall be inserted as necessary to keep format. Boxes flow (F) specifies how much a horizontal and vertical position of a text matters when determining text flow order. The value should be within the range from -1.0 (only horizontal position matters) to +1.0 (only vertical position matters). Keyword arguments: fname -- PDF file name (string) pages -- Set of pages to extract (set) M -- char_margin (float) L -- line_margin (float) W -- word_margin (float) F -- boxes_flow (float) Return: text: pdf contents as plain text """ if not pages: pagenums = set() else: pagenums = set(pages) output = BytesIO() codec = "utf-8" manager = PDFResourceManager() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = False laparams.char_margin = M laparams.line_margin = L laparams.word_margin = W laparams.boxes_flow = F converter = TextConverter(manager, output, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text
def readText(self,path, outtype='text', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) # debug option pagenos = set() maxpages = 0 # output option # ?outfile = None # ?outtype = None outdir = None #layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) print laparams # #PDFDocument.debug = debug #PDFParser.debug = debug CMapDB.debug = self.debug PDFResourceManager.debug = self.debug PDFPageInterpreter.debug = self.debug PDFDevice.debug = self.debug # rsrcmgr = PDFResourceManager() #outtype = 'text' outfp = StringIO() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True) fp.close() device.close() print outfp.getvalue() outfp.close() return
def convert_to_text_file(filename_in, filename_out, rewrite=False): """ Parse file according to BORME PDF format filename: filenameOut: """ if os.path.isdir(filename_out): filename_out = os.path.join(filename_out, os.path.basename(filename_in)) if os.path.exists(filename_out) and not rewrite: logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out) return False # conf codec = 'utf-8' laparams = LAParams() imagewriter = None pagenos = set() maxpages = 0 password = '' rotation = 0 # <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False> laparams.detect_vertical = True laparams.all_texts = False laparams.char_margin = 2.0 laparams.line_margin = 0.5 laparams.word_margin = 0.1 caching = True rsrcmgr = PDFResourceManager(caching=caching) outfp = open(filename_out, 'w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = open(filename_in, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) # https://github.com/euske/pdfminer/issues/72 #page = PDFPage() #PDFPage.cropbox = # y esto? for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return True
def to_text(self): rsrcmgr = PDFResourceManager() output = StringIO() laparams = LAParams() laparams.detect_vertical = True laparams.all_texts = True laparams.word_margin = 0.4 device = TextConverter(rsrcmgr, output, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in self._doc.get_pages(): interpreter.process_page(page) return output.getvalue().decode('utf-8', 'ignore')
def pdf2txt(pdfname, txtname): btxt = False try: fp = open(pdfname, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) ncount = 0 print("pdf2txt %s..." % pdfname) # informa por consola del nombre de archivo # abre archivo de texto para la salida fptxt = open(txtname, 'w') # recorre el documento procesando cada página for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() # recorre la página procesando cada objeto for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): spagetxt = lt_obj.get_text().strip() + " " if (spagetxt != ""): btxt = True fptxt.write(spagetxt) print("Palabra", spagetxt) elif isinstance(lt_obj, LTFigure): print("LTFigure, pte implementar!") spagetxt = "" ncount += 1 print("end") fptxt.closed fp.closed except Exception as e: print("Error: %s" % (e)) return btxt
def initialize_pdf_interpreter(): # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.word_margin = 0.0 # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return interpreter, device
def isSearchablePDF(self): searchable = True entityList_ = [] print("PDF File") fp = open(self.filename, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' counter = 1 for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): extracted_text += lt_obj.get_text() n1 = extracted_text.replace("\t", " ") n2 = n1.replace("\r", "") n3 = n2.replace("\n", "") finaltext = n3.replace("\u00a0", "") doc = nlp(finaltext) # print([(X.text, X.label_) for X in doc.ents]) for X in doc.ents: if X.text != ('\n') and X.label_ not in ( 'ORDINAL', 'CARDINAL', 'NORP', 'Non-‐binding'): self.listEntities.append(X.text + ",") entityList_.append((X.text, X.label_)) if (entityList_ == []): searchable = False return searchable
def get_pdf(path): print("Begin: get_pdf") with open(path, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) songs = [] for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() nro_element = 0 verse_text = '' for lt_obj in layout: if nro_element == 0: nro_element += 1 elif isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): text = lt_obj.get_text() if nro_element == 1: tittle = validate_tittle_to_save(text.replace('\n', '')) nro_element += 1 else: if text[:4] == 'Page' and text[4:].replace('\n', '').isdigit(): nro_page = text[4:].replace('\n', '') else: verse_text += text verse_text += '\n' song = {} song["tittle"] = tittle song["nro_page"] = nro_page song["verses"] = verse_text print(f"Read Page: {nro_page}") songs.append(song) print("Finish: get_pdf") return songs
def parsePDFFile(filePath, everyNPages=13): print('') print('About to parse file at path: {:s}'.format(filePath)) with open(filePath, 'rb') as pdfFile: pdfParser = PDFParser(pdfFile) document = PDFDocument() pdfParser.set_document(document) document.set_parser(pdfParser) document.initialize('') pdfResourceManager = PDFResourceManager() laParams = LAParams() laParams.char_margin = 1.0 laParams.word_margin = 1.0 pdfPageAggregator = PDFPageAggregator(pdfResourceManager, laparams=laParams) pdfPageInterpreter = PDFPageInterpreter(pdfResourceManager, pdfPageAggregator) extracted_text = {} pageNumber = 1 chosenStoppingPage = random.randint(1, everyNPages - 1) try: for page in document.get_pages(): if pageNumber % everyNPages == chosenStoppingPage: extracted_text[pageNumber] = '' pdfPageInterpreter.process_page(page) layout = pdfPageAggregator.get_result() for layoutObject in layout: if isinstance(layoutObject, LTTextContainer): text = layoutObject.get_text() text = text.replace('-\n', '') text = text.replace('\n', ' ') extracted_text[pageNumber] += text pageNumber += 1 except KeyError: pass print('URL parse complete') return extracted_text
def read_pdf_data(filename): with open(filename, 'rb') as fp: rsrcmgr = PDFResourceManager() laparams = LAParams() # laparams.line_margin = 0.005 laparams.word_margin = 0.05 device = My(rsrcmgr, sys.stdout, laparams=laparams) device.reset() interpreter = PDFPageInterpreter(rsrcmgr, device) result_data = [] count = 0 for page in PDFPage.get_pages(fp, set()): interpreter.process_page(page) result_data.append(device.group) device.word = "" device.group = [] device.word_pos_info = {} count += 1 return result_data
def pdf_woorden(filename): # Open het bestand en lees bites(rb) fp = open(filename, 'rb') # Opzetten pdf parser voor lezen woorden op pagina. Standaard pdfminer setup. parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # In extracted komen uiteindelijk alle woorden. extracted_text = '' # Ittereren over elke pagina van de pdf. for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() # Ittereren over objecten op pagina. for lt_obj in layout: # Als object een woord is, toevoegen aan extracted text if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() # Omzetten van alle letter naar niet-hoofdletter. # Keywords maakt van aaneeschakeling van karakters een lijst met woorden, zodat er mee gewerkt kan worden. # "dit zijn woorden" wordt: ['dit', 'zijn', 'woorden'] lower = extracted_text.lower() uit_elkaar = lower.split() stop_words = stopwords.words("dutch") keywords = [word for word in uit_elkaar if not word in stop_words] return keywords
def output_pdf_to_table(self, path, config): fp = open(path, "rb") rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.line_margin = line_margin_threshold laparams.word_margin = word_margin_threshold codec = 'utf-8' device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) password="" maxpages=pages_to_view caching=True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=False): interpreter.process_page(page) layout = device.get_result() self.getRows(layout, config)
def extract_pdf(): fp = open("timetable.pdf", 'rb') #extract data from PDF parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 1.0 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = '' for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text += lt_obj.get_text() + "," return extracted_text
def extract_text(self): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() laparams = LAParams() laparams.char_margin = 1000 laparams.word_margin = 0.01 laparams.line_margin = 0.01 converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams) page_interpreter = PDFPageInterpreter(resource_manager, converter) page_interpreter.device.handle_undefined_char = lambda f, c: chr(c) with open(self.path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) self._text = fake_file_handle.getvalue() self.rows = self._text.split('\n') converter.close() fake_file_handle.close()
def read_file(self): with open(self.path, 'rb') as f: parser = PDFParser(f) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.char_margin = 0.1 laparams.word_margin = 1.0 device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) extracted_text = [] for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): extracted_text.append(lt_obj.get_text()) self.content = ' '.join(extracted_text)
def initialize_pdf_miner(fh): # Create a PDF parser object associated with the file object. parser = PDFParser(fh) # Create a PDF document object that stores the document structure. doc = PDFDocument(parser) # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # for page in doc.get_pages(): # interpreter.process_page(page) # Set parameters for analysis. laparams = LAParams() laparams.word_margin = 0.0 codec = 'utf-8' # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return doc, interpreter, device
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-c codec] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:c:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None codec = 'utf-8' pageno = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-c': codec = v # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout device = CourseRegisterParser(rsrcmgr, outfp, codec=codec, laparams=laparams) for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
def convert_pdf(path, outtype='txt', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if not outtype: outtype = 'txt' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'txt': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print( "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]" " [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]" " [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]" " [-t text|html|xml|tag] [-c codec] [-s scale]" " file ..." % argv[0] ) return 100 try: (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:") except getopt.GetoptError: return usage() if not args: return usage() # input option password = b"" pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = "normal" codec = "utf-8" pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == "-d": logging.getLogger().setLevel(logging.DEBUG) elif k == "-p": pagenos.update(int(x) - 1 for x in v.split(",")) elif k == "-m": maxpages = int(v) elif k == "-P": password = v elif k == "-o": outfile = v elif k == "-C": caching = False elif k == "-n": laparams = None elif k == "-A": laparams.all_texts = True elif k == "-V": laparams.detect_vertical = True elif k == "-M": laparams.char_margin = float(v) elif k == "-L": laparams.line_margin = float(v) elif k == "-W": laparams.word_margin = float(v) elif k == "-F": laparams.boxes_flow = float(v) elif k == "-Y": layoutmode = v elif k == "-O": imagewriter = ImageWriter(v) elif k == "-R": rotation = int(v) elif k == "-S": stripcontrol = True elif k == "-t": outtype = v elif k == "-c": codec = v elif k == "-s": scale = float(v) # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = "text" if outfile: if outfile.endswith(".htm") or outfile.endswith(".html"): outtype = "html" elif outfile.endswith(".xml"): outtype = "xml" elif outfile.endswith(".tag"): outtype = "tag" if outfile: outfp = open(outfile, "wb") else: outfp = sys.stdout if outfp.encoding is not None: codec = None if outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == "xml": device = XMLConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol ) elif outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter ) elif outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = open(fname, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def readPDF2HTML(pdfFile, opts={}): # open a PDF file fp = StringIO(pdfFile.read()) retstr = StringIO() # create a PDF parser object associated with the file object parser = PDFParser(fp) # create a PDF document allows text extraction document = PDFDocument(parser) # password if needed # check if document allows text extraction without password if not document.is_extractable: raise PDFTextExtractionNotAllowed # create a PDF resource manager object that sotres shared resources rsrcmgr = PDFResourceManager() # create a PDF device object laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) codec = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() # process each page contained in the document for page in PDFPage.get_pages(fp, pagenos): interpreter.process_page(page) # close streams and return text content fp.close() content = retstr.getvalue() device.close() retstr.close() return content
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
def main(argv): import getopt #getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html def usage(): #usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用 print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') ''' getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] ) 短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数 长选项名后的等号(=)表示该选项必须有附加的参数。 返回opts和args。 ''' except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' #参数P pagenos = set() #参数p maxpages = 0 #参数m # output option outfile = None #参数o output outtype = None #参数t out type outdir = None #参数O output directory layoutmode = 'normal' #参数Y codec = 'utf-8' #参数c pageno = 1 scale = 1 #参数s,暂缺M,L,F,Y四个参数 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: #确认输出文件格式 outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #TextConverter貌似不能指定outdir参数 elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
import StringIO as StringIO import xlwt from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.pdfpage import PDFPage from pdfminer.layout import LAParams laparams = LAParams() laparams.word_margin = float(1.0) laparams.char_margin = float(2.0) #laparams.line_margin = float(0.55) #laparams.boxes_flow = float(0.7) #laparams.detect_vertical = True #laparams.all_texts = True caching = True fp = open('C:\Users\daniel.betteridge\Downloads\Aeopi 9 Sep to 23 Sept page set up.pdf', 'rb') #outfp = open('C:\Users\daniel.betteridge\Documents\pdfextract\Aeopi.csv', 'wb') rsrc = PDFResourceManager() restr = StringIO.StringIO() device =TextConverter(rsrc, restr,laparams=laparams) #replace restr with outfp for file output interpreter = PDFPageInterpreter(rsrc, device) book = xlwt.Workbook(encoding="utf-8") for pageNumber,page in enumerate(PDFPage.get_pages(fp, [1200], password=None, caching=caching, check_extractable=True)): if (pageNumber+1)%3 == 0: numcolumns = 8 else: numcolumns = 15
def main(argv): import getopt def usage(): print( "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] " "[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] " "[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ..." % argv[0] ) return 100 try: (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:") except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = "" pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = "normal" codec = "utf-8" pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == "-d": debug += 1 elif k == "-p": pagenos.update(int(x) - 1 for x in v.split(",")) elif k == "-m": maxpages = int(v) elif k == "-P": password = v elif k == "-o": outfile = v elif k == "-C": caching = False elif k == "-n": laparams = None elif k == "-A": laparams.all_texts = True elif k == "-V": laparams.detect_vertical = True elif k == "-M": laparams.char_margin = float(v) elif k == "-L": laparams.line_margin = float(v) elif k == "-W": laparams.word_margin = float(v) elif k == "-F": laparams.boxes_flow = float(v) elif k == "-Y": layoutmode = v elif k == "-O": outdir = v elif k == "-t": outtype = v elif k == "-c": codec = v elif k == "-s": scale = float(v) # # PDFDocument.debug = debug # PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = "text" if outfile: if outfile.endswith(".htm") or outfile.endswith(".html"): outtype = "html" elif outfile.endswith(".xml"): outtype = "xml" elif outfile.endswith(".tag"): outtype = "tag" if outfile: outfp = file(outfile, "w") else: outfp = sys.stdout if outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == "xml": device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir ) elif outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, "rb") process_pdf( rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ) fp.close() device.close() outfp.close() return
def main(argv): def usage(): print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
# Create a PDF parser object associated with the file object. #parser = PDFParser(open_file) # Create a PDF document object that stores the document structure. #doc = PDFDocument(parser) # Connect the parser and document objects. #print parser.nextline() #print parser.nextline() #print parser.nextline() ##ATTEMPT 2 #Code from pdf2txt.py laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin=0.5 laparams.word_margin=0.1 laparams.all_texts=False rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, fp_out, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pdf_pages = PDFPage.get_pages(fp_in, set()) pagenum = 0 pagelim = 3 for page in pdf_pages: pagenum += 1 if pagenum > pagelim: continue print "Transcribing page " + str(pagenum) + " from PDF to text" interpreter.process_page(page) fp_in.close()
def main(argv): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() using_optparse = False parser = ArgumentParser(prog='pdf2txt.py', description='Convert pdf to txt', formatter_class=ArgumentDefaultsHelpFormatter) if using_optparse: DEBUG(3, 'using optparse') parser.add_argument = parser.add_option parser.parse_known_args = parser.parse_args parser.disable_interspersed_args() parser.add_argument('-d', dest='debuglevel', action='count', default = 0, help='Debug (repeat for more verbose debugging)') parser.add_argument('-p', '--pages', dest='pagenos', action='store', type=str, default = '', help='Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.') parser.add_argument('-c', '--codec', dest='codec', action='store', type=str, default='utf-8', help='Specifies the output codec.') parser.add_argument('-t', '--type', dest='outtype', action='store', type=str, default='shape', choices = ['text', 'html', 'xml', 'tag', 'shape'], help='Specifies the output format, one of: shape, text, html, xml, tag') parser.add_argument('-m', dest='maxpages', action='store', type=int, default=0, help='Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.') parser.add_argument('-P', '--password', dest='password', action='store', type=str, default='', help='Provides the user password to access PDF contents.') parser.add_argument('-o', '--output', dest='outfile', action='store', type=str, default=None, help='Specifies the output file name. By default, it prints the extracted contents to stdout in text format.') parser.add_argument('-C', '--no-caching', dest='caching', action='store_false', default=True, help='Suppress object caching. This will reduce the memory consumption but also slows down the process.') parser.add_argument('-n', '--no-layout', dest='layout', action='store_false', default=True, help='Suppress layout analysis.') parser.add_argument('--show-pageno', dest='show_pageno', action='store_true', default=False, help='Show page numbers.') parser.add_argument('-A', '--analyze-all', dest='all_texts', action='store_true', default=False, help='Forces to perform layout analysis for all the text strings, including text contained in figures.') parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true', default=False, help='Allows vertical writing detection.') parser.add_argument('-M', dest='char_margin', action='store', type=float, default=2.0, help='Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.') parser.add_argument('-L', dest='line_margin', action='store', type=float, default=0.5, help='Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.') parser.add_argument('-W', dest='word_margin', action='store', type=float, default=0.1, help='It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.') parser.add_argument('-F', dest='boxes_flow', action='store', type=float, default=0.5, help='Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).') parser.add_argument('-Y', '--layout-mode', dest='layoutmode', action='store', type=str, default='normal', choices = ['exact', 'normal', 'loose'], help='Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.') parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store', type=str, default=None, help='imagewriter') parser.add_argument('-R', '--rotation', dest='rotation', action='store', type=int, default=0, help='rotation') parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true', default=False, help='stripcontrol') parser.add_argument('-s', dest='scale', action='store', type=float, default=1, help='Specifies the output scale. Can be used in HTML format only.') parser.add_argument('--draw-lines', dest='draw_lines', action='store_true', help="Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output.") parser.add_argument('--draw-boxes', dest='draw_boxes', action='store_true', help="Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output.") parser.add_argument('--draw-blocks', dest='draw_blocks', action='store_true', help="Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output.") parser.add_argument('--shear-limit', dest='shear_limit', action='store', default=0.1, type=float, help="If the text is sheared above this limit, reject it. Valid only for the `shape' output.") parser.add_argument('--rotation-limit', dest='rotation_limit', action='store', default=2, type=float, help="If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output.") parser.add_argument('--line-height-diff', dest='line_height_diff', action='store', type=float, default=0.1, help='Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).') parser.add_argument('--heading-before', dest='heading_before', action='store', type=str, default='', help='String to put before each heading, e.g. <h1>') parser.add_argument('--heading-after', dest='heading_after', action='store', type=str, default='', help='String to put after each heading, e.g. </h1>') parser.add_argument('--box-separator', dest='box_separator', action='store', type=str, default=r'\n\n', help=r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--block-separator', dest='block_separator', action='store', type=str, default=r'\n\n', help=r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-separator', dest='indent_separator', action='store', type=str, default=r'\n\n', help=r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-string', dest='indent_string', action='store', type=str, default=r'\t', help=r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--indent-limit', dest='indent_limit', action='store', type=float, default=3, help='If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.') parser.add_argument('--page-separator', dest='page_separator', action='store', type=str, default=r'\n\n', help=r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.') parser.add_argument('--norm-whitespace', dest='norm_whitespace', action='store_true', default=False, help='Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).') parser.add_argument('--print-stats', dest='print_stats', action='store_true', default=False, help='Instead of the text, output some simple statistics about the file.') parser.add_argument('--max-blocks', dest='max_blocks', action='store', default=0, type=int, help='If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.') parser.add_argument('--max-textlines', dest='max_textlines', action='store', default=0, type=int, help='If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.') parser.add_argument('--line-height-method', dest='line_height_method', action='store', type=str, default='bbox', choices = ['bbox', 'mean', 'median'], help='Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.') parser.add_argument(dest='pdffile', help='List of PDF files to go through', default=None, nargs='+') args, rest = parser.parse_known_args() global debuglevel debuglevel = debug = args.debuglevel DEBUG(3, 'args:', str(args)) DEBUG(3, 'rest:', str(rest)) DEBUG(3, 'optparse:', using_optparse) if args.pagenos: pagenos.update( int(x)-1 for x in args.pagenos.split(',') ) maxpages = args.maxpages outfile = args.outfile password = args.password caching = args.caching showpageno = args.show_pageno if not args.layout: laparams = None if laparams and args.all_texts: laparams.all_texts = True if laparams and args.detect_vertical: laparams.detect_vertical = True if laparams: laparams.char_margin = args.char_margin laparams.line_margin = args.line_margin laparams.word_margin = args.word_margin laparams.boxes_flow = args.boxes_flow layoutmode = args.layoutmode if args.imagewriter: imagewriter = ImageWriter(args.imagewriter) rotation = args.rotation stripcontrol = args.stripcontrol outtype = args.outtype codec = args.codec scale = args.scale args.box_separator = unescape_string(args.box_separator) args.block_separator = unescape_string(args.block_separator) args.indent_separator = unescape_string(args.indent_separator) args.indent_string = unescape_string(args.indent_string) args.page_separator = unescape_string(args.page_separator) global options options = args PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') DEBUG(2, 'output goes to', outfile) else: outfp = sys.stdout DEBUG(2, 'output goes to stdout') if outtype == 'shape': device = ShapeTextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, showpageno=showpageno, imagewriter=imagewriter) elif outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in options.pdffile: DEBUG(2, 'processing', fname) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() DEBUG(2, 'finished.') return
def convert_pdf_To_Txt(path,opts={}): """ this ALGO form pdfinterp modul documentation """ # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout retstr = StringIO() if outtype == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) #print retstr.getvalue() txt2Pdf=retstr.getvalue() #print type(txt2Pdf) #fp.close() #device.close() #outfp.close() return txt2Pdf
def pdfminerr(argv): global pdfminerr, install import getopt def usage(): print ("usage: just put the path to the pdf file in pdf.txt, and make sure you create a seprate folder and put nothing there except for this repository.") return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = 'tag' imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'tag' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout for fname in args: l = glob.glob(fname) count = len(l) print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format' for pdf in l: # print pdf d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'} ext = '.' + d[outtype] outfile = pdf[0:-4] + ext print outfile outfp = file(outfile, 'wb') if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) device.showpageno = False else: return usage() fp = file(pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print 'Done' return
doc = open_pdf(sys.argv[1]) Point = Route = False pages = page_count(doc) if pages == 68: Point = True elif pages == 1143: Route = True else: sys.stderr.write("PDF file not of recognised (NRG) format\n") sys.exit(1) rsrcmgr = PDFResourceManager() laparams = LAParams() laparams.line_margin = 0 # Forces every line to be absolutely positioned laparams.word_margin = 0.2 # Prevents space before narrow characters device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) writer = BufferedWriter(sys.stdout) for (pageno, page) in enumerate(doc.get_pages()): interpreter.process_page(page) layout = device.get_result() # returns LTPage (text, other) = fsplit(lambda obj: isinstance(obj, LTText), layout) header_y = 0 if Point: # Locates bottom of header separator (lowest non-text < 10px height) header_y = reduce(lambda x, o: min(x, o.y0),
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrc = PDFResourceManager() if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
def main(fname, k, v): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-S': stripcontrol = True elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def pdf2txt(argv): import getopt (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] [-r] ' '[-S] [-f] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'fSrdp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True roundCoords = False simplifyOutput = False formatOutput = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) elif k == '-r': roundCoords = True elif k == '-S': simplifyOutput = True elif k == '-f': formatOutput = True PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if formatOutput and outtype.endswith('ml'): try: from cStringIO import StringIO except ImportError: from StringIO import StringIO outfp = StringIO() else: outfp = getRealOutput(outfile) if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, layoutmode=layoutmode, scale=scale, roundCoords=roundCoords, simplifyOutput=simplifyOutput) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if formatOutput: root = outfp.getvalue() with getRealOutput(outfile) as realOutput: try: from bs4 import BeautifulSoup as bs except ImportError: bs = None sys.stderr.write('Could not import BeautifulSoup, skipping output formatting') realOutput.write(root) else: soup = bs(root) prettyHTML = soup.prettify() realOutput.write(prettyHTML) outfp.close() return