def convert(self, data): # convert binary pdf data into a file like structure pdfdata = StringIO(data) # I have no idea why this is needed CMapDB.initialize('CMap', 'CDBCMap') # create the converter and resource manager rsrc = PDFResourceManager() converter = TextConverter(rsrc) # setup the parser doc = PDFDocument() parser = PDFParser(doc, pdfdata) # initialize the pdf try: # use empty password doc.initialize('') except PDFPasswordIncorrect: return '' # check if we can extract the contents of this file if not doc.is_extractable: return '' # do the conversion interpreter = PDFPageInterpreter(rsrc, converter) for page in doc.get_pages(): interpreter.process_page(page) converter.close() pdfdata.close() return converter.get_text()
def main(argv): import getopt def usage(): print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-t html|sgml|tag] [-o output] file ...' % argv[ 0] return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:') except getopt.GetoptError: return usage() if not args: return usage() debug = 0 cmapdir = 'CMap' cdbcmapdir = 'CDBCMap' codec = 'ascii' pagenos = set() maxpages = 0 outtype = 'html' password = '' outfp = stdout for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-P': password = v elif k == '-c': codec = v elif k == '-m': maxpages = int(v) elif k == '-C': cmapdir = v elif k == '-D': cdbcmapdir = v elif k == '-t': outtype = v elif k == '-o': outfp = file(v, 'wb') # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug # CMapDB.initialize(cmapdir, cdbcmapdir) rsrc = PDFResourceManager() if outtype == 'sgml': device = SGMLConverter(rsrc, outfp, codec) elif outtype == 'html': device = HTMLConverter(rsrc, outfp, codec) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec) else: return usage() for fname in args: convert(rsrc, device, fname, pagenos, maxpages=maxpages, password=password) return
def main(argv): import getopt def usage(): print 'usage: %s [-d] [-p pagenos] [-P password] [-c codec] [-t html|sgml|tag] [-o output] file ...' % argv[0] return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:t:o:C:D:m:') except getopt.GetoptError: return usage() if not args: return usage() debug = 0 cmapdir = 'CMap' cdbcmapdir = 'CDBCMap' codec = 'ascii' pagenos = set() maxpages = 0 outtype = 'html' password = '' outfp = stdout for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-P': password = v elif k == '-c': codec = v elif k == '-m': maxpages = int(v) elif k == '-C': cmapdir = v elif k == '-D': cdbcmapdir = v elif k == '-t': outtype = v elif k == '-o': outfp = file(v, 'wb') # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug # CMapDB.initialize(cmapdir, cdbcmapdir) rsrc = PDFResourceManager() if outtype == 'sgml': device = SGMLConverter(rsrc, outfp, codec) elif outtype == 'html': device = HTMLConverter(rsrc, outfp, codec) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec) else: return usage() for fname in args: convert(rsrc, device, fname, pagenos, maxpages=maxpages, password=password) return