示例#1
0
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
def pdf_to_text(page_object):
    parser = PDFParser(page_object)
    # Create a PDF document object that stores the document structure
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.initialize('')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF page aggregator object
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = []
    # i = page number #without this it doesn't work
    # page are items in page
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for object in layout:
            if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
                trial = []
                trial.append(object.get_text())
                for word in trial:
                    text_content.append(word)                    
    return text_content
示例#3
0
def main():
    # Open a PDF file.
    with open('/home/chris/Documents/Literature/DFT Primer.pdf', 'rb') as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        print rsrcmgr
        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            print interpreter.process_page(page)
        outlines = document.get_outlines()
        for (level,title,dest,a,se) in outlines:
            print (level, title)
    return 0
def with_pdf(pdf_doc, fn, pdf_pwd, *args):
    """Open the pdf document, and apply the function, returning the results"""
    result = None
    try:
        # open the pdf file
        fp = open(pdf_doc, "rb")
        # create a parser object associated with the file object
        parser = PDFParser(fp)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument(parser)
        # connect the parser and document objects
        parser.set_document(doc)
        # supply the password for initialization
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            result = fn(doc, *args)

        # close the pdf file
        fp.close()
    except IOError:
        # the file doesn't exist or similar problem
        pass
    return result
示例#5
0
def extractembedded(outfp, fname, objids, pagenos, password='',
                    dumpall=False, codec=None, extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile' %
                (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print >>sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    return
示例#6
0
    def parse_paragraphs(self, text):
        # Will only work for markdown elements
        #   divided by '##' markers
        #   or for pdf like chapters, e.g. \n\n 2 Conclusion \n\n
        lines = text.split('\n')
        headlines = []

        if self.is_pdf:
            with open(self.paper_filename, 'rb') as pdf:
                parser = PDFParser(pdf)
                document = PDFDocument(parser)

                try:
                    outlines = document.get_outlines()
                    for (level, title, _, _, _) in outlines:
                        if level == 1:
                            headlines.append(title)
                except PDFNoOutlines:
                    logging.info(
                        "No outline found -> skipping paragraph search..."
                    )
        else:  # check markdown headlines
            for index, line in enumerate(lines):
                if line.startswith('## '):
                    headlines.append(line)

        if len(headlines) > 0:
            self.count_paragraphs(text, lines, headlines)
示例#7
0
def extract_pdf(file):
    """
    extract the string content of a pdf
    """
    parser = PDFParser(file)
    document = PDFDocument(parser)
    document.initialize("")
    if not document.is_extractable:
        return -1

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    codec = 'utf-8'
    device = TextConverter(rsrcmgr, retstr, codec = codec, showpageno=False, laparams = laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()

    for page in PDFPage.get_pages(file, pagenos, maxpages=0, password="", caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)

    content = retstr.getvalue()
    return content
 def parse (self):
     fp = file(self.pdf, 'rb')
     parser = PDFParser(fp, dbg=self.debug)
     doc = PDFDocument(parser, dbg=self.debug)
     #extract blob of data after EOF (if it exists)
     if doc.found_eof and doc.eof_distance > 3:
         self.bin_blob = parser.read_from_end(doc.eof_distance)
     res = '<pdf>'
     visited = set() #keep track of the objects already visited
     for xref in doc.xrefs:
         for objid in xref.get_objids():
             if objid in visited:
                 continue
             if objid == 21 or objid == 67:
                 print objid
             visited.add(objid)
             try:
                 obj = doc.getobj(objid)
                 res += '<object id="' + str(objid) + '">\n'
                 res += self.dump(obj)
                 res += '\n</object>\n\n'
             except PDFObjectNotFound as e:
                 mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096)
                 mal_obj = mal_obj.replace('<', '0x3C')
                 res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj)
                 self.takenote(self.malformed, 'objects', objid)
             except Exception as e:
                 res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message)
     fp.close()
     res += self.dumptrailers(doc)
     res += '</pdf>'
     self.xml=res
     self.errors = doc.errors
     self.bytes_read = parser.BYTES
     return
示例#9
0
def dumppdf(fname, objids, pagenos, password='',
            dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    res = ""
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            res += dumpxml(obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        res += dumpxml( obj, codec=codec)
                else:
                    res += dumpxml(page.attrs)
    #print "before dumpall"
    if dumpall:
        res += dumpallobjs( doc, codec=codec)
        #print "after dumpall"
    if (not objids) and (not pagenos) and (not dumpall):
        res += dumptrailers( doc)
    fp.close()
    if codec not in ('raw','binary'):
        res += '\n'
    #print "end proc"
    return res
示例#10
0
def get_toc(pdf_path):
    infile = open(pdf_path, "rb")
    parser = PDFParser(infile)
    document = PDFDocument(parser)

    toc = list()
    for (level, title, dest, a, structelem) in document.get_outlines():
        toc.append((level, title))

    return toc
示例#11
0
def pdf2metadata(fp):
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    doc.initialize()

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        #print metadata  # The raw XMP metadata
    return doc.info  # The "Info" metadata
示例#12
0
def loadPDF(library, file_name):
	"""adds a paper to the library"""
	fp = open(file_name, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	document = PDFDocument(parser)
	# Supply the password for initialization.
	# (If no password is set, give an empty string.)
	password = ""
	document.initialize(password)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
		print "CANT"
	#	raise PDFTextExtractionNotAllowed
	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()
	# Set parameters for analysis.
	laparams = LAParams()
	# Create a PDF page aggregator object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	
	text_content = []
	authors = []       #list of authors
	citations = []     #list of authors that have been cited

	#pages_length = sum(1 for page in document.get_pages())

	for ii, page in enumerate(PDFPage.create_pages(document)):
		print '---------------------------------------------------------------------------------------------------'
		print "page number {}".format(ii)
		interpreter.process_page(page)
		# receive the LTPage object for the page.
		layout = device.get_result()
		for jj, lt_obj in enumerate(layout._objs):
			if jj>3:
				break
			if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
				cur_line = lt_obj.get_text().encode('ascii', 'ignore')
				match = pattern_ignore.match(cur_line)
				if match is None and len(cur_line)<200:
					print bcolors.OKGREEN +" "+cur_line+bcolors.ENDC
				else:
					print bcolors.FAIL+" "+cur_line[0:150]+bcolors.ENDC
				
			else:
				print "PICTURE"
		break


	paper_title = file_name
	paper = library.getPaper(paper_title)
	paper.addAuthorIds(authors)
	paper.addCitationIds(citations)
示例#13
0
def print_all_obj(filename):
    with file(filename, 'rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser, None)
        visited_objids = set()
        for xref in doc.xrefs:
            for objid in xref.get_objids():
                if objid in visited_objids:
                    continue
                visited_objids.add(objid)
                print objid, get_obj_type(doc.getobj(objid))
示例#14
0
    def proc(self, pdfFp):
        """Get meta-data as available from a PDF document"""

        parser = PDFParser(pdfFp)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        doc.initialize()
        self.info = doc.info
        if 'Metadata' in doc.catalog:
            self.metadata = xmp_to_dict(
                resolve1(doc.catalog['Metadata']).get_data()
            )
        self.raw_doc = pdfFp.getvalue()
示例#15
0
def getDocumentInfoAndAnnotations(pdfFile):
   logger.info("Parsing pdf file " + pdfFile);
   # Open PDF file.
   fp = open(pdfFile, 'rb');
   docInfo = None;
   docAnnotations = [];
   # Create a PDF parser object associated with the file object.
   parser = PDFParser(fp);
   # Create a PDF document object that stores the document structure.
   document = PDFDocument(parser);
   # Supply the password for initialization.
   # (If no password is set, give an empty string.)
   document.initialize('');
   # Create a PDF resource manager object that stores shared resources.
   rsrcmgr = PDFResourceManager();
   # Create a PDF device object.
   device = PDFDevice(rsrcmgr);
   # Create a PDF interpreter object.
   interpreter = PDFPageInterpreter(rsrcmgr, device);
   # Process each page contained in the document.
   pageNum = 0;
   for page in PDFPage.create_pages(document):
      pageNum += 1;
      interpreter.process_page(page);
      if(page.annots):
         try:
            if isinstance( page.annots, list ):
               annots = page.annots;
            else:
               annots = page.annots.resolve();

            for annot in annots:
               if isinstance( annot, PDFObjRef ):
                  annot = annot.resolve();
   
                  if(annot.has_key('Subj')):
                     if(annot['Subj'] == 'Sticky Note' and docInfo == None):
                        logger.debug('DOC INFO ' + annot['Subj'] + ' Contents=' + annot['Contents']);
                        docInfo = annot['Contents'];
                     elif(annot['Subj'] == 'Comment on Text'):
                        logger.debug('COMMENT ON TEXT ' + annot['Subj'] + ' Contents=' + annot['Contents']);
                        contents = annot['Contents'];
                        docAnnotations.append(str(pageNum) + ':' + contents);
                     else:
                        logger.debug('UNKNOWN ANNOTATION: ' + annot['Subj'] + ' Contents=' + annot['Contents']);

         except Exception, e:
            logger.error("error getting annotation");
            logger.exception(e);
            # move file to error
            os.rename(file, "/home1/northbr6/batch/apps/catalogue/output/error/" + os.path.basename(file));
示例#16
0
def pdf_from_resource(resource):
    """
    Builds PDF mining objects from input data.

    This function attempts to open a PDF file for processing.
    """
    parser = PDFParser(resource)
    document = PDFDocument()
    parser.set_document(document)

    document.set_parser(parser)
    document.initialize()

    return document
示例#17
0
def parse(filename, maxlevel):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)

    outlines = doc.get_outlines()
    for (level, title, dest, a, se) in outlines:
        if level <= maxlevel:
            title_words = title.encode('utf8') \
                               .replace('\n', '') \
                               .split()
            title = ' '.join(title_words)
            print('<h{level}>{title}</h{level}>'
                  .format(level=level, title=title))
示例#18
0
文件: pdf.py 项目: CJStuart/amcat
    def load_document(self, _file, password=""):
        """turn the file into a PDFMiner document"""
        log.info("loading document...")
        parser = module_parser(_file)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)

        doc.initialize(password)

        if not doc.is_extractable:
            raise ValueError("PDF text extraction not allowed")

        return doc
示例#19
0
	def Parse_PDF(self):

		def parse_lt_objs (lt_objs, page_number, text=[]):
			"""Iterate through the list of LT* objects and capture the text or image data contained in each"""
			text_content = [] 
			page_text = {}
			for lt_obj in lt_objs:
				
				if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
				# text, so arrange is logically based on its column width
					text_content.append(lt_obj.get_text())

				elif isinstance(lt_obj, LTFigure):
					# LTFigure objects are containers for other LT* objects, so recurse through the children
					text_content.append(parse_lt_objs(lt_obj, page_number, text_content))

			for k, v in sorted([(key,value) for (key,value) in page_text.items()]):
				# sort the page_text hash by the keys (x0,x1 values of the bbox),
				# which produces a top-down, left-to-right sequence of related columns
				text_content.append(''.join(v))

			return '\n'.join(text_content)

		fp = open( self.filePath, 'rb')

		parser = PDFParser(fp)

		document = PDFDocument(parser)

		try:
			document.initialize('')
		except:
			pass

		rsrcmgr = PDFResourceManager()

		device = PDFPageAggregator(rsrcmgr, laparams=LAParams())

		interpreter = PDFPageInterpreter(rsrcmgr, device)

		text_content = []
		i = 0

		for page in PDFPage.create_pages(document):
			interpreter.process_page(page)
			layout = device.get_result()
			self.text_content.append(parse_lt_objs(layout, (i+1)).strip())
			i += 1

		return self.text_content
示例#20
0
def check_pdf_password(pdf, password):
        fp = open(pdf, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        try:
                doc.initialize(password)
                if doc.is_extractable:
                        print ''
                        print 'The PDF Password Is:' + password
                        return True
                else:
                        print 'exception'
                        return False
        except:
                print '\r',
                return False
def convert_file(pdf_file, file_name):
    parser = PDFParser(pdf_file)
    pdf = PDFDocument(parser)
    pdf.initialize("")
    if not pdf.is_extractable:
        raise PDFPage.PDFTextExtractionNotAllowed("Document does not allow text extraction: " + file_name)

    resource = PDFResourceManager()
    laparams = LAParams()
    output = StringIO.StringIO()
    device = TextConverter(resource, output, codec="utf-8", laparams=laparams)

    interpreter = PDFPageInterpreter(resource, device)
    for page in PDFPage.create_pages(pdf):
        interpreter.process_page(page)

    return output.getvalue()
示例#22
0
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None, extractdir=None):
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page)
                  in enumerate(PDFPage.create_pages(doc)) )
    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest
    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level,title,dest,a,se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a.resolve()
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
示例#23
0
文件: slate.py 项目: areyesnav/slate
class PDF(list):
    def __init__(self, file, password='', just_text=1):
        self.parser = PDFParser(file)
        self.doc = PDFDocument(self.parser)
        self.parser.set_document(self.doc)
        self.doc.initialize(password)
        if self.doc.is_extractable:
            self.resmgr = PDFResourceManager()
            self.device = TextConverter(self.resmgr, outfp=StringIO())
            self.interpreter = PDFPageInterpreter(
               self.resmgr, self.device)
            for page in PDFPage.create_pages(self.doc):
                self.append(self.interpreter.process_page(page))
            self.metadata = self.doc.info
        if just_text:
            self._cleanup()

    def _cleanup(self):
        """ 
        Frees lots of non-textual information, such as the fonts
        and images and the objects that were needed to parse the
        PDF.
        """
        del self.device
        del self.doc
        del self.parser
        del self.resmgr
        del self.interpreter

    def text(self, clean=True):
        """ 
        Returns the text of the PDF as a single string.
        Options:

          :clean:
            Removes misc cruft, like lots of whitespace.
        """
        if clean:
            return utils.normalise_whitespace(''.join(self))
        else:
            return ''.join(self) 
示例#24
0
def extractComments(fp):
    parser = PDFParser(fp)
    doc = PDFDocument(parser, "")

    visited = set()
    pages = []
    resultList = []

    def extract(objid, obj):
        result = None
        if isinstance(obj, dict):
            # 'Type' is PDFObjRef type
            if obj.has_key('Type') and obj['Type'].name == 'Page':
                pages.append(objid)
            elif obj.has_key('C'):
                try:
                    pr = obj['P']
                    pi = pages.index(pr.objid)+1
                except:
                    pi = -1
                try:
                    result = (fp.name, objid, pi, obj['Subtype'].name, obj['Subj'],obj['T'],obj['Contents'])
                except:
                    # if any of the listed entries do not exist, ignore 
                    #print(objid, pi, obj['Subtype'].name)
                    result = ()

        return result

    for xref in doc.xrefs:
        for objid in xref.get_objids():
            if objid in visited: continue
            visited.add(objid)
            try:
                obj = doc.getobj(objid)
                if obj is None: continue
                r= extract(objid,obj)
                if r:
                    resultList.append(r)
            except PDFObjectNotFound, e:
                print >>sys.stderr, 'not found: %r' % e
 def __init__(self, pdf):
     self.document = pdf
     #initialize parsing parameters
     self.file_pointer = open(self.document.name, 'rb')
     self.parser = PDFParser(self.file_pointer)
     self.pdf_document = PDFDocument(self.parser)
     self.pdf_document.initialize()
     # set resaource management
     self.resource_manager = PDFResourceManager()
     self.pdf_device = PDFPageAggregator(self.resource_manager, laparams=LAParams())
     #set interpreter
     self.interpreter = PDFPageInterpreter(self.resource_manager, self.pdf_device)
示例#26
0
 def __init__(self, file, password="", just_text=1):
     self.parser = PDFParser(file)
     self.doc = PDFDocument(self.parser)
     self.parser.set_document(self.doc)
     self.doc.initialize(password)
     if self.doc.is_extractable:
         self.resmgr = PDFResourceManager()
         self.device = TextConverter(self.resmgr, outfp=StringIO())
         self.interpreter = PDFPageInterpreter(self.resmgr, self.device)
         for page in PDFPage.create_pages(self.doc):
             self.append(self.interpreter.process_page(page))
         self.metadata = self.doc.info
     if just_text:
         self._cleanup()
示例#27
0
def get_headings(filename):
    os.chdir('..')
    rd.open_location("/PDF",True)
    filename_=filename[:-14]

    for compare_filename in os.listdir(os.getcwd()):

        if filename_ == compare_filename[:-4]:
            in_file=open(compare_filename, 'rb')
            
            parse_file=PDFParser(in_file)
            file=PDFDocument(parse_file)
            pages=0
            for page in PDFPage.get_pages(in_file):
                pages+=1   
            headings_list=[]
            try:
                for (level,title,dest,a,structelem) in file.get_outlines():
                    headings_list.append((level,title))
                rd.open_location("/program",True)    
                return headings_list,pages
            except:
                rd.open_location("/program",True)
                return None,pages
示例#28
0
 def valid_toc(self, toc):
     with open(str(self._doc), "rb") as pdffile:
         parser = PDFParser(pdffile)
         document = PDFDocument(parser)
         try:
             real_toc = list(document.get_outlines())
         except PDFNoOutlines:
             return len(toc) == 0
         print("TOC from PDF file:", real_toc)
         if len(real_toc) != len(toc):
             print("Incorrect TOC length")
             return False
         for ref, real in zip(toc, real_toc):
             print("Checking", ref)
             if not ref[0] + 1 == real[0]:
                 # level
                 return False
             if not self._is_reference_to_ith_page(real[2][0], ref[1] - 1):
                 # destination
                 return False
             if not ref[2] == real[1]:
                 # title
                 return False
     return True
示例#29
0
    def extract_contents(self):
        parser = PDFParser(self.fd)
        doc = PDFDocument(parser)
        self.total_pages = self.get_pages_total()
        self.pages = zip(PDFPage.get_pages(self.fd), range(1, self.total_pages))

        try:
            outlines = doc.get_outlines()
        except PDFNoOutlines:
            # No built-in outlines
            return None
        else:
            # built-in outlines exist
            def search_page_toc(objid):
                for page, pagenum in self.pages:
                    if page.pageid == objid:
                        return pagenum
                return 0

            for (level, title, dest, a, se) in outlines:
                if dest is not None:
                    pn = search_page_toc(dest[0].objid)
                    if pn > 0:
                        self.outlines.append((title, pn))
示例#30
0
    def __init__(self, *args, **kwargs):
        super(AccountRIB, self).__init__(*args, **kwargs)

        self.parsed_text = b''

        try:
            try:
                from pdfminer.pdfdocument import PDFDocument
                from pdfminer.pdfpage import PDFPage
                newapi = True
            except ImportError:
                from pdfminer.pdfparser import PDFDocument
                newapi = False
            from pdfminer.pdfparser import PDFParser, PDFSyntaxError
            from pdfminer.converter import TextConverter
            from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
        except ImportError:
            self.logger.warning('Please install python-pdfminer to get IBANs')
        else:
            parser = PDFParser(BytesIO(self.doc))
            try:
                if newapi:
                    doc = PDFDocument(parser)
                else:
                    doc = PDFDocument()
                    parser.set_document(doc)
                    doc.set_parser(parser)
            except PDFSyntaxError:
                return

            rsrcmgr = PDFResourceManager()
            out = BytesIO()
            device = TextConverter(rsrcmgr, out)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            if newapi:
                pages = PDFPage.create_pages(doc)
            else:
                doc.initialize()
                pages = doc.get_pages()
            for page in pages:
                interpreter.process_page(page)

            self.parsed_text = out.getvalue()
示例#31
0
def get_pdf_totalpage(file):
    file = open(file, 'rb')
    parser = PDFParser(file)
    document = PDFDocument(parser)
    page_count = resolve1(document.catalog['Pages'])['Count']
    return page_count
示例#32
0
def main():
    global OUTFILE, VERBOSE, ENCODING

    printout(BANNER)

    args = parse_args()

    links = set()
    emails = set()
    usernames = set()
    ips = set()
    paths = set()
    softwares = set()
    locations = set()
    img_users = set()
    img_software = set()
    img_locations = set()
    img_serials = set()
    pdf_metadata = []
    img_metadata = []

    # get all input files
    if os.path.isfile(args.path):
        files = [args.path]
    elif os.path.isdir(args.path):
        files = [os.path.join(args.path, f) for f in os.listdir(args.path) if
                 os.path.isfile(os.path.join(args.path, f)) and f.endswith('.pdf')]
        printout('Files to be processed:', False)
        for h in files:
            printout(' %s' % os.path.join(args.path, h), False)
    else:
        printout('[!] Error: provided path %s is not a valid file or folder' % args.path)
        sys.exit(-1)

    # extract data from all files
    for f in files:
        with open(f, 'rb') as fp:

            try:

                if VERBOSE:
                    printout('* Processing file %s...' % f)
                else:
                    print(' ' * 200, end='\r')
                    print('* Processing file %s...' % f, end='\r')

                parser = PDFParser(fp)
                doc = PDFDocument(parser)
                if not doc.is_extractable:
                    printout('[!] Document %s is set not to be extractable. Trying anyway...' % f)
                    doc.is_extractable = True
                metadata = get_metadata(doc)
                metadata['_filename'] = f
                pdf_metadata.append(metadata)
                if args.email or args.links or args.ips or args.paths or args.usernames or args.software:
                    xml = get_xml(f)
                    decoded = html.unescape(xml)
                if args.email:
                    emails |= set(retrieve_all(decoded, rex.RE_EMAIL))
                if args.links:
                    links |= set(retrieve_all(decoded, rex.RE_WWW))
                    links |= set(urls_in_tags(decoded.splitlines()))
                if args.ips:
                    ips |= set(retrieve_all(decoded, rex.RE_IP))
                if args.extract_paths:
                    paths |= set(paths_in_tooltips(decoded.splitlines()))
                if args.usernames or args.software:
                    [u, s] = get_users_sw_from_meta(metadata)
                    usernames |= set(u)
                    softwares |= set(s)
                if args.images:
                    image_meta = extract_images(doc, store_path=args.store_images, filename=f)
                    img_metadata.append(image_meta)
                    [img_u, img_sw, img_ser, img_loc] = get_users_sw_from_img_meta(image_meta)
                    img_users |= set(img_u)
                    img_software |= set(img_sw)
                    img_locations |= set(img_loc)
                    img_serials |= set(img_ser)
            except Exception as ex:
                printout('[!] Error while processing file %s: %s' % (f, ex))
                printout()
                printout(ex, False)

    # now we also retrieve info from the paths structure found
    [u_linux, u_mac, u_windows] = get_info_from_paths(paths)
    usernames |= set(u_linux)
    usernames |= set(u_mac)
    usernames |= set(u_windows)

    # if images were extracted and metadata to be shown, first show img metadata
    if args.metadata and args.images:
        printout('%s %s %s' % ('.' * 31, 'image metadata', '.' * 31))
        printout()
        print_image_metadata(img_metadata)
    # show pdf metadata
    if args.metadata:
        printout('%s %s %s' % ('.' * 32, 'PDF metadata', '.' * 32))
        printout()
        print_metadata(pdf_metadata)

    # print the summary of results
    if args.summary: printout('.' * 78 + '\n')
    if args.usernames: print_results('* Usernames found', usernames)
    if args.paths: print_results('* Paths found', paths)
    if args.ips: print_results('* IPs found', ips)
    if args.email: print_results('* Emails found', emails)
    if args.links: print_results('* Links found', links)
    if args.software: print_results('* Software found', softwares)
    if args.images:
        if img_users and args.usernames: print_results('* Users in images', img_users)
        if img_software and args.software: print_results('* Software in images', img_software)
        if img_locations: print_results('* GPS Locations', img_locations)
        if img_serials: print_results('* Serial # in images', img_serials)
示例#33
0
def main(argv):
    global Verbose_Flag
    global Use_local_time_for_output_flag
    global testing

    argp = argparse.ArgumentParser(description="extract_pseudo_JSON-from_PDF.py: Extract the pseudo JSON from the end of the thesis PDF file")

    argp.add_argument('-v', '--verbose', required=False,
                      default=False,
                      action="store_true",
                      help="Print lots of output to stdout")

    argp.add_argument('-t', '--testing',
                      default=False,
                      action="store_true",
                      help="execute test code"
                      )

    argp.add_argument('-p', '--pdf',
                      type=str,
                      default="test.pdf",
                      help="read PDF file"
                      )

    argp.add_argument('-j', '--json',
                      type=str,
                      default="calendar_event.json",
                      help="JSON file for extracted calendar event"
                      )

    argp.add_argument('-a', '--acronyms',
                      type=str,
                      default="acronyms.tex",
                      help="acronyms filename"
                      )

    argp.add_argument('-l', '--ligatures',
                      default=False,
                      action="store_true",
                      help="leave ligatures rahter than replace them"
                      )



    args = vars(argp.parse_args(argv))

    Verbose_Flag=args["verbose"]

    filename=args["pdf"]
    if Verbose_Flag:
        print("filename={}".format(filename))

    #output_string = StringIO()
    output_string = BytesIO()
    with open(filename, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        #device = HTMLConverter(rsrcmgr, output_string, laparams=LAParams(), layoutmode='normal', codec='utf-8')

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

        text=output_string.getvalue().decode('UTF-8')
        if Verbose_Flag:
            print("text: {}".format(text))

    # define the maker string
    quad__euro_marker='€€€€'

    # look for the new start of the For DiVA information
    diva_start=text.find("{0} For DIVA {0}".format(quad__euro_marker))
    if diva_start < 0:
        # if not found, then try the older For DIVA string
        diva_start=text.find("For DIVA")

    if Verbose_Flag:
        print("For DIVA found at diva_start={}".format(diva_start))
    if diva_start >= 0:
        diva_data=text[:]
        diva_data=diva_data[diva_start:]
        diva_start=diva_data.find("{")
        if diva_start >= 0:
            diva_data=diva_data[diva_start:]
            end_block=diva_data.find('”Number of lang instances”:') # note these are right double quote marks
            if end_block < 0:            
                end_block=diva_data.find('"Number of lang instances":') # note these are straight double quote marks
            if end_block > 0:
                end_block=diva_data.find(',', end_block)
                if end_block > 0:
                    dict_string=diva_data[:]
                    dict_string=dict_string[:end_block]+'}'

                    dict_string=dict_string.replace('', '') #  remove any new page characters
                    dict_string=dict_string.replace('”', '"')
                    dict_string=dict_string.replace('\n\n', '\n')
                    dict_string=dict_string.replace(' \n', '')
                    dict_string=dict_string.replace(',}', '}')

                    dict_string=dict_string.replace('”', '"')
                    #dict_string=dict_string.replace('&quot;', '"')
                    #dict_string=dict_string.replace('<br>', '\n')
                    #dict_string=dict_string.replace('<br>"', '\n"')
                    #dict_string=dict_string.replace('<br>}', '\n}')
                    dict_string=dict_string.replace(',\n\n}', '\n}')
                    dict_string=dict_string.replace(',\n}', '\n}')

                    # fix an error in the early template
                    if dict_string.find(',Äddress": ') > 0:
                        print("fix an error in the early template")
                        dict_string=dict_string.replace(',Äddress": ', ',"Address": "')
                        dict_string=dict_string.replace('\"Lindstedtsvägen', 'Lindstedtsvägen')
                        dict_string=dict_string.replace('¨Lindstedtsvägen', 'Lindstedtsvägen')
                        dict_string=dict_string.replace('¨Isafjordsgatan', 'Isafjordsgatan')



                    if not args['ligatures']:
                        dict_string=replace_ligature(dict_string)
                        print("looking for and replacing ligatures")

                    if Verbose_Flag:
                        print("dict_string={}".format(dict_string))
                    print("dict_string={}".format(dict_string))
                    d=json.loads(dict_string)
                    if Verbose_Flag:
                        print("d={}".format(d))

                    abs_keywords=diva_data[(end_block+1):]
                    abs_keywords=abs_keywords.replace('', '')
                    if Verbose_Flag:
                        print("abs_keywords={}".format(abs_keywords))
                    number_of_quad_euros=abs_keywords.count(quad__euro_marker)
                    if Verbose_Flag:
                        print("number_of_quad_euros={}".format(number_of_quad_euros))
                    abstracts=dict()
                    keywords=dict()
                    if (number_of_quad_euros % 2) == 1:
                        print("Odd number of markers")

                    save_abs_keywords=abs_keywords[:]

                    number_of_pairs_of_markers=int(number_of_quad_euros/2)
                    for i in range(0, number_of_pairs_of_markers):
                        abstract_key_prefix='”Abstract['
                        key_offset=abs_keywords.find(abstract_key_prefix)
                        if key_offset > 0:
                            # found a key for an abstract
                            # get language code
                            lang_code_start=key_offset+len(abstract_key_prefix)
                            lang_code=abs_keywords[lang_code_start:lang_code_start+3]
                            quad__euro_marker_start=abs_keywords.find(quad__euro_marker, lang_code_start)
                            if quad__euro_marker_start >= 0:
                                quad__euro_marker_end=abs_keywords.find(quad__euro_marker, quad__euro_marker_start + 5)
                                abstracts[lang_code]=abs_keywords[quad__euro_marker_start+5:quad__euro_marker_end]
                                #br_offset=abstracts[lang_code].find('<br>')
                                #if br_offset >= 0:
                                #    abstracts[lang_code]=abstracts[lang_code][br_offset+4:]

                                abs_keywords=abs_keywords[quad__euro_marker_end+1:]
                        

                    abs_keywords=save_abs_keywords[:]

                    for i in range(0, number_of_pairs_of_markers):
                        abstract_key_prefix='”Keywords['
                        key_offset=abs_keywords.find(abstract_key_prefix)
                        if key_offset > 0:
                            # found a key for an abstract
                            # get language code
                            lang_code_start=key_offset+len(abstract_key_prefix)
                            lang_code=abs_keywords[lang_code_start:lang_code_start+3]
                            quad__euro_marker_start=abs_keywords.find(quad__euro_marker, lang_code_start)
                            if quad__euro_marker_start > 0:
                                quad__euro_marker_end=abs_keywords.find(quad__euro_marker, quad__euro_marker_start + 5)
                                keywords[lang_code]=abs_keywords[quad__euro_marker_start+5:quad__euro_marker_end]
                                keywords[lang_code]=keywords[lang_code].replace('\n', '') # remove newlines from keywords
                                keywords[lang_code]=keywords[lang_code].strip() # remove starting end ending white space
                                br_offset=keywords[lang_code].find('<br>')
                                if br_offset >= 0:
                                    keywords[lang_code]=keywords[lang_code][br_offset+4:]
                                abs_keywords=abs_keywords[quad__euro_marker_end+1:]
                        

                    for a in abstracts:
                        print("a={0}, abstract={1}".format(a, abstracts[a]))
                        abstracts[a]=clean_up_abstract(abstracts[a])

                    any_acronyms_in_abstracts=False
                    for a in abstracts:
                        acronyms_present=check_for_acronyms(abstracts[a])
                        if acronyms_present:
                            any_acronyms_in_abstracts=True

                    if any_acronyms_in_abstracts:
                        acronyms_filename=args["acronyms"]
                        print("Acronyms found, getting acronyms from {}".format(acronyms_filename))
                        acronym_dict=get_acronyms(acronyms_filename)
                        if len(acronym_dict) == 0:
                            print("no acronyms found in {}".format(acronyms_filename))
                        else:
                            # entries of the form: acronym_dict[label]={'acronym': acronym, 'phrase': phrase}
                            for a in abstracts:
                                abstracts[a]=spellout_acronyms_in_abstract(acronym_dict, abstracts[a])


                    print("abstracts={}".format(abstracts))
                    print("keywords={}".format(keywords))

                    d['abstracts']=abstracts
                    d['keywords']=keywords
                    output_filename=args["json"]
                    if Verbose_Flag:
                        print("output_filename={}".format(output_filename))
                    with open(output_filename, 'w', encoding='utf-8') as output_FH:
                        j_as_string = json.dumps(d, ensure_ascii=False)
                        print(j_as_string, file=output_FH)

            else:
                print('No "Number of lang instances" found')
                dict_string=diva_data[:]
                print("initial dict_string={}".format(dict_string))

                dict_string=dict_string.replace('', '') #  remove any new page characters

                dict_string=dict_string.replace('”', '"')
                dict_string=dict_string.replace('\n\n', '\n')
                dict_string=dict_string.replace(' \n', '')
                dict_string=dict_string.replace(',}', '}')

                #dict_string=dict_string.replace('&quot;', '"')
                #dict_string=dict_string.replace('<br>', '\n')
                #dict_string=dict_string.replace('<br>"', '\n"')
                #dict_string=dict_string.replace('<br>}', '\n}')
                dict_string=dict_string.replace(',\n\n}', '\n}')
                dict_string=dict_string.replace(',\n}', '\n}')
                # fix an error in the early template
                if dict_string.find(',Äddress": ') > 0:
                    print("fix an error in the early template")
                    dict_string=dict_string.replace(',Äddress": ', ',"Address": "')
                    dict_string=dict_string.replace('\"Lindstedtsvägen', 'Lindstedtsvägen')
                    dict_string=dict_string.replace('¨Lindstedtsvägen', 'Lindstedtsvägen')
                    dict_string=dict_string.replace('¨Isafjordsgatan', 'Isafjordsgatan')

                if not args['ligatures']:
                    dict_string=replace_ligature(dict_string)
                    print("looking for and replacing ligatures")

                print("dict_string={}".format(dict_string))
                d=json.loads(dict_string)
                print("d={}".format(d))

                output_filename=args["json"]
                if Verbose_Flag:
                    print("output_filename={}".format(output_filename))
                with open(output_filename, 'w', encoding='utf-8') as output_FH:
                    j_as_string = json.dumps(d, ensure_ascii=False)
                    print(j_as_string, file=output_FH)
示例#34
0
def parse_assessment_to_excel(assessment_path, database_path):

    utc_now = datetime.utcnow()

    data_dictionary = OrderedDict(
        {"Processed_UTC": utc_now.isoformat()}
    )  # Lets make a dictionary where all the parsed values are kept, lets add time when parsing was started
    # TODO add also processed file name

    assessment_file = open(assessment_path, 'rb')

    parser = PDFParser(assessment_file)
    doc = PDFDocument(parser)
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    for i in fields:
        field = resolve1(i)
        key, value = field.get('T'), field.get('V')

        if debug:
            print '{}: {} -> {}'.format(key, value, type(value))  # DEBUG

        if type(value) == str:

            unicode_value = unicode(
                value.decode("iso-8859-1").replace(
                    u"\xfe\xff\x00",
                    u"").replace(u"\x00", u"").replace(u'\xfe\xff', u"")
            )  # Lets convert the string to unicode and replace is needed to remove some funny characters
            data_dictionary[key] = [unicode_value]

        elif value == None:
            data_dictionary[key] = [u"ei"]

        else:
            data_dictionary[key] = [value.name]

            if value.name == "Off":
                data_dictionary[key] = [u"ei"]

            if value.name == "Yes":
                data_dictionary[key] = [u"jah"]

    assessment_file.close()

    # Create pandas dataframe for exporting data
    data_frame = pandas.DataFrame(data_dictionary)

    if debug:
        print list(data_frame.columns)  # DEBUG

    if os.path.exists(database_path) == True:

        print "Info  - Database file {} already exists, loading previous records".format(
            database_path)
        existing_data = pandas.read_excel(
            database_path, index_col=0)  # TODO set first column as index

        if debug:
            print existing_data

        # Add to exsiting data
        data_frame = existing_data.append(data_frame, sort=False)

        # Fix index numbering
        data_frame = data_frame.reset_index(drop=True)  # Fix index numbering

        # Create backup of current database
        move_file(database_path,
                  "database_backup", "{:%Y%m%dT%H%M%S}_{}".format(
                      utc_now,
                      uuid.uuid4()))  # Create unique filename for each bacup

    # Export to excel and add formatting

    sheet_name = "Hindamised"

    writer = pandas.ExcelWriter(database_path, engine='xlsxwriter')
    data_frame.to_excel(writer, sheet_name, encoding='utf8')

    # Get sheet to do some formatting
    sheet = writer.sheets[sheet_name]

    # Set default column size, if this does not work you are missing XslxWriter module
    first_col = 1
    last_col = len(data_frame.columns)
    width = 25
    sheet.set_column(first_col, last_col, width)

    # freeze column names and ID column
    sheet.freeze_panes(1, 1)

    # Apply filter to excel
    first_row = 0
    last_row = len(data_frame)
    sheet.autofilter(first_row, first_col, last_row, last_col)

    # Save the file
    writer.save()

    return data_dictionary
示例#35
0
def run_convert_code():
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    textJson = []
    entries = os.listdir('../files/pdf')
    source_file = ''
    text = ''
    date = ''

    # recuperation du chemin de chaque doc pdf dans le dossier
    for pdf_path in entries:
        source_file = pdf_path
        try:
            images = convert_from_path(
                '../files/pdf/' + pdf_path,
                poppler_path=r'C:\poppler-21.03.0\Library\bin')
            list_images = images[0]
            ocr_dict = pytesseract.image_to_data(list_images,
                                                 lang='eng',
                                                 output_type=Output.DICT)
            text1 = " ".join(ocr_dict['text'])

            file = open('../files/pdf/' + pdf_path, 'rb')
            parser = PDFParser(file)
            document = PDFDocument(parser)
            if resolve1(document.catalog['Pages'])['Count'] > 1:
                pil_im1 = images[1]
                ocr_dict1 = pytesseract.image_to_data(pil_im1,
                                                      lang='eng',
                                                      output_type=Output.DICT)
                text2 = " ".join(ocr_dict1['text'])
            else:
                text2 = ''
            text = text1 + text2

            jour = re.search(
                "(lundi|mardi|Mercredi|mércredi|jeudi|vendredi|samedi|dimanche)",
                text, re.IGNORECASE)

            pat = r"\W*([\w]+)"
            n = 3
            groups = re.search(
                r'{}\W*{}{}'.format(pat * n, str(jour.group(0)), pat * n),
                text, re.IGNORECASE).groups()
            date_input = str(groups[n:][0]) + '-' + str(
                groups[n:][1]).lower() + '-' + str(groups[n:][2])
            if len(str(groups[n:][0])) < 2:
                journ = '0' + str(groups[n:][0])
            else:
                journ = str(groups[n:][0])
            if str(groups[n:][1]).lower() == 'janvier':
                mois = '01'
            if str(groups[n:][1]).lower() == 'février':
                mois = '02'
            if str(groups[n:][1]).lower() == 'mars':
                mois = '03'
            if str(groups[n:][1]).lower() == 'avril':
                mois = '04'
            if str(groups[n:][1]).lower() == 'mai':
                mois = '05'
            if str(groups[n:][1]).lower() == 'juin':
                mois = '06'
            if str(groups[n:][1]).lower() == 'juillet':
                mois = '07'
            if str(groups[n:][1]).lower() == 'août':
                mois = '08'
            if str(groups[n:][1]).lower() == 'septembre':
                mois = '09'
            if str(groups[n:][1]).lower() == 'octobre':
                mois = '10'
            if str(groups[n:][1]).lower() == 'novembre':
                mois = '11'
            if str(groups[n:][1]).lower() == 'décembre':
                mois = '12'
            date = journ + '/' + mois + '/' + str(groups[n:][2])

        except Exception as e:
            print('petit probleme')

        cas_positifs = re.search(
            r'(\w+\s+){0,3}sont revenus positifs(\w+\s+){0,3}', text,
            re.IGNORECASE)
        if cas_positifs:
            pos_num = str(cas_positifs.group(0))
            cas_positifs_nums = [
                int(s) for s in pos_num.split() if s.isdigit()
            ]
            if not cas_positifs_nums:
                cas_positifs_nums = [0]
        else:
            cas_positifs_nums = [0]

        cas_importes = re.search(r'(\w+\s+){0,3}cas importés(\w+\s+){0,3}',
                                 text, re.IGNORECASE)
        if cas_importes:
            imp_num = str(cas_importes.group(0))
            cas_importes_nums = [
                int(s) for s in imp_num.split() if s.isdigit()
            ]
            if not cas_importes_nums:
                cas_importes_nums = [0]
        else:
            cas_importes_nums = [0]

        cas_contacts = re.search(r'(\w+\s+){0,3}cas contacts(\w+\s+){0,3}',
                                 text, re.IGNORECASE)
        if cas_contacts:
            cont_num = str(cas_contacts.group(0))
            cas_contacts_nums = [
                int(s) for s in cont_num.split() if s.isdigit()
            ]
            if not cas_contacts_nums:
                cas_contacts_nums = [0]
        else:
            cas_contacts_nums = [0]

        tests_realises = re.search(r'(\w+\s+){0,3}tests réalisés(\w+\s+){0,3}',
                                   text, re.IGNORECASE)
        if tests_realises:
            test_num = str(tests_realises.group(0))
            if tests_realises.group(0) is not None:
                cas_test_nums = [
                    int(s) for s in test_num.split() if s.isdigit()
                ]
                if not cas_test_nums:
                    cas_test_nums = [0]
        else:
            cas_test_nums = [0]

        sous_traitement = re.search(
            r'(\w+\s+){0,3}sous traitement(\w+\s+){0,3}', text, re.IGNORECASE)
        if sous_traitement:
            trait_num = str(sous_traitement.group(0))
            cas_sous_traitement_nums = [
                int(s) for s in trait_num.split() if s.isdigit()
            ]
            if not cas_sous_traitement_nums:
                cas_sous_traitement_nums = [0]
        else:
            cas_sous_traitement_nums = [0]

        contacts_suivis = re.search(
            r'(\w+\s+){0,3}contacts suivis(\w+\s+){0,3}', text, re.IGNORECASE)
        if contacts_suivis:
            suivi_num = str(contacts_suivis.group(0))
            cas_contacts_suivis_nums = [
                int(s) for s in suivi_num.split() if s.isdigit()
            ]
            if not cas_contacts_suivis_nums:
                cas_ccontacts_suivis_nums = [0]
        else:
            cas_contacts_suivis_nums = [0]

        cas_communautaires = re.search(
            r'(\w+\s+){0,12} communautaire(\w+\s+){0,3}', text, re.IGNORECASE)
        if cas_communautaires:
            comm_num = str(cas_communautaires.group(0))
            cas_communautaires_nums = [
                int(s) for s in comm_num.split() if s.isdigit()
            ]
            if not cas_communautaires_nums:
                cas_communautaires_nums = [0]
        else:
            cas_communautaires_nums = [0]

        cas_gueris = re.search(
            r'(\w+\s+){0,6}négatifs et déclarés guéris(\w+\s+){0,3}', text,
            re.IGNORECASE)
        if cas_gueris:
            gueris_num = str(cas_gueris.group(0))
            cas_gueris_nums = [
                int(s) for s in gueris_num.split() if s.isdigit()
            ]
        else:
            cas_gueris_nums = [0]

        cas_deces = re.search(r'(\w+\s+){0,10}décès(\w+\s+){0,10}', text,
                              re.IGNORECASE)
        if cas_deces:
            deces_num = str(cas_deces.group(0))
            cas_deces_nums = [int(s) for s in deces_num.split() if s.isdigit()]
            if not cas_deces_nums:
                cas_deces_nums = [0]
        else:
            cas_deces_nums = [0]

        # different regions du senegal
        reg1 = r"(?i)(?:\bDakar\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Dakar)"
        reg2 = r"(?i)(?:\bThiès\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Thiès)"
        reg3 = r"(?i)(?:\bLouga\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Louga)"
        reg4 = r"(?i)(?:\bDiourbel\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Diourbel)"
        reg5 = r"(?i)(?:\bFatick\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Fatick)"
        reg6 = r"(?i)(?:\bKaolack\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Kaolack)"
        reg7 = r"(?i)(?:\bKaffrine\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Kaffrine)"
        reg8 = r"(?i)(?:\bKolda\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Kolda)"
        reg9 = r"(?i)(?:\bTamba\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Tamba)"
        reg10 = r"(?i)(?:\bZiguinchor\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Ziguinchor)"
        reg11 = r"(?i)(?:\bSaint-Louis\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Saint-Louis)"
        reg12 = r"(?i)(?:\bMatam\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Matam)"
        reg13 = r"(?i)(?:\bSédhiou\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Sédhiou)"
        reg14 = r"(?i)(?:\bKédougou\D{0,20})([0-9][0-9,]*)[^.,]|([0-9][0-9,]*)[^.,](?:\D{0,20}Kédougou)"

        nb_cas_dkr = re.findall(reg1, text)
        if not nb_cas_dkr:
            nb_cas_dakar = [0]
        else:
            dkr_str = str(nb_cas_dkr)
            nb_cas_dakar = re.findall(r'\d+', dkr_str)

        nbCasTh = re.findall(reg2, text)
        if not nbCasTh:
            nb_cas_thies = [0]
        else:
            th_str = str(nbCasTh)
            nb_cas_thies == re.findall(r'\d+', th_str)

        nbCasLg = re.findall(reg3, text)
        if not nbCasLg:
            nb_cas_louga = [0]
        else:
            lg_str = str(nbCasLg)
            nb_cas_louga = re.findall(r'\d+', lg_str)

        nbCasDbl = re.findall(reg4, text)
        if not nbCasDbl:
            nb_cas_diourbel = [0]
        else:
            dbl_str = str(nbCasDbl)
            nb_cas_diourbel = re.findall(r'\d+', dbl_str)

        nbCasFtk = re.findall(reg5, text)
        if not nbCasFtk:
            nb_cas_fatick = [0]
        else:
            ftk_str = str(nbCasFtk)
            nb_cas_fatick = re.findall(r'\d+', ftk_str)

        nbCasKlk = re.findall(reg6, text)
        if not nbCasKlk:
            nb_cas_kaolack = [0]
        else:
            klk_str = str(nbCasKlk)
            nb_cas_kaolack = re.findall(r'\d+', klk_str)

        nbCasKfr = re.findall(reg7, text)
        if not nbCasKfr:
            nb_cas_kaffrine = [0]
        else:
            kfr_str = str(nbCasKfr)
            nb_cas_kaffrine = re.findall(r'\d+', kfr_str)

        nbCasKld = re.findall(reg8, text)
        if not nbCasKld:
            nb_cas_kolda = [0]
        else:
            kld_str = str(nbCasKld)
            nb_cas_kolda = re.findall(r'\d+', kld_str)

        nbCasTmb = re.findall(reg9, text)
        if not nbCasTmb:
            nb_cas_tamba = [0]
        else:
            tmb_str = str(nbCasTmb)
            nb_cas_tamba = re.findall(r'\d+', tmb_str)

        nbCasZig = re.findall(reg10, text)
        if not nbCasZig:
            nb_cas_ziguinchor = [0]
        else:
            zig_str = str(nbCasZig)
            nb_cas_ziguinchor = re.findall(r'\d+', zig_str)

        nbCasSl = re.findall(reg11, text)
        if not nbCasSl:
            nb_cas_saintl = [0]
        else:
            sl_str = str(nbCasSl)
            nb_cas_saintl = re.findall(r'\d+', sl_str)

        nbCasMtm = re.findall(reg12, text)
        if not nbCasMtm:
            nb_cas_matam = [0]
        else:
            mtm_str = str(nbCasMtm)
            nb_cas_matam = re.findall(r'\d+', mtm_str)

        nbCasSdh = re.findall(reg13, text)
        if not nbCasSdh:
            nb_cas_sedhiou = [0]
        else:
            sdh_str = str(nbCasSdh)
            nb_cas_sedhiou = re.findall(r'\d+', sdh_str)

        nbCasKdg = re.findall(reg14, text)
        if not nbCasKdg:
            nb_cas_kedougou = [0]
        else:
            kdg_str = str(nbCasKdg)
            nb_cas_kedougou = re.findall(r'\d+', kdg_str)
        annee_mois = ''
        json_data = {
            pdf_path: {
                'date': date,
                'nouveaux_cas': cas_positifs_nums[0],
                'cas_importes': cas_importes_nums[0],
                'cas_contacts': cas_contacts_nums[0],
                'test_realise': cas_test_nums[0],
                'personne_sous_traitement': cas_sous_traitement_nums[0],
                'cas_communautaires': cas_communautaires_nums[0],
                'nombre_gueris': cas_gueris_nums[0],
                'nombre_deces': cas_deces_nums[0],
                'date_heure_extraction': str(datetime.now()),
                'nom_fichier_source': source_file,
                'localites': {
                    'Dakar': int(nb_cas_dakar[0]),
                    'Thies': int(nb_cas_thies[0]),
                    'Diourbel': int(nb_cas_diourbel[0]),
                    'Fatick': int(nb_cas_fatick[0]),
                    'Kaolack': int(nb_cas_kaolack[0]),
                    'Kaffrine': int(nb_cas_kaffrine[0]),
                    'Louga': int(nb_cas_louga[0]),
                    'Kolda': int(nb_cas_kolda[0]),
                    'Tambacounda': int(nb_cas_tamba[0]),
                    'Ziguinchor': int(nb_cas_ziguinchor[0]),
                    'Saint-Louis': int(nb_cas_saintl[0]),
                    'Matam': int(nb_cas_matam[0]),
                    'Sedhiou': int(nb_cas_sedhiou[0]),
                    'Kedougou': int(nb_cas_kedougou[0])
                }
            }
        }
        textJson.append(json_data[pdf_path])

    valeurs = set(map(lambda x: x['date'][3:12], textJson))
    new_list = [[y for y in textJson if y['date'][3:12] == x] for x in valeurs]

    for i in new_list:
        doc_name = i[0]['date'][3:12]
        txtFile = doc_name.replace('/', '-')
        annee_mois = doc_name
        if txtFile:
            with open(str(txtFile) + '.json', 'w', encoding='utf-8') as f:
                json.dump(i, f, ensure_ascii=False, indent=4)
            f.close()
    for f in glob.glob('*.json'):
        shutil.move(f, '../files/jsons')
示例#36
0
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

filepath='C:/Users/lenovo/Desktop/ACL2020'
list1=os.listdir(filepath)
list_words=[]
corpus=[]
for i in range(len(list1)):
    outs=""
    fp = open(filepath+'/'+list1[i], 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser=parser)
    parser.set_document(doc=doc)
    resource = PDFResourceManager()
    laparam = LAParams()
    device = PDFPageAggregator(resource, laparams=laparam)
    interpreter = PDFPageInterpreter(resource, device)
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        layout = device.get_result()
        for out in layout:
            if hasattr(out, 'get_text'):
                outs=out.get_text()+outs
    outs=outs.lower().replace('\n','')
    english_pu=['’','“','“']
    punctuation_map = dict((ord(char), None) for char in string.punctuation)
    without_punctuation = outs.translate(punctuation_map)  # 去除文章标点符号
def PDF_to_TXT_regex(title):
	
	#print("\n\n ~~~~~~~~ \n\n ~~~~~~~~ \n\n")
	print("Title: {}".format(title))
	
	with open(title, 'rb') as in_file:
		parser = PDFParser(in_file)
		doc = PDFDocument(parser)
		rsrcmgr = PDFResourceManager()
		device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		for page in PDFPage.create_pages(doc):
			interpreter.process_page(page)

	#print(output_string.getvalue())

	import re 
	
	# Let us createa an empty array to append incubation periods that we find
	
	for sentence in output_string.getvalue().split(". "):
		if "incubation" in sentence:
			#print(sentence)
			#day = re.findall(r" \d{1.2} day", sentence)
			day = re.findall(r" ((\d{1,2}\.)?\d{1,2}) day[s]?", sentence)
			
			#day = re.findall(r" (\d{1,2}\.(\d{1,2})?) day[s]?", sentence)
			
			print(day)
			
			
			# let's print the numbers where we found them:
			if (len(day) >0):
				#print("\nDays: {}, Array length = {}".format(day, len(day)))
				#print("Corresponding sentence is:\n{}".format(sentence))
				incubations.append(float(day[0][0]) )
			else:
				pass 
				#print(day)
				
			
			'''
			# let's print now the numbers and their sentences 
			# only when there is 1 number found
			if len(day) == 1:
				print("\nday[0][0] = {}  day[s]".format(day[0][0]))
				print("\nSentence: {}".format(sentence))
			
				incubations.append(day)
			
			'''
			
			'''
			day2 = re.findall(r"(?:\d{1,2}\.)?\d{1,2}", sentence)
			#print(day2)
			
			day3 = re.findall(r"(\d{1,2}(\.\d{1,2})? day[s]?)", sentence)
			day3 = re.findall(r"(\d{1,2}(\.\d{1,2})?) day[s]?", sentence)
			#day3 = re.findall(r" \d{1,2}(\.\d{1,2})? day[s]?", sentence)
			#print(day3)
			
			if len(day2) == 1:
				print(day2[0])
				print(sentence)
			
			if len(day3) == 1:
				print("day3[0] = {}  day[s]".format(day3[0][0]))
				print("Sentence: {}".format(sentence))
				
				incubations.append(day3[0][0])
			'''
	#print("Incubation days from the paper:\n{}".format(incubations))
	print(incubations)
	print(sorted(incubations))
	import matplotlib.pyplot as plt
	
	bins = [0., 2., 4., 6., 8., 10., 12., 14., 16., 18., 20., 22., 24., 26., 28.]
	
	fig = plt.figure(figsize=[14,16])
	plt.rc('font', size=18)
	
	#plt.hist(sorted(incubations), bins=bins)
	plt.hist((incubations), bins=bins)
	
	plt.xlabel('Incubation period')
	plt.ylabel('Frequency/Probability')
	plt.title('Histogram of Coronavirus incubation periods')
	
	plt.show()
	plt.close()
示例#38
0
文件: pdf.py 项目: antibios/weboob
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    if LOGGER.isEnabledFor(DEBUGFILES):
        import tempfile
        import PIL.Image as Image
        import PIL.ImageDraw as ImageDraw
        import random

        path = tempfile.mkdtemp(prefix='pdf')

    for npage, page in enumerate(pages):
        LOGGER.debug('processing page %s', npage)
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], [])
        LOGGER.debug('found %d text objects', len(texts))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for t in texts:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color)
                draw.text((t.x0, t.y0), t.text.encode('utf-8'), color)
            fpath = '%s/1text-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        # TODO filter ltcurves that are not lines?
        # TODO convert rects to 4 lines?
        lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))]
        LOGGER.debug('found %d lines', len(lines))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for l in lines:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color)
            fpath = '%s/2lines-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        lines = list(uniq_lines(lines))
        LOGGER.debug('found %d unique lines', len(lines))

        rows = build_rows(lines)
        LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for r in rows:
                for b in r:
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
            fpath = '%s/3rows-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        textrows = arrange_texts_in_rows(rows, texts)
        LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for row, trow in zip(rows, textrows):
                for b, tlines in zip(row, trow):
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
                    draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color)
            fpath = '%s/4cells-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        yield textrows
    device.close()
示例#39
0
    def get_pdf_details(self):
        # save pdf to local
        # it gives random name to pdf, it will delete it after processing
        random_string = str(uuid.uuid4())[0:10]
        file_path = os.path.join(BASE_DIR, 'pdf_files',
                                 "{}.pdf".format(random_string))
        html_file_path = os.path.join(BASE_DIR, 'pdf_files',
                                      "{}.html".format(random_string))
        with open(file_path, 'wb') as f:
            f.write(self.response.content)

        text = ""

        # Usage Type 1:
        # Rendering pdf as text. Best way to get PDF content, but got problems with jusText, not getting article as expected
        with open(file_path, 'rb') as f:
            parser = PDFParser(f)
            document = PDFDocument(parser)
            manager = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(manager, laparams=laparams)
            interpreter = PDFPageInterpreter(manager, device)
            for page in PDFPage.get_pages(f):
                interpreter.process_page(page)
            layout = device.get_result()
            for element in layout:
                if isinstance(element, (LTTextBoxHorizontal)):
                    # alterin element get as html element, so jusText library can find relative texts
                    text += "<p>{}</p>".format(element.get_text())
            # End of usage type 1

            # Usage Type 2:
            # Rendering pdf as html. Not a great way to get PDF content. Font sizes, html elements etc. not rendering as expected.
            # If fixed, would work with jusText as expected.
            with open(html_file_path, 'wb') as outf:
                extract_text_to_fp(f, outf, output_type='html')
        with open(html_file_path, 'rb') as f:
            text = " ".join(
                [x.decode().replace('\n', '') for x in f.readlines()])

        # End of usage type 2

        if document.info:
            self.title = document.info[0].get('Title', None)
            if self.title:
                self.title = self.title.decode()

        # jusText raises exception if text variable is empty
        if text:
            parapraphs = justext.justext(
                text, justext.get_stoplist(language='English'))

            content = " ".join([
                parapraph.text for parapraph in parapraphs
                if not parapraph.is_boilerplate and not parapraph.is_heading
                and parapraph.class_type == 'good'
            ])

            self.content = content
            self.raw_content = content

        # Remove reduntant files.
        os.unlink(file_path)
        os.unlink(html_file_path)
示例#40
0
文件: PDF.py 项目: mcspx/rpaframework
    def get_input_fields(self,
                         source_pdf: str = None,
                         replace_none_value: bool = False) -> dict:
        """Get input fields in the PDF.

        Stores input fields internally so that they can be used without
        parsing PDF again.

        Parameter `replace_none_value` is for convience to visualize fields.

        :param source_pdf: source filepath, defaults to None
        :param replace_none_value: if value is None replace it with key name,
            defaults to False
        :return: dictionary of input key values or `None`
        """
        record_fields = {}
        if source_pdf is None and self.active_fields:
            return self.active_fields
        self.switch_to_pdf_document(source_pdf)
        source_parser = PDFParser(self.active_fileobject)
        source_document = PDFDocument(source_parser)
        try:
            fields = resolve1(source_document.catalog["AcroForm"])["Fields"]
        except KeyError:
            self.logger.info('PDF "%s" does not have any input fields.',
                             self.active_pdf)
            return None

        for i in fields:
            field = resolve1(i)
            if field is None:
                continue
            name, value, rect, label = (
                field.get("T"),
                field.get("V"),
                field.get("Rect"),
                field.get("TU"),
            )
            if value is None and replace_none_value:
                record_fields[name.decode("iso-8859-1")] = {
                    "value": name.decode("iso-8859-1"),
                    "rect": iterable_items_to_int(rect),
                    "label": label.decode("iso-8859-1") if label else None,
                }
            else:
                try:
                    record_fields[name.decode("iso-8859-1")] = {
                        "value": value.decode("iso-8859-1") if value else "",
                        "rect": iterable_items_to_int(rect),
                        "label": label.decode("iso-8859-1") if label else None,
                    }
                except AttributeError:
                    self.logger.debug("Attribute error")
                    record_fields[name.decode("iso-8859-1")] = {
                        "value": value,
                        "rect": iterable_items_to_int(rect),
                        "label": label.decode("iso-8859-1") if label else None,
                    }

        self.active_fields = record_fields if record_fields else None
        return record_fields
示例#41
0
def cate(load_path, keywords, mode, save_path, category, win):

    new_win = None
    canvas = None
    fill_line = None
    if win != None:
        new_win = Toplevel(win)
        new_win.title(category + '分类进度')
        new_win.geometry('300x20')
        canvas = Canvas(new_win, width=300, height=20, bg="white")
        canvas.place(x=0, y=0)
        fill_line = canvas.create_rectangle(0,
                                            0,
                                            300,
                                            20,
                                            width=0,
                                            fill="green")

    pdfs = os.listdir(load_path)
    n = 300 / len(pdfs)
    for pdf in tqdm(pdfs):
        with open(load_path + pdf, 'rb') as fp:
            parser = PDFParser(fp)
            document = PDFDocument(parser)
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed
            else:
                # 创建一个PDF资源管理器对象来存储共赏资源
                rsrcmgr = PDFResourceManager()
                # 设定参数进行分析
                laparams = LAParams()
                # 创建一个PDF设备对象
                # device=PDFDevice(rsrcmgr)
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                # 创建一个PDF解释器对象
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                flag = False

                for page in PDFPage.create_pages(document):
                    interpreter.process_page(page)
                    # # 接受该页面的LTPage对象
                    layout = device.get_result(
                    )  # return text image line curve
                    for x in layout:
                        if isinstance(x, LTText):
                            text = x.get_text()
                            isin = is_in(keywords, text, mode)
                            if isin:
                                copy(load_path, pdf, save_path, category)
                                flag = True
                                break
                    if flag:
                        break

        if new_win != None and canvas != None and fill_line != None:
            n = n + 300 / len(pdfs)
            canvas.coords(fill_line, (0, 0, n, 20))
            new_win.update()
    if new_win != None:
        new_win.destroy()
示例#42
0
def get_title_from_io(pdf_io, min_ch, min_wd):
    parser = PDFParser(pdf_io)
    # if pdf is protected with a pwd, 2nd param here is password
    doc = PDFDocument(parser)

    # pdf may not allow extraction
    # pylint: disable=no-else-return
    if doc.is_extractable:
        rm = PDFResourceManager()
        dev = TextOnlyDevice(rm)
        interpreter = TextOnlyInterpreter(rm, dev)

        first_page = StringIO()
        converter = TextConverter(rm, first_page, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(rm, converter)

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            page_interpreter.process_page(page)
            break

        converter.close()
        first_page_text = first_page.getvalue()
        first_page.close()
        dev.recover_last_paragraph()
        verbose('all blocks')

        for b in dev.blocks:
            verbose(b)

        title = None
        max_tfs_cutoff = None
        tfs_tol = 1
        y_tol = 1
        max_num_iter = 4  # number of times to lower max_tfs_cutoff if title too short or too few words.
        tfs_iter = 0
        while tfs_iter < max_num_iter and (
                not title or (min_ch > 1 and len(title) < min_ch) or
            (min_wd > 1 and len(title.split(' ')) > 1
             and len(title.split(' ')) < min_wd)):
            tfs_iter += 1
            # pylint: disable=W0603
            # global ALGO # don't neet 'global ALGO' as it's not being modified, can still read it.
            if ALGO == "original":
                # find max font size
                max_tfs = max([
                    b for b in dev.blocks
                    if (not max_tfs_cutoff or b[1] < max_tfs_cutoff)
                ],
                              key=lambda x: x[1])[1]
                verbose('max_tfs: ', max_tfs)
                # find max blocks with max font size
                max_blocks = list(
                    filter(lambda x: abs(x[1] - max_tfs) < tfs_tol,
                           dev.blocks))
                # find the one with the highest y coordinate
                # this is the most close to top
                max_y = max(max_blocks, key=lambda x: x[3])[3]
                verbose('max_y: ', max_y)
                found_blocks = list(
                    filter(lambda x: abs(x[3] - max_y) < y_tol, max_blocks))
                verbose('found blocks')

                for b in found_blocks:
                    verbose(b)

                title = ''
                for b in found_blocks:
                    title += ''.join(b[4])
            elif ALGO == "max2":
                # find max font size
                all_tfs = sorted(list(
                    map(lambda x: x[1], [
                        b for b in dev.blocks
                        if (not max_tfs_cutoff or b[1] < max_tfs_cutoff)
                    ])),
                                 reverse=True)
                max_tfs = all_tfs[0]
                verbose('max_tfs: ', max_tfs)
                selected_blocks = []
                max2_tfs = -1
                for b in dev.blocks:
                    if max2_tfs == -1:
                        if abs(b[1] - max_tfs) < tfs_tol:
                            selected_blocks.append(b)
                        elif len(selected_blocks) > 0:  # max is added
                            selected_blocks.append(b)
                            max2_tfs = b[1]
                    else:
                        if abs(b[1] -
                               max_tfs) < tfs_tol or abs(b[1] -
                                                         max2_tfs) < tfs_tol:
                            selected_blocks.append(b)
                        else:
                            break

                for b in selected_blocks:
                    verbose(b)

                title = ''
                for b in selected_blocks:
                    title += ''.join(b[4])
            elif ALGO == "max_position":
                # find max font size
                max_tfs = max([
                    b for b in dev.blocks
                    if (not max_tfs_cutoff or b[1] < max_tfs_cutoff)
                ],
                              key=lambda x: x[1])[1]
                verbose('max_tfs: ', max_tfs)
                # find max blocks with max font size
                tfs_tol = 1
                max_blocks = [
                    b for b in dev.blocks if abs(b[1] - max_tfs) < tfs_tol
                ]
                for b in max_blocks:
                    verbose(b)
                # Now use the y-range of max_blocks as the check
                # for all blocks, with a much higher tolerance for
                # tfs to account for sub/superscript characters which
                # can vary by +/- 10pts.
                y_max = max(max_blocks, key=lambda x: x[3])[3]
                y_min = min(max_blocks, key=lambda x: x[3])[3]
                y_range = y_max - y_min
                y_mid = (y_max + y_min) * 0.5
                verbose(f"{y_range = }, {y_mid = }")
                # find the one with the highest y coordinate
                # this is the most close to top
                y_tol = 2
                tfs_tol = 8
                found_blocks = [
                    b for b in dev.blocks if b in max_blocks or (
                        b[3] <= y_max + y_tol and b[3] >= y_min -
                        y_tol and abs(b[1] - max_tfs) < tfs_tol)
                ]
                verbose('found blocks')

                for b in found_blocks:
                    verbose(b)

                title = ''
                for b in found_blocks:
                    title += ''.join(b[4])
            else:
                raise Exception("unsupported ALGO")

            max_tfs_cutoff = max_tfs

            verbose(f"before retrieving spaces, {title = }")

            # Retrieve missing spaces if needed
            # if " " not in title:
            #     title = retrieve_spaces(first_page_text, title)
            new_title = retrieve_spaces_word_based(first_page_text,
                                                   title.replace(' ', ''))
            if len(new_title) > len(title):
                title = new_title

            # Remove duplcate spaces if any are present
            if "  " in title:
                title = " ".join(title.split())

        return title
    else:
        return None
示例#43
0
    def makepdf(self, pdfdata1, udct, zeros):
        parser = PDFParser(BytesIO(pdfdata1))
        document = PDFDocument(parser, fallback=False)

        prev = document.find_xref(parser)
        info = document.xrefs[0].trailer['Info'].objid
        root = document.xrefs[0].trailer['Root'].objid
        size = 1
        # calculate last object id, size is only xref size but not count of object in xref
        for ref in document.xrefs:
            if isinstance(ref, PDFXRefStream):
                no = max(ref.ranges, key=operator.itemgetter(1))[1]
            else:
                if len(ref.offsets) == 0:
                    no = 0
                else:
                    no = max(ref.offsets.keys())
            size = max(size, no)
        pages = len(document.getobj(document.catalog['Pages'].objid)['Kids'])
        page = udct.get(b'sigpage',
                        0) if 0 <= udct.get(b'sigpage', 0) <= pages - 1 else 0
        page = document.getobj(
            document.catalog['Pages'].objid)['Kids'][page].objid

        nsig, fields = self.getfields(root, document)
        annots = self.getannots(page, document)

        infodata = self.getdata(pdfdata1, info, prev, document)
        rootdata = self.getdata(pdfdata1, root, prev, document, ('AcroForm', ))
        pagedata = self.getdata(pdfdata1, page, prev, document, ('Annots', ))

        no = size + 1
        visualization, nav = self.makevisualization(no, udct, nsig, page)
        objs = [
            self.makeobj(page,
                         (b'/Annots[%s%d 0 R]' % (annots, no + 3) + pagedata)),
            self.makeobj(no + 0, infodata),
            self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata),
            self.makeobj(
                no + 2, b'/Fields[%s%d 0 R]/SigFlags %d' %
                (fields, no + 3, udct[b'sigflags'])),
            visualization,
            self.makeobj(nav + 1, (
                b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\
/Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\
/Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'],
                udct[b'reason'])) + zeros + b'>'),
            #            self.makeobj(nav + 1, (b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\
            #/Filter/Adobe.PPKMS/SubFilter/ETSI.CAdES.detached/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\
            #/Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'], udct[b'reason'])) + zeros + b'>'),
        ]

        size = nav - no + 2
        pdfdata2 = b''.join(objs)
        startxref = len(pdfdata1)
        xref = b'xref\n%d 1\n%010d 00000 n \n%d %d\n' % (
            page, startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1, no,
            size)
        xref += b''.join([
            b'%010d 00000 n \n' % (startxref + pdfdata2.find(b'\n%d 0 obj\n' %
                                                             (no + i)) + 1)
            for i in range(size)
        ])

        trailer = b'''\
trailer
<</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\
startxref\n\
%(startxref)d\n\
%%%%EOF\n\
'''
        trailer = trailer % {
            b'page': page,
            b'no': no,
            b'startxref': startxref + len(pdfdata2),
            b'prev': prev,
            b'info': no + 0,
            b'root': no + 1,
            b'size': size,
            b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'),
            b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'),
        }

        pdfdata2 = pdfdata2 + xref + trailer

        return pdfdata2
示例#44
0
from pdfminer.pdfinterp import PDFPageInterpreter

from pdfminer.pdfdevice import PDFDevice

fp = open('./test.pdf', 'rb')

#创建一个PDF文档解析器对象

parser = PDFParser(fp)

#创建一个PDF文档对象存储文档结构

#提供密码初始化,没有就不用传该参数

document = PDFDocument(parser, password)

#检查文件是否允许文本提取

if not document.is_extractable:

    raise PDFTextExtractionNotAllowed

#创建一个PDF资源管理器对象来存储共享资源

rsrcmgr = PDFResourceManager()

#创建一个pdf设备对象

device = PDFDevice(rsrcmgr)
示例#45
0
    def __init__(self,
                 pdf_stream,
                 password="",
                 pagenos=[],
                 maxpages=0):  # noqa: C901
        ReaderBackend.__init__(self)
        self.pdf_stream = pdf_stream

        # Extract Metadata
        parser = PDFParser(pdf_stream)
        doc = PDFDocument(parser, password=password, caching=True)
        if doc.info:
            for k in doc.info[0]:
                v = doc.info[0][k]
                # print(repr(v), type(v))
                if isinstance(v, (bytes, str, unicode)):
                    self.metadata[k] = make_compat_str(v)
                elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)):
                    self.metadata[k] = make_compat_str(v.name)

        # Secret Metadata
        if "Metadata" in doc.catalog:
            metadata = resolve1(doc.catalog["Metadata"]).get_data()
            # print(metadata)  # The raw XMP metadata
            # print(xmp_to_dict(metadata))
            self.metadata.update(xmp_to_dict(metadata))
            # print("---")

        # Extract Content
        text_io = BytesIO()
        rsrcmgr = PDFResourceManager(caching=True)
        converter = TextConverter(rsrcmgr,
                                  text_io,
                                  codec="utf-8",
                                  laparams=LAParams(),
                                  imagewriter=None)
        interpreter = PDFPageInterpreter(rsrcmgr, converter)

        self.metadata["Pages"] = 0
        self.curpage = 0
        for page in PDFPage.get_pages(
                self.pdf_stream,
                pagenos=pagenos,
                maxpages=maxpages,
                password=password,
                caching=True,
                check_extractable=False,
        ):
            # Read page contents
            interpreter.process_page(page)
            self.metadata["Pages"] += 1
            self.curpage += 1

            # Collect URL annotations
            # try:
            if page.annots:
                refs = self.resolve_PDFObjRef(page.annots)
                if refs:
                    if isinstance(refs, list):
                        for ref in refs:
                            if ref:
                                self.references.add(ref)
                    elif isinstance(refs, Reference):
                        self.references.add(refs)

            # except Exception as e:
            # logger.warning(str(e))

        # Remove empty metadata entries
        self.metadata_cleanup()

        # Get text from stream
        self.text = text_io.getvalue().decode("utf-8")
        text_io.close()
        converter.close()
        # print(self.text)

        # Extract URL references from text
        for url in extractor.extract_urls(self.text):
            self.references.add(Reference(url, self.curpage))

        for ref in extractor.extract_arxiv(self.text):
            self.references.add(Reference(ref, self.curpage))

        for ref in extractor.extract_doi(self.text):
            self.references.add(Reference(ref, self.curpage))
示例#46
0
    def text_to_lda(self, fp=None):
        #Reading the PDF Document and saving as lone
        lone=self.convert_pdf_to_text()

        # Gets inputs rb
        fp = open(self.a, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        fields = resolve1(doc.catalog['AcroForm'])['Fields']

        inps=[]
        for i in fields:
            field = resolve1(i)
            name, value = field.get('T'), field.get('V')
            inps.append('{0}: {1}'.format(name, value))
            
        inf=[]
        ini=[]
        for i in fields:
            field=resolve1(i)
            name,value=field.get('T'),field.get('V')
            inf.append(name)
            ini.append(value)

        # Topic Modeling
        # Fitting Count Vectorizer on the document with Stop Words
        vect=CountVectorizer(ngram_range=(1,1),stop_words='english')
        dtm = vect.fit_transform(inps)

        #Converting the Document Term Matrix from Count Vectorizer into a Pandas Dataframe
        dfm=pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

        #Fitting the Latent Dirichlet Allocation Model on the Document Term Matrix
        lda = LatentDirichletAllocation(n_components=5)
        lda_dtf = lda.fit_transform(dtm)
        #Latent Dirichlet Allocation Model
        # lda_dtf

        # Topic Extracting
        #Extracting 5 Topics from LDA and the most common words in each topic
        sorting = np.argsort(lda.components_)[:, ::-1]
        features = np.array(vect.get_feature_names())

        # mg.tools.print_topics(topics=range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=15)

        #Sentences within the Topic Model 1
        topic_0 = np.argsort(lda_dtf[:,0])[::-1]
        t0=[]
        for i in topic_0[:5]:
            j=(f".".join(inps[i].split(f".")[:2]) + f".\n")
            t0.append(j)
            
        #Senteces within the Topic Model 2
        topic_1 = np.argsort(lda_dtf[:,1])[::-1]
        t1=[]
        for i in topic_1[:5]:
            j=(f".".join(inps[i].split(f".")[:2]) + f".\n")
            t1.append(j)

        #Senteces within the Topic Model 3
        topic_2 = np.argsort(lda_dtf[:,2])[::-1]
        t2=[]
        for i in topic_2[:5]:
            j=(f".".join(inps[i].split(f".")[:2]) + f".\n")
            t2.append(j)
            
        #Senteces within the Topic Model 4
        topic_3 = np.argsort(lda_dtf[:,3])[::-1]
        t3=[]
        for i in topic_3[:5]:
            j=(f".".join(inps[i].split(f".")[:2]) + f".\n")
            t3.append(j)
            
        #Senteces within the Topic Model 5
        topic_4 = np.argsort(lda_dtf[:,4])[::-1]
        t4=[]
        for i in topic_4[:5]:
            j=(f".".join(inps[i].split(f".")[:2]) + f".\n")
            t4.append(j)

        st0=str(t0).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ")
        st1=str(t1).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ")
        st2=str(t2).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ")
        st3=str(t3).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ")
        st4=str(t4).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ")

        print ("Topic 0: \n" + st0 + "\n")
        print ("Topic 1: \n" + st1 + "\n")
        print ("Topic 2: \n" + st2 + "\n")
        print ("Topic 3: \n" + st3 + "\n")
        print ("Topic 4: \n" + st4 + "\n")
示例#47
0
    def pdf2txt(self):
        '''
        =============================

        return : str, text File path
        '''

        # input
        password = ''
        pagenos = set()
        maxpages = 0

        # output
        imagewriter = None
        rotation = 0
        codec = 'UTF-8'
        pageno = 1
        scale = 1
        caching = True
        showpageno = True
        laparams = LAParams()

        infp = open(self.input_path, "rb")

        if self.output_path == None:
            self.output_path = self.input_path[:-4] + '_trans.txt'
            outfp = open(self.output_path, "w", encoding='UTF8')
        else:
            outfp = open(self.output_path, "w", encoding='UTF8')

        #page total num
        parser = PDFParser(infp)
        document = PDFDocument(parser)
        page_total_num = resolve1(document.catalog['Pages'])['Count']

        #
        rsrcmgr = PDFResourceManager(caching=caching)

        # pdf -> text converter
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)

        # pdf -> text interpreter
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # pdf -> text start
        with tqdm(total=page_total_num) as pbar:
            for page in PDFPage.get_pages(infp,
                                          pagenos,
                                          maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):

                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)

                pbar.update(1)

        print('[INFO] pdf -> text')

        outfp.close()
        infp.close()

        return self.output_path
    def parse(self, full_path: str):
        info = super().parse(full_path)

        if self.content_length > 0:
            with open(full_path, "rb") as f:

                try:
                    parser = PDFParser(f)
                    document = PDFDocument(parser)
                except PDFSyntaxError:
                    print("couldn't parse PDF " + full_path)
                    return info

                info["content"] = ""
                if len(document.info) > 0 and "Title" in document.info[
                        0] and document.info[0]["Title"] != b"":
                    if isinstance(document.info[0]["Title"], bytes):
                        info["content"] += document.info[0]["Title"].decode(
                            "utf-8", "replace") + "\n"
                    else:
                        info["content"] += document.info[0]["Title"].resolve(
                        ).decode("utf-8", "replace") + "\n"

                try:
                    if document.is_extractable:
                        resource_manager = PDFResourceManager()
                        la_params = LAParams()

                        device = PDFPageAggregator(resource_manager,
                                                   laparams=la_params)
                        interpreter = PDFPageInterpreter(
                            resource_manager, device)

                        for page in PDFPage.create_pages(document):

                            interpreter.process_page(page)
                            layout = device.get_result()

                            for lt_obj in layout:
                                if isinstance(lt_obj, LTTextBox) or isinstance(
                                        lt_obj, LTTextLine):

                                    text = lt_obj.get_text()

                                    if len(info["content"]) + len(
                                            text) <= self.content_length:
                                        info["content"] += text
                                    else:
                                        info["content"] += text[
                                            0:self.content_length -
                                            len(info["content"])]
                                        break
                            else:
                                continue
                            break
                    else:
                        print("PDF is not extractable: " + full_path)
                except ValueError:
                    print("Couldn't parse page for " + full_path)

        return info
示例#49
0
文件: cms.py 项目: shiayx/endesive
    def makepdf(self, pdfdata1, udct, zeros):
        parser = PDFParser(BytesIO(pdfdata1))
        document = PDFDocument(parser, fallback=False)

        prev = document.find_xref(parser)
        info = document.xrefs[0].trailer['Info'].objid
        root = document.xrefs[0].trailer['Root'].objid
        size = 1
        # calculate last object id, size is only xref size but not count of object in xref
        for ref in document.xrefs:
            if isinstance(ref, PDFXRefStream):
                no = max(ref.ranges, key=operator.itemgetter(1))[1]
            else:
                if len(ref.offsets) == 0:
                    no = 0
                else:
                    no = max(ref.offsets.keys())
            size = max(size, no)
        page = document.getobj(
            document.catalog['Pages'].objid)['Kids'][0].objid

        nsig, fields = self.getfields(root, document)
        annots = self.getannots(page, document)

        infodata = self.getdata(pdfdata1, info, prev, document)
        rootdata = self.getdata(pdfdata1, root, prev, document, ('AcroForm', ))
        pagedata = self.getdata(pdfdata1, page, prev, document, ('Annots', ))

        annotation = udct.get(b'signature', b'').decode('utf8')
        x1, y1, x2, y2 = udct.get(b'signaturebox', (0, 0, 0, 0))
        annotation = FreeText(
            Location(x1=x1, y1=y1, x2=x2, y2=y2, page=0),
            Appearance(
                fill=[0, 0, 0],
                stroke_width=1,
                wrap_text=True,
                font_size=12,
                content=annotation,
            ),
        )
        pdfa = annotation.as_pdf_object(identity(), page=None)
        pdfar = b'[%d %d %d %d]' % tuple(pdfa.Rect)
        pdfas = pdfa.AP.N.stream.encode('latin1')

        no = size + 1
        objs = [
            self.makeobj(page,
                         (b'/Annots[%s%d 0 R]' % (annots, no + 3) + pagedata)),
            self.makeobj(no + 0, infodata),
            self.makeobj(no + 1, (b'/AcroForm %d 0 R' % (no + 2)) + rootdata),
            self.makeobj(
                no + 2, b'/Fields[%s%d 0 R]/SigFlags %d' %
                (fields, no + 3, udct[b'sigflags'])),
            self.makeobj(
                no + 3, b'''
/Type
/Annot
/Subtype
/FreeText
/AP <</N %d 0 R>>
/BS <</S /S /Type /Border /W 0>>
/C []
/Contents (%s)
/DA (0 0 0 rg /%s 12 Tf)
/Rect %s
/F 704
/P %d 0 R
/FT
/Sig
/T(Signature%d)
/V %d 0 R
''' % (no + 4, pdfa.Contents.encode('latin1'),
        pdfa.AP.N.Resources.Font.keys()[0].encode('latin1'), pdfar, page, nsig,
        no + 5)),
            self.makeobj(
                no + 4, b'''
/BBox %s
/FormType 1
/Length %d
/Matrix [1 0 0 1 0 0]
/Resources <</Font <<%s <</BaseFont /Helvetica /Encoding /WinAnsiEncoding /Subtype /Type1 /Type /Font>>>> /ProcSet /PDF>>
/Subtype
/Form
/Type
/XObject
''' % (
                    pdfar,
                    len(pdfas),
                    pdfa.AP.N.Resources.Font.keys()[0].encode('latin1'),
                ), b'stream\n' + pdfas + b'\nendstream\n'),
            self.makeobj(no + 5, (
                b'/ByteRange [0000000000 0000000000 0000000000 0000000000]/ContactInfo(%s)\
/Filter/Adobe.PPKLite/Location(%s)/M(D:%s)/Prop_Build<</App<</Name/>>>>/Reason(%s)/SubFilter/adbe.pkcs7.detached/Type/Sig\
/Contents <' % (udct[b'contact'], udct[b'location'], udct[b'signingdate'],
                udct[b'reason'])) + zeros + b'>'),
        ]

        pdfdata2 = b''.join(objs)
        xref = b'''\
xref\n\
%(page)d 1\n\
%(p0)010d 00000 n \n\
%(no)d 6\n\
%(n0)010d 00000 n \n\
%(n1)010d 00000 n \n\
%(n2)010d 00000 n \n\
%(n3)010d 00000 n \n\
%(n4)010d 00000 n \n\
%(n5)010d 00000 n \n\
'''
        startxref = len(pdfdata1)
        dct = {
            b'page': page,
            b'no': no,
            b'startxref': startxref + len(pdfdata2),
            b'prev': prev,
            b'info': no + 0,
            b'root': no + 1,
            b'size': 6,
            b'p0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % page) + 1,
            b'n0': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 0)) + 1,
            b'n1': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 1)) + 1,
            b'n2': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 2)) + 1,
            b'n3': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 3)) + 1,
            b'n4': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 4)) + 1,
            b'n5': startxref + pdfdata2.find(b'\n%d 0 obj\n' % (no + 5)) + 1,
            b'h1': hashlib.md5(pdfdata1).hexdigest().upper().encode('ascii'),
            b'h2': hashlib.md5(pdfdata2).hexdigest().upper().encode('ascii'),
        }

        trailer = b'''\
trailer
<</ID [<%(h1)s><%(h2)s>]/Info %(info)d 0 R/Prev %(prev)d/Root %(root)d 0 R/Size %(size)d>>\n\
startxref\n\
%(startxref)d\n\
%%%%EOF\n\
'''

        xref = xref % dct
        trailer = trailer % dct

        pdfdata2 = pdfdata2 + xref + trailer

        return pdfdata2
示例#50
0
		headers = '\t'.join([x.text for x in headerline])
		tbrows = [x for x in tls if x.y1 < hd1tl.y0]
	else:
		tbrows = [x for x in tls]
	ry0s = sorted([x.y0 for x in tbrows], reverse=True)
	rows = clustrows(ry0s)
	tbrows2 = [sorted([x for x in tbrows if x.y0 in ri], key=lambda x: x.x0) for ri in rows]
	tbrows3 = ['\t'.join([x.text for x in r]) for r in tbrows2]
	if header1 != '':
		return([headers] + tbrows3)
	else:
		return(tbrows3)


infile = open(inpdf, 'rb')
document = PDFDocument(PDFParser(infile))
page_it = PageIterator(document, LAParams(char_margin=0.2))

tbs5=[]
tbs6=[]
tbs7=[]
tbs8=[]
pg = 1
while pg < 264:
	if pg == 153:
		tbs5 = tbs5 + parsetable(page_it, header1='SAMPLE')
	elif pg > 153 and pg < 224:
		tbs5 = tbs5 + parsetable(page_it)
	elif pg == 224:
		tbs6 = tbs6 + parsetable(page_it, header1='PDID')
	elif pg > 224 and pg < 248:
示例#51
0
    
    __author__: "Sushovan Mandal"
    __license__: "GPLv2"
    __email__: "*****@*****.**"
'''

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed

# Open a PDF file.
fp = open('extras/sample.pdf', 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
#parser.set_document(doc)
#doc.set_parser(parser)
#document.initialize()

metadata = document.info  # The "Info" metadata
#print document.catalog
for d in metadata:
    if type(d) == dict:
        for key, value in d.iteritems():
            print("%s, %s" % (unicode(key), unicode(value, errors='ignore')))
示例#52
0
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice

path = '/Users/mattstringer/Dropbox/ProyectoLaCumbre/DataClean/pdfs/example_finca.pdf'
# Open a PDF file.
fp = file(path, 'rb')

# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
document = PDFDocument(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
document.initialize(password)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
示例#53
0
def mine_area(filename):
    """
    use pdfminer to get the valid area of each page.
    all results are relative position!
    """

    pageboxlist = []

    # 打开一个pdf文件
    with open(filename, 'rb') as fp:
        # 创建一个PDF文档解析器对象
        parser = PDFParser(fp)
        # 创建一个PDF文档对象存储文档结构
        # 提供密码初始化,没有就不用传该参数
        #document = PDFDocument(parser, password)
        document = PDFDocument(parser)
        # 检查文件是否允许文本提取
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # 创建一个PDF资源管理器对象来存储共享资源
        # caching = False不缓存
        rsrcmgr = PDFResourceManager(caching=False)
        # 创建一个PDF设备对象
        laparams = LAParams()
        # 创建一个PDF页面聚合对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解析器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 处理文档当中的每个页面

        # doc.get_pages() 获取page列表
        # for i, page in enumerate(document.get_pages()):
        # PDFPage.create_pages(document) 获取page列表的另一种方式
        # 循环遍历列表,每次处理一个page的内容
        count = 0
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象。一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            boxlist = []
            for item in layout:
                if count >= 3:
                    break
                box = item.bbox
                boxlist.append(box)

                if isinstance(item, LTTextBox) or isinstance(item, LTTextLine):
                    print('text:{}'.format(item))
                    print(item.height)
                    print(item.get_text())
                    count += 1
                elif isinstance(item, LTImage):
                    print('image:{}'.format(item))
                elif isinstance(item, LTFigure):
                    print('figure:{}'.format(item))
                elif isinstance(item, LTAnno):
                    print('anno:{}'.format(item))
                elif isinstance(item, LTChar):
                    print('char:{}'.format(item))
                elif isinstance(item, LTLine):
                    print('line:{}'.format(item))
                elif isinstance(item, LTRect):
                    print('rect:{}'.format(item))
                elif isinstance(item, LTCurve):
                    print('curve:{}'.format(item))

            pageboxlist.append(boxlist)
            # for x in layout:
            #     #如果x是水平文本对象的话
            #     if (isinstance(x, LTTextBoxHorizontal)):
            #         # text=re.sub(replace,'',x.get_text())
            #         text = x.get_text()
            #         if len(text) != 0:
            #             print text

            break

    res = []
    for boxlist in pageboxlist:
        tmp = get_max_box(boxlist)
        res.append(tmp)
    return res
示例#54
0
def parse_pdf(path=None,
              data=None,
              savePath=None,
              y_tolerance=1.5,
              char_tolerance=0.5):
    '''
    function : 处理pdf
    :param:词间最大间距,行间最大间距,输入路径,输出路径
    :return  无
    '''
    # 记录page行数
    pdfRowNumber = 0

    theMaxColSize = []

    wb = Workbook()
    ws = wb.active

    if data == None:
        data = open(path, 'rb')

    parser = PDFParser(data)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=None)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        height = page.mediabox[3] - page.mediabox[1]
        layout = device.get_result()
        pageContainer, theMaxColNum = get_line_word(
            layout,
            height,
            y_tolerance=y_tolerance,
            char_tolerance=char_tolerance)
        # 按照位置信息排序
        for line in pageContainer:
            line.sort(key=itemgetter('x0'))
        pageContainer.sort(key=lambda line: line[0]['top'])

        if len(pageContainer[0]) < theMaxColNum:
            for i in range(len(pageContainer)):
                if len(pageContainer[i]) == theMaxColNum:
                    repairList = align_front_row(pageContainer[0:i],
                                                 theMaxColNum)
                    del pageContainer[0:i]
                    pageContainer.insert(0, repairList)
                    break
        # 对最后一排进行判断
        if len(pageContainer[-1]) < theMaxColNum:
            pageContainer[-1] = align_last_row(pageContainer[-2:],
                                               theMaxColNum)
        # 写入excel
        alignment = Alignment(horizontal='center', vertical='center')
        for idx, line in enumerate(pageContainer):
            for idy, item in enumerate(line):
                cellIndex = ws.cell(row=idx + 1 + pdfRowNumber, column=idy + 1)
                if item['text'] == '':
                    pass
                elif item['text'] == None:
                    ws.merge_cells(start_row=idx + 1 + pdfRowNumber,
                                   start_column=1,
                                   end_row=idx + 1 + pdfRowNumber,
                                   end_column=theMaxColNum)
                    ws.cell(idx + 1 + pdfRowNumber, 1).alignment = alignment
                    break
                else:
                    if idx == 0 and len(line) == 2:
                        pass
                    else:
                        cellIndex.alignment = alignment

                    if item['text'].isdigit():
                        cellIndex.value = int(item['text'])
                        cellIndex.number_format = '0'
                    elif is_float(item['text']):
                        cellIndex.value = float(item['text'])
                    else:
                        cellIndex.value = item['text']

        thePageMaxColSize = [0 for i in range(theMaxColNum)]
        for line in pageContainer:
            if len(line) == 2:
                continue
            for col, item in enumerate(line):
                if len(item['text']) > thePageMaxColSize[col]:
                    thePageMaxColSize[col] = len(item['text'])

        if theMaxColSize == []:
            theMaxColSize = thePageMaxColSize[:]
        else:
            for i in range(theMaxColNum):
                if theMaxColSize[i] < thePageMaxColSize[i]:
                    theMaxColSize[i] = thePageMaxColSize[i]
        # 将该页的行数相加,使excel连续
        pdfRowNumber += len(pageContainer)

    # 保存excel文件至本地
    letter = [
        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
        'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
    ]
    for col, theSize in enumerate(theMaxColSize):
        rest = (col + 1) % 26
        cut = int((col + 1) / 26)
        colLetter = ''
        if cut == 0:
            colLetter = letter[rest - 1]
        else:
            colLetter = letter[cut] + letter[rest - 1]
        ws.column_dimensions[colLetter].width = theSize * 2

    if savePath != None:
        wb.save(savePath)
    else:
        wb.save(path.replace('.pdf', '.xlsx'))
示例#55
0
    def get_input_fields(self,
                         source_path: str = None,
                         replace_none_value: bool = False) -> dict:
        """Get input fields in the PDF.

        Stores input fields internally so that they can be used without
        parsing the PDF again.

        Parameter ``replace_none_value`` is for convience to visualize fields.

        If no source path given, assumes a PDF is already opened.

        **Examples**

        **Robot Framework**

        .. code-block:: robotframework

            ***Settings***
            Library    RPA.PDF

            ***Tasks***
            Example Keyword
                ${fields}=  Get Input Fields    /tmp/sample.pdf

        **Python**

        .. code-block:: python

            from RPA.PDF import PDF

            pdf = PDF()

            def example_keyword():
                fields = pdf.get_input_fields("/tmp/sample.pdf")

        :param source_path: source filepath, defaults to None.
        :param replace_none_value: if value is None replace it with key name,
            defaults to False.
        :return: dictionary of input key values or `None`.
        """
        record_fields = {}
        if not source_path and self.ctx.active_pdf_document.fields:
            return self.ctx.active_pdf_document.fields
        self.ctx.switch_to_pdf(source_path)
        source_parser = PDFParser(self.ctx.active_pdf_document.fileobject)
        source_document = PDFDocument(source_parser)

        try:
            fields = pdfminer.pdftypes.resolve1(
                source_document.catalog["AcroForm"])["Fields"]
        except KeyError as err:
            raise KeyError('PDF "%s" does not have any input fields.' %
                           self.ctx.active_pdf_document.path) from err

        for i in fields:
            field = pdfminer.pdftypes.resolve1(i)
            if field is None:
                continue
            name, value, rect, label = (
                field.get("T"),
                field.get("V"),
                field.get("Rect"),
                field.get("TU"),
            )
            if value is None and replace_none_value:
                record_fields[name.decode("iso-8859-1")] = {
                    "value": name.decode("iso-8859-1"),
                    "rect": iterable_items_to_int(rect),
                    "label": label.decode("iso-8859-1") if label else None,
                }
            else:
                try:
                    record_fields[name.decode("iso-8859-1")] = {
                        "value": value.decode("iso-8859-1") if value else "",
                        "rect": iterable_items_to_int(rect),
                        "label": label.decode("iso-8859-1") if label else None,
                    }
                except AttributeError:
                    self.logger.debug("Attribute error")
                    record_fields[name.decode("iso-8859-1")] = {
                        "value": value,
                        "rect": iterable_items_to_int(rect),
                        "label": label.decode("iso-8859-1") if label else None,
                    }

        self.ctx.active_pdf_document.fields = record_fields or None
        return record_fields
示例#56
0
def get_pages_in_pdf(file):
    document = PDFDocument(PDFParser(file))
    return resolve1(document.catalog['Pages'])['Count']
示例#57
0
#coding=utf-8
'''
Created on 2017��1��12��

@author: feifei
'''
import os
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

path_project = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
path_book = path_project + os.sep + "input" + os.sep + "McCrackens Removable Partial Prosthodontics_nodrm.pdf"
path_out = path_project + os.sep + "output" + os.sep + "Contemporary Fixed Prosthodontics, 5ed index.txt"

# Open a PDF document.
fp = open(path_book, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)

# Get the outlines of the document.
outlines = document.get_outlines()
with open(path_out, "w") as f:
    for (level, title, dest, a, se) in outlines:
        f.write(title.encode("utf-8") + "\n")
print "over"
示例#58
0
class PdfFileParser(object):
    def __init__(self,
                 infile,
                 outfile=None,
                 password=None,
                 selectedpages=None,
                 maxSplit=3,
                 W=1440.0,
                 H=1080.0,
                 outputJson=False,
                 trimbox=None,
                 trimboxes=None,
                 exclude=False,
                 debug=0):
        self.args = {
            a[0]: a[1]
            for a in locals().items() if a[0] not in ['self', 'outputJson']
        }
        self.outputJson = outputJson
        self.DEBUG = debug
        self.picklefile = infile + '.pickle'
        self.selectedpages = selectedpages
        self.pickleLoaded = False
        self.savedconfig = None
        self.coords = []
        self.pagesCoords = []
        self.trimbox = trimbox
        self.trimboxes = trimboxes
        self.exclude = exclude

        self.pageRanges = SelectedPages(selectedpages)

        if ENABLE_PICKLE and os.path.isfile(self.picklefile):
            try:
                with open(self.picklefile, 'rb') as f:
                    self.savedconfig = pickle.load(f)
                    savedargs = self.savedconfig['args']
                    equal = True
                    for k, v in self.args.items():
                        if k == 'selectedpages':
                            if v not in SelectedPages(savedargs[k]):
                                equal = False
                        elif k not in savedargs:
                            equal = False
                        elif v != savedargs[k]:
                            equal = False
                        if not equal:
                            break

                    if equal:
                        self.pickleLoaded = True
                        self.pagesCoords = self.savedconfig['pagesCoords']
            except Exception, e:
                print e

        self.fname = infile
        self.W = float(W)
        self.H = float(H)
        self.maxSplit = maxSplit
        self.outfile = outfile
        if self.outfile == None:
            outFilename, outExt = os.path.splitext(infile)
            self.outfile = outFilename + '-out' + outExt
            if not (self.selectedpages == None or self.selectedpages == ''):
                outFilename, outExt = os.path.splitext(self.outfile)
                self.outfile = '%s(%s)%s' % (outFilename, self.selectedpages,
                                             outExt)
        if os.path.isfile(self.outfile):
            i = 1
            outfile, outExt = os.path.splitext(self.outfile)
            while os.path.isfile("%s(%d)%s" % (outfile, i, outExt)):
                i += 1
            self.outfile = "%s%d%s" % (outfile, i, outExt)

        self.password = password
        self.endPage = self.pageRanges.getEndPage(
            30000) - 1  # 1 base vs 2 base

        self.inFile = open(self.fname, 'rb')
        self.parser = PDFParser(self.inFile)
        self.document = PDFDocument(self.parser)
        self.rsrcmgr = PDFResourceManager()
        self.laparams = LAParams()
        if not self.pickleLoaded:
            self.device = PDFPageAggregator(self.rsrcmgr,
                                            laparams=self.laparams)
            self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
            self.pagesEnumerator = enumerate(
                PDFPage.create_pages(self.document))
示例#59
0
    def getText(self):
        """ Algorithm:
        1) Txr information from PDF file to PDF document object using parser
        2) Open the PDF file
        3) Parse the file using PDFParser object
        4) Assign the parsed content to PDFDocument object
        5) Now the information in this PDFDocumet object has to be processed.
        For this we need PDFPageInterpreter, PDFDevice and PDFResourceManager
        6) Finally process the file page by page
        """

        # Open and read the pdf file in binary mode
        with open(self.pdf_file_path, "rb") as fp:

            # Create parser object to parse the pdf content
            parser = PDFParser(fp)

            # Store the parsed content in PDFDocument object
            document = PDFDocument(parser, self.password)

            # Check if document is extractable, if not abort
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed

            # Create PDFResourceManager object that stores shared resources
            # such as fonts or images
            rsrcmgr = PDFResourceManager()

            # set parameters for analysis
            laparams = LAParams()

            # Create a PDFDevice object which translates interpreted
            # information into desired format
            # Device to connect to resource manager to store shared resources
            # device = PDFDevice(rsrcmgr)
            # Extract the decive to page aggregator to get LT object elements
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create interpreter object to process content from PDFDocument
            # Interpreter needs to be connected to resource manager for shared
            # resources and device
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            # Initialize the text
            extracted_text = ""

            # Ok now that we have everything to process a pdf document,
            # lets process it page by page
            numpages = 0
            for page in PDFPage.create_pages(document):
                # As the interpreter processes the page stored in PDFDocument
                # object
                interpreter.process_page(page)
                # The device renders the layout from interpreter
                layout = device.get_result()
                # Out of the many LT objects within layout, we are interested
                # in LTTextBox and LTTextLine
                for lt_obj in layout:
                    if (isinstance(lt_obj, LTTextBox)
                            or isinstance(lt_obj, LTTextLine)):
                        extracted_text += lt_obj.get_text()

                numpages += 1
                if numpages <= 4:
                    print(extracted_text.encode("utf-8"))
        return extracted_text.encode("utf-8")
示例#60
0
def parse(DataIO, save_path, start=None, end=None):
    # 用文件对象创建一个PDF文档分析器
    parser = PDFParser(DataIO)
    # 创建一个PDF文档
    doc = PDFDocument(parser)
    #分析器和文档相互连接
    parser.set_document(doc)
    #doc.set_parser(parser)
    # 提供初始化密码,没有默认为空
    #doc.initialize()
    # 检查文档是否可以转成TXT,如果不可以就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDF资源管理器,来管理共享资源
        rsrcmagr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        # 将资源管理器和设备对象聚合
        device = PDFPageAggregator(rsrcmagr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmagr, device)
 
        # 循环遍历列表,每次处理一个page内容
        #pages = PDFPage.get_pages(doc)
        # doc.get_pages()获取page列表
        #for page in pages:
        page_num = 0
        for page in PDFPage.create_pages(doc):
            page_num = page_num + 1
            if start is not None and end is not None:
                if page_num < start:
                    continue
                if page_num > end:
                    break
            interpreter.process_page(page)
            # 接收该页面的LTPage对象
            layout = device.get_result()
            f = open('./text/'+str(page_num)+'.txt', 'w')
            #with open('%s' % (save_path), 'a') as f:

            # 这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象
            # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对象
            # 想要获取文本就得获取对象的text属性
            for x in layout:
                #try:
                if isinstance(x, LTTextBoxHorizontal):
                    # 得到文本
                    result = x.get_text()
                    try:
                        print("***************** LTTextBoxHorizontal  ************")
                        print(result)
                        #if len(result) >= 15:
                        # 写到文件中
                        f.write(result + "\n")
                    except:
                        print('写入文件错误', result)
                        pass
                if isinstance(x, LTTextBox): 
                    print("***************** LTTextBox  ************")
                    print(x.get_text())
                if isinstance(x, LTFigure): 
                    print("***************** LTFigure  ************")
                    parse_lt_figure(x, page_num, f)
                if isinstance(x, LTImage): 
                    print("***************** LTImage  ************")
                    saved_file = save_image(x, page_num)
                    print('save image ' + x.name)
                if isinstance(x, LTChar): 
                    print('ppppppppppppppp')
                    print(x.get_text())
                    f.write(x.get_text())
                if isinstance(x, LTCurve): 
                    print("***************** LTCurve  ************")
            f.close()