Exemplo n.º 1
0
def load_field(field):
    """load form field"""
    def uniflail(stringish):
        def uni8(stringish):
            try:
                return unicode(stringish, encoding='utf8')
            except UnicodeDecodeError:
                return unicode(stringish, encoding='iso-8859-1')
        if stringish == None:
            return None
        if len(stringish) < 2:
            return uni8(stringish)
        b0 = ord(stringish[0])
        b1 = ord(stringish[1])
        if (b0 == 0xff and b1 == 0xfe) or (b0 == 0xfe and b1 == 0xff):
            return unicode(stringish, encoding='utf16')
        return uni8(stringish)
    typ = field.get('FT').name
    if typ:
        t = field.get('T')
        if not t:
            return None
    if typ == "Tx":
        val = resolve1(field.get('V'))
        if val == None:
            return None
        return (t, uniflail(val))
    elif typ == "Btn":
        val = resolve1(field.get('V'))
        if val == None:
            return None
        return (t, uniflail(val.name))
    else:
        raise FormParseException("unknown field type " + typ)
Exemplo n.º 2
0
def load_form(filename):
    """Load pdf form contents into a dictionary"""
    with open(filename, 'rb') as file:
        try:
            parser = PDFParser(file)
            doc = PDFDocument(parser)
            parser.set_document(doc)
            if not 'AcroForm' in doc.catalog:
                return None
            fields = resolve1(doc.catalog['AcroForm'])
            if fields == None or 'Fields' not in fields:
                return None
            fieldlist = []
            for f in fields['Fields']:
                field = resolve1(f)
                if field == None:
                    return None
                fieldlist.append(load_field(field))
            fieldset = dict()
            for f in fieldlist:
                if f == None:
                    continue
                k, v = f
                fieldset[k] = v
            return fieldset
        except UnicodeDecodeError, e:
            raise FormParseException(filename + ": unicode error: " + str(e))
Exemplo n.º 3
0
def read_fields(pdffile):
    outfields = list()
    fp = open(pdffile, 'rb')
    id_to_page = dict()
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1;
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    for i in fields:
        field = resolve1(i)
        name, value, rect, page, field_type = field.get('T'), field.get('V'), field.get('Rect'), field.get('P'), field.get('FT')
        logmessage("name is " + str(name) + " and FT is |" + str(field_type) + "|")
        if page is not None:
            pageno = id_to_page[page.objid]
        else:
            pageno = 1
        if str(field_type) == '/Btn':
            if value == '/Yes':
                default = "Yes"
            else:
                default = "No"
        elif str(field_type) == '/Sig':
            default = '${ user.signature }'
        else:
            if value is not None:
                default = value
            else:
                default = word("something")
        outfields.append((name, default, pageno, rect, field_type))
    return outfields
Exemplo n.º 4
0
def load_form(filename):
    """Load pdf form contents into a nested list of name/value tuples"""
    with open(filename, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument(parser)
        return [load_fields(resolve1(f)) for f in
                   resolve1(doc.catalog['AcroForm'])['Fields']]
Exemplo n.º 5
0
def load_fields(field):
    """Recursively load form fields"""
    form = field.get('Kids', None)
    if form:
        return [load_fields(resolve1(f)) for f in form]
    else:
        # Some field types, like signatures, need extra resolving
        return (field.get('T').decode('utf-16'), field, resolve1(field.get('V')))
Exemplo n.º 6
0
 def resolve_dest(dest):
     if isinstance(dest, str):
         dest = resolve1(doc.get_dest(dest))
     elif isinstance(dest, PSLiteral):
         dest = resolve1(doc.get_dest(dest.name))
     if isinstance(dest, dict):
         dest = dest['D']
     return dest
Exemplo n.º 7
0
def load_form(filename):
    """Load pdf form contents into a nested list of name/value tuples"""
    with open(filename, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument(parser)
        import ipdb;ipdb.set_trace()
        # parser.set_document(doc)
        #doc.set_parser(parser)
        #doc.initialize()
        return [load_fields(resolve1(f)) for f in
                resolve1(doc.catalog['AcroForm'])['Fields']]
Exemplo n.º 8
0
	def resolve_dest(dest):
		try :
			if isinstance(dest, str):
					dest = resolve1(doc.get_dest(dest))
			elif isinstance(dest, PSLiteral):
					dest = resolve1(doc.get_dest(dest.name))
			if isinstance(dest, dict):
					dest = dest['D']
			return dest
		except PDFDestinationNotFound :
			return None
Exemplo n.º 9
0
 def _get_xmp_metadata(self):
     t = a = None
     metadata = resolve1(self.doc.catalog['Metadata']).get_data()
     try:
         md = xmp_to_dict(metadata)
     except:
         return t, a
     try:
         t = md['dc']['title']['x-default']
     except KeyError:
         pass
     try:
         a = md['dc']['creator']
     except KeyError:
         pass
     else:
         if type(a) is str:
             a = [a]
         a = filter(bool, a)  # remove None, empty strings, ...
         if len(a) > 1:
             a = '%s %s' % (self._au_last_name(a[0]),
                     self._au_last_name(a[-1]))
         elif len(a) == 1:
             a = self._au_last_name(a[0])
         else:
             a = None
     return t, a
Exemplo n.º 10
0
    def get_metadata(self):
        """Returns metadata from both
    	   the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
    	"""
        file_pointer = open(self.path, 'rb')
        parser = PDFParser(file_pointer)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize()
        metadata = Metadata()
        for i in doc.info:
            metadata.add(i)
        if 'Metadata' in doc.catalog:
            xmp_metadata = resolve1(doc.catalog['Metadata']).get_data()
            xmp_dict = xmp_to_dict(xmp_metadata)
            #Let's add only the most useful one
            if "xap" in xmp_dict:
                metadata.add(xmp_dict["xap"])
            if "pdf" in xmp_dict:
                metadata.add(xmp_dict["pdf"])
            if "dc" in xmp_dict:
                metadata.add(xmp_dict["dc"], metadataType="dc")
        file_pointer.close()

        self.metadata = metadata
        return metadata
Exemplo n.º 11
0
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1( doc.lookup_name('Dests', dest) )
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level,title,dest,pageno))+'\n')
    parser.close()
    fp.close()
    return
def getData(fileName):
 doc = PDFDocument()
 fp = file(fileName, 'rb')
 parser = PDFParser(fp)
 try:
  parser.set_document(doc)
  doc.set_parser(parser)
 except:
  return "error"
   
 parser.close()
 fp.close()
 try:
  for xref in doc.xrefs:
   info_ref=xref.trailer.get('Info')
   if info_ref:
    info=resolve1(info_ref)
   metadata=info
   if metadata == None:
    return "Empty metadata"
   else:
    if metadata.has_key('Author'):
     print("Author "+metadata['Author'])
    if metadata.has_key('Company'):
     print("Company "+metadata['Company'])
    if metadata.has_key('Producer'):
     print("Producer "+metadata['Producer'])
    if metadata.has_key('Creator'):
     print("Creator "+metadata['Creator'])         
 except Exception,e:
  print "\t [x] Error in PDF extractor"
  return e 
Exemplo n.º 13
0
    def get_metadata(self):
        """Returns metadata from both
           the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
        """
        with PdfMinerWrapper(self.path) as pdf_miner:
            metadata = Metadata()

            for i in pdf_miner.document.info:
                metadata.add(i)

            if 'Metadata' in pdf_miner.document.catalog:
                catalog = pdf_miner.document.catalog['Metadata']
                xmp_metadata = resolve1(catalog).get_data()
                xmp_dict = xmp_to_dict(xmp_metadata)
                # Let's add only the most useful one
                if "xap" in xmp_dict:
                    metadata.add(xmp_dict["xap"])
                if "pdf" in xmp_dict:
                    metadata.add(xmp_dict["pdf"])
                if "dc" in xmp_dict:
                    metadata.add(xmp_dict["dc"], metadataType="dc")

            self.metadata = metadata
            return metadata
Exemplo n.º 14
0
	def getData(self):
		doc = PDFDocument()
		fp = file(self.fname, 'rb')
		parser = PDFParser(fp)
		try:
			parser.set_document(doc)
			doc.set_parser(parser)
			doc.initialize(self.password)
		except:
			return "error"
		
		parser.close()
		fp.close()
		#try:
		#	metadata = resolve1(doc.catalog['Metadata'])
		#	return "ok"
		#except:
		#	print "[x] Error in PDF extractor, Metadata catalog"
		try:
			for xref in doc.xrefs:
				info_ref=xref.trailer.get('Info')
				if info_ref:
					info=resolve1(info_ref)
				self.metadata=info
				self.raw = info
			if self.raw == None:
				return "Empty metadata"
			else:
				return "ok"
		except Exception,e:
			return e 
			print "\t [x] Error in PDF extractor, Trailer Info"
Exemplo n.º 15
0
def getPDFMetadata(path):

    result = {}

    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()

    result = doc.info

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        
        try:
            result.update( metadata ) # The raw XMP metadata
            
        except:
            pass
            
        try:
            result.update( xmp_to_dict(metadata) )
            
        except:
            pass

    return result[0]
	def getAllAnnots(self):
		annots = []
		pages = self.__getPages();
		for p in pages:
			if (p.annots): annots += resolve1(p.annots);
		
		#annots = annots[0:10] #[ annots[5] ]
		self.__decodeAnnots(annots)
		return annots;
Exemplo n.º 17
0
def recursively_add_fields(fields, id_to_page, outfields, prefix=''):
    for i in fields:
        field = resolve1(i)
        name, value, rect, page, field_type = field.get('T'), field.get('V'), field.get('Rect'), field.get('P'), field.get('FT')
        if name is not None:
            if PY2:
                name = remove_nonprintable_limited(str(name))
            else:
                if not isinstance(name, bytes):
                    name = bytes(str(name), encoding='utf-8')
                name = remove_nonprintable_bytes_limited(name)
        if value is not None:
            if PY2:
                value = remove_nonprintable_limited(str(value))
            else:
                if not isinstance(value, bytes):
                    value = bytes(str(value), encoding='utf-8')
                value = remove_nonprintable_bytes_limited(value)
        #logmessage("name is " + repr(name) + " and FT is |" + repr(str(field_type)) + "| and value is " + repr(value))
        if page is not None:
            pageno = id_to_page[page.objid]
        else:
            pageno = 1
        if str(field_type) in ('/Btn', "/u'Btn'", "/'Btn'"):
            if value == '/Yes':
                default = "Yes"
            else:
                default = "No"
        elif str(field_type) in ('/Sig', "/u'Sig'", "/'Sig'"):
            default = '${ user.signature }'
        else:
            if value is not None:
                #for val in value:
                #    logmessage("Got a " + str(ord(val)))
                #logmessage(repr(value.decode('utf8')))
                #default = re.sub(r'^\xc3\xbe\xc3\xbf', '', value)
                default = value
                if not default:
                    default = word("something")
            else:
                default = word("something")
        kids = field.get('Kids')
        if kids:
            if name is None:
                recursively_add_fields(kids, id_to_page, outfields, prefix=prefix)
            else:
                if prefix == '':
                    recursively_add_fields(kids, id_to_page, outfields, prefix=name)
                else:
                    recursively_add_fields(kids, id_to_page, outfields, prefix=prefix + '.' + name)
        else:
            if prefix != '' and name is not None:
                outfields.append((prefix + '.' + name, default, pageno, rect, field_type))
            elif prefix == '':
                outfields.append((name, default, pageno, rect, field_type))
            else:
                outfields.append((prefix, default, pageno, rect, field_type))
Exemplo n.º 18
0
def pdf2metadata(fp):
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    doc.initialize()

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        #print metadata  # The raw XMP metadata
    return doc.info  # The "Info" metadata
Exemplo n.º 19
0
 def _add_annots(self, layout, annots):
     """Adds annotations to the layout object
     """
     if annots:
         for annot in resolve1(annots):
             annot = resolve1(annot)
             if annot.get('Rect') is not None:
                 annot['bbox'] = annot.pop('Rect')  # Rename key
                 annot = self._set_hwxy_attrs(annot)
             try:
                 annot['URI'] = resolve1(annot['A'])['URI']
             except KeyError:
                 pass
             for k, v in six.iteritems(annot):
                 if not isinstance(v, six.string_types):
                     annot[k] = obj_to_string(v)
             elem = parser.makeelement('Annot', annot)
             layout.add(elem)
     return layout
Exemplo n.º 20
0
	def getData(self):
		try:
			doc = PDFDocument()
			fp = file(self.fname, 'rb')
			parser = PDFParser(fp)
			parser.set_document(doc)
			doc.set_parser(parser)
			doc.initialize(self.password)
			metadata = resolve1(doc.catalog['Metadata'])
			parser.close()
			fp.close()
			for xref in doc.xrefs:
				info_ref=xref.trailer.get('Info')
				if info_ref:
					info=resolve1(info_ref)
				self.metadata=info
				self.raw = info
			return "ok"
		except:
			return "error"
Exemplo n.º 21
0
def load_fields(field):
    """Recursively load form fields"""
    form = field.get('Kids', None)
    if form:
        for f in form:
            for field_name in load_fields(resolve1(f)):
                yield field_name
    else:
        try:
            yield field.get('T').decode('utf-16')
        except:
            yield field.get('T')
Exemplo n.º 22
0
 def get_page_number(self, index):
     """
     Given an index, return page label as specified by catalog['PageLabels']['Nums']
     Nums == [   0 << /S /r >>
                 4 << /S /D >>
                 7 << /S /D /P (A−) /St 8>>
             ]
                 /S = [
                         D Decimal arabic numerals
                         R Uppercase roman numerals
                         r Lowercase roman numerals
                         A Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on)
                         a Lowercase letters (a to z for the first 26 pages, aa to zz for the next 26, and so on)
                     ] (if no /S, just use prefix ...)
                 /P = text string label
                 /St = integer start value
     """
     try:
         nums = resolve1(self.catalog['PageLabels'])['Nums'] # e.g. [ 0 {settings} 2 {settings} 20
         # {settings} ...]
         assert len(nums) > 1 and len(nums) % 2 == 0
     except:
         return ""
     for i in range(len(nums)-2,-1,-2): # find highest page number lower than requested page
         if nums[i] <= index:
             break
     settings = nums[i+1].resolve()
     page_num = ""
     if 'S' in settings: # show a digit
         page_num = index - nums[i]
         if 'St' in settings: # alternate start value
             page_num += settings['St']
         else:
             page_num += 1
         num_type = settings['S'].name
         if num_type.lower() == 'r': # roman (upper or lower)
             import roman
             page_num = roman.toRoman(page_num)
             if num_type == 'r':
                 page_num = page_num.lower()
         elif num_type.lower() == 'a': # letters
             # a to z for the first 26 pages, aa to zz for the next 26, and so on
             letter = chr(page_num % 26 + 65)
             letter *= page_num / 26 + 1
             if num_type == 'a':
                 letter = letter.lower()
             page_num = letter
         else: #if num_type == 'D': # decimal arabic
             page_num = unicode(page_num)
     if 'P' in settings: # page prefix
         page_num = settings['P']+page_num
     return page_num
Exemplo n.º 23
0
 def __init__(self, doc, pageid, attrs):
   self.doc = doc
   self.pageid = pageid
   self.attrs = dict_value(attrs)
   self.lastmod = resolve1(self.attrs.get('LastModified'))
   self.resources = resolve1(self.attrs['Resources'])
   self.mediabox = resolve1(self.attrs['MediaBox'])
   if 'CropBox' in self.attrs:
     self.cropbox = resolve1(self.attrs['CropBox'])
   else:
     self.cropbox = self.mediabox
   self.rotate = self.attrs.get('Rotate', 0)
   self.annots = self.attrs.get('Annots')
   self.beads = self.attrs.get('B')
   if 'Contents' in self.attrs:
     contents = resolve1(self.attrs['Contents'])
   else:
     contents = []
   if not isinstance(contents, list):
     contents = [ contents ]
   self.contents = contents
   return
Exemplo n.º 24
0
    def proc(self, pdfFp):
        """Get meta-data as available from a PDF document"""

        parser = PDFParser(pdfFp)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        doc.initialize()
        self.info = doc.info
        if 'Metadata' in doc.catalog:
            self.metadata = xmp_to_dict(
                resolve1(doc.catalog['Metadata']).get_data()
            )
        self.raw_doc = pdfFp.getvalue()
Exemplo n.º 25
0
def load_fields(field):
    """Recursively load form fields"""
    form = field.get('Kids', None)
    if form:
        f = gettext(field)
        #print 'FORM: #', f,'#'
        if 'Page2' in f:
            return
        return [load_fields(resolve1(f)) for f in form]
    else:
        name, value = field.get('T'), field.get('V')
        # if name=='OrdinaryDividendsAmt[0]':
        #    import ipdb;ipdb.set_trace()
        arect = field.get('Rect')
        print "<div style='background-color:green;position:absolute;left:%spx;top:%spx;width:%spx;height:%spx;'>%s</div>" % ( (arect[0])*2,(1200-arect[1])*2, (arect[2] - arect[0])*2-3,( arect[3] - arect[1])*2-3, gettext(field))
Exemplo n.º 26
0
def do(filename=''):
	fp = open(filename, 'rb')
	parser = PDFParser(fp)
	doc = PDFDocument(parser)
	parser.set_document(doc)
#	doc.set_parser(parser)
#	doc.initialize()

	print doc.info        # The "Info" metadata

	if 'Metadata' in doc.catalog:
	    metadata = resolve1(doc.catalog['Metadata']).get_data()
	    print metadata  # The raw XMP metadata
	    print xmp_to_dict(metadata)
	return doc,doc.info[0]
Exemplo n.º 27
0
    def get_tree(self, *page_numbers):
        """
            Return lxml.etree.ElementTree for entire document, or page numbers
            given if any.
        """
        cache_key = "_".join(map(str, _flatten(page_numbers)))
        tree = self._parse_tree_cacher.get(cache_key)
        if tree is None:
            # set up root
            root = parser.makeelement("pdfxml")
            if self.doc.info:
                for k, v in list(self.doc.info[0].items()):
                    k = unicode_decode_object(k)
                    v = unicode_decode_object(resolve1(v))
                    try:
                        root.set(k, v)
                    except ValueError as e:
                        # Sometimes keys have a character in them, like ':',
                        # that isn't allowed in XML attribute names.
                        # If that happens we just replace non-word characters
                        # with '_'.
                        if "Invalid attribute name" in e.message:
                            k = re.sub('\W', '_', k)
                            root.set(k, v)

            # Parse pages and append to root.
            # If nothing was passed in for page_numbers, we do this for all
            # pages, but if None was explicitly passed in, we skip it.
            if not(len(page_numbers) == 1 and page_numbers[0] is None):
                if page_numbers:
                    pages = [[n, self.get_layout(self.get_page(n))] for n in
                             _flatten(page_numbers)]
                else:
                    pages = enumerate(self.get_layouts())
                for n, page in pages:
                    page = self._xmlize(page)
                    page.set('page_index', unicode_decode_object(n))
                    page.set('page_label', self.doc.get_page_number(n))
                    root.append(page)
                self._clean_text(root)

            # wrap root in ElementTree
            tree = etree.ElementTree(root)
            self._parse_tree_cacher.set(cache_key, tree)

        return tree
Exemplo n.º 28
0
def read_fields(pdffile):
    import string
    printable = set(string.printable)
    outfields = list()
    fp = open(pdffile, 'rb')
    id_to_page = dict()
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1;
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    if 'AcroForm' not in doc.catalog:
        return None
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    recursively_add_fields(fields, id_to_page, outfields)
    return sorted(outfields, key=fieldsorter)
Exemplo n.º 29
0
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  doc.initialize(password)
  pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
  for (level,title,dest,a,se) in doc.get_outlines():
    pageno = None
    if dest:
      dest = resolve1( doc.lookup_name('Dests', dest) )
      if isinstance(dest, dict):
        dest = dest['D']
      pageno = pages[dest[0].objid]
    outfp.write(repr((level,title,dest,pageno))+'\n')
  parser.close()
  fp.close()
  return
	def __decodeAnnots(self, annots):
		self._unknownTitle = 0;	
		for index,a in enumerate(annots):

			if type(a).__name__ != 'PDFObjRef': continue;
			a = resolve1(a);

			if ('Compression' in a):
				if (a['Compression']==12): a=str(a).encode("x/1244");
				elif (a['Compression']==17): a=str(a).encode("x/1211");
				elif (a['Compression']==10): a=str(a).encode("x/101");
		
			#print resolve1(a)
			transformedAnnot = self.__analyseAnnot( a );
			if (transformedAnnot != None): 
				annots[index] = transformedAnnot;
				annots[index]['id'] = index+1;
			else: del annots[index];
Exemplo n.º 31
0
    if 'Keywords' in pdfdoc:
        return True
    return False


# loop through directories
for subdir, dirs, files in os.walk(arg_path):
    for file in files:
        file_count += 1
        filepath = subdir + os.sep + file
        if filepath.endswith(".pdf"):
            pdffilecount += 1
        try:
            pdfdoc = parsePDFfile(filepath)
            if checkMetadata(pdfdoc):
                metadata = resolve1(pdfdoc.catalog['Metadata']).get_data()
                dirname = subdir.split(os.path.sep)[-1]
                pdfdict = xmp_to_dict(metadata)
                dict1 = pdfdoc.info[0]
                xkeywords = None
                xdesc = None
                xcreator = None
                xtitle = None
                xfolder = None
                try:
                    xkeywords = str(pdfdict['pdf']['Keywords']).replace(
                        '\r\n', ', ')
                except:
                    xkeywords = ''
                    pass
                try:
Exemplo n.º 32
0
    def get_input_fields(self,
                         source_pdf: str = None,
                         replace_none_value: bool = False) -> dict:
        """Get input fields in the PDF.

        Stores input fields internally so that they can be used without
        parsing PDF again.

        Parameter `replace_none_value` is for convience to visualize fields.

        :param source_pdf: source filepath, defaults to None
        :param replace_none_value: if value is None replace it with key name,
            defaults to False
        :return: dictionary of input key values or `None`
        """
        record_fields = {}
        if source_pdf is None and self.active_fields:
            return self.active_fields
        self.switch_to_pdf_document(source_pdf)
        source_parser = PDFParser(self.active_fileobject)
        source_document = PDFDocument(source_parser)
        try:
            fields = resolve1(source_document.catalog["AcroForm"])["Fields"]
        except KeyError:
            self.logger.info('PDF "%s" does not have any input fields.',
                             self.active_pdf)
            return None

        for i in fields:
            field = resolve1(i)
            if field is None:
                continue
            name, value, rect, label = (
                field.get("T"),
                field.get("V"),
                field.get("Rect"),
                field.get("TU"),
            )
            if value is None and replace_none_value:
                record_fields[name.decode("iso-8859-1")] = {
                    "value": name.decode("iso-8859-1"),
                    "rect": iterable_items_to_int(rect),
                    "label": label.decode("iso-8859-1") if label else None,
                }
            else:
                try:
                    record_fields[name.decode("iso-8859-1")] = {
                        "value": value.decode("iso-8859-1") if value else "",
                        "rect": iterable_items_to_int(rect),
                        "label": label.decode("iso-8859-1") if label else None,
                    }
                except AttributeError:
                    self.logger.debug("Attribute error")
                    record_fields[name.decode("iso-8859-1")] = {
                        "value": value,
                        "rect": iterable_items_to_int(rect),
                        "label": label.decode("iso-8859-1") if label else None,
                    }

        self.active_fields = record_fields if record_fields else None
        return record_fields
Exemplo n.º 33
0
fp = open(args.file, 'rb')

parser = PDFParser(fp)
doc = PDFDocument(parser)
font_regex = re.compile('\/\D+(\d+)')


def convert(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    s1 = s1.replace(' ', '_')
    s1 = s1.replace('__', '_')
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()


fields = resolve1(resolve1(doc.catalog['AcroForm'])['Fields'])
for i in fields:
    field = resolve1(i)
    if field.get('Rect') == None:
        continue
    name, position = field.get('T'), field.get('Rect')
    name = convert(name.decode('UTF-8'))
    width = int(round(position[2] - position[0]))
    height = int(round(position[3] - position[1]))
    x = int(round(position[0]))
    font_size = None
    font_size = int(font_regex.match(field.get('DA').decode('UTF-8')).group(1))
    if height < 1:
        height = height * -1
    y = int(round(position[1]) + height)
    if font_size > 0 and font_size != 10:
Exemplo n.º 34
0
def basic_usage():
    pdf_filepath = '/path/to/sample.pdf'

    fp = None
    try:
        # Open a PDF file.
        fp = open(pdf_filepath, 'rb')

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        if True:
            # Create a PDF parser object associated with the file object.
            parser = PDFParser(fp)
            try:
                # Create a PDF document object that stores the document structure.
                document = PDFDocument(parser, password=b'')
            except PDFEncryptionError as ex:
                print('PDFEncryptionError raised: {}.'.format(ex))
            except PDFSyntaxError as ex:
                print('PDFSyntaxError raised: {}.'.format(ex))
            except PDFException as ex:
                print('PDFException raised: {}.'.format(ex))
            # Check if the document allows text extraction. If not, abort.
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed

            # Metadata.
            print('Metadata: {}.'.format(document.info))
            for info in document.info:
                if 'CreationDate' in info:
                    print('\tCreation date = {}.'.format(info['CreationDate']))

            # Page count.
            try:
                pages = resolve1(document.catalog['Pages'])
                #pages = resolve_all(document.catalog['Pages'])
                print('#pages = {}.'.format(pages['Count']))
            except KeyError as ex:
                print('KeyError raised: {}.'.format(ex))

            # Process each page contained in the document.
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                print('Page ID {} processed.'.format(page.pageid))
        else:
            for page in PDFPage.get_pages(
                    fp, pagenos=None, maxpages=0, password=b''
            ):  # pagenos uses zero-based indices. pagenos is sorted inside the function.
                interpreter.process_page(page)
                print('Page ID {} processed.'.format(page.pageid))
    except FileNotFoundError as ex:
        print('File not found, {}: {}.'.format(pdf_filepath, ex))
    except Exception as ex:
        print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex))
    finally:
        if fp: fp.close()
Exemplo n.º 35
0
                y_limit = 786
                x_limit = 86
                # ONLY FOR EXCEPTION:
            if filename.__contains__("w04") or filename.__contains__("w05") or filename.__contains__("w06") or filename.__contains__("s05") or filename.__contains__("s17_ms_21"):
                x_limit = 103
        elif filename.__contains__("qp"):
            x_limit = 60
            y_limit = 786

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        pagesCount = resolve1(document.catalog['Pages'])['Count']
        pages = PDFPage.get_pages(fp)
        if filename.__contains__("qp"):
            pagesCount = pagesCount - 1
        for index, page in enumerate(pages):
            pageNo = index + 1
            # print('Processing next page...')
            if pageNo > 1 and pageNo <= pagesCount:
                interpreter.process_page(page)
                layout = device.get_result()
                for lobj in layout:
                    if isinstance(lobj, LTTextBox):
                        x, y, ydown, text = lobj.bbox[0], lobj.bbox[3], lobj.bbox[1], lobj.get_text().lstrip()
                        pos = textText(text)
                        storeIfLesser(filename, ydown, pageNo, text)
                        print('At %r is text:%s' % ((x, y), text))
Exemplo n.º 36
0
def create_csv(folder_path):
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    encoding = 'iso-8859-1'
    first = True
    k = 0
    files_count = len(pdf_files)
    for pdf_file_name in pdf_files:
        columns_pdf = []
        values = []
        file = folder_path + '/' + pdf_file_name
        with open(file, 'rb') as pdf_file:
            parser = PDFParser(pdf_file)
            doc = PDFDocument(parser)
            fields = resolve1(doc.catalog['AcroForm'])['Fields']
            for i in fields:
                field = resolve1(i)
                # try because of diameter sign
                try:
                    name = str(field.get('T'), encoding)
                except:
                    name = str(field.get('T')[:-8], encoding)
                opt = field.get('Opt')
                sel = field.get('V')
                # check if options are available and comparison
                if opt != None:
                    if not isinstance(type(opt), list):
                        opt = resolve1(opt)
                    for e in opt:
                        # Field has no 2 array list
                        if name == 'Beobachter':
                            if e == sel:
                                value = e
                        elif e[0] == sel:
                            value = e[1]
                else:
                    value = sel

                # just bytes can be decoded
                if isinstance(value, bytes):
                    try:
                        value = str(value, encoding)
                    except:
                        value = value
                elif str(value)[0] == r"/":
                    value = str(value)[2:-1]
                else:
                    value = str(value)

                columns_pdf.append(name)
                values.append(value)

            if first:
                columns_init = columns_pdf.copy()
                columns_init.append('file')
                df = pd.DataFrame(columns=columns_init)
                first = False
            df_pdf = pd.DataFrame([values], columns=columns_pdf)
            filename = [pdf_file_name]
            df_pdf['file'] = filename
            df = df.append(df_pdf)
            k += 1
            text_count.set(str(k) + ' von ' + str(files_count))
            root.update()
    df = df.replace({'None': '-'})
    df = df.fillna('-')
    first_col = df.pop('file')
    df.insert(0, 'file', first_col)
    df.to_csv(folder_path + '.csv', index=False)
    root.destroy()
Exemplo n.º 37
0
    def _test_pdfminer(self):
        """
        Test 6 - Using PDFMiner.
        """

        print(Colors.UNDERLINE +
              '________________________________________________\n' +
              Colors.ENDC)

        total_pages, errors, total_mining_time = [], [], []

        for index, pdf_file in enumerate(self.pdfs):
            index = index + 1

            filename = os.path.basename(pdf_file)

            file_size = self.convert_size(self.get_file_size(pdf_file))

            try:
                start_time = time.time()

                with open(pdf_file, 'rb') as f:
                    parser = PDFParser(f)

                    doc = PDFDocument(parser)
                    parser.set_document(doc)

                    pages = resolve1(doc.catalog['Pages'])
                    pages_count = pages.get('Count', 0)

                    end_time = time.time()

                    single_file_time = self.decimal_round.format(end_time -
                                                                 start_time)

                    total_mining_time.append(single_file_time)

                    mining_time = filename, single_file_time

                    self._save_mining_time(item=mining_time,
                                           test_type='pdfminer')

                    total_pages.append(pages_count)

                    print(
                        Colors.CYAN +
                        '[PDFMINER] File {i}/{index}. Total pages: {pages_count} --> "{filename}" - {file_size}'
                        .format(i=index,
                                index=len(self.pdfs),
                                pages_count=pages_count,
                                filename=filename,
                                file_size=file_size) + Colors.ENDC)
            except (KeyError, AttributeError, PDFSyntaxError,
                    PDFEncryptionError) as error:
                self._save_mining_time(item=(filename, self.default_time),
                                       test_type='pdfminer')

                errors.append(error)
                pass

        total_pages, total_errors = list(map(int, total_pages)), len(errors)

        list_set_errors, total_parsing_time = list(set(errors)), sum(
            list(map(float, total_mining_time)))

        pdfminer_total_pages = sum(total_pages)

        print(Colors.CYAN +
              '[PDFMINER] Total pages count: {pdfminer_total_pages}'.format(
                  pdfminer_total_pages=pdfminer_total_pages) + Colors.ENDC)

        self.final_stats_dict.update(
            **{
                'pdfminer_total_pages': pdfminer_total_pages,
                'pdfminer_total_parsing_time': total_parsing_time,
                'pdfminer_errors': {
                    'count': total_errors,
                    'errors': list_set_errors
                },
            })
Exemplo n.º 38
0
    def pdffill(self):
        x = self.start_server()
        mapping = self.dic()
        myfile = PdfFileReader("./routes/up/blank_table.pdf")
        writer = PdfFileWriter()
        writer, myfile = self.set_need_appearances_writer(myfile, writer)
        if "/AcroForm" in writer._root_object:
            writer._root_object["/AcroForm"].update(
                {NameObject("/NeedAppearances"): BooleanObject(True)})
        print(1)
        fp = open("./routes/up/blank_table.pdf", 'rb')
        #        pdf_writer = PyPDF2.PdfFileWriter()
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        fields = resolve1(doc.catalog['AcroForm'])['Fields']

        first_page = myfile.getPage(self.page)
        #for i in fields:
        #    field = resolve1(i)
        #    name, value = field.get('T'), field.get('V')
        ##    print (str(name))
        #    if str(name) == "b'Text11'":
        #        writer.updatePageFormFieldValues(first_page, fields={'Text11':x['firstname']})
        #    if str(name) == "b'Text13'":
        #        writer.updatePageFormFieldValues(first_page, fields={'Text13':x['lastname']})
        #    if str(name) == "b'Text16'":
        #        writer.updatePageFormFieldValues(first_page, fields={'Text16':x['homeaddress']})
        #    if str(name) == "b'Text15'":
        #        writer.updatePageFormFieldValues(first_page, fields={'Text15':x['gender']})
        #    if str(name) == "b'Text14'":
        #        writer.updatePageFormFieldValues(first_page, fields={'Text14':str(x['birthdate'])[:10]})
        for p in x.keys():

            for i in fields:
                temp = []
                field = resolve1(i)
                name, value = field.get('T'), field.get('V')
                label = (re.split(b'\t|\x90s', name))
                q = ""
                for j in label:
                    # temp.append(j.decode('utf-8').lower())
                    q = q + (j.decode('utf-8').lower())
                temp.append(q)
                # print(temp)
                # print(1)
                if (str(name) in mapping.keys()
                        and p in mapping[str(name)]) or p in temp:
                    # print(p)
                    # print(str(name) in mapping.keys() and p in mapping[str(name)])
                    # print( p in temp)

                    if p == "birthdate":
                        writer.updatePageFormFieldValues(
                            first_page,
                            fields={
                                str(name)[2:len(str(name)) - 1]: str(x[p])[:10]
                            })
                    else:
                        print(str(name)[2:len(str(name)) - 1])
                        print(str(x[p]))
                        writer.updatePageFormFieldValues(
                            first_page,
                            fields={
                                str(name)[2:len(str(name)) - 1]: str(x[p])
                            })
        #    print ('{0}: {1}'.format(name, value))
        # writer.addPage(first_page)
        writer.updatePageFormFieldValues(first_page,
                                         fields={'parent1name': "123"})
        return first_page
Exemplo n.º 39
0
    def __init__(self,
                 pdf_stream,
                 password="",
                 pagenos=[],
                 maxpages=0):  # noqa: C901
        ReaderBackend.__init__(self)
        self.pdf_stream = pdf_stream

        # Extract Metadata
        parser = PDFParser(pdf_stream)
        doc = PDFDocument(parser, password=password, caching=True)
        if doc.info:
            for k in doc.info[0]:
                v = doc.info[0][k]
                # print(repr(v), type(v))
                if isinstance(v, (bytes, str, unicode)):
                    self.metadata[k] = make_compat_str(v)
                elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)):
                    self.metadata[k] = make_compat_str(v.name)

        # Secret Metadata
        if "Metadata" in doc.catalog:
            metadata = resolve1(doc.catalog["Metadata"]).get_data()
            # print(metadata)  # The raw XMP metadata
            # print(xmp_to_dict(metadata))
            self.metadata.update(xmp_to_dict(metadata))
            # print("---")

        # Extract Content
        text_io = BytesIO()
        rsrcmgr = PDFResourceManager(caching=True)
        converter = TextConverter(rsrcmgr,
                                  text_io,
                                  codec="utf-8",
                                  laparams=LAParams(),
                                  imagewriter=None)
        interpreter = PDFPageInterpreter(rsrcmgr, converter)

        self.metadata["Pages"] = 0
        self.curpage = 0
        for page in PDFPage.get_pages(
                self.pdf_stream,
                pagenos=pagenos,
                maxpages=maxpages,
                password=password,
                caching=True,
                check_extractable=False,
        ):
            # Read page contents
            interpreter.process_page(page)
            self.metadata["Pages"] += 1
            self.curpage += 1

            # Collect URL annotations
            # try:
            if page.annots:
                refs = self.resolve_PDFObjRef(page.annots)
                if refs:
                    if isinstance(refs, list):
                        for ref in refs:
                            if ref:
                                self.references.add(ref)
                    elif isinstance(refs, Reference):
                        self.references.add(refs)

            # except Exception as e:
            # logger.warning(str(e))

        # Remove empty metadata entries
        self.metadata_cleanup()

        # Get text from stream
        self.text = text_io.getvalue().decode("utf-8")
        text_io.close()
        converter.close()
        # print(self.text)

        # Extract URL references from text
        for url in extractor.extract_urls(self.text):
            self.references.add(Reference(url, self.curpage))

        for ref in extractor.extract_arxiv(self.text):
            self.references.add(Reference(ref, self.curpage))

        for ref in extractor.extract_doi(self.text):
            self.references.add(Reference(ref, self.curpage))
Exemplo n.º 40
0
def process_form_field(field, output_file_code, pdf_processed_pages,
                       make_crops):
    resolved_field = resolve1(field)

    # gets the details of the form field from the PDF File
    name, value, rect, page_id = resolved_field.get('T'), resolved_field.get(
        'V'), resolved_field.get('Rect'), resolved_field.get('P')

    field_id = str(field.objid)

    if 'FT' not in resolved_field:
        is_textfield = True
    elif resolved_field['FT'].name == 'Tx':
        is_textfield = True
    elif resolved_field['FT'].name == 'Btn':
        is_textfield = False
    else:
        is_textfield = True  # weird form type; assume its a text field

    if page_id is None:
        return

    page_resolved = resolve1(page_id)

    quadtree_index = pdf_processed_pages[page_id.objid][1]

    if make_crops:
        cropped_file_name = 'mturk_images/' + output_file_code + "_" + str(
            field_id) + ".png"
        if not os.path.isfile(cropped_file_name):
            draw_image = pdf_processed_pages[page_id.objid][0].copy(
            )  # makes a copy since we want to make a fresh crop for each one

            image_height = draw_image.height

            page_width = page_resolved['MediaBox'][2]
            page_height = page_resolved['MediaBox'][3]

            scale = image_height / page_height

            x0 = rect[0] * scale
            y0 = image_height - rect[1] * scale
            x1 = rect[2] * scale
            y1 = image_height - rect[3] * scale

            draw = ImageDraw.Draw(draw_image, 'RGBA')
            draw.rectangle([x0, y0, x1, y1], fill=(0, 0, 250, 100))

            crop_area = (0, y1 - 200, page_width * scale, y0 + 200)
            cropped_example = draw_image.crop(crop_area)

            cropped_example.save(cropped_file_name, "PNG")

    spacer = 0
    while True:
        # keep increasing area until we grab at least one textfield or get unreasonably big
        if spacer > 20:
            break
        if is_textfield:
            # textfields have their labels to the left or up
            quadrect = [rect[0] - spacer, rect[1] - spacer, rect[2], rect[3]]
        else:
            # checkboxes have their labels to the right
            quadrect = [rect[0], rect[1], rect[2] + spacer, rect[3] + spacer]

        matches = quadtree_index.intersect(quadrect)

        if len(matches) > 1:
            break
        else:
            spacer += 5

    # handle unfound case
    if len(matches) == 0:
        return

    match = matches[0]
    field_description = match[0]
    quadtree_index.remove(match, match[1])

    if is_textfield:
        field_type = 'string'
    else:
        field_type = 'boolean'

    match_text = list(map(lambda x: x[0], matches))
    match_str = ' '.join(match_text)

    return field_id, field_type, field_description, cropped_file_name, match_str
"""Code snippets vol-57
   283-Number of pages in a PDF file.

   Download all snippets so far:
   https://wp.me/Pa5TU8-1yg
   Blog: stevepython.wordpress.com

Requirements:
pip3 install pdfminer

Origin:
https://gist.github.com/miodeqqq/0a06c395b21cec60a7e0d8abe7a0793f
"""

from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import resolve1

with open('test.pdf', 'rb') as f:
    parser = PDFParser(f)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    pages = resolve1(doc.catalog['Pages'])
    pages_count = pages.get('Count', 0)
    print(pages_count, 'pages')
def pdf_metadata(path):
    # Function to retrieve PDF metadata when available
    # Initialize dictionary to contain metadata
    metadata = {}
    fp = open(path, 'rb')
    # initialize PDFParser to extract metadata
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    # Long series of exceptions handling in case of wierd text conversion from PDFParser
    try:
        metadata['Title'] = decoder(doc.info[0]["Title"])
    # If not recognized as text, resolve with built in function resolve1()
    except AttributeError:
        title = decoder(resolve1(doc.info[0]["Title"]))
        # Element retrieved is not null, attribute to metadata key
        if title:
            metadata['Title'] = title
        # Otherwise use simple naïve method
        else:
            metadata['Title'] = os.path.basename(path)
    # If not element corresponds to title in metadata use simple naïve method
    except KeyError:
        metadata['Title'] = os.path.basename(path)

    # Same exception handling as above
    try:
        metadata['Author(s)'] = decoder(doc.info[0]["Author"])
    except AttributeError:
        author = decoder(resolve1(doc.info[0]["Author"]))
        if author:
            metadata["Author(s)"] = author
        else:
            metadata['Author(s)'] = "Unknown"
    except KeyError:
        metadata["Author(s)"] = "Unknown"

    # Same exception handling as above
    try:
        metadata['Last Modified By'] = decoder(doc.info[0]["Author"])
    except AttributeError:
        author = decoder(resolve1(doc.info[0]["Author"]))
        if author:
            metadata['Last Modified By'] = author
        else:
            metadata['Last Modified By'] = "Unknown"
    except KeyError:
        metadata['Last Modified By'] = "Unknown"

    # Same exception handling as above
    try:
        metadata['Created Date'] = posix_from_s(
            decoder(doc.info[0]["CreationDate"]))
    except AttributeError:
        cdate = posix_from_s(decoder(resolve1(doc.info[0]["CreationDate"])))
        if cdate:
            metadata['Created Date'] = cdate
        else:
            metadata['Created Date'] = "Unknown"
    except KeyError:
        metadata['Created Date'] = "Unknown"

    # Same exception handling as above, however use posix correction function defined at root of script
    try:
        metadata['Modified Date'] = posix_from_s(
            decoder(doc.info[0]["ModDate"]))
    except AttributeError:
        mdate = posix_from_s(decoder(resolve1(doc.info[0]["ModDate"])))
        if mdate:
            metadata['Modified Date'] = mdate
        else:
            metadata['Modified Date'] = "Unknown"
    except KeyError:
        metadata['Modified Date'] = "Unknown"

    return metadata
Exemplo n.º 43
0
    def get_page_number(self, index):
        """
        Given an index, return page label as specified by
        catalog['PageLabels']['Nums']

        In a PDF, page labels are stored as a list of pairs, like
        [starting_index, label_format, starting_index, label_format ...]

        For example:
        [0, {'S': 'D', 'St': 151}, 4, {'S':'R', 'P':'Foo'}]

        So we have to first find the correct label_format based on the closest
        starting_index lower than the requested index, then use the
        label_format to convert the index to a page label.

        Label format meaning:
            /S = [
                    D Decimal arabic numerals
                    R Uppercase roman numerals
                    r Lowercase roman numerals
                    A Uppercase letters (A to Z for the first 26 pages, AA to ZZ
                      for the next 26, and so on)
                    a Lowercase letters (a to z for the first 26 pages, aa to zz
                      for the next 26, and so on)
                ] (if no /S, just use prefix ...)
            /P = text string label
            /St = integer start value
        """

        # get and cache page ranges
        if not hasattr(self, 'page_range_pairs'):
            try:
                page_ranges = resolve1(self.catalog['PageLabels'])['Nums']
                assert len(page_ranges) > 1 and len(page_ranges) % 2 == 0
                self.page_range_pairs = list(
                    reversed(list(zip(page_ranges[::2], page_ranges[1::2]))))
            except:
                self.page_range_pairs = []

        if not self.page_range_pairs:
            return ""

        # find page range containing index
        for starting_index, label_format in self.page_range_pairs:
            if starting_index <= index:
                break  # we found correct label_format
        label_format = resolve1(label_format)

        page_label = ""

        # handle numeric part of label
        if 'S' in label_format:

            # first find number for this page ...
            page_label = index - starting_index
            if 'St' in label_format:  # alternate start value
                page_label += label_format['St']
            else:
                page_label += 1

            # ... then convert to correct format
            num_type = label_format['S'].name

            # roman (upper or lower)
            if num_type.lower() == 'r':
                import roman
                page_label = roman.toRoman(page_label)
                if num_type == 'r':
                    page_label = page_label.lower()

            # letters
            elif num_type.lower() == 'a':
                # a to z for the first 26 pages, aa to zz for the next 26, and
                # so on
                letter = chr(page_label % 26 + 65)
                letter *= page_label / 26 + 1
                if num_type == 'a':
                    letter = letter.lower()
                page_label = letter

            # decimal arabic
            else:  # if num_type == 'D':
                page_label = obj_to_string(page_label)

        # handle string prefix
        if 'P' in label_format:
            page_label = smart_unicode_decode(label_format['P']) + page_label

        return page_label
Exemplo n.º 44
0
 def pdfInfo(self):
     # collect metadata from pdf file at document and page levels
     with open(self.fpath, 'rb') as fp:
         parser = PDFParser(fp)
         doc = PDFDocument(parser)
         docinfo = {}
         if 'Metadata' in doc.catalog:
             metadata = resolve1(doc.catalog['Metadata']).get_data()
             xmpdict = xmp_to_dict(metadata)
             docinfo['titl'] = xmpdict['dc']['title']['x-default']
             docinfo['desc'] = xmpdict['dc']['description']['x-default']
             docinfo['isfillable'] = (
                 xmpdict['pdf'].get('Keywords', '').lower() == 'fillable')
             anyMonth = 'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec'
             titlePttn1 = re.compile(ut.compactify(
                 r'''(?:(\d\d\d\d) )?   # 2016
                 Form ([\w-]+           # Form 1040
                 (?: \w\w?)?)           # AS
                 (?: or ([\w-]+))?      # or 1040A
                 (?:  ?\(?(?:Schedule ([\w-]+))\)?)?  # (Schedule B)
                 (?:  ?\((?:Rev|'''+anyMonth+''').+?\))?\s*$'''
                 ))
             # eg 2016 Form W-2 AS
             # eg 2015 Form 1120 S (Schedule D)
             # eg 2015 Form 990 or 990-EZ (Schedule E)
             # eg Form 8818  (Rev. December 2007)
             # eg Form 8849  (Schedule 2)  (Rev. January 2009)
             # eg Form 1066 (Schedule Q) (Rev. December 2013)
             # eg Form 1120S Schedule B-1 (December 2013)
             # 'Rev' means 'revised'
             m = re.search(titlePttn1, docinfo['titl'])
             if m:
                 taxyr, form1, form2, sched = m.groups()
             else:
                 titlePttn2 = re.compile(ut.compactify(
                     r'''(?:(\d\d\d\d) )?   # 2016
                     Schedule ([\w-]+)[ ]   # Schedule B
                     \(Form ([\w-]+)        # (Form 1040
                     (?: or ([\w-]+))? ?\)  # or 1040A)
                     (?: \((?:Rev|'''+anyMonth+''').+?\))?\s*$''',
                     ))
                 # eg 2015 Schedule M-3 (Form 1065)
                 # eg 2015 Schedule O (Form 990 or 990-EZ)
                 # eg Schedule O (Form 1120) (Rev. December 2012)
                 # eg Schedule C (Form 1065 ) (Rev. December 2014)
                 m = re.search(titlePttn2, docinfo['titl'])
                 if m:
                     taxyr, sched, form1, form2 = m.groups()
                 else:
                     msg = docinfo['titl'] + ' dont fit'
                     log.error(msg)
                     raise Exception(msg)
             docinfo['taxyr'] = taxyr
             form = form1 if not form2 or len(form1) < len(form2) else form2
             docinfo['form'] = form
             docinfo['sched'] = sched
             docinfo['formName'] = form if not sched else (form, sched)
             docinfo['fpath'] = self.fpath
         # Check if the document allows text extraction. If not, abort.
         if not doc.is_extractable:
             raise Exception('PDFTextExtractionNotAllowed')
         pageinfo = {}
         rr = Renderer()
         # for ipage,page in enumerate(doc.get_pages()):
         for ipage, page in enumerate(PDFPage.create_pages(doc)):
             pagenum = 1 + ipage
             if page.cropbox != page.mediabox:
                 log.warn(
                     'boxesDontMatch: cropbox!=mediabox on page %d:'
                     ' cropbox=%s; mediabox=%s',
                     pagenum, page.cropbox, page.mediabox)
             pagewidth = Qnty(
                 page.cropbox[2] - page.cropbox[0], 'printers_point')
             pageheight = Qnty(
                 page.cropbox[3] - page.cropbox[1], 'printers_point')
             pageinfo[pagenum] = PageInfo(
                 pagenum, pagewidth, pageheight, rr.renderPage(page))
     return docinfo, pageinfo
Exemplo n.º 45
0
def recursively_add_fields(fields, id_to_page, outfields, prefix=''):
    for i in fields:
        field = resolve1(i)
        name, value, rect, page, field_type = field.get('T'), field.get(
            'V'), field.get('Rect'), field.get('P'), field.get('FT')
        if name is not None:
            if PY2:
                name = remove_nonprintable_limited(str(name))
            else:
                if not isinstance(name, bytes):
                    name = bytes(str(name), encoding='utf-8')
                name = remove_nonprintable_bytes_limited(name)
        if value is not None:
            if PY2:
                value = remove_nonprintable_limited(str(value))
            else:
                if not isinstance(value, bytes):
                    value = bytes(str(value), encoding='utf-8')
                value = remove_nonprintable_bytes_limited(value)
        #logmessage("name is " + repr(name) + " and FT is |" + repr(str(field_type)) + "| and value is " + repr(value))
        if page is not None:
            pageno = id_to_page[page.objid]
        else:
            pageno = 1
        if str(field_type) in ('/Btn', "/u'Btn'", "/'Btn'"):
            if value == '/Yes':
                default = "Yes"
            else:
                default = "No"
        elif str(field_type) in ('/Sig', "/u'Sig'", "/'Sig'"):
            default = '${ user.signature }'
        else:
            if value is not None:
                #for val in value:
                #    logmessage("Got a " + str(ord(val)))
                #logmessage(repr(value.decode('utf8')))
                #default = re.sub(r'^\xc3\xbe\xc3\xbf', '', value)
                default = value
                if not default:
                    default = word("something")
            else:
                default = word("something")
        kids = field.get('Kids')
        if kids:
            if name is None:
                recursively_add_fields(kids,
                                       id_to_page,
                                       outfields,
                                       prefix=prefix)
            else:
                if prefix == '':
                    recursively_add_fields(kids,
                                           id_to_page,
                                           outfields,
                                           prefix=name)
                else:
                    recursively_add_fields(kids,
                                           id_to_page,
                                           outfields,
                                           prefix=prefix + '.' + name)
        else:
            if prefix != '' and name is not None:
                outfields.append(
                    (prefix + '.' + name, default, pageno, rect, field_type))
            elif prefix == '':
                outfields.append((name, default, pageno, rect, field_type))
            else:
                outfields.append((prefix, default, pageno, rect, field_type))
Exemplo n.º 46
0
    def text_to_lda(self, fp=None):
        #Reading the PDF Document and saving as lone
        lone=self.convert_pdf_to_text()

        # Gets inputs rb
        fp = open(self.a, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        fields = resolve1(doc.catalog['AcroForm'])['Fields']

        inps=[]
        for i in fields:
            field = resolve1(i)
            name, value = field.get('T'), field.get('V')
            inps.append('{0}: {1}'.format(name, value))
            
        inf=[]
        ini=[]
        for i in fields:
            field=resolve1(i)
            name,value=field.get('T'),field.get('V')
            inf.append(name)
            ini.append(value)

        # Topic Modeling
        # Fitting Count Vectorizer on the document with Stop Words
        vect=CountVectorizer(ngram_range=(1,1),stop_words='english')
        dtm = vect.fit_transform(inps)

        #Converting the Document Term Matrix from Count Vectorizer into a Pandas Dataframe
        dfm=pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

        #Fitting the Latent Dirichlet Allocation Model on the Document Term Matrix
        lda = LatentDirichletAllocation(n_components=5)
        lda_dtf = lda.fit_transform(dtm)
        #Latent Dirichlet Allocation Model
        # lda_dtf

        # Topic Extracting
        #Extracting 5 Topics from LDA and the most common words in each topic
        sorting = np.argsort(lda.components_)[:, ::-1]
        features = np.array(vect.get_feature_names())

        # mg.tools.print_topics(topics=range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=15)

        #Sentences within the Topic Model 1
        topic_0 = np.argsort(lda_dtf[:,0])[::-1]
        t0=[]
        for i in topic_0[:5]:
            j=(f".".join(inps[i].split(f".")[:2]) + f".\n")
            t0.append(j)
            
        #Senteces within the Topic Model 2
        topic_1 = np.argsort(lda_dtf[:,1])[::-1]
        t1=[]
        for i in topic_1[:5]:
            j=(f".".join(inps[i].split(f".")[:2]) + f".\n")
            t1.append(j)

        #Senteces within the Topic Model 3
        topic_2 = np.argsort(lda_dtf[:,2])[::-1]
        t2=[]
        for i in topic_2[:5]:
            j=(f".".join(inps[i].split(f".")[:2]) + f".\n")
            t2.append(j)
            
        #Senteces within the Topic Model 4
        topic_3 = np.argsort(lda_dtf[:,3])[::-1]
        t3=[]
        for i in topic_3[:5]:
            j=(f".".join(inps[i].split(f".")[:2]) + f".\n")
            t3.append(j)
            
        #Senteces within the Topic Model 5
        topic_4 = np.argsort(lda_dtf[:,4])[::-1]
        t4=[]
        for i in topic_4[:5]:
            j=(f".".join(inps[i].split(f".")[:2]) + f".\n")
            t4.append(j)

        st0=str(t0).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ")
        st1=str(t1).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ")
        st2=str(t2).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ")
        st3=str(t3).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ")
        st4=str(t4).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ")

        print ("Topic 0: \n" + st0 + "\n")
        print ("Topic 1: \n" + st1 + "\n")
        print ("Topic 2: \n" + st2 + "\n")
        print ("Topic 3: \n" + st3 + "\n")
        print ("Topic 4: \n" + st4 + "\n")
Exemplo n.º 47
0
    def get_multimedia(klass, document):
        def search(obj, parent):
            if isinstance(obj, int):
                objid = obj
                tree = dict_value(document.getobj(objid)).copy()
            else:
                objid = obj.objid
                tree = dict_value(obj).copy()

            for (k, v) in parent.items():
                if k in 'Resources' and k not in tree:
                    tree[k] = v

            tree_type = tree.get('Type')
            if tree_type is LITERAL_PAGES and 'Kids' in tree:
                for c in list_value(tree['Kids']):
                    for x in search(c, tree):
                        yield x

            elif tree_type is LITERAL_PAGE:
                yield (objid, tree)

        if 'Pages' in document.catalog:
            for (objid, tree) in search(document.catalog['Pages'],
                                        document.catalog):
                pageid = objid
                attrs = dict_value(tree)
                resources = resolve1(attrs.get('Resources', dict()))

                if 'XObject' in resources:  # Image
                    for (im_ref, xobj) in resources['XObject'].items():
                        image_stream = xobj.resolve()
                        if 'Filter' in image_stream:
                            if isinstance(image_stream['Filter'], list):
                                for filter in image_stream['Filter']:
                                    if filter.name == 'DCTDecode':
                                        yield ('', image_stream)
                            else:
                                #print(type(image_stream['Filter']), image_stream)
                                if image_stream['Filter'].name == 'DCTDecode':
                                    yield ('', image_stream)

                if 'Annots' in attrs:  # Multimedia (Video, Audio, SWF)
                    annots = resolve1(attrs.get('Annots', dict()))
                    for annot_obj in annots:
                        annot = annot_obj.resolve()

                        if 'RichMediaContent' in annot:
                            rich_media_content = resolve1(
                                annot.get('RichMediaContent', dict()))

                            if 'Assets' in rich_media_content:
                                assets = resolve1(
                                    rich_media_content.get('Assets', dict()))

                                for i in range(0, len(assets['Names']), 2):
                                    media_name = assets['Names'][i].decode(
                                        'utf-16')
                                    media_data_obj = assets['Names'][
                                        i + 1].resolve()

                                    if 'EF' in media_data_obj:
                                        for media_ref, media_obj in media_data_obj[
                                                'EF'].items():
                                            # print(media_ref, media_obj)
                                            filename = media_data_obj[
                                                media_ref].decode('ascii')
                                            media_stream = media_obj.resolve()
                                            yield (filename, media_stream)
Exemplo n.º 48
0
def getMetadataPDF():
    if doc and doc.catalog and 'Metadata' in doc.catalog:
        return resolve1(doc.catalog['Metadata']).get_data()
    else:
        return None
Exemplo n.º 49
0
def num_value2(x):
    x = resolve1(x)
    if not (isinstance(x, int) or isinstance(x, float)):
        return None
    return x
Exemplo n.º 50
0
    def __analyseAnnot(self, annot):

        _title = None
        _type = None
        idParent = None
        suppAttr = {}

        if ('FT' in annot):
            _type = annot['FT'].name
        else:
            _type = "unknown"
            if ('Parent' in annot):
                idParent = int(''.join(
                    [str(s) for s in str(annot['Parent']) if s.isdigit()]))
                _parent = resolve1(annot['Parent'])
                if ('FT' in _parent): _type = _parent['FT'].name
                if ('T' in _parent):
                    _title = _parent['T'].decode(encoding='UTF-8')
                    #decode_text( _parent['T'] );

        if ('T' in annot): _title = annot['T'].decode(encoding='UTF-8')
        elif _title == None:
            self._unknownTitle += 1
            _title = "unknown title " + str(self._unknownTitle)

        if (_type == 'Btn'):
            if (idParent != None):
                _type = "radio"
                suppAttr['group'] = idParent
            elif ('AS' in annot):
                _type = "checkbox"
            else:
                _type = "button"

        if (_type == 'Tx'):
            _type = "text"
            if ('Q' in annot):
                Q = int(annot['Q'])
                if (Q == 1): suppAttr['align'] = 'center'
                elif (Q == 2): suppAttr['align'] = 'right'

        if ('AA' in annot):
            AA = annot['AA']
            if ('F' in AA):
                format = resolve1(annot['AA']['F'])
                if ('JS' in format):
                    if ('AFNumber_Format(' in format['JS']):
                        suppAttr['format'] = 'numberonly'
                        suppAttr['decimal'] = int(
                            format['JS'][16:format['JS'].index(',')])

            # @TODO : gérer des validations numérique (from 5 to 10 ? max/min etc.)
            #if (AA.has_key('V')):
            #print resolve1(annot['AA']['V']); # {'JS': 'AFRange_Validate(true, 5, true, 10);'}

        if (_type == 'Ch' and ('Opt' in annot)):
            _type = 'multichoice'
            suppAttr['choices'] = [
                self.__decode_text(o) for o in annot['Opt']
            ]

        if ('MaxLen' in annot):
            suppAttr['maxchar'] = int(annot['MaxLen'])

        copySuppAttr = suppAttr.copy()
        r = {'type': _type, 'title': _title, 'rectangle': annot['Rect']}
        r.update(copySuppAttr)

        if (r['type'] == 'unknown'): return None
        else: return r
Exemplo n.º 51
0
def resource_example():
    from pdfminer.pdffont import CFFFont, TrueTypeFont
    from pdfminer.pdffont import PDFFont, PDFSimpleFont, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont
    from pdfminer.psparser import literal_name
    from pdfminer.pdftypes import PDFObjRef
    from pdfminer.pdftypes import list_value, dict_value, stream_value
    from pdfminer.pdfcolor import PDFColorSpace
    from pdfminer.pdfcolor import PREDEFINED_COLORSPACE

    font_filepath = '/path/to/font.ttf'
    with open(font_filepath, 'rb') as fp:
        #font = CFFFont(font_filepath, fp)
        font = TrueTypeFont(font_filepath, fp)
        print('Font type = {}.'.format(font.fonttype))
        print('Font fp = {}.'.format(font.fp))
        print('Font name = {}.'.format(font.name))
        print('Font tables = {}.'.format(font.tables))

    #--------------------
    pdf_filepath = '/path/to/sample.pdf'

    fp = None
    try:
        # Open a PDF file.
        fp = open(pdf_filepath, 'rb')

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        pages = PDFPage.get_pages(
            fp, pagenos=None, maxpages=0, password=b''
        )  # pagenos uses zero-based indices. pagenos is sorted inside the function.
        page = next(pages)
        if page:
            resources, contents = page.resources, page.contents
            if not resources:
                print('No resource.')
                return

            if contents:
                print('Contents: {}.'.format(contents))
                #for ct in contents:
                #	print(ct.resolve())

            # REF [function] >> pdfminer.pdfinterp.PDFPageInterpreter.init_resources()
            def get_colorspace(spec):
                if isinstance(spec, list):
                    name = literal_name(spec[0])
                else:
                    name = literal_name(spec)
                if name == 'ICCBased' and isinstance(spec,
                                                     list) and 2 <= len(spec):
                    return PDFColorSpace(name, stream_value(spec[1])['N'])
                elif name == 'DeviceN' and isinstance(spec,
                                                      list) and 2 <= len(spec):
                    return PDFColorSpace(name, len(list_value(spec[1])))
                else:
                    return PREDEFINED_COLORSPACE[name]

            fontmap, xobjmap = dict(), dict()
            csmap = PREDEFINED_COLORSPACE.copy()
            for (k, v) in dict_value(resources).items():
                #if 2 <= self.debug:
                #	print >>stderr, 'Resource: %r: %r' % (k,v)
                if k == 'Font':
                    for (font_id, spec) in dict_value(v).items():
                        obj_id = None
                        if isinstance(spec, PDFObjRef):
                            obj_id = spec.objid
                        spec = dict_value(spec)
                        fontmap[font_id] = rsrcmgr.get_font(obj_id, spec)
                elif k == 'ColorSpace':
                    for (cs_id, spec) in dict_value(v).items():
                        csmap[cs_id] = get_colorspace(resolve1(spec))
                elif k == 'ProcSet':
                    rsrcmgr.get_procset(list_value(v))
                elif k == 'XObject':
                    for (xobj_id, xobjstrm) in dict_value(v).items():
                        xobjmap[xobj_id] = xobjstrm

            #spec = ...
            #if 'FontDescriptor' in spec:
            #	print('FontDescriptor: {}.'.format(spec['FontDescriptor'].resolve()))

            font = PDFType1Font(rsrcmgr, spec)
            font = PDFTrueTypeFont(rsrcmgr, spec)
            #font = PDFType3Font(rsrcmgr, spec)
            font = PDFCIDFont(rsrcmgr, spec)

            for font_id, font in fontmap.items():
                print(
                    '------------------------------------------------------------'
                )
                print('Descriptor: {}.'.format(font.descriptor))
                print('\tFont name: {}, Font type: {}.'.format(
                    font.fontname,
                    type(font).__name__))
                if hasattr(font, 'basefont'):
                    print('\tBase font: {}.'.format(font.basefont))
                if hasattr(font, 'flags'):
                    print('\tFlags = {}.'.format(font.flags))
                if hasattr(font, 'default_width') and hasattr(font, 'widths'):
                    print('\tDefault width = {}, Widths = {}.'.format(
                        font.default_width, font.widths))
                print('\tAscent: {}, {}.'.format(font.ascent,
                                                 font.get_ascent()))
                print('\tDescent: {}, {}.'.format(font.descent,
                                                  font.get_descent()))
                if hasattr(font, 'hscale') and hasattr(font, 'vscale'):
                    print('\tScale: {}, {}.'.format(font.hscale, font.vscale))
                if hasattr(font, 'leading') and hasattr(font, 'italic_angle'):
                    print('\tLeading = {}, Italic angle = {}.'.format(
                        font.leading, font.italic_angle))
                print('\tBbox = {}.'.format(font.bbox))
                if hasattr(font, 'get_width') and hasattr(font, 'get_height'):
                    print('\t(width, height) = ({}, {}).'.format(
                        font.get_width(), font.get_height()))
                if hasattr(font, 'is_multibyte') and hasattr(
                        font, 'is_vertical'):
                    print('\tis_multibyte = {}, is_vertical = {}.'.format(
                        font.is_multibyte(), font.is_vertical()))
                if hasattr(font, 'cid2unicode') and hasattr(
                        font, 'unicode_map'):
                    print('\tcid2unicode = {}, unicode_map = {}.'.format(
                        font.cid2unicode, font.unicode_map))
                #if hasattr(font, 'char_disp'):
                #	print('\tchar_disp({}) = {}.'.format(cid, font.char_disp(cid)))
                #if hasattr(font, 'to_unichr'):
                #	print('\tto_unichr({}) = {}.'.format(cid, font.to_unichr(cid)))
                #if hasattr(font, 'char_width') and hasattr(font, 'string_width'):
                #	print('\tchar_width({}) = {}, string_width({}) = {}.'.format(cid, font.char_width(cid), s, font.string_width(s)))
            for cs_id, cs in csmap.items():
                print('CS ID: {}.'.format(cs_id))
                print('\t{}.'.format(cs))
            for xobj_id, xobj in xobjmap.items():
                print('XObj ID: {}.'.format(xobj_id))
                print('\t{}.'.format(xobj))
    except FileNotFoundError as ex:
        print('File not found, {}: {}.'.format(pdf_filepath, ex))
    except Exception as ex:
        print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex))
    finally:
        if fp: fp.close()
Exemplo n.º 52
0
 def getMetadataPDF(self):
     if self.pdfDoc and self.pdfDoc.catalog and 'Metadata' in self.pdfDoc.catalog:
         return resolve1(self.pdfDoc.catalog['Metadata']).get_data()
     else:
         return None
Exemplo n.º 53
0
def extract_form_values():
    csv_file = open(os.path.join(dir, file_name), "w")
    for entry in header:
        csv_file.write("%s , " % entry)
    csv_file.write("\n")

    for f in p.glob('*.pdf'):
        filename = os.path.basename(f)
        #print ("filename ", filename)

        ### HACK. From web converted file - x sign marks on the field gets extracted! x sign does not work via programmatic method. ###
        ### In case you face the same issue: ###
        ### Uncomment the below two lines and comment the call to convert_pdf_to_img. ###
        ### Manually convert pdf to image file. Name per the naming convention i.e. filename-2.jpg. Place in the image directory. ###
        #image_filename = filename[0:-4] + "-2.jpg"
        #image_file = os.path.join(img_dir, image_filename)

        #Call pdf to image convert
        image_file = convert_pdf_to_img(filename, f)

        #Call OCR
        img_ocr_txt_file = convert_jpg_to_text(image_file)

        #It will be the txt file output created from pdf and jpg image files by extracting field values
        form_to_txt_file_name = filename[0:-3] + "txt"
        #print (form_to_txt_file_name)
        output_file = open(os.path.join(output_dir, form_to_txt_file_name),
                           "w")

        #Open the pdf file
        with open(f, 'rb') as fp:
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            res = resolve1(doc.catalog)
            if 'AcroForm' not in res:
                raise ValueError("No AcroForm Found")

            data = {}
            fields = resolve1(doc.catalog['AcroForm'])['Fields']
            #Extract field n value using pdfminer
            for f in fields:
                field = resolve1(f)
                name, values = field.get('T'), field.get('V')
                # decode name
                name = decode_text(name)
                # resolve indirect obj
                values = resolve1(values)
                # decode value(s)
                if isinstance(values, list):
                    values = [decode_value(v) for v in values]
                else:
                    values = decode_value(values)

                data.update({name: values})
                #print(name)
                #print ("\n")
                #print (values)
                output_file.write(name)
                if values is not None:
                    output_file.write(values)
                output_file.write("\n")

        process_big_text(output_file, img_ocr_txt_file)
        output_file.close()
        fp.close()
        TextToCSV.write_CSV(csv_file, output_dir, form_to_txt_file_name)

    csv_file.close()
Exemplo n.º 54
0
def parse_assessment_to_excel(assessment_path, database_path):

    utc_now = datetime.utcnow()

    data_dictionary = OrderedDict(
        {"Processed_UTC": utc_now.isoformat()}
    )  # Lets make a dictionary where all the parsed values are kept, lets add time when parsing was started
    # TODO add also processed file name

    assessment_file = open(assessment_path, 'rb')

    parser = PDFParser(assessment_file)
    doc = PDFDocument(parser)
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    for i in fields:
        field = resolve1(i)
        key, value = field.get('T'), field.get('V')

        if debug:
            print '{}: {} -> {}'.format(key, value, type(value))  # DEBUG

        if type(value) == str:

            unicode_value = unicode(
                value.decode("iso-8859-1").replace(
                    u"\xfe\xff\x00",
                    u"").replace(u"\x00", u"").replace(u'\xfe\xff', u"")
            )  # Lets convert the string to unicode and replace is needed to remove some funny characters
            data_dictionary[key] = [unicode_value]

        elif value == None:
            data_dictionary[key] = [u"ei"]

        else:
            data_dictionary[key] = [value.name]

            if value.name == "Off":
                data_dictionary[key] = [u"ei"]

            if value.name == "Yes":
                data_dictionary[key] = [u"jah"]

    assessment_file.close()

    # Create pandas dataframe for exporting data
    data_frame = pandas.DataFrame(data_dictionary)

    if debug:
        print list(data_frame.columns)  # DEBUG

    if os.path.exists(database_path) == True:

        print "Info  - Database file {} already exists, loading previous records".format(
            database_path)
        existing_data = pandas.read_excel(
            database_path, index_col=0)  # TODO set first column as index

        if debug:
            print existing_data

        # Add to exsiting data
        data_frame = existing_data.append(data_frame, sort=False)

        # Fix index numbering
        data_frame = data_frame.reset_index(drop=True)  # Fix index numbering

        # Create backup of current database
        move_file(database_path,
                  "database_backup", "{:%Y%m%dT%H%M%S}_{}".format(
                      utc_now,
                      uuid.uuid4()))  # Create unique filename for each bacup

    # Export to excel and add formatting

    sheet_name = "Hindamised"

    writer = pandas.ExcelWriter(database_path, engine='xlsxwriter')
    data_frame.to_excel(writer, sheet_name, encoding='utf8')

    # Get sheet to do some formatting
    sheet = writer.sheets[sheet_name]

    # Set default column size, if this does not work you are missing XslxWriter module
    first_col = 1
    last_col = len(data_frame.columns)
    width = 25
    sheet.set_column(first_col, last_col, width)

    # freeze column names and ID column
    sheet.freeze_panes(1, 1)

    # Apply filter to excel
    first_row = 0
    last_row = len(data_frame)
    sheet.autofilter(first_row, first_col, last_row, last_col)

    # Save the file
    writer.save()

    return data_dictionary
def pdf_metadata(path):
    metadata = {}
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    try:
        metadata['Title'] = decoder(doc.info[0]["Title"])
    except AttributeError:
        title = decoder(resolve1(doc.info[0]["Title"]))
        if title:
            metadata['Title'] = title
        else:
            metadata['Title'] = os.path.basename(path)
    except KeyError:
        metadata['Title'] = os.path.basename(path)

    try:
        metadata['Author(s)'] = decoder(doc.info[0]["Author"])
    except AttributeError:
        author = decoder(resolve1(doc.info[0]["Author"]))
        if author:
            metadata["Author(s)"] = author
        else:
            metadata['Author(s)'] = "Unknown"
    except KeyError:
        metadata["Author(s)"] = "Unknown"

    try:
        metadata['Last Modified By'] = decoder(doc.info[0]["Author"])
    except AttributeError:
        author = decoder(resolve1(doc.info[0]["Author"]))
        if author:
            metadata['Last Modified By'] = author
        else:
            metadata['Last Modified By'] = "Unknown"
    except KeyError:
        metadata['Last Modified By'] = "Unknown"

    try:
        metadata['Created Date'] = posix_from_s(
            decoder(doc.info[0]["CreationDate"]))
    except AttributeError:
        cdate = posix_from_s(decoder(resolve1(doc.info[0]["CreationDate"])))
        if cdate:
            metadata['Created Date'] = cdate
        else:
            metadata['Created Date'] = "Unknown"
    except KeyError:
        metadata['Created Date'] = "Unknown"

    try:
        metadata['Modified Date'] = posix_from_s(
            decoder(doc.info[0]["ModDate"]))
    except AttributeError:
        mdate = posix_from_s(decoder(resolve1(doc.info[0]["ModDate"])))
        if mdate:
            metadata['Modified Date'] = mdate
        else:
            metadata['Modified Date'] = "Unknown"
    except KeyError:
        metadata['Modified Date'] = "Unknown"

    return metadata
Exemplo n.º 56
0
def recursively_add_fields(fields, id_to_page, outfields, prefix=''):
    if isinstance(fields, PDFObjRef):
        fields = resolve1(fields)
    for i in fields:
        field = resolve1(i)
        try:
            name, value, rect, page, field_type = field.get('T'), field.get('V'), field.get('Rect'), field.get('P'), field.get('FT')
        except:
            logmessage("Skipping field " + repr(field))
            continue
        if name is not None:
            if not isinstance(name, bytes):
                name = bytes(str(name), encoding='utf-8')
            name = remove_nonprintable_bytes_limited(name)
        if value is not None:
            if not isinstance(value, bytes):
                value = bytes(str(value), encoding='utf-8')
            value = remove_nonprintable_bytes_limited(value)
        #logmessage("name is " + repr(name) + " and FT is |" + repr(str(field_type)) + "| and value is " + repr(value))
        if page is not None:
            pageno = id_to_page[page.objid]
        else:
            pageno = 1
        export_value = None
        if str(field_type) in ('/Btn', "/'Btn'"):
            export_value = 'Yes'
            try:
                for key in list(field['AP']['N'].keys()):
                    if key in ('Off', 'off', 'No', 'no'):
                        continue
                    export_value = key
                    break
            except:
                pass
            if value == '/Yes':
                default = export_value
            else:
                default = "No"
        elif str(field_type) in ('/Sig', "/'Sig'"):
            default = '${ user.signature }'
        else:
            if value is not None:
                #for val in value:
                #    logmessage("Got a " + str(ord(val)))
                #logmessage(repr(value.decode('utf8')))
                #default = re.sub(r'^\xc3\xbe\xc3\xbf', '', value)
                default = value
                if not default:
                    default = word("something")
            else:
                default = word("something")
        kids = field.get('Kids')
        if kids:
            if name is None:
                recursively_add_fields(kids, id_to_page, outfields, prefix=prefix)
            else:
                if prefix == '':
                    recursively_add_fields(kids, id_to_page, outfields, prefix=name)
                else:
                    recursively_add_fields(kids, id_to_page, outfields, prefix=prefix + '.' + name)
        else:
            if prefix != '' and name is not None:
                outfields.append((prefix + '.' + name, default, pageno, rect, field_type, export_value))
            elif prefix == '':
                outfields.append((name, default, pageno, rect, field_type, export_value))
            else:
                outfields.append((prefix, default, pageno, rect, field_type, export_value))