def load_field(field): """load form field""" def uniflail(stringish): def uni8(stringish): try: return unicode(stringish, encoding='utf8') except UnicodeDecodeError: return unicode(stringish, encoding='iso-8859-1') if stringish == None: return None if len(stringish) < 2: return uni8(stringish) b0 = ord(stringish[0]) b1 = ord(stringish[1]) if (b0 == 0xff and b1 == 0xfe) or (b0 == 0xfe and b1 == 0xff): return unicode(stringish, encoding='utf16') return uni8(stringish) typ = field.get('FT').name if typ: t = field.get('T') if not t: return None if typ == "Tx": val = resolve1(field.get('V')) if val == None: return None return (t, uniflail(val)) elif typ == "Btn": val = resolve1(field.get('V')) if val == None: return None return (t, uniflail(val.name)) else: raise FormParseException("unknown field type " + typ)
def load_form(filename): """Load pdf form contents into a dictionary""" with open(filename, 'rb') as file: try: parser = PDFParser(file) doc = PDFDocument(parser) parser.set_document(doc) if not 'AcroForm' in doc.catalog: return None fields = resolve1(doc.catalog['AcroForm']) if fields == None or 'Fields' not in fields: return None fieldlist = [] for f in fields['Fields']: field = resolve1(f) if field == None: return None fieldlist.append(load_field(field)) fieldset = dict() for f in fieldlist: if f == None: continue k, v = f fieldset[k] = v return fieldset except UnicodeDecodeError, e: raise FormParseException(filename + ": unicode error: " + str(e))
def read_fields(pdffile): outfields = list() fp = open(pdffile, 'rb') id_to_page = dict() parser = PDFParser(fp) doc = PDFDocument(parser) pageno = 1; for page in PDFPage.create_pages(doc): id_to_page[page.pageid] = pageno pageno += 1 fields = resolve1(doc.catalog['AcroForm'])['Fields'] for i in fields: field = resolve1(i) name, value, rect, page, field_type = field.get('T'), field.get('V'), field.get('Rect'), field.get('P'), field.get('FT') logmessage("name is " + str(name) + " and FT is |" + str(field_type) + "|") if page is not None: pageno = id_to_page[page.objid] else: pageno = 1 if str(field_type) == '/Btn': if value == '/Yes': default = "Yes" else: default = "No" elif str(field_type) == '/Sig': default = '${ user.signature }' else: if value is not None: default = value else: default = word("something") outfields.append((name, default, pageno, rect, field_type)) return outfields
def load_form(filename): """Load pdf form contents into a nested list of name/value tuples""" with open(filename, 'rb') as file: parser = PDFParser(file) doc = PDFDocument(parser) return [load_fields(resolve1(f)) for f in resolve1(doc.catalog['AcroForm'])['Fields']]
def load_fields(field): """Recursively load form fields""" form = field.get('Kids', None) if form: return [load_fields(resolve1(f)) for f in form] else: # Some field types, like signatures, need extra resolving return (field.get('T').decode('utf-16'), field, resolve1(field.get('V')))
def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest
def load_form(filename): """Load pdf form contents into a nested list of name/value tuples""" with open(filename, 'rb') as file: parser = PDFParser(file) doc = PDFDocument(parser) import ipdb;ipdb.set_trace() # parser.set_document(doc) #doc.set_parser(parser) #doc.initialize() return [load_fields(resolve1(f)) for f in resolve1(doc.catalog['AcroForm'])['Fields']]
def resolve_dest(dest): try : if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest except PDFDestinationNotFound : return None
def _get_xmp_metadata(self): t = a = None metadata = resolve1(self.doc.catalog['Metadata']).get_data() try: md = xmp_to_dict(metadata) except: return t, a try: t = md['dc']['title']['x-default'] except KeyError: pass try: a = md['dc']['creator'] except KeyError: pass else: if type(a) is str: a = [a] a = filter(bool, a) # remove None, empty strings, ... if len(a) > 1: a = '%s %s' % (self._au_last_name(a[0]), self._au_last_name(a[-1])) elif len(a) == 1: a = self._au_last_name(a[0]) else: a = None return t, a
def get_metadata(self): """Returns metadata from both the info field (older PDFs) and XMP (newer PDFs). Return format is a .modules.metadata.Metadata object """ file_pointer = open(self.path, 'rb') parser = PDFParser(file_pointer) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() metadata = Metadata() for i in doc.info: metadata.add(i) if 'Metadata' in doc.catalog: xmp_metadata = resolve1(doc.catalog['Metadata']).get_data() xmp_dict = xmp_to_dict(xmp_metadata) #Let's add only the most useful one if "xap" in xmp_dict: metadata.add(xmp_dict["xap"]) if "pdf" in xmp_dict: metadata.add(xmp_dict["pdf"]) if "dc" in xmp_dict: metadata.add(xmp_dict["dc"], metadataType="dc") file_pointer.close() self.metadata = metadata return metadata
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) for (level,title,dest,a,se) in doc.get_outlines(): pageno = None if dest: dest = resolve1( doc.lookup_name('Dests', dest) ) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = action['D'] pageno = pages[dest[0].objid] outfp.write(repr((level,title,dest,pageno))+'\n') parser.close() fp.close() return
def getData(fileName): doc = PDFDocument() fp = file(fileName, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) doc.set_parser(parser) except: return "error" parser.close() fp.close() try: for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) metadata=info if metadata == None: return "Empty metadata" else: if metadata.has_key('Author'): print("Author "+metadata['Author']) if metadata.has_key('Company'): print("Company "+metadata['Company']) if metadata.has_key('Producer'): print("Producer "+metadata['Producer']) if metadata.has_key('Creator'): print("Creator "+metadata['Creator']) except Exception,e: print "\t [x] Error in PDF extractor" return e
def get_metadata(self): """Returns metadata from both the info field (older PDFs) and XMP (newer PDFs). Return format is a .modules.metadata.Metadata object """ with PdfMinerWrapper(self.path) as pdf_miner: metadata = Metadata() for i in pdf_miner.document.info: metadata.add(i) if 'Metadata' in pdf_miner.document.catalog: catalog = pdf_miner.document.catalog['Metadata'] xmp_metadata = resolve1(catalog).get_data() xmp_dict = xmp_to_dict(xmp_metadata) # Let's add only the most useful one if "xap" in xmp_dict: metadata.add(xmp_dict["xap"]) if "pdf" in xmp_dict: metadata.add(xmp_dict["pdf"]) if "dc" in xmp_dict: metadata.add(xmp_dict["dc"], metadataType="dc") self.metadata = metadata return metadata
def getData(self): doc = PDFDocument() fp = file(self.fname, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) doc.set_parser(parser) doc.initialize(self.password) except: return "error" parser.close() fp.close() #try: # metadata = resolve1(doc.catalog['Metadata']) # return "ok" #except: # print "[x] Error in PDF extractor, Metadata catalog" try: for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) self.metadata=info self.raw = info if self.raw == None: return "Empty metadata" else: return "ok" except Exception,e: return e print "\t [x] Error in PDF extractor, Trailer Info"
def getPDFMetadata(path): result = {} fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() result = doc.info if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() try: result.update( metadata ) # The raw XMP metadata except: pass try: result.update( xmp_to_dict(metadata) ) except: pass return result[0]
def getAllAnnots(self): annots = [] pages = self.__getPages(); for p in pages: if (p.annots): annots += resolve1(p.annots); #annots = annots[0:10] #[ annots[5] ] self.__decodeAnnots(annots) return annots;
def recursively_add_fields(fields, id_to_page, outfields, prefix=''): for i in fields: field = resolve1(i) name, value, rect, page, field_type = field.get('T'), field.get('V'), field.get('Rect'), field.get('P'), field.get('FT') if name is not None: if PY2: name = remove_nonprintable_limited(str(name)) else: if not isinstance(name, bytes): name = bytes(str(name), encoding='utf-8') name = remove_nonprintable_bytes_limited(name) if value is not None: if PY2: value = remove_nonprintable_limited(str(value)) else: if not isinstance(value, bytes): value = bytes(str(value), encoding='utf-8') value = remove_nonprintable_bytes_limited(value) #logmessage("name is " + repr(name) + " and FT is |" + repr(str(field_type)) + "| and value is " + repr(value)) if page is not None: pageno = id_to_page[page.objid] else: pageno = 1 if str(field_type) in ('/Btn', "/u'Btn'", "/'Btn'"): if value == '/Yes': default = "Yes" else: default = "No" elif str(field_type) in ('/Sig', "/u'Sig'", "/'Sig'"): default = '${ user.signature }' else: if value is not None: #for val in value: # logmessage("Got a " + str(ord(val))) #logmessage(repr(value.decode('utf8'))) #default = re.sub(r'^\xc3\xbe\xc3\xbf', '', value) default = value if not default: default = word("something") else: default = word("something") kids = field.get('Kids') if kids: if name is None: recursively_add_fields(kids, id_to_page, outfields, prefix=prefix) else: if prefix == '': recursively_add_fields(kids, id_to_page, outfields, prefix=name) else: recursively_add_fields(kids, id_to_page, outfields, prefix=prefix + '.' + name) else: if prefix != '' and name is not None: outfields.append((prefix + '.' + name, default, pageno, rect, field_type)) elif prefix == '': outfields.append((name, default, pageno, rect, field_type)) else: outfields.append((prefix, default, pageno, rect, field_type))
def pdf2metadata(fp): parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) doc.initialize() if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() #print metadata # The raw XMP metadata return doc.info # The "Info" metadata
def _add_annots(self, layout, annots): """Adds annotations to the layout object """ if annots: for annot in resolve1(annots): annot = resolve1(annot) if annot.get('Rect') is not None: annot['bbox'] = annot.pop('Rect') # Rename key annot = self._set_hwxy_attrs(annot) try: annot['URI'] = resolve1(annot['A'])['URI'] except KeyError: pass for k, v in six.iteritems(annot): if not isinstance(v, six.string_types): annot[k] = obj_to_string(v) elem = parser.makeelement('Annot', annot) layout.add(elem) return layout
def getData(self): try: doc = PDFDocument() fp = file(self.fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(self.password) metadata = resolve1(doc.catalog['Metadata']) parser.close() fp.close() for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) self.metadata=info self.raw = info return "ok" except: return "error"
def load_fields(field): """Recursively load form fields""" form = field.get('Kids', None) if form: for f in form: for field_name in load_fields(resolve1(f)): yield field_name else: try: yield field.get('T').decode('utf-16') except: yield field.get('T')
def get_page_number(self, index): """ Given an index, return page label as specified by catalog['PageLabels']['Nums'] Nums == [ 0 << /S /r >> 4 << /S /D >> 7 << /S /D /P (A−) /St 8>> ] /S = [ D Decimal arabic numerals R Uppercase roman numerals r Lowercase roman numerals A Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on) a Lowercase letters (a to z for the first 26 pages, aa to zz for the next 26, and so on) ] (if no /S, just use prefix ...) /P = text string label /St = integer start value """ try: nums = resolve1(self.catalog['PageLabels'])['Nums'] # e.g. [ 0 {settings} 2 {settings} 20 # {settings} ...] assert len(nums) > 1 and len(nums) % 2 == 0 except: return "" for i in range(len(nums)-2,-1,-2): # find highest page number lower than requested page if nums[i] <= index: break settings = nums[i+1].resolve() page_num = "" if 'S' in settings: # show a digit page_num = index - nums[i] if 'St' in settings: # alternate start value page_num += settings['St'] else: page_num += 1 num_type = settings['S'].name if num_type.lower() == 'r': # roman (upper or lower) import roman page_num = roman.toRoman(page_num) if num_type == 'r': page_num = page_num.lower() elif num_type.lower() == 'a': # letters # a to z for the first 26 pages, aa to zz for the next 26, and so on letter = chr(page_num % 26 + 65) letter *= page_num / 26 + 1 if num_type == 'a': letter = letter.lower() page_num = letter else: #if num_type == 'D': # decimal arabic page_num = unicode(page_num) if 'P' in settings: # page prefix page_num = settings['P']+page_num return page_num
def __init__(self, doc, pageid, attrs): self.doc = doc self.pageid = pageid self.attrs = dict_value(attrs) self.lastmod = resolve1(self.attrs.get('LastModified')) self.resources = resolve1(self.attrs['Resources']) self.mediabox = resolve1(self.attrs['MediaBox']) if 'CropBox' in self.attrs: self.cropbox = resolve1(self.attrs['CropBox']) else: self.cropbox = self.mediabox self.rotate = self.attrs.get('Rotate', 0) self.annots = self.attrs.get('Annots') self.beads = self.attrs.get('B') if 'Contents' in self.attrs: contents = resolve1(self.attrs['Contents']) else: contents = [] if not isinstance(contents, list): contents = [ contents ] self.contents = contents return
def proc(self, pdfFp): """Get meta-data as available from a PDF document""" parser = PDFParser(pdfFp) doc = PDFDocument(parser) parser.set_document(doc) doc.initialize() self.info = doc.info if 'Metadata' in doc.catalog: self.metadata = xmp_to_dict( resolve1(doc.catalog['Metadata']).get_data() ) self.raw_doc = pdfFp.getvalue()
def load_fields(field): """Recursively load form fields""" form = field.get('Kids', None) if form: f = gettext(field) #print 'FORM: #', f,'#' if 'Page2' in f: return return [load_fields(resolve1(f)) for f in form] else: name, value = field.get('T'), field.get('V') # if name=='OrdinaryDividendsAmt[0]': # import ipdb;ipdb.set_trace() arect = field.get('Rect') print "<div style='background-color:green;position:absolute;left:%spx;top:%spx;width:%spx;height:%spx;'>%s</div>" % ( (arect[0])*2,(1200-arect[1])*2, (arect[2] - arect[0])*2-3,( arect[3] - arect[1])*2-3, gettext(field))
def do(filename=''): fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) parser.set_document(doc) # doc.set_parser(parser) # doc.initialize() print doc.info # The "Info" metadata if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() print metadata # The raw XMP metadata print xmp_to_dict(metadata) return doc,doc.info[0]
def get_tree(self, *page_numbers): """ Return lxml.etree.ElementTree for entire document, or page numbers given if any. """ cache_key = "_".join(map(str, _flatten(page_numbers))) tree = self._parse_tree_cacher.get(cache_key) if tree is None: # set up root root = parser.makeelement("pdfxml") if self.doc.info: for k, v in list(self.doc.info[0].items()): k = unicode_decode_object(k) v = unicode_decode_object(resolve1(v)) try: root.set(k, v) except ValueError as e: # Sometimes keys have a character in them, like ':', # that isn't allowed in XML attribute names. # If that happens we just replace non-word characters # with '_'. if "Invalid attribute name" in e.message: k = re.sub('\W', '_', k) root.set(k, v) # Parse pages and append to root. # If nothing was passed in for page_numbers, we do this for all # pages, but if None was explicitly passed in, we skip it. if not(len(page_numbers) == 1 and page_numbers[0] is None): if page_numbers: pages = [[n, self.get_layout(self.get_page(n))] for n in _flatten(page_numbers)] else: pages = enumerate(self.get_layouts()) for n, page in pages: page = self._xmlize(page) page.set('page_index', unicode_decode_object(n)) page.set('page_label', self.doc.get_page_number(n)) root.append(page) self._clean_text(root) # wrap root in ElementTree tree = etree.ElementTree(root) self._parse_tree_cacher.set(cache_key, tree) return tree
def read_fields(pdffile): import string printable = set(string.printable) outfields = list() fp = open(pdffile, 'rb') id_to_page = dict() parser = PDFParser(fp) doc = PDFDocument(parser) pageno = 1; for page in PDFPage.create_pages(doc): id_to_page[page.pageid] = pageno pageno += 1 if 'AcroForm' not in doc.catalog: return None fields = resolve1(doc.catalog['AcroForm'])['Fields'] recursively_add_fields(fields, id_to_page, outfields) return sorted(outfields, key=fieldsorter)
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) for (level,title,dest,a,se) in doc.get_outlines(): pageno = None if dest: dest = resolve1( doc.lookup_name('Dests', dest) ) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] outfp.write(repr((level,title,dest,pageno))+'\n') parser.close() fp.close() return
def __decodeAnnots(self, annots): self._unknownTitle = 0; for index,a in enumerate(annots): if type(a).__name__ != 'PDFObjRef': continue; a = resolve1(a); if ('Compression' in a): if (a['Compression']==12): a=str(a).encode("x/1244"); elif (a['Compression']==17): a=str(a).encode("x/1211"); elif (a['Compression']==10): a=str(a).encode("x/101"); #print resolve1(a) transformedAnnot = self.__analyseAnnot( a ); if (transformedAnnot != None): annots[index] = transformedAnnot; annots[index]['id'] = index+1; else: del annots[index];
if 'Keywords' in pdfdoc: return True return False # loop through directories for subdir, dirs, files in os.walk(arg_path): for file in files: file_count += 1 filepath = subdir + os.sep + file if filepath.endswith(".pdf"): pdffilecount += 1 try: pdfdoc = parsePDFfile(filepath) if checkMetadata(pdfdoc): metadata = resolve1(pdfdoc.catalog['Metadata']).get_data() dirname = subdir.split(os.path.sep)[-1] pdfdict = xmp_to_dict(metadata) dict1 = pdfdoc.info[0] xkeywords = None xdesc = None xcreator = None xtitle = None xfolder = None try: xkeywords = str(pdfdict['pdf']['Keywords']).replace( '\r\n', ', ') except: xkeywords = '' pass try:
def get_input_fields(self, source_pdf: str = None, replace_none_value: bool = False) -> dict: """Get input fields in the PDF. Stores input fields internally so that they can be used without parsing PDF again. Parameter `replace_none_value` is for convience to visualize fields. :param source_pdf: source filepath, defaults to None :param replace_none_value: if value is None replace it with key name, defaults to False :return: dictionary of input key values or `None` """ record_fields = {} if source_pdf is None and self.active_fields: return self.active_fields self.switch_to_pdf_document(source_pdf) source_parser = PDFParser(self.active_fileobject) source_document = PDFDocument(source_parser) try: fields = resolve1(source_document.catalog["AcroForm"])["Fields"] except KeyError: self.logger.info('PDF "%s" does not have any input fields.', self.active_pdf) return None for i in fields: field = resolve1(i) if field is None: continue name, value, rect, label = ( field.get("T"), field.get("V"), field.get("Rect"), field.get("TU"), ) if value is None and replace_none_value: record_fields[name.decode("iso-8859-1")] = { "value": name.decode("iso-8859-1"), "rect": iterable_items_to_int(rect), "label": label.decode("iso-8859-1") if label else None, } else: try: record_fields[name.decode("iso-8859-1")] = { "value": value.decode("iso-8859-1") if value else "", "rect": iterable_items_to_int(rect), "label": label.decode("iso-8859-1") if label else None, } except AttributeError: self.logger.debug("Attribute error") record_fields[name.decode("iso-8859-1")] = { "value": value, "rect": iterable_items_to_int(rect), "label": label.decode("iso-8859-1") if label else None, } self.active_fields = record_fields if record_fields else None return record_fields
fp = open(args.file, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) font_regex = re.compile('\/\D+(\d+)') def convert(name): s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) s1 = s1.replace(' ', '_') s1 = s1.replace('__', '_') return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() fields = resolve1(resolve1(doc.catalog['AcroForm'])['Fields']) for i in fields: field = resolve1(i) if field.get('Rect') == None: continue name, position = field.get('T'), field.get('Rect') name = convert(name.decode('UTF-8')) width = int(round(position[2] - position[0])) height = int(round(position[3] - position[1])) x = int(round(position[0])) font_size = None font_size = int(font_regex.match(field.get('DA').decode('UTF-8')).group(1)) if height < 1: height = height * -1 y = int(round(position[1]) + height) if font_size > 0 and font_size != 10:
def basic_usage(): pdf_filepath = '/path/to/sample.pdf' fp = None try: # Open a PDF file. fp = open(pdf_filepath, 'rb') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) if True: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) try: # Create a PDF document object that stores the document structure. document = PDFDocument(parser, password=b'') except PDFEncryptionError as ex: print('PDFEncryptionError raised: {}.'.format(ex)) except PDFSyntaxError as ex: print('PDFSyntaxError raised: {}.'.format(ex)) except PDFException as ex: print('PDFException raised: {}.'.format(ex)) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Metadata. print('Metadata: {}.'.format(document.info)) for info in document.info: if 'CreationDate' in info: print('\tCreation date = {}.'.format(info['CreationDate'])) # Page count. try: pages = resolve1(document.catalog['Pages']) #pages = resolve_all(document.catalog['Pages']) print('#pages = {}.'.format(pages['Count'])) except KeyError as ex: print('KeyError raised: {}.'.format(ex)) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) print('Page ID {} processed.'.format(page.pageid)) else: for page in PDFPage.get_pages( fp, pagenos=None, maxpages=0, password=b'' ): # pagenos uses zero-based indices. pagenos is sorted inside the function. interpreter.process_page(page) print('Page ID {} processed.'.format(page.pageid)) except FileNotFoundError as ex: print('File not found, {}: {}.'.format(pdf_filepath, ex)) except Exception as ex: print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex)) finally: if fp: fp.close()
y_limit = 786 x_limit = 86 # ONLY FOR EXCEPTION: if filename.__contains__("w04") or filename.__contains__("w05") or filename.__contains__("w06") or filename.__contains__("s05") or filename.__contains__("s17_ms_21"): x_limit = 103 elif filename.__contains__("qp"): x_limit = 60 y_limit = 786 rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) parser = PDFParser(fp) document = PDFDocument(parser) pagesCount = resolve1(document.catalog['Pages'])['Count'] pages = PDFPage.get_pages(fp) if filename.__contains__("qp"): pagesCount = pagesCount - 1 for index, page in enumerate(pages): pageNo = index + 1 # print('Processing next page...') if pageNo > 1 and pageNo <= pagesCount: interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): x, y, ydown, text = lobj.bbox[0], lobj.bbox[3], lobj.bbox[1], lobj.get_text().lstrip() pos = textText(text) storeIfLesser(filename, ydown, pageNo, text) print('At %r is text:%s' % ((x, y), text))
def create_csv(folder_path): pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')] encoding = 'iso-8859-1' first = True k = 0 files_count = len(pdf_files) for pdf_file_name in pdf_files: columns_pdf = [] values = [] file = folder_path + '/' + pdf_file_name with open(file, 'rb') as pdf_file: parser = PDFParser(pdf_file) doc = PDFDocument(parser) fields = resolve1(doc.catalog['AcroForm'])['Fields'] for i in fields: field = resolve1(i) # try because of diameter sign try: name = str(field.get('T'), encoding) except: name = str(field.get('T')[:-8], encoding) opt = field.get('Opt') sel = field.get('V') # check if options are available and comparison if opt != None: if not isinstance(type(opt), list): opt = resolve1(opt) for e in opt: # Field has no 2 array list if name == 'Beobachter': if e == sel: value = e elif e[0] == sel: value = e[1] else: value = sel # just bytes can be decoded if isinstance(value, bytes): try: value = str(value, encoding) except: value = value elif str(value)[0] == r"/": value = str(value)[2:-1] else: value = str(value) columns_pdf.append(name) values.append(value) if first: columns_init = columns_pdf.copy() columns_init.append('file') df = pd.DataFrame(columns=columns_init) first = False df_pdf = pd.DataFrame([values], columns=columns_pdf) filename = [pdf_file_name] df_pdf['file'] = filename df = df.append(df_pdf) k += 1 text_count.set(str(k) + ' von ' + str(files_count)) root.update() df = df.replace({'None': '-'}) df = df.fillna('-') first_col = df.pop('file') df.insert(0, 'file', first_col) df.to_csv(folder_path + '.csv', index=False) root.destroy()
def _test_pdfminer(self): """ Test 6 - Using PDFMiner. """ print(Colors.UNDERLINE + '________________________________________________\n' + Colors.ENDC) total_pages, errors, total_mining_time = [], [], [] for index, pdf_file in enumerate(self.pdfs): index = index + 1 filename = os.path.basename(pdf_file) file_size = self.convert_size(self.get_file_size(pdf_file)) try: start_time = time.time() with open(pdf_file, 'rb') as f: parser = PDFParser(f) doc = PDFDocument(parser) parser.set_document(doc) pages = resolve1(doc.catalog['Pages']) pages_count = pages.get('Count', 0) end_time = time.time() single_file_time = self.decimal_round.format(end_time - start_time) total_mining_time.append(single_file_time) mining_time = filename, single_file_time self._save_mining_time(item=mining_time, test_type='pdfminer') total_pages.append(pages_count) print( Colors.CYAN + '[PDFMINER] File {i}/{index}. Total pages: {pages_count} --> "{filename}" - {file_size}' .format(i=index, index=len(self.pdfs), pages_count=pages_count, filename=filename, file_size=file_size) + Colors.ENDC) except (KeyError, AttributeError, PDFSyntaxError, PDFEncryptionError) as error: self._save_mining_time(item=(filename, self.default_time), test_type='pdfminer') errors.append(error) pass total_pages, total_errors = list(map(int, total_pages)), len(errors) list_set_errors, total_parsing_time = list(set(errors)), sum( list(map(float, total_mining_time))) pdfminer_total_pages = sum(total_pages) print(Colors.CYAN + '[PDFMINER] Total pages count: {pdfminer_total_pages}'.format( pdfminer_total_pages=pdfminer_total_pages) + Colors.ENDC) self.final_stats_dict.update( **{ 'pdfminer_total_pages': pdfminer_total_pages, 'pdfminer_total_parsing_time': total_parsing_time, 'pdfminer_errors': { 'count': total_errors, 'errors': list_set_errors }, })
def pdffill(self): x = self.start_server() mapping = self.dic() myfile = PdfFileReader("./routes/up/blank_table.pdf") writer = PdfFileWriter() writer, myfile = self.set_need_appearances_writer(myfile, writer) if "/AcroForm" in writer._root_object: writer._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) print(1) fp = open("./routes/up/blank_table.pdf", 'rb') # pdf_writer = PyPDF2.PdfFileWriter() parser = PDFParser(fp) doc = PDFDocument(parser) fields = resolve1(doc.catalog['AcroForm'])['Fields'] first_page = myfile.getPage(self.page) #for i in fields: # field = resolve1(i) # name, value = field.get('T'), field.get('V') ## print (str(name)) # if str(name) == "b'Text11'": # writer.updatePageFormFieldValues(first_page, fields={'Text11':x['firstname']}) # if str(name) == "b'Text13'": # writer.updatePageFormFieldValues(first_page, fields={'Text13':x['lastname']}) # if str(name) == "b'Text16'": # writer.updatePageFormFieldValues(first_page, fields={'Text16':x['homeaddress']}) # if str(name) == "b'Text15'": # writer.updatePageFormFieldValues(first_page, fields={'Text15':x['gender']}) # if str(name) == "b'Text14'": # writer.updatePageFormFieldValues(first_page, fields={'Text14':str(x['birthdate'])[:10]}) for p in x.keys(): for i in fields: temp = [] field = resolve1(i) name, value = field.get('T'), field.get('V') label = (re.split(b'\t|\x90s', name)) q = "" for j in label: # temp.append(j.decode('utf-8').lower()) q = q + (j.decode('utf-8').lower()) temp.append(q) # print(temp) # print(1) if (str(name) in mapping.keys() and p in mapping[str(name)]) or p in temp: # print(p) # print(str(name) in mapping.keys() and p in mapping[str(name)]) # print( p in temp) if p == "birthdate": writer.updatePageFormFieldValues( first_page, fields={ str(name)[2:len(str(name)) - 1]: str(x[p])[:10] }) else: print(str(name)[2:len(str(name)) - 1]) print(str(x[p])) writer.updatePageFormFieldValues( first_page, fields={ str(name)[2:len(str(name)) - 1]: str(x[p]) }) # print ('{0}: {1}'.format(name, value)) # writer.addPage(first_page) writer.updatePageFormFieldValues(first_page, fields={'parent1name': "123"}) return first_page
def __init__(self, pdf_stream, password="", pagenos=[], maxpages=0): # noqa: C901 ReaderBackend.__init__(self) self.pdf_stream = pdf_stream # Extract Metadata parser = PDFParser(pdf_stream) doc = PDFDocument(parser, password=password, caching=True) if doc.info: for k in doc.info[0]: v = doc.info[0][k] # print(repr(v), type(v)) if isinstance(v, (bytes, str, unicode)): self.metadata[k] = make_compat_str(v) elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)): self.metadata[k] = make_compat_str(v.name) # Secret Metadata if "Metadata" in doc.catalog: metadata = resolve1(doc.catalog["Metadata"]).get_data() # print(metadata) # The raw XMP metadata # print(xmp_to_dict(metadata)) self.metadata.update(xmp_to_dict(metadata)) # print("---") # Extract Content text_io = BytesIO() rsrcmgr = PDFResourceManager(caching=True) converter = TextConverter(rsrcmgr, text_io, codec="utf-8", laparams=LAParams(), imagewriter=None) interpreter = PDFPageInterpreter(rsrcmgr, converter) self.metadata["Pages"] = 0 self.curpage = 0 for page in PDFPage.get_pages( self.pdf_stream, pagenos=pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=False, ): # Read page contents interpreter.process_page(page) self.metadata["Pages"] += 1 self.curpage += 1 # Collect URL annotations # try: if page.annots: refs = self.resolve_PDFObjRef(page.annots) if refs: if isinstance(refs, list): for ref in refs: if ref: self.references.add(ref) elif isinstance(refs, Reference): self.references.add(refs) # except Exception as e: # logger.warning(str(e)) # Remove empty metadata entries self.metadata_cleanup() # Get text from stream self.text = text_io.getvalue().decode("utf-8") text_io.close() converter.close() # print(self.text) # Extract URL references from text for url in extractor.extract_urls(self.text): self.references.add(Reference(url, self.curpage)) for ref in extractor.extract_arxiv(self.text): self.references.add(Reference(ref, self.curpage)) for ref in extractor.extract_doi(self.text): self.references.add(Reference(ref, self.curpage))
def process_form_field(field, output_file_code, pdf_processed_pages, make_crops): resolved_field = resolve1(field) # gets the details of the form field from the PDF File name, value, rect, page_id = resolved_field.get('T'), resolved_field.get( 'V'), resolved_field.get('Rect'), resolved_field.get('P') field_id = str(field.objid) if 'FT' not in resolved_field: is_textfield = True elif resolved_field['FT'].name == 'Tx': is_textfield = True elif resolved_field['FT'].name == 'Btn': is_textfield = False else: is_textfield = True # weird form type; assume its a text field if page_id is None: return page_resolved = resolve1(page_id) quadtree_index = pdf_processed_pages[page_id.objid][1] if make_crops: cropped_file_name = 'mturk_images/' + output_file_code + "_" + str( field_id) + ".png" if not os.path.isfile(cropped_file_name): draw_image = pdf_processed_pages[page_id.objid][0].copy( ) # makes a copy since we want to make a fresh crop for each one image_height = draw_image.height page_width = page_resolved['MediaBox'][2] page_height = page_resolved['MediaBox'][3] scale = image_height / page_height x0 = rect[0] * scale y0 = image_height - rect[1] * scale x1 = rect[2] * scale y1 = image_height - rect[3] * scale draw = ImageDraw.Draw(draw_image, 'RGBA') draw.rectangle([x0, y0, x1, y1], fill=(0, 0, 250, 100)) crop_area = (0, y1 - 200, page_width * scale, y0 + 200) cropped_example = draw_image.crop(crop_area) cropped_example.save(cropped_file_name, "PNG") spacer = 0 while True: # keep increasing area until we grab at least one textfield or get unreasonably big if spacer > 20: break if is_textfield: # textfields have their labels to the left or up quadrect = [rect[0] - spacer, rect[1] - spacer, rect[2], rect[3]] else: # checkboxes have their labels to the right quadrect = [rect[0], rect[1], rect[2] + spacer, rect[3] + spacer] matches = quadtree_index.intersect(quadrect) if len(matches) > 1: break else: spacer += 5 # handle unfound case if len(matches) == 0: return match = matches[0] field_description = match[0] quadtree_index.remove(match, match[1]) if is_textfield: field_type = 'string' else: field_type = 'boolean' match_text = list(map(lambda x: x[0], matches)) match_str = ' '.join(match_text) return field_id, field_type, field_description, cropped_file_name, match_str
"""Code snippets vol-57 283-Number of pages in a PDF file. Download all snippets so far: https://wp.me/Pa5TU8-1yg Blog: stevepython.wordpress.com Requirements: pip3 install pdfminer Origin: https://gist.github.com/miodeqqq/0a06c395b21cec60a7e0d8abe7a0793f """ from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdftypes import resolve1 with open('test.pdf', 'rb') as f: parser = PDFParser(f) doc = PDFDocument(parser) parser.set_document(doc) pages = resolve1(doc.catalog['Pages']) pages_count = pages.get('Count', 0) print(pages_count, 'pages')
def pdf_metadata(path): # Function to retrieve PDF metadata when available # Initialize dictionary to contain metadata metadata = {} fp = open(path, 'rb') # initialize PDFParser to extract metadata parser = PDFParser(fp) doc = PDFDocument(parser) # Long series of exceptions handling in case of wierd text conversion from PDFParser try: metadata['Title'] = decoder(doc.info[0]["Title"]) # If not recognized as text, resolve with built in function resolve1() except AttributeError: title = decoder(resolve1(doc.info[0]["Title"])) # Element retrieved is not null, attribute to metadata key if title: metadata['Title'] = title # Otherwise use simple naïve method else: metadata['Title'] = os.path.basename(path) # If not element corresponds to title in metadata use simple naïve method except KeyError: metadata['Title'] = os.path.basename(path) # Same exception handling as above try: metadata['Author(s)'] = decoder(doc.info[0]["Author"]) except AttributeError: author = decoder(resolve1(doc.info[0]["Author"])) if author: metadata["Author(s)"] = author else: metadata['Author(s)'] = "Unknown" except KeyError: metadata["Author(s)"] = "Unknown" # Same exception handling as above try: metadata['Last Modified By'] = decoder(doc.info[0]["Author"]) except AttributeError: author = decoder(resolve1(doc.info[0]["Author"])) if author: metadata['Last Modified By'] = author else: metadata['Last Modified By'] = "Unknown" except KeyError: metadata['Last Modified By'] = "Unknown" # Same exception handling as above try: metadata['Created Date'] = posix_from_s( decoder(doc.info[0]["CreationDate"])) except AttributeError: cdate = posix_from_s(decoder(resolve1(doc.info[0]["CreationDate"]))) if cdate: metadata['Created Date'] = cdate else: metadata['Created Date'] = "Unknown" except KeyError: metadata['Created Date'] = "Unknown" # Same exception handling as above, however use posix correction function defined at root of script try: metadata['Modified Date'] = posix_from_s( decoder(doc.info[0]["ModDate"])) except AttributeError: mdate = posix_from_s(decoder(resolve1(doc.info[0]["ModDate"]))) if mdate: metadata['Modified Date'] = mdate else: metadata['Modified Date'] = "Unknown" except KeyError: metadata['Modified Date'] = "Unknown" return metadata
def get_page_number(self, index): """ Given an index, return page label as specified by catalog['PageLabels']['Nums'] In a PDF, page labels are stored as a list of pairs, like [starting_index, label_format, starting_index, label_format ...] For example: [0, {'S': 'D', 'St': 151}, 4, {'S':'R', 'P':'Foo'}] So we have to first find the correct label_format based on the closest starting_index lower than the requested index, then use the label_format to convert the index to a page label. Label format meaning: /S = [ D Decimal arabic numerals R Uppercase roman numerals r Lowercase roman numerals A Uppercase letters (A to Z for the first 26 pages, AA to ZZ for the next 26, and so on) a Lowercase letters (a to z for the first 26 pages, aa to zz for the next 26, and so on) ] (if no /S, just use prefix ...) /P = text string label /St = integer start value """ # get and cache page ranges if not hasattr(self, 'page_range_pairs'): try: page_ranges = resolve1(self.catalog['PageLabels'])['Nums'] assert len(page_ranges) > 1 and len(page_ranges) % 2 == 0 self.page_range_pairs = list( reversed(list(zip(page_ranges[::2], page_ranges[1::2])))) except: self.page_range_pairs = [] if not self.page_range_pairs: return "" # find page range containing index for starting_index, label_format in self.page_range_pairs: if starting_index <= index: break # we found correct label_format label_format = resolve1(label_format) page_label = "" # handle numeric part of label if 'S' in label_format: # first find number for this page ... page_label = index - starting_index if 'St' in label_format: # alternate start value page_label += label_format['St'] else: page_label += 1 # ... then convert to correct format num_type = label_format['S'].name # roman (upper or lower) if num_type.lower() == 'r': import roman page_label = roman.toRoman(page_label) if num_type == 'r': page_label = page_label.lower() # letters elif num_type.lower() == 'a': # a to z for the first 26 pages, aa to zz for the next 26, and # so on letter = chr(page_label % 26 + 65) letter *= page_label / 26 + 1 if num_type == 'a': letter = letter.lower() page_label = letter # decimal arabic else: # if num_type == 'D': page_label = obj_to_string(page_label) # handle string prefix if 'P' in label_format: page_label = smart_unicode_decode(label_format['P']) + page_label return page_label
def pdfInfo(self): # collect metadata from pdf file at document and page levels with open(self.fpath, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) docinfo = {} if 'Metadata' in doc.catalog: metadata = resolve1(doc.catalog['Metadata']).get_data() xmpdict = xmp_to_dict(metadata) docinfo['titl'] = xmpdict['dc']['title']['x-default'] docinfo['desc'] = xmpdict['dc']['description']['x-default'] docinfo['isfillable'] = ( xmpdict['pdf'].get('Keywords', '').lower() == 'fillable') anyMonth = 'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec' titlePttn1 = re.compile(ut.compactify( r'''(?:(\d\d\d\d) )? # 2016 Form ([\w-]+ # Form 1040 (?: \w\w?)?) # AS (?: or ([\w-]+))? # or 1040A (?: ?\(?(?:Schedule ([\w-]+))\)?)? # (Schedule B) (?: ?\((?:Rev|'''+anyMonth+''').+?\))?\s*$''' )) # eg 2016 Form W-2 AS # eg 2015 Form 1120 S (Schedule D) # eg 2015 Form 990 or 990-EZ (Schedule E) # eg Form 8818 (Rev. December 2007) # eg Form 8849 (Schedule 2) (Rev. January 2009) # eg Form 1066 (Schedule Q) (Rev. December 2013) # eg Form 1120S Schedule B-1 (December 2013) # 'Rev' means 'revised' m = re.search(titlePttn1, docinfo['titl']) if m: taxyr, form1, form2, sched = m.groups() else: titlePttn2 = re.compile(ut.compactify( r'''(?:(\d\d\d\d) )? # 2016 Schedule ([\w-]+)[ ] # Schedule B \(Form ([\w-]+) # (Form 1040 (?: or ([\w-]+))? ?\) # or 1040A) (?: \((?:Rev|'''+anyMonth+''').+?\))?\s*$''', )) # eg 2015 Schedule M-3 (Form 1065) # eg 2015 Schedule O (Form 990 or 990-EZ) # eg Schedule O (Form 1120) (Rev. December 2012) # eg Schedule C (Form 1065 ) (Rev. December 2014) m = re.search(titlePttn2, docinfo['titl']) if m: taxyr, sched, form1, form2 = m.groups() else: msg = docinfo['titl'] + ' dont fit' log.error(msg) raise Exception(msg) docinfo['taxyr'] = taxyr form = form1 if not form2 or len(form1) < len(form2) else form2 docinfo['form'] = form docinfo['sched'] = sched docinfo['formName'] = form if not sched else (form, sched) docinfo['fpath'] = self.fpath # Check if the document allows text extraction. If not, abort. if not doc.is_extractable: raise Exception('PDFTextExtractionNotAllowed') pageinfo = {} rr = Renderer() # for ipage,page in enumerate(doc.get_pages()): for ipage, page in enumerate(PDFPage.create_pages(doc)): pagenum = 1 + ipage if page.cropbox != page.mediabox: log.warn( 'boxesDontMatch: cropbox!=mediabox on page %d:' ' cropbox=%s; mediabox=%s', pagenum, page.cropbox, page.mediabox) pagewidth = Qnty( page.cropbox[2] - page.cropbox[0], 'printers_point') pageheight = Qnty( page.cropbox[3] - page.cropbox[1], 'printers_point') pageinfo[pagenum] = PageInfo( pagenum, pagewidth, pageheight, rr.renderPage(page)) return docinfo, pageinfo
def recursively_add_fields(fields, id_to_page, outfields, prefix=''): for i in fields: field = resolve1(i) name, value, rect, page, field_type = field.get('T'), field.get( 'V'), field.get('Rect'), field.get('P'), field.get('FT') if name is not None: if PY2: name = remove_nonprintable_limited(str(name)) else: if not isinstance(name, bytes): name = bytes(str(name), encoding='utf-8') name = remove_nonprintable_bytes_limited(name) if value is not None: if PY2: value = remove_nonprintable_limited(str(value)) else: if not isinstance(value, bytes): value = bytes(str(value), encoding='utf-8') value = remove_nonprintable_bytes_limited(value) #logmessage("name is " + repr(name) + " and FT is |" + repr(str(field_type)) + "| and value is " + repr(value)) if page is not None: pageno = id_to_page[page.objid] else: pageno = 1 if str(field_type) in ('/Btn', "/u'Btn'", "/'Btn'"): if value == '/Yes': default = "Yes" else: default = "No" elif str(field_type) in ('/Sig', "/u'Sig'", "/'Sig'"): default = '${ user.signature }' else: if value is not None: #for val in value: # logmessage("Got a " + str(ord(val))) #logmessage(repr(value.decode('utf8'))) #default = re.sub(r'^\xc3\xbe\xc3\xbf', '', value) default = value if not default: default = word("something") else: default = word("something") kids = field.get('Kids') if kids: if name is None: recursively_add_fields(kids, id_to_page, outfields, prefix=prefix) else: if prefix == '': recursively_add_fields(kids, id_to_page, outfields, prefix=name) else: recursively_add_fields(kids, id_to_page, outfields, prefix=prefix + '.' + name) else: if prefix != '' and name is not None: outfields.append( (prefix + '.' + name, default, pageno, rect, field_type)) elif prefix == '': outfields.append((name, default, pageno, rect, field_type)) else: outfields.append((prefix, default, pageno, rect, field_type))
def text_to_lda(self, fp=None): #Reading the PDF Document and saving as lone lone=self.convert_pdf_to_text() # Gets inputs rb fp = open(self.a, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) fields = resolve1(doc.catalog['AcroForm'])['Fields'] inps=[] for i in fields: field = resolve1(i) name, value = field.get('T'), field.get('V') inps.append('{0}: {1}'.format(name, value)) inf=[] ini=[] for i in fields: field=resolve1(i) name,value=field.get('T'),field.get('V') inf.append(name) ini.append(value) # Topic Modeling # Fitting Count Vectorizer on the document with Stop Words vect=CountVectorizer(ngram_range=(1,1),stop_words='english') dtm = vect.fit_transform(inps) #Converting the Document Term Matrix from Count Vectorizer into a Pandas Dataframe dfm=pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names()) #Fitting the Latent Dirichlet Allocation Model on the Document Term Matrix lda = LatentDirichletAllocation(n_components=5) lda_dtf = lda.fit_transform(dtm) #Latent Dirichlet Allocation Model # lda_dtf # Topic Extracting #Extracting 5 Topics from LDA and the most common words in each topic sorting = np.argsort(lda.components_)[:, ::-1] features = np.array(vect.get_feature_names()) # mg.tools.print_topics(topics=range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=15) #Sentences within the Topic Model 1 topic_0 = np.argsort(lda_dtf[:,0])[::-1] t0=[] for i in topic_0[:5]: j=(f".".join(inps[i].split(f".")[:2]) + f".\n") t0.append(j) #Senteces within the Topic Model 2 topic_1 = np.argsort(lda_dtf[:,1])[::-1] t1=[] for i in topic_1[:5]: j=(f".".join(inps[i].split(f".")[:2]) + f".\n") t1.append(j) #Senteces within the Topic Model 3 topic_2 = np.argsort(lda_dtf[:,2])[::-1] t2=[] for i in topic_2[:5]: j=(f".".join(inps[i].split(f".")[:2]) + f".\n") t2.append(j) #Senteces within the Topic Model 4 topic_3 = np.argsort(lda_dtf[:,3])[::-1] t3=[] for i in topic_3[:5]: j=(f".".join(inps[i].split(f".")[:2]) + f".\n") t3.append(j) #Senteces within the Topic Model 5 topic_4 = np.argsort(lda_dtf[:,4])[::-1] t4=[] for i in topic_4[:5]: j=(f".".join(inps[i].split(f".")[:2]) + f".\n") t4.append(j) st0=str(t0).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ") st1=str(t1).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ") st2=str(t2).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ") st3=str(t3).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ") st4=str(t4).replace("b'"," ").replace("\\n"," ").replace("\\"," ").replace("b\\"," ").replace("'b"," ") print ("Topic 0: \n" + st0 + "\n") print ("Topic 1: \n" + st1 + "\n") print ("Topic 2: \n" + st2 + "\n") print ("Topic 3: \n" + st3 + "\n") print ("Topic 4: \n" + st4 + "\n")
def get_multimedia(klass, document): def search(obj, parent): if isinstance(obj, int): objid = obj tree = dict_value(document.getobj(objid)).copy() else: objid = obj.objid tree = dict_value(obj).copy() for (k, v) in parent.items(): if k in 'Resources' and k not in tree: tree[k] = v tree_type = tree.get('Type') if tree_type is LITERAL_PAGES and 'Kids' in tree: for c in list_value(tree['Kids']): for x in search(c, tree): yield x elif tree_type is LITERAL_PAGE: yield (objid, tree) if 'Pages' in document.catalog: for (objid, tree) in search(document.catalog['Pages'], document.catalog): pageid = objid attrs = dict_value(tree) resources = resolve1(attrs.get('Resources', dict())) if 'XObject' in resources: # Image for (im_ref, xobj) in resources['XObject'].items(): image_stream = xobj.resolve() if 'Filter' in image_stream: if isinstance(image_stream['Filter'], list): for filter in image_stream['Filter']: if filter.name == 'DCTDecode': yield ('', image_stream) else: #print(type(image_stream['Filter']), image_stream) if image_stream['Filter'].name == 'DCTDecode': yield ('', image_stream) if 'Annots' in attrs: # Multimedia (Video, Audio, SWF) annots = resolve1(attrs.get('Annots', dict())) for annot_obj in annots: annot = annot_obj.resolve() if 'RichMediaContent' in annot: rich_media_content = resolve1( annot.get('RichMediaContent', dict())) if 'Assets' in rich_media_content: assets = resolve1( rich_media_content.get('Assets', dict())) for i in range(0, len(assets['Names']), 2): media_name = assets['Names'][i].decode( 'utf-16') media_data_obj = assets['Names'][ i + 1].resolve() if 'EF' in media_data_obj: for media_ref, media_obj in media_data_obj[ 'EF'].items(): # print(media_ref, media_obj) filename = media_data_obj[ media_ref].decode('ascii') media_stream = media_obj.resolve() yield (filename, media_stream)
def getMetadataPDF(): if doc and doc.catalog and 'Metadata' in doc.catalog: return resolve1(doc.catalog['Metadata']).get_data() else: return None
def num_value2(x): x = resolve1(x) if not (isinstance(x, int) or isinstance(x, float)): return None return x
def __analyseAnnot(self, annot): _title = None _type = None idParent = None suppAttr = {} if ('FT' in annot): _type = annot['FT'].name else: _type = "unknown" if ('Parent' in annot): idParent = int(''.join( [str(s) for s in str(annot['Parent']) if s.isdigit()])) _parent = resolve1(annot['Parent']) if ('FT' in _parent): _type = _parent['FT'].name if ('T' in _parent): _title = _parent['T'].decode(encoding='UTF-8') #decode_text( _parent['T'] ); if ('T' in annot): _title = annot['T'].decode(encoding='UTF-8') elif _title == None: self._unknownTitle += 1 _title = "unknown title " + str(self._unknownTitle) if (_type == 'Btn'): if (idParent != None): _type = "radio" suppAttr['group'] = idParent elif ('AS' in annot): _type = "checkbox" else: _type = "button" if (_type == 'Tx'): _type = "text" if ('Q' in annot): Q = int(annot['Q']) if (Q == 1): suppAttr['align'] = 'center' elif (Q == 2): suppAttr['align'] = 'right' if ('AA' in annot): AA = annot['AA'] if ('F' in AA): format = resolve1(annot['AA']['F']) if ('JS' in format): if ('AFNumber_Format(' in format['JS']): suppAttr['format'] = 'numberonly' suppAttr['decimal'] = int( format['JS'][16:format['JS'].index(',')]) # @TODO : gérer des validations numérique (from 5 to 10 ? max/min etc.) #if (AA.has_key('V')): #print resolve1(annot['AA']['V']); # {'JS': 'AFRange_Validate(true, 5, true, 10);'} if (_type == 'Ch' and ('Opt' in annot)): _type = 'multichoice' suppAttr['choices'] = [ self.__decode_text(o) for o in annot['Opt'] ] if ('MaxLen' in annot): suppAttr['maxchar'] = int(annot['MaxLen']) copySuppAttr = suppAttr.copy() r = {'type': _type, 'title': _title, 'rectangle': annot['Rect']} r.update(copySuppAttr) if (r['type'] == 'unknown'): return None else: return r
def resource_example(): from pdfminer.pdffont import CFFFont, TrueTypeFont from pdfminer.pdffont import PDFFont, PDFSimpleFont, PDFType1Font, PDFTrueTypeFont, PDFType3Font, PDFCIDFont from pdfminer.psparser import literal_name from pdfminer.pdftypes import PDFObjRef from pdfminer.pdftypes import list_value, dict_value, stream_value from pdfminer.pdfcolor import PDFColorSpace from pdfminer.pdfcolor import PREDEFINED_COLORSPACE font_filepath = '/path/to/font.ttf' with open(font_filepath, 'rb') as fp: #font = CFFFont(font_filepath, fp) font = TrueTypeFont(font_filepath, fp) print('Font type = {}.'.format(font.fonttype)) print('Font fp = {}.'.format(font.fp)) print('Font name = {}.'.format(font.name)) print('Font tables = {}.'.format(font.tables)) #-------------------- pdf_filepath = '/path/to/sample.pdf' fp = None try: # Open a PDF file. fp = open(pdf_filepath, 'rb') # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() pages = PDFPage.get_pages( fp, pagenos=None, maxpages=0, password=b'' ) # pagenos uses zero-based indices. pagenos is sorted inside the function. page = next(pages) if page: resources, contents = page.resources, page.contents if not resources: print('No resource.') return if contents: print('Contents: {}.'.format(contents)) #for ct in contents: # print(ct.resolve()) # REF [function] >> pdfminer.pdfinterp.PDFPageInterpreter.init_resources() def get_colorspace(spec): if isinstance(spec, list): name = literal_name(spec[0]) else: name = literal_name(spec) if name == 'ICCBased' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, stream_value(spec[1])['N']) elif name == 'DeviceN' and isinstance(spec, list) and 2 <= len(spec): return PDFColorSpace(name, len(list_value(spec[1]))) else: return PREDEFINED_COLORSPACE[name] fontmap, xobjmap = dict(), dict() csmap = PREDEFINED_COLORSPACE.copy() for (k, v) in dict_value(resources).items(): #if 2 <= self.debug: # print >>stderr, 'Resource: %r: %r' % (k,v) if k == 'Font': for (font_id, spec) in dict_value(v).items(): obj_id = None if isinstance(spec, PDFObjRef): obj_id = spec.objid spec = dict_value(spec) fontmap[font_id] = rsrcmgr.get_font(obj_id, spec) elif k == 'ColorSpace': for (cs_id, spec) in dict_value(v).items(): csmap[cs_id] = get_colorspace(resolve1(spec)) elif k == 'ProcSet': rsrcmgr.get_procset(list_value(v)) elif k == 'XObject': for (xobj_id, xobjstrm) in dict_value(v).items(): xobjmap[xobj_id] = xobjstrm #spec = ... #if 'FontDescriptor' in spec: # print('FontDescriptor: {}.'.format(spec['FontDescriptor'].resolve())) font = PDFType1Font(rsrcmgr, spec) font = PDFTrueTypeFont(rsrcmgr, spec) #font = PDFType3Font(rsrcmgr, spec) font = PDFCIDFont(rsrcmgr, spec) for font_id, font in fontmap.items(): print( '------------------------------------------------------------' ) print('Descriptor: {}.'.format(font.descriptor)) print('\tFont name: {}, Font type: {}.'.format( font.fontname, type(font).__name__)) if hasattr(font, 'basefont'): print('\tBase font: {}.'.format(font.basefont)) if hasattr(font, 'flags'): print('\tFlags = {}.'.format(font.flags)) if hasattr(font, 'default_width') and hasattr(font, 'widths'): print('\tDefault width = {}, Widths = {}.'.format( font.default_width, font.widths)) print('\tAscent: {}, {}.'.format(font.ascent, font.get_ascent())) print('\tDescent: {}, {}.'.format(font.descent, font.get_descent())) if hasattr(font, 'hscale') and hasattr(font, 'vscale'): print('\tScale: {}, {}.'.format(font.hscale, font.vscale)) if hasattr(font, 'leading') and hasattr(font, 'italic_angle'): print('\tLeading = {}, Italic angle = {}.'.format( font.leading, font.italic_angle)) print('\tBbox = {}.'.format(font.bbox)) if hasattr(font, 'get_width') and hasattr(font, 'get_height'): print('\t(width, height) = ({}, {}).'.format( font.get_width(), font.get_height())) if hasattr(font, 'is_multibyte') and hasattr( font, 'is_vertical'): print('\tis_multibyte = {}, is_vertical = {}.'.format( font.is_multibyte(), font.is_vertical())) if hasattr(font, 'cid2unicode') and hasattr( font, 'unicode_map'): print('\tcid2unicode = {}, unicode_map = {}.'.format( font.cid2unicode, font.unicode_map)) #if hasattr(font, 'char_disp'): # print('\tchar_disp({}) = {}.'.format(cid, font.char_disp(cid))) #if hasattr(font, 'to_unichr'): # print('\tto_unichr({}) = {}.'.format(cid, font.to_unichr(cid))) #if hasattr(font, 'char_width') and hasattr(font, 'string_width'): # print('\tchar_width({}) = {}, string_width({}) = {}.'.format(cid, font.char_width(cid), s, font.string_width(s))) for cs_id, cs in csmap.items(): print('CS ID: {}.'.format(cs_id)) print('\t{}.'.format(cs)) for xobj_id, xobj in xobjmap.items(): print('XObj ID: {}.'.format(xobj_id)) print('\t{}.'.format(xobj)) except FileNotFoundError as ex: print('File not found, {}: {}.'.format(pdf_filepath, ex)) except Exception as ex: print('Unknown exception raised in {}: {}.'.format(pdf_filepath, ex)) finally: if fp: fp.close()
def getMetadataPDF(self): if self.pdfDoc and self.pdfDoc.catalog and 'Metadata' in self.pdfDoc.catalog: return resolve1(self.pdfDoc.catalog['Metadata']).get_data() else: return None
def extract_form_values(): csv_file = open(os.path.join(dir, file_name), "w") for entry in header: csv_file.write("%s , " % entry) csv_file.write("\n") for f in p.glob('*.pdf'): filename = os.path.basename(f) #print ("filename ", filename) ### HACK. From web converted file - x sign marks on the field gets extracted! x sign does not work via programmatic method. ### ### In case you face the same issue: ### ### Uncomment the below two lines and comment the call to convert_pdf_to_img. ### ### Manually convert pdf to image file. Name per the naming convention i.e. filename-2.jpg. Place in the image directory. ### #image_filename = filename[0:-4] + "-2.jpg" #image_file = os.path.join(img_dir, image_filename) #Call pdf to image convert image_file = convert_pdf_to_img(filename, f) #Call OCR img_ocr_txt_file = convert_jpg_to_text(image_file) #It will be the txt file output created from pdf and jpg image files by extracting field values form_to_txt_file_name = filename[0:-3] + "txt" #print (form_to_txt_file_name) output_file = open(os.path.join(output_dir, form_to_txt_file_name), "w") #Open the pdf file with open(f, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) res = resolve1(doc.catalog) if 'AcroForm' not in res: raise ValueError("No AcroForm Found") data = {} fields = resolve1(doc.catalog['AcroForm'])['Fields'] #Extract field n value using pdfminer for f in fields: field = resolve1(f) name, values = field.get('T'), field.get('V') # decode name name = decode_text(name) # resolve indirect obj values = resolve1(values) # decode value(s) if isinstance(values, list): values = [decode_value(v) for v in values] else: values = decode_value(values) data.update({name: values}) #print(name) #print ("\n") #print (values) output_file.write(name) if values is not None: output_file.write(values) output_file.write("\n") process_big_text(output_file, img_ocr_txt_file) output_file.close() fp.close() TextToCSV.write_CSV(csv_file, output_dir, form_to_txt_file_name) csv_file.close()
def parse_assessment_to_excel(assessment_path, database_path): utc_now = datetime.utcnow() data_dictionary = OrderedDict( {"Processed_UTC": utc_now.isoformat()} ) # Lets make a dictionary where all the parsed values are kept, lets add time when parsing was started # TODO add also processed file name assessment_file = open(assessment_path, 'rb') parser = PDFParser(assessment_file) doc = PDFDocument(parser) fields = resolve1(doc.catalog['AcroForm'])['Fields'] for i in fields: field = resolve1(i) key, value = field.get('T'), field.get('V') if debug: print '{}: {} -> {}'.format(key, value, type(value)) # DEBUG if type(value) == str: unicode_value = unicode( value.decode("iso-8859-1").replace( u"\xfe\xff\x00", u"").replace(u"\x00", u"").replace(u'\xfe\xff', u"") ) # Lets convert the string to unicode and replace is needed to remove some funny characters data_dictionary[key] = [unicode_value] elif value == None: data_dictionary[key] = [u"ei"] else: data_dictionary[key] = [value.name] if value.name == "Off": data_dictionary[key] = [u"ei"] if value.name == "Yes": data_dictionary[key] = [u"jah"] assessment_file.close() # Create pandas dataframe for exporting data data_frame = pandas.DataFrame(data_dictionary) if debug: print list(data_frame.columns) # DEBUG if os.path.exists(database_path) == True: print "Info - Database file {} already exists, loading previous records".format( database_path) existing_data = pandas.read_excel( database_path, index_col=0) # TODO set first column as index if debug: print existing_data # Add to exsiting data data_frame = existing_data.append(data_frame, sort=False) # Fix index numbering data_frame = data_frame.reset_index(drop=True) # Fix index numbering # Create backup of current database move_file(database_path, "database_backup", "{:%Y%m%dT%H%M%S}_{}".format( utc_now, uuid.uuid4())) # Create unique filename for each bacup # Export to excel and add formatting sheet_name = "Hindamised" writer = pandas.ExcelWriter(database_path, engine='xlsxwriter') data_frame.to_excel(writer, sheet_name, encoding='utf8') # Get sheet to do some formatting sheet = writer.sheets[sheet_name] # Set default column size, if this does not work you are missing XslxWriter module first_col = 1 last_col = len(data_frame.columns) width = 25 sheet.set_column(first_col, last_col, width) # freeze column names and ID column sheet.freeze_panes(1, 1) # Apply filter to excel first_row = 0 last_row = len(data_frame) sheet.autofilter(first_row, first_col, last_row, last_col) # Save the file writer.save() return data_dictionary
def pdf_metadata(path): metadata = {} fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) try: metadata['Title'] = decoder(doc.info[0]["Title"]) except AttributeError: title = decoder(resolve1(doc.info[0]["Title"])) if title: metadata['Title'] = title else: metadata['Title'] = os.path.basename(path) except KeyError: metadata['Title'] = os.path.basename(path) try: metadata['Author(s)'] = decoder(doc.info[0]["Author"]) except AttributeError: author = decoder(resolve1(doc.info[0]["Author"])) if author: metadata["Author(s)"] = author else: metadata['Author(s)'] = "Unknown" except KeyError: metadata["Author(s)"] = "Unknown" try: metadata['Last Modified By'] = decoder(doc.info[0]["Author"]) except AttributeError: author = decoder(resolve1(doc.info[0]["Author"])) if author: metadata['Last Modified By'] = author else: metadata['Last Modified By'] = "Unknown" except KeyError: metadata['Last Modified By'] = "Unknown" try: metadata['Created Date'] = posix_from_s( decoder(doc.info[0]["CreationDate"])) except AttributeError: cdate = posix_from_s(decoder(resolve1(doc.info[0]["CreationDate"]))) if cdate: metadata['Created Date'] = cdate else: metadata['Created Date'] = "Unknown" except KeyError: metadata['Created Date'] = "Unknown" try: metadata['Modified Date'] = posix_from_s( decoder(doc.info[0]["ModDate"])) except AttributeError: mdate = posix_from_s(decoder(resolve1(doc.info[0]["ModDate"]))) if mdate: metadata['Modified Date'] = mdate else: metadata['Modified Date'] = "Unknown" except KeyError: metadata['Modified Date'] = "Unknown" return metadata
def recursively_add_fields(fields, id_to_page, outfields, prefix=''): if isinstance(fields, PDFObjRef): fields = resolve1(fields) for i in fields: field = resolve1(i) try: name, value, rect, page, field_type = field.get('T'), field.get('V'), field.get('Rect'), field.get('P'), field.get('FT') except: logmessage("Skipping field " + repr(field)) continue if name is not None: if not isinstance(name, bytes): name = bytes(str(name), encoding='utf-8') name = remove_nonprintable_bytes_limited(name) if value is not None: if not isinstance(value, bytes): value = bytes(str(value), encoding='utf-8') value = remove_nonprintable_bytes_limited(value) #logmessage("name is " + repr(name) + " and FT is |" + repr(str(field_type)) + "| and value is " + repr(value)) if page is not None: pageno = id_to_page[page.objid] else: pageno = 1 export_value = None if str(field_type) in ('/Btn', "/'Btn'"): export_value = 'Yes' try: for key in list(field['AP']['N'].keys()): if key in ('Off', 'off', 'No', 'no'): continue export_value = key break except: pass if value == '/Yes': default = export_value else: default = "No" elif str(field_type) in ('/Sig', "/'Sig'"): default = '${ user.signature }' else: if value is not None: #for val in value: # logmessage("Got a " + str(ord(val))) #logmessage(repr(value.decode('utf8'))) #default = re.sub(r'^\xc3\xbe\xc3\xbf', '', value) default = value if not default: default = word("something") else: default = word("something") kids = field.get('Kids') if kids: if name is None: recursively_add_fields(kids, id_to_page, outfields, prefix=prefix) else: if prefix == '': recursively_add_fields(kids, id_to_page, outfields, prefix=name) else: recursively_add_fields(kids, id_to_page, outfields, prefix=prefix + '.' + name) else: if prefix != '' and name is not None: outfields.append((prefix + '.' + name, default, pageno, rect, field_type, export_value)) elif prefix == '': outfields.append((name, default, pageno, rect, field_type, export_value)) else: outfields.append((prefix, default, pageno, rect, field_type, export_value))