def _parse(self, filepath: str) -> Dict[str, Any]: """Parses the PDF for static information. @param filepath: Path to file to be analyzed. @return: results dict or None. """ # Load the PDF with PDFiD and convert it to JSON for processing pdf_data = PDFiD(filepath, False, True) try: pdf_json = PDFiD2JSON(pdf_data, True) pdfid_data = json.loads(pdf_json)[0] except IndexError as e: log.error("parse_pdf: %s", str(e)) return {} pdfresult = { "Info": { "PDF Header": pdfid_data["pdfid"]["header"], "Total Entropy": pdfid_data["pdfid"]["totalEntropy"], "Entropy In Streams": pdfid_data["pdfid"]["streamEntropy"], "Entropy Out Streams": pdfid_data["pdfid"]["nonStreamEntropy"], "Count %% EOF": pdfid_data["pdfid"]["countEof"], "Data After EOF": pdfid_data["pdfid"]["countChatAfterLastEof"], }, # Note, PDFiD doesn't interpret some dates properly, specifically it doesn't # seem to be able to properly represent time zones that involve fractions of # an hour "Dates": pdfid_data["pdfid"]["dates"]["date"], # Get keywords, counts and format. "Keywords": { str(keyword["name"]): keyword["count"] for keyword in pdfid_data["pdfid"]["keywords"]["keyword"] }, } pdfresult = peepdf_parse(self.file_path, pdfresult) return pdfresult
def _parse(self, filepath): """Parses the PDF for static information. Uses PyV8 from peepdf to extract JavaScript from PDF objects. @param filepath: Path to file to be analyzed. @return: results dict or None. """ # Load the PDF with PDFiD and convert it to JSON for processing pdf_data = PDFiD(filepath, False, True) pdf_json = PDFiD2JSON(pdf_data, True) pdfid_data = json.loads(pdf_json)[0] info = {} info["PDF Header"] = pdfid_data['pdfid']['header'] info["Total Entropy"] = pdfid_data['pdfid']['totalEntropy'] info['Entropy In Streams'] = pdfid_data['pdfid']['streamEntropy'] info['Entropy Out Streams'] = pdfid_data['pdfid']['nonStreamEntropy'] info['Count %% EOF'] = pdfid_data['pdfid']['countEof'] info['Data After EOF'] = pdfid_data['pdfid']['countChatAfterLastEof'] # Note, PDFiD doesn't interpret some dates properly, specifically it doesn't # seem to be able to properly represent time zones that involve fractions of # an hour dates = pdfid_data['pdfid']['dates']['date'] # Get keywords, counts and format. keywords = {} for keyword in pdfid_data['pdfid']['keywords']['keyword']: keywords[str(keyword['name'])] = keyword['count'] result = {} result["Info"] = info result["Dates"] = dates result["Keywords"] = keywords log.debug("About to parse with PDFParser") parser = PDFParser() ret, self.pdf = parser.parse(filepath, forceMode=True, looseMode=True, manualAnalysis=False) urlset = set() annoturiset = set() objects = [] retobjects = [] metadata = dict() self._set_base_uri() for i in range(len(self.pdf.body)): body = self.pdf.body[i] metatmp = self.pdf.getBasicMetadata(i) if metatmp: metadata = metatmp objects = body.objects for index in objects: oid = objects[index].id offset = objects[index].offset size = objects[index].size details = objects[index].object obj_data = {} obj_data["Object ID"] = oid obj_data["Offset"] = offset obj_data["Size"] = size if details.type == 'stream': encoded_stream = details.encodedStream decoded_stream = details.decodedStream if HAVE_PYV8: jsdata = None try: jslist, unescapedbytes, urlsfound, errors, ctxdummy = analyseJS( decoded_stream.strip()) jsdata = jslist[0] except Exception, e: continue if len(errors): continue if jsdata == None: continue for url in urlsfound: urlset.add(url) # The following loop is required to "JSONify" the strings returned from PyV8. # As PyV8 returns byte strings, we must parse out bytecode and # replace it with an escape '\'. We can't use encode("string_escape") # as this would mess up the new line representation which is used for # beautifying the javascript code for Django's web interface. ret_data = "" for x in xrange(len(jsdata)): if ord(jsdata[x]) > 127: tmp = "\\x" + str(jsdata[x].encode("hex")) else: tmp = jsdata[x] ret_data += tmp else: continue obj_data["Data"] = ret_data retobjects.append(obj_data) elif details.type == "dictionary" and details.hasElement("/A"): # verify it to be a link type annotation subtype_elem = details.getElementByName("/Subtype") type_elem = details.getElementByName("/Type") if not subtype_elem or not type_elem: continue subtype_elem = self._get_obj_val(i, subtype_elem) type_elem = self._get_obj_val(i, type_elem) if subtype_elem.getValue( ) != "/Link" or type_elem.getValue() != "/Annot": continue a_elem = details.getElementByName("/A") a_elem = self._get_obj_val(i, a_elem) if a_elem.type == "dictionary" and a_elem.hasElement( "/URI"): uri_elem = a_elem.getElementByName("/URI") uri_elem = self._get_obj_val(i, uri_elem) annoturiset.add(self.base_uri + uri_elem.getValue()) else: # can be dictionaries, arrays, etc, don't bother displaying them # all for now pass
def _parse(self, filepath): """Parses the PDF for static information. Uses PyV8 from peepdf to extract JavaScript from PDF objects. @param filepath: Path to file to be analyzed. @return: results dict or None. """ # Load the PDF with PDFiD and convert it to JSON for processing pdf_data = PDFiD(filepath, False, True) pdf_json = PDFiD2JSON(pdf_data, True) pdfid_data = json.loads(pdf_json)[0] info = {} info["PDF Header"] = pdfid_data['pdfid']['header'] info["Total Entropy"] = pdfid_data['pdfid']['totalEntropy'] info['Entropy In Streams'] = pdfid_data['pdfid']['streamEntropy'] info['Entropy Out Streams'] = pdfid_data['pdfid']['nonStreamEntropy'] info['Count %% EOF'] = pdfid_data['pdfid']['countEof'] info['Data After EOF'] = pdfid_data['pdfid']['countChatAfterLastEof'] dates = pdfid_data['pdfid']['dates']['date'] # Get streams, counts and format. streams = {} for stream in pdfid_data['pdfid']['keywords']['keyword']: streams[str(stream['name'])] = stream['count'] result = {} result["Info"] = info result["Dates"] = dates result["Streams"] = streams log.debug("About to parse with PDFParser") parser = PDFParser() ret, pdf = parser.parse(filepath, True, False) objects = [] retobjects = [] count = 0 object_counter = 1 for i in range(len(pdf.body)): body = pdf.body[count] objects = body.objects for index in objects: oid = objects[index].id offset = objects[index].offset size = objects[index].size details = objects[index].object obj_data = {} obj_data["Object ID"] = oid obj_data["Offset"] = offset obj_data["Size"] = size if details.type == 'stream': encoded_stream = details.encodedStream decoded_stream = details.decodedStream obj_data["File Type"] = _get_filetype(decoded_stream)[:100] if HAVE_PYV8: try: jsdata = analyseJS(decoded_stream.strip())[0][0] except Exception, e: jsdata = "PyV8 failed to parse the stream." if jsdata == None: jsdata = "PyV8 did not detect JavaScript in the stream. (Possibly encrypted)" # The following loop is required to "JSONify" the strings returned from PyV8. # As PyV8 returns byte strings, we must parse out bytecode and # replace it with an escape '\'. We can't use encode("string_escape") # as this would mess up the new line representation which is used for # beautifying the javascript code for Django's web interface. ret_data = "" for i in xrange(len(jsdata)): if ord(jsdata[i]) > 127: tmp = "\\x" + str(jsdata[i].encode("hex")) else: tmp = jsdata[i] ret_data += tmp else: ret_data = "PyV8 not installed, unable to extract JavaScript." obj_data["Data"] = ret_data retobjects.append(obj_data) object_counter += 1 else: obj_data["File Type"] = "Encoded" obj_data["Data"] = "Encoded" retobjects.append(obj_data) count += 1 result["Objects"] = retobjects