예제 #1
0
    def _parse(self, filepath: str) -> Dict[str, Any]:
        """Parses the PDF for static information.
        @param filepath: Path to file to be analyzed.
        @return: results dict or None.
        """
        # Load the PDF with PDFiD and convert it to JSON for processing
        pdf_data = PDFiD(filepath, False, True)
        try:
            pdf_json = PDFiD2JSON(pdf_data, True)
            pdfid_data = json.loads(pdf_json)[0]
        except IndexError as e:
            log.error("parse_pdf: %s", str(e))
            return {}

        pdfresult = {
            "Info": {
                "PDF Header": pdfid_data["pdfid"]["header"],
                "Total Entropy": pdfid_data["pdfid"]["totalEntropy"],
                "Entropy In Streams": pdfid_data["pdfid"]["streamEntropy"],
                "Entropy Out Streams": pdfid_data["pdfid"]["nonStreamEntropy"],
                "Count %% EOF": pdfid_data["pdfid"]["countEof"],
                "Data After EOF": pdfid_data["pdfid"]["countChatAfterLastEof"],
            },
            # Note, PDFiD doesn't interpret some dates properly, specifically it doesn't
            # seem to be able to properly represent time zones that involve fractions of
            # an hour
            "Dates": pdfid_data["pdfid"]["dates"]["date"],
            # Get keywords, counts and format.
            "Keywords": {
                str(keyword["name"]): keyword["count"]
                for keyword in pdfid_data["pdfid"]["keywords"]["keyword"]
            },
        }
        pdfresult = peepdf_parse(self.file_path, pdfresult)

        return pdfresult
예제 #2
0
    def _parse(self, filepath):
        """Parses the PDF for static information. Uses PyV8 from peepdf to
        extract JavaScript from PDF objects.
        @param filepath: Path to file to be analyzed.
        @return: results dict or None.
        """
        # Load the PDF with PDFiD and convert it to JSON for processing
        pdf_data = PDFiD(filepath, False, True)
        pdf_json = PDFiD2JSON(pdf_data, True)
        pdfid_data = json.loads(pdf_json)[0]

        info = {}
        info["PDF Header"] = pdfid_data['pdfid']['header']
        info["Total Entropy"] = pdfid_data['pdfid']['totalEntropy']
        info['Entropy In Streams'] = pdfid_data['pdfid']['streamEntropy']
        info['Entropy Out Streams'] = pdfid_data['pdfid']['nonStreamEntropy']
        info['Count %% EOF'] = pdfid_data['pdfid']['countEof']
        info['Data After EOF'] = pdfid_data['pdfid']['countChatAfterLastEof']
        # Note, PDFiD doesn't interpret some dates properly, specifically it doesn't
        # seem to be able to properly represent time zones that involve fractions of
        # an hour
        dates = pdfid_data['pdfid']['dates']['date']

        # Get keywords, counts and format.
        keywords = {}
        for keyword in pdfid_data['pdfid']['keywords']['keyword']:
            keywords[str(keyword['name'])] = keyword['count']

        result = {}
        result["Info"] = info
        result["Dates"] = dates
        result["Keywords"] = keywords

        log.debug("About to parse with PDFParser")
        parser = PDFParser()
        ret, self.pdf = parser.parse(filepath,
                                     forceMode=True,
                                     looseMode=True,
                                     manualAnalysis=False)
        urlset = set()
        annoturiset = set()
        objects = []
        retobjects = []
        metadata = dict()

        self._set_base_uri()

        for i in range(len(self.pdf.body)):
            body = self.pdf.body[i]
            metatmp = self.pdf.getBasicMetadata(i)
            if metatmp:
                metadata = metatmp

            objects = body.objects

            for index in objects:
                oid = objects[index].id
                offset = objects[index].offset
                size = objects[index].size
                details = objects[index].object

                obj_data = {}
                obj_data["Object ID"] = oid
                obj_data["Offset"] = offset
                obj_data["Size"] = size
                if details.type == 'stream':
                    encoded_stream = details.encodedStream
                    decoded_stream = details.decodedStream
                    if HAVE_PYV8:
                        jsdata = None
                        try:
                            jslist, unescapedbytes, urlsfound, errors, ctxdummy = analyseJS(
                                decoded_stream.strip())
                            jsdata = jslist[0]
                        except Exception, e:
                            continue
                        if len(errors):
                            continue
                        if jsdata == None:
                            continue

                        for url in urlsfound:
                            urlset.add(url)

                        # The following loop is required to "JSONify" the strings returned from PyV8.
                        # As PyV8 returns byte strings, we must parse out bytecode and
                        # replace it with an escape '\'. We can't use encode("string_escape")
                        # as this would mess up the new line representation which is used for
                        # beautifying the javascript code for Django's web interface.
                        ret_data = ""
                        for x in xrange(len(jsdata)):
                            if ord(jsdata[x]) > 127:
                                tmp = "\\x" + str(jsdata[x].encode("hex"))
                            else:
                                tmp = jsdata[x]
                            ret_data += tmp
                    else:
                        continue

                    obj_data["Data"] = ret_data
                    retobjects.append(obj_data)
                elif details.type == "dictionary" and details.hasElement("/A"):
                    # verify it to be a link type annotation
                    subtype_elem = details.getElementByName("/Subtype")
                    type_elem = details.getElementByName("/Type")
                    if not subtype_elem or not type_elem:
                        continue
                    subtype_elem = self._get_obj_val(i, subtype_elem)
                    type_elem = self._get_obj_val(i, type_elem)
                    if subtype_elem.getValue(
                    ) != "/Link" or type_elem.getValue() != "/Annot":
                        continue
                    a_elem = details.getElementByName("/A")
                    a_elem = self._get_obj_val(i, a_elem)
                    if a_elem.type == "dictionary" and a_elem.hasElement(
                            "/URI"):
                        uri_elem = a_elem.getElementByName("/URI")
                        uri_elem = self._get_obj_val(i, uri_elem)
                        annoturiset.add(self.base_uri + uri_elem.getValue())
                else:
                    # can be dictionaries, arrays, etc, don't bother displaying them
                    # all for now
                    pass
예제 #3
0
    def _parse(self, filepath):
        """Parses the PDF for static information. Uses PyV8 from peepdf to
        extract JavaScript from PDF objects.
        @param filepath: Path to file to be analyzed.
        @return: results dict or None.
        """
        # Load the PDF with PDFiD and convert it to JSON for processing
        pdf_data = PDFiD(filepath, False, True)
        pdf_json = PDFiD2JSON(pdf_data, True)
        pdfid_data = json.loads(pdf_json)[0]

        info = {}
        info["PDF Header"] = pdfid_data['pdfid']['header']
        info["Total Entropy"] = pdfid_data['pdfid']['totalEntropy']
        info['Entropy In Streams'] = pdfid_data['pdfid']['streamEntropy']
        info['Entropy Out Streams'] = pdfid_data['pdfid']['nonStreamEntropy']
        info['Count %% EOF'] = pdfid_data['pdfid']['countEof']
        info['Data After EOF'] = pdfid_data['pdfid']['countChatAfterLastEof']
        dates = pdfid_data['pdfid']['dates']['date']

        # Get streams, counts and format.
        streams = {}
        for stream in pdfid_data['pdfid']['keywords']['keyword']:
            streams[str(stream['name'])] = stream['count']

        result = {}
        result["Info"] = info
        result["Dates"] = dates
        result["Streams"] = streams

        log.debug("About to parse with PDFParser")
        parser = PDFParser()
        ret, pdf = parser.parse(filepath, True, False)
        objects = []
        retobjects = []
        count = 0
        object_counter = 1

        for i in range(len(pdf.body)):
            body = pdf.body[count]
            objects = body.objects

            for index in objects:
                oid = objects[index].id
                offset = objects[index].offset
                size = objects[index].size
                details = objects[index].object

                obj_data = {}
                obj_data["Object ID"] = oid
                obj_data["Offset"] = offset
                obj_data["Size"] = size
                if details.type == 'stream':
                    encoded_stream = details.encodedStream
                    decoded_stream = details.decodedStream
                    obj_data["File Type"] = _get_filetype(decoded_stream)[:100]
                    if HAVE_PYV8:
                        try:
                            jsdata = analyseJS(decoded_stream.strip())[0][0]
                        except Exception, e:
                            jsdata = "PyV8 failed to parse the stream."
                        if jsdata == None:
                            jsdata = "PyV8 did not detect JavaScript in the stream. (Possibly encrypted)"

                        # The following loop is required to "JSONify" the strings returned from PyV8.
                        # As PyV8 returns byte strings, we must parse out bytecode and
                        # replace it with an escape '\'. We can't use encode("string_escape")
                        # as this would mess up the new line representation which is used for
                        # beautifying the javascript code for Django's web interface.
                        ret_data = ""
                        for i in xrange(len(jsdata)):
                            if ord(jsdata[i]) > 127:
                                tmp = "\\x" + str(jsdata[i].encode("hex"))
                            else:
                                tmp = jsdata[i]
                            ret_data += tmp
                    else:
                        ret_data = "PyV8 not installed, unable to extract JavaScript."

                    obj_data["Data"] = ret_data
                    retobjects.append(obj_data)
                    object_counter += 1

                else:
                    obj_data["File Type"] = "Encoded"
                    obj_data["Data"] = "Encoded"
                    retobjects.append(obj_data)

            count += 1
            result["Objects"] = retobjects