Exemplos de HTML.HTMLParser em Python, exemplos de FileFormats.HTML.HTMLParser em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: LiveCom.py Projeto: ntvis/pyflag

    def stats(self, query, result):
        result.start_table(**{'class': 'GeneralTable'})
        dbh = DB.DBO(self.case)
        columns = [
            "service", "type", "From", "To", "CC", "BCC", "sent", "subject",
            "message"
        ]
        dbh.execute("select * from webmail_messages where `inode_id`=%r",
                    self.lookup_id())
        row = dbh.fetch()

        dbh2 = DB.DBO(self.case)
        dbh2.execute("select * from inode where inode_id = %r",
                     row['inode_id'])
        row2 = dbh2.fetch()
        result.row("Timestamp", row2['mtime'])

        for c in columns:
            if c == 'message':
                ## Filter the message out here:
                parser = HTML.HTMLParser(tag_class = \
                                         FlagFramework.Curry(HTML.ResolvingHTMLTag,
                                                             case = self.case,
                                                             inode_id = row['parent_inode_id']))
                #parser = HTML.HTMLParser(tag_class = HTML.TextTag)
                parser.feed(HTML.decode(row[c] or ""))
                parser.close()
                #tmp = result.__class__(result)
                #tmp.text(parser.root.innerHTML(), font='typewriter', wrap='full')
                #row[c] = tmp
                r = parser.root.__str__()
                r = textwrap.fill(r)
                row[c] = r

Exemplo n.º 2

0

Exibir arquivo

Arquivo: LiveCom.py Projeto: ntvis/pyflag

    def sanitize_page(self, tag_class):
        """ This produces a rendered version of the underlying page """
        ## Get the original HTML File:
        fsfd = FileSystem.DBFS(self.case)
        fd = fsfd.open(inode_id=self.parent_inode_id)
        #data = HTML.decode(fd.read())
        data = fd.read()
        ## FIXME - This is a hack which works because we always send a
        ## curried class down:
        try:
            tag_class.kwargs['inode_id'] = self.parent_inode_id
        except AttributeError:
            pass

        ## Make a parser:
        p = HTML.HTMLParser(tag_class=tag_class)
        p.feed(data)
        p.close()

        ## Allow us to fix the html page
        root = p.root
        self.fixup_page(root, tag_class)

        ## Add the timestamp to the title of the page - so when you
        ## print it out we can identify it:
        s = fsfd.istat(inode_id=self.parent_inode_id)
        title_tag = root.find("title")
        if title_tag:
            title_tag.children = [
                "%s %s %s" % (title_tag.innerHTML(), s['mtime'], s['inode']),
            ]

        return root.innerHTML()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: Gmail.py Projeto: johnmccabe/pyflag

    def scan(self, fd, scanners, type, mime, cookie, scores=None, **args):
        if scores.get('GmailStreamMagic', 0) == 0:
            return

        pyflaglog.log(pyflaglog.DEBUG,
                      "Opening %s for Gmail processing" % fd.inode_id)
        self.current_time = None
        self.current_box = 'Unknown'

        if "html" in mime:
            html_parser = HTML.HTMLParser()
            html_parser.parse_fd(fd)
            html_parser.close()

            ## Process all script segments
            for script_tag in html_parser.root.search("script"):
                script = script_tag.innerHTML()
                try:
                    j = Javascript.JSParser()
                    j.feed(script)
                    j.close()
                except:
                    continue

                self.process_js(j.root, fd)

        elif "javascript" in mime:
            ## Make a new parser
            j = Javascript.JSParser()
            j.parse_fd(fd)
            j.close()

            self.process_js(j.root, fd)

Exemplo n.º 4

0

Exibir arquivo

    def process_string(self, fd, string):
        parser = HTML.HTMLParser(verbose=0)
        parser.feed(string)
        parser.close()

        self.process_readmessage(fd, parser)
        self.process_listing(fd, parser)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: LiveCom.py Projeto: ntvis/pyflag

    def display(self, value, row, result):
        parser = HTML.HTMLParser(tag_class=HTML.TextTag)
        parser.feed(value or '')
        parser.close()

        value = parser.root.innerHTML()

        result.text(value, wrap='full', font='typewriter')

Exemplo n.º 6

0

Exibir arquivo

Arquivo: ViewFile.py Projeto: olivierh59500/pyflag

        def generator():
            parser = HTML.HTMLParser(tag_class=Curry(HTML.ResolvingHTMLTag,
                                                     inode_id=fd.lookup_id(),
                                                     case=self.case))
            #parser = HTML.HTMLParser(tag_class = HTML.Tag)
            data = fd.read(1000000)
            parser.feed(data)
            parser.close()

            yield parser.root.innerHTML()

Exemplo n.º 7

0

Exibir arquivo

Arquivo: LiveCom.py Projeto: ntvis/pyflag

        def boring(self, metadata, data=''):
            ## We dont think its boring if our base class does not:
            ## And the data contains '<title>\s+Windows Live' in the top.
            if not Scanner.StoreAndScanType.boring(self, metadata, data) and \
                   re.search("<title>\s+Windows Live", data):
                ## Make a new parser:
                if not self.parser:
                    self.parser = HTML.HTMLParser(verbose=0)
                return False

            return True

Exemplo n.º 8

0

Exibir arquivo

    def render_html(self, inode_id, table_renderer):
        import plugins.TableRenderers.HTMLBundle as HTMLBundle

        fsfd = FileSystem.DBFS(table_renderer.case)
        fd = fsfd.open(inode_id=inode_id)
        parser = HTML.HTMLParser(tag_class=HTML.SanitizingTag)

        parser.feed(fd.read(fd.size))
        parser.close()

        text = parser.root.innerHTML()
        return text

Exemplo n.º 9

0

Exibir arquivo

        def boring(self, metadata, data=''):
            ## We dont think its boring if our base class does not:
            ## And the data contains '<title>\s+Yahoo! Mail' in the top.
            if not Scanner.StoreAndScanType.boring(self, metadata, data=''):
                m = re.search("<title>[^<]+Yahoo! Mail", data)
                if m:
                    self.username = None
                    ## Make a new parser:
                    if not self.parser:
                        self.parser = HTML.HTMLParser(verbose=0)
                    return False

            return True

Exemplo n.º 10

0

Exibir arquivo

Arquivo: LiveCom.py Projeto: ntvis/pyflag

    def render_html(self, value, table_renderer):
        import plugins.TableRenderers.HTMLBundle as HTMLBundle

        parser = HTML.HTMLParser(tag_class=HTML.TextTag)

        parser.feed(value or '')
        parser.close()

        text = parser.root.innerHTML()

        ## Make sure its wrapped:
        ui = HTMLUI.HTMLUI(initial=True)
        ui.text(text, wrap='full', font='typewriter')
        return ui.__str__()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: LiveCom.py Projeto: ntvis/pyflag

 def fixup_page(self, root, tag_class):
     ## We have to inject the message into the edit area:
     edit_area = root.find("div", {"class":"EditArea"}) or \
                 root.find("div",{"id":"MsgContainer"}) or \
                 root.find("textarea",{"id":"fMessageBody"})
     if edit_area:
         parser = HTML.HTMLParser(tag_class=tag_class)
         parser.feed(HTML.decode(self.message))
         #parser.feed(self.message)
         parser.close()
         result = parser.root.__str__()
         result = textwrap.fill(result)
         edit_area.prune()
         edit_area.add_child(result)
         edit_area.name = 'div'

Exemplo n.º 12

0

Exibir arquivo

        def boring(self, metadata, data=''):
            ## Yahoo web 2.0 is very nice to work with- All
            ## responses are in nice XML
            if not Scanner.StoreAndScanType.boring(self, metadata, data=''):
                m = re.search(
                    "<(GetDisplayMessageResponse|ListMessagesResponse|SendMessageResponse)",
                    data)
                if m:
                    self.context = m.group(1)
                    ## Make a new parser:
                    if not self.parser:
                        self.parser = HTML.HTMLParser(verbose=0)
                    return False

            return True

Exemplo n.º 13

0

Exibir arquivo

Arquivo: yahoo_mail_versions.py Projeto: olivierh59500/pyflag

def insert_message(self, result, inode_template="l%s"):
    ## We dont really want to touch the db in here - just print it out
    ## nicely:
    try:
        ## Try to render the html as text:
        message = unicode(result['Message'])
        p = HTML.HTMLParser(tag_class=HTML.TextTag)
        p.feed(message)
        p.close()

        result['Message'] = p.root.__str__()

    except KeyError:
        pass

    for k, v in result.items():
        print "   %s: %r" % (k, v)

    return True

Exemplo n.º 14

0

Exibir arquivo

Arquivo: YahooMail.py Projeto: johnmccabe/pyflag

    def scan(self, fd, scanners, type, mime, cookie, **args):
        if "Yahoo Mail AJAX" in type:
            self.parser = HTML.HTMLParser(verbose=0)
            pyflaglog.log(
                pyflaglog.DEBUG,
                "Opening %s for YahooMail2.0 processing" % fd.inode_id)

            ## Read all the data into the parser
            self.context = None
            while 1:
                data = fd.read(1024 * 1024)
                if not data: break

                if not self.context: self.context = data
                self.parser.feed(data)

            self.parser.close()

            if 'GetDisplayMessageResponse' in self.context:
                self.process_readmessage(fd)

Exemplo n.º 15

0

Exibir arquivo

    def fixup_page(self, result, message, tag_class):
        """ Given the parse tree in root, fix up the page so it looks
        as close as possible to the way it should. We write the new
        page on outfd.
        """
        if not message: return
        ## We have to inject the message into the edit area:
        edit_area = self.parser.root.find("div", {"class":"EditArea"}) or \
                    self.parser.root.find("div",{"id":"MsgContainer"}) or \
                    self.parser.root.find("textarea",{"id":"fMessageBody"})
        if edit_area:
            parser = HTML.HTMLParser(tag_class=tag_class)
            parser.feed(HTML.decode(message))
            parser.close()
            result = parser.root.__str__()
            result = textwrap.fill(result)
            edit_area.prune()
            edit_area.add_child(result)
            edit_area.name = 'div'

        return self.parser.root.innerHTML()

Exemplo n.º 16

0

Exibir arquivo

Arquivo: LiveCom.py Projeto: ntvis/pyflag

        def process_readmessage(self, message):
            parser = HTML.HTMLParser(verbose=0)
            parser.feed(message)
            parser.close()

            result = {'type': 'Read', 'Message': ''}

            ## Find the subject
            sbj = parser.root.find('td', {'class': 'ReadMsgSubject'})
            if sbj: result['Subject'] = HTML.decode_entity(sbj.innerHTML())

            context = None
            for td in parser.root.search('td'):
                data = td.innerHTML()
                if context:
                    result[context] = HTML.decode_entity(data)
                    context = None

                if data.lower().startswith('from:'):
                    context = 'From'
                elif data.lower().startswith('to:'):
                    context = 'To'
                elif data.lower().startswith('sent:'):
                    context = 'Sent'

            msg = parser.root.find('div', {'class': 'ReadMsgContainer'})
            if msg:
                result['Message'] = msg.innerHTML()

            ## Try to detect the message ID
            tag = parser.root.find('div', {'mid': '.'})
            if tag:
                result['message_id'] = tag['mid']

            try:
                result[context] = Time.parse(result[context])
            except:
                pass

            return self.insert_message(result, inode_template='l%s')

Exemplo n.º 17

0

Exibir arquivo

Arquivo: YahooMail.py Projeto: johnmccabe/pyflag

    def process_send_message(self, fd):
        dbh = DB.DBO(self.case)
        dbh.execute(
            "select `key`,`value`,`indirect` from http_parameters where `key`='body' and inode_id = %r limit 1",
            self.fd.inode_id)
        row = dbh.fetch()
        if not row: return

        inode_id = row['indirect']
        if not inode_id: return

        ## Need to parse the sent message
        fsfd = FileSystem.DBFS(self.case)
        fd = fsfd.open(inode_id=inode_id)
        self.parser = HTML.HTMLParser(verbose=0)
        self.parser.feed(fd.read())
        self.parser.close()
        root = self.parser.root

        result = {'type': 'Edit Sent'}
        result['From'] = self.parse_email_address(root, 'from')
        result['To'] = self.parse_email_address(root, 'to')
        try:
            result['message'] = root.find("text").innerHTML()
        except:
            pass

        ## Sometimes they also give us the html version
        #try:
        #    result['message'] = root.find("html").innerHTML()
        #except: pass

        try:
            result['subject'] = root.find("subject").innerHTML()
        except:
            pass

        self.insert_message(result, "webmail")

Exemplo n.º 18

0

Exibir arquivo

    def scan(self, fd, scanners, type, mime, cookie, **args):
        if "HTML" in type:
            data = fd.read(1024)
            if not re.search("<title>\s+Windows Live", data): return

            ## Ok - we know its a Live page
            pyflaglog.log(
                pyflaglog.DEBUG, "Opening (%s) %s for Hotmail processing" %
                (fd.inode_id, fd.urn))
            self.parser = HTML.HTMLParser(verbose=0)
            self.parser.feed(data.decode("utf8", "ignore"))

            while len(data) > 0:
                data = fd.read(1024)
                self.parser.feed(data.decode("utf8", "ignore"))
                ## Get all the tokens
                while self.parser.next_token(True):
                    pass

            ## Now we should be able to parse the data out:
            self.process_send_message(fd)
            self.process_editread(fd)
            self.process_readmessage(fd)
            self.process_mail_listing(fd)

Exemplo n.º 19

0

Exibir arquivo

 def sanitize_data(self, data, value, result):
     parser = HTML.HTMLParser(tag_class = \
                              FlagFramework.Curry(MessageTags,
                                                  case = self.case,
                                                  inode_id = value))

Exemplo n.º 20

0

Exibir arquivo

Arquivo: LiveCom.py Projeto: ntvis/pyflag

    def xxxdisplay(self, value, row, result):
        parser = HTML.HTMLParser(tag_class=HTML.SanitizingTag)
        parser.feed(value)
        parser.close()

        return parser.root.innerHTML()

Exemplo n.º 21

0

Exibir arquivo

    def scan(self, fd, scanners, type, mime, cookie, **args):
        if "Google Image Search" in type:
            pyflaglog.log(pyflaglog.DEBUG,"Opening %s for Google image search processing" % fd.inode_id)
            ## Parse the file
            self.parser = HTML.HTMLParser()        
            self.parser.feed(fd.read())
            self.parser.close()
            
            ## Pull out all the scripts and match the regex:
            result = ''
            image_text = ''
            text_text = ''
            count = 0
            total_count = 0
            regex = re.compile('dyn.Img(\(.+?\));')
            for script in self.parser.root.search("script"):
                data = script.innerHTML()
                for m in regex.finditer(data):
                    row = eval(m.group(1),{},{})
                    image_text += '''\n<td id="tDataImage%s" nowrap="" width="16%%" valign="bottom" align="center" style="padding-top: 0px;">
                    <a href="%s">
                    <img height="%s" width="%s" src="%s?q=tbn:%s%s" style="border: 1px solid ;"/>
                    </a>
                    </td>\n''' % (total_count, row[0], row[5], row[4], row[14], row[2], row[3])

                    text_text += '''<td id="tDataText%s" width="16%%" valign="top" align="center">
                    <font face="arial,sans-serif" size="-1">
                    %s
                    <br/>
                    %s - %s
                    <br/>
                    <font color="#008000">%s</font>
                    </font>
                    </td>''' % (total_count, row[6], row[9], row[10], row[11])
                    
                    count += 1
                    total_count += 1
                    
                    if count >= 5:
                        result += "<tr>%s</tr>\n<tr>%s</tr>\n" % (image_text, text_text)
                        image_text = ''
                        text_text = ''
                        count = 0

            if image_text:
                result += "<tr>%s</tr>\n<tr>%s</tr>\n" % (image_text, text_text)

            if result:
                ## Prepare the new page
                tag = self.parser.root.find("div", {"id":"ImgContent"})
                if tag:
                    result = "<table>%s</table>" % result
                    tag.add_child(result)

                page = self.parser.root.innerHTML()
                page = page.encode("utf8","ignore")

                new_fd = CacheManager.AFF4_MANAGER.create_cache_data(
                    fd.case,
                    "%s/Gimage" % fd.urn,
                    page, inherited = fd.urn)

                new_fd.close()