示例#1
0
    def unknown_starttag(self, tag, attrs):
        if self.in_Script:
            self.handle_data(self.get_starttag_text())
            return

        config.VERBOSE(config.VERBOSE_DEBUG,
                       "[DEBUG] [PageParser.py] Tag: " + tag)
        if self.endearly:
            return

        domobj = DOMObject(self.__dict__['__window'], tag, self)
        #sometimes k in tag is not really attrname, so a transform is needed.
        #note that this is IE way. In firefox transform is done in DOMObject.setAttribute()
        for name, value in attrs:
            domobj.setAttribute(dataetc.attrTrans(name, tag), value)
            if dataetc.isevent(name.lower(), tag):
                self.emulate_timeout(name, value)

        if tag == 'script':
            domobj.__dict__['script'] = ''

        if config.retrieval_all:
            if 'src' in domobj.__dict__:
                src = self.__dict__['__window'].document.location.fix_url(
                    domobj.src)
                script, headers = hc.get(
                    src, self.__dict__['__window'].document.location.href)
                # if config.replace_nonascii:
                #     script = re.sub('[\x80-\xff]',' ',script)

        try:
            begin = self.html.lower()[self.current:].index('<' + tag)
            start = self.current + begin
            offset = begin + self.html.lower()[start:].index('>') + 1

            self.current += offset
            domobj.__dict__['begin'] = self.current
            domobj.__dict__['end'] = self.current + self.html.lower(
            )[self.current:].index('</' + tag)

            if (tag == 'div' and attrs) or tag == 'body':
                domobj.innerHTML = self.html[domobj.__dict__['begin']:domobj.
                                             __dict__['end']]
        except:
            pass

        self.DOM_stack[-1].appendChild(domobj)
        self.DOM_stack.append(domobj)

        if tag == 'form':
            self.__dict__['__window'].__dict__['__fl'].append(domobj)
        if tag == 'br' or tag == 'meta':
            self.unknown_endtag(tag)  # <br> and <meta> have no end tag.
        if tag == 'select':
            self.lastselect = domobj
        if tag == 'option':
            try:
                self.lastselect.options.append(domobj)
            except:
                pass
示例#2
0
    def start_script(self, attrs):
        for k, v in attrs:
            if k.lower(
            ) == 'language' and not v.lower().startswith('javascript'):
                config.VERBOSE(
                    config.VERBOSE_DEBUG,
                    "[DEBUG] in PageParser.py: Ignoring(ignoreScript) start_object attrs: "
                    + str(attrs))
                self.ignoreScript = True
                return

        self.unknown_starttag('script', attrs)
        self.in_Script = True
        self.literal = 1

        if 'src' in self.DOM_stack[-1].__dict__:
            src = self.__dict__['__window'].document.location.fix_url(
                self.DOM_stack[-1].src)
            script, headers = hc.get(
                src, self.__dict__['__window'].document.location.href)
            if config.replace_nonascii:
                script = re.sub('[\x80-\xff]', ' ', script)
            self.DOM_stack[-1].__dict__['script'] += script
            #self.literal = 0
            self.__dict__['__window'].__dict__['__sl'].append(
                self.DOM_stack[-1])
            self.end_script()
            return

        self.__dict__['__window'].__dict__['__sl'].append(self.DOM_stack[-1])
示例#3
0
 def start_frame(self, attrs):
     self.unknown_starttag('frame', attrs)
     if 'src' in self.DOM_stack[-1].__dict__:
         src = self.__dict__['__window'].document.location.fix_url(self.DOM_stack[-1].src)            
         frame, headers = hc.get(src, self.__dict__['__window'].document.location.href)
         
         self.DOM_stack[-1].__dict__['frame'] = frame
示例#4
0
 def start_script(self, attrs):
     for k, v in attrs: 
         if k.lower() == 'language' and not v.lower().startswith('javascript'):
             config.VERBOSE(config.VERBOSE_DEBUG, "[DEBUG] in PageParser.py: Ignoring(ignoreScript) start_object attrs: " +str(attrs))
             self.ignoreScript = True
             return
            
     self.unknown_starttag('script', attrs)
     self.in_Script = True
     self.literal = 1
     
     if 'src' in self.DOM_stack[-1].__dict__:
         src = self.__dict__['__window'].document.location.fix_url(self.DOM_stack[-1].src)
         script, headers = hc.get(src, self.__dict__['__window'].document.location.href)            
         if config.replace_nonascii:
             script = re.sub('[\x80-\xff]', ' ', script)
         self.DOM_stack[-1].__dict__['script'] += script
         #self.literal = 0
         self.__dict__['__window'].__dict__['__sl'].append(self.DOM_stack[-1])
         self.end_script()
         return
             
     
            
     self.__dict__['__window'].__dict__['__sl'].append(self.DOM_stack[-1])
示例#5
0
 def start_embed(self, attrs):
     self.unknown_starttag('embed', attrs)
     if 'src' in self.DOM_stack[-1].__dict__:
         src = self.__dict__['__window'].document.location.fix_url(
             self.DOM_stack[-1].src)
         embed, headers = hc.get(
             src, self.__dict__['__window'].document.location.href)
         self.DOM_stack[-1].__dict__['embed'] = embed
示例#6
0
 def start_frame(self, attrs):
     self.unknown_starttag('frame', attrs)
     if 'src' in self.DOM_stack[-1].__dict__:
         src = self.__dict__['__window'].document.location.fix_url(
             self.DOM_stack[-1].src)
         frame, headers = hc.get(
             src, self.__dict__['__window'].document.location.href)
         self.DOM_stack[-1].__dict__['frame'] = frame
示例#7
0
    def unknown_starttag(self, tag, attrs):
        if self.in_Script:
            self.handle_data(self.get_starttag_text())
            return

        config.VERBOSE(config.VERBOSE_DEBUG, "[DEBUG] [PageParser.py] Tag: " + tag)
        if self.endearly: 
            return
       
        domobj = DOMObject(self.__dict__['__window'], tag, self)
        #sometimes k in tag is not really attrname, so a transform is needed.
        #note that this is IE way. In firefox transform is done in DOMObject.setAttribute()
        for name, value in attrs:
            domobj.setAttribute(dataetc.attrTrans(name, tag), value)
            if dataetc.isevent(name.lower(), tag):
                self.emulate_timeout(name, value)


        if tag == 'script':
            domobj.__dict__['script'] = ''

        if config.retrieval_all:
            if 'src' in domobj.__dict__:
                src = self.__dict__['__window'].document.location.fix_url(domobj.src)
                script, headers = hc.get(src, self.__dict__['__window'].document.location.href)
                # if config.replace_nonascii:
                #     script = re.sub('[\x80-\xff]',' ',script)

        try:
            begin  = self.html.lower()[self.current:].index('<' + tag)
            start  = self.current + begin
            offset = begin + self.html.lower()[start:].index('>') + 1

            self.current += offset
            domobj.__dict__['begin'] = self.current
            domobj.__dict__['end']   = self.current + self.html.lower()[self.current:].index('</'+tag) 

            if (tag == 'div' and attrs) or tag == 'body':
                domobj.innerHTML = self.html[domobj.__dict__['begin']:domobj.__dict__['end']]
        except:
            pass

        self.DOM_stack[-1].appendChild(domobj)
        self.DOM_stack.append(domobj)

        if tag == 'form': 
            self.__dict__['__window'].__dict__['__fl'].append(domobj)
        if tag == 'br' or tag == 'meta': 
            self.unknown_endtag(tag) # <br> and <meta> have no end tag.
        if tag == 'select': 
            self.lastselect = domobj
        if tag == 'option':
            try: 
                self.lastselect.options.append(domobj)
            except: 
                pass
示例#8
0
    def handle_src(self, name, val):
        url = self.__dict__['__window'].document.location.fix_url(val)

        if config.retrieval_all:
            hc.get(url, self.__dict__['__window'].document.location.href)
        
        scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
        if scheme not in ('http','file','https','ftp'):
            config.VERBOSE(config.VERBOSE_WARNING, "[WARNING] Got unknown scheme: %s in %s.%s ."%(url,self.tagName, name));
            if 'onerror' in self.__dict__:
                config.VERBOSE(config.VERBOSE_DEBUG, "[DEBUG] Calling onerror of %s."%(self.tagName));
                self.onerror()

        if self.tagName == "iframe":
            from Window import Window
            from PageParser import PageParser
            window = Window(self.__dict__['__window'].__dict__['__root'],
                            self.__dict__['__window'].document.location.fix_url(val),
                            self.__dict__['__window'].document.location.href)
            parser = PageParser(window, window.document, window.__dict__['__html'])
            parser.close()
示例#9
0
    def __init_html(self):
        scheme = self.__dict__['__scheme']
        url    = self.__dict__['__url']
        
        if self.__dict__['__referrer'] and '__url' in self.__dict__['__referrer'].__dict__:
            referrer = self.__dict__['__referrer'].__dict__['__url']
        else:
            referrer = ''

        try:
            self.__dict__['__html'], headers = hc.get(url, referrer)
            if config.replace_nonascii:
                self.__dict__['__html'] = re.sub('[\x80-\xff]', ' ', self.__dict__['__html'])
            for header in headers.splitlines():
                self.__dict__['__headers'].append(header)
        except Exception, e:  
            traceback.print_exc()
示例#10
0
    def __init_html(self):
        scheme = self.__dict__['__scheme']
        url    = self.__dict__['__url']
        
        if self.__dict__['__referrer'] and '__url' in self.__dict__['__referrer'].__dict__:
            referrer = self.__dict__['__referrer'].__dict__['__url']
        else:
            referrer = ''

        try:
            self.__dict__['__html'], headers = hc.get(url, referrer)
            if config.replace_nonascii:
                self.__dict__['__html'] = re.sub('[\x80-\xff]', ' ', self.__dict__['__html'])
            for header in headers.splitlines():
                self.__dict__['__headers'].append(header)
        except Exception, e:  
            traceback.print_exc()
示例#11
0
 def start_embed(self, attrs):
     self.unknown_starttag('embed', attrs)
     if 'src' in self.DOM_stack[-1].__dict__:
         src = self.__dict__['__window'].document.location.fix_url(self.DOM_stack[-1].src)
         embed, headers = hc.get(src, self.__dict__['__window'].document.location.href)
         self.DOM_stack[-1].__dict__['embed'] = embed