def unknown_starttag(self, tag, attrs): if self.in_Script: self.handle_data(self.get_starttag_text()) return config.VERBOSE(config.VERBOSE_DEBUG, "[DEBUG] [PageParser.py] Tag: " + tag) if self.endearly: return domobj = DOMObject(self.__dict__['__window'], tag, self) #sometimes k in tag is not really attrname, so a transform is needed. #note that this is IE way. In firefox transform is done in DOMObject.setAttribute() for name, value in attrs: domobj.setAttribute(dataetc.attrTrans(name, tag), value) if dataetc.isevent(name.lower(), tag): self.emulate_timeout(name, value) if tag == 'script': domobj.__dict__['script'] = '' if config.retrieval_all: if 'src' in domobj.__dict__: src = self.__dict__['__window'].document.location.fix_url( domobj.src) script, headers = hc.get( src, self.__dict__['__window'].document.location.href) # if config.replace_nonascii: # script = re.sub('[\x80-\xff]',' ',script) try: begin = self.html.lower()[self.current:].index('<' + tag) start = self.current + begin offset = begin + self.html.lower()[start:].index('>') + 1 self.current += offset domobj.__dict__['begin'] = self.current domobj.__dict__['end'] = self.current + self.html.lower( )[self.current:].index('</' + tag) if (tag == 'div' and attrs) or tag == 'body': domobj.innerHTML = self.html[domobj.__dict__['begin']:domobj. __dict__['end']] except: pass self.DOM_stack[-1].appendChild(domobj) self.DOM_stack.append(domobj) if tag == 'form': self.__dict__['__window'].__dict__['__fl'].append(domobj) if tag == 'br' or tag == 'meta': self.unknown_endtag(tag) # <br> and <meta> have no end tag. if tag == 'select': self.lastselect = domobj if tag == 'option': try: self.lastselect.options.append(domobj) except: pass
def start_script(self, attrs): for k, v in attrs: if k.lower( ) == 'language' and not v.lower().startswith('javascript'): config.VERBOSE( config.VERBOSE_DEBUG, "[DEBUG] in PageParser.py: Ignoring(ignoreScript) start_object attrs: " + str(attrs)) self.ignoreScript = True return self.unknown_starttag('script', attrs) self.in_Script = True self.literal = 1 if 'src' in self.DOM_stack[-1].__dict__: src = self.__dict__['__window'].document.location.fix_url( self.DOM_stack[-1].src) script, headers = hc.get( src, self.__dict__['__window'].document.location.href) if config.replace_nonascii: script = re.sub('[\x80-\xff]', ' ', script) self.DOM_stack[-1].__dict__['script'] += script #self.literal = 0 self.__dict__['__window'].__dict__['__sl'].append( self.DOM_stack[-1]) self.end_script() return self.__dict__['__window'].__dict__['__sl'].append(self.DOM_stack[-1])
def start_frame(self, attrs): self.unknown_starttag('frame', attrs) if 'src' in self.DOM_stack[-1].__dict__: src = self.__dict__['__window'].document.location.fix_url(self.DOM_stack[-1].src) frame, headers = hc.get(src, self.__dict__['__window'].document.location.href) self.DOM_stack[-1].__dict__['frame'] = frame
def start_script(self, attrs): for k, v in attrs: if k.lower() == 'language' and not v.lower().startswith('javascript'): config.VERBOSE(config.VERBOSE_DEBUG, "[DEBUG] in PageParser.py: Ignoring(ignoreScript) start_object attrs: " +str(attrs)) self.ignoreScript = True return self.unknown_starttag('script', attrs) self.in_Script = True self.literal = 1 if 'src' in self.DOM_stack[-1].__dict__: src = self.__dict__['__window'].document.location.fix_url(self.DOM_stack[-1].src) script, headers = hc.get(src, self.__dict__['__window'].document.location.href) if config.replace_nonascii: script = re.sub('[\x80-\xff]', ' ', script) self.DOM_stack[-1].__dict__['script'] += script #self.literal = 0 self.__dict__['__window'].__dict__['__sl'].append(self.DOM_stack[-1]) self.end_script() return self.__dict__['__window'].__dict__['__sl'].append(self.DOM_stack[-1])
def start_embed(self, attrs): self.unknown_starttag('embed', attrs) if 'src' in self.DOM_stack[-1].__dict__: src = self.__dict__['__window'].document.location.fix_url( self.DOM_stack[-1].src) embed, headers = hc.get( src, self.__dict__['__window'].document.location.href) self.DOM_stack[-1].__dict__['embed'] = embed
def start_frame(self, attrs): self.unknown_starttag('frame', attrs) if 'src' in self.DOM_stack[-1].__dict__: src = self.__dict__['__window'].document.location.fix_url( self.DOM_stack[-1].src) frame, headers = hc.get( src, self.__dict__['__window'].document.location.href) self.DOM_stack[-1].__dict__['frame'] = frame
def unknown_starttag(self, tag, attrs): if self.in_Script: self.handle_data(self.get_starttag_text()) return config.VERBOSE(config.VERBOSE_DEBUG, "[DEBUG] [PageParser.py] Tag: " + tag) if self.endearly: return domobj = DOMObject(self.__dict__['__window'], tag, self) #sometimes k in tag is not really attrname, so a transform is needed. #note that this is IE way. In firefox transform is done in DOMObject.setAttribute() for name, value in attrs: domobj.setAttribute(dataetc.attrTrans(name, tag), value) if dataetc.isevent(name.lower(), tag): self.emulate_timeout(name, value) if tag == 'script': domobj.__dict__['script'] = '' if config.retrieval_all: if 'src' in domobj.__dict__: src = self.__dict__['__window'].document.location.fix_url(domobj.src) script, headers = hc.get(src, self.__dict__['__window'].document.location.href) # if config.replace_nonascii: # script = re.sub('[\x80-\xff]',' ',script) try: begin = self.html.lower()[self.current:].index('<' + tag) start = self.current + begin offset = begin + self.html.lower()[start:].index('>') + 1 self.current += offset domobj.__dict__['begin'] = self.current domobj.__dict__['end'] = self.current + self.html.lower()[self.current:].index('</'+tag) if (tag == 'div' and attrs) or tag == 'body': domobj.innerHTML = self.html[domobj.__dict__['begin']:domobj.__dict__['end']] except: pass self.DOM_stack[-1].appendChild(domobj) self.DOM_stack.append(domobj) if tag == 'form': self.__dict__['__window'].__dict__['__fl'].append(domobj) if tag == 'br' or tag == 'meta': self.unknown_endtag(tag) # <br> and <meta> have no end tag. if tag == 'select': self.lastselect = domobj if tag == 'option': try: self.lastselect.options.append(domobj) except: pass
def handle_src(self, name, val): url = self.__dict__['__window'].document.location.fix_url(val) if config.retrieval_all: hc.get(url, self.__dict__['__window'].document.location.href) scheme, netloc, path, query, fragment = urlparse.urlsplit(url) if scheme not in ('http','file','https','ftp'): config.VERBOSE(config.VERBOSE_WARNING, "[WARNING] Got unknown scheme: %s in %s.%s ."%(url,self.tagName, name)); if 'onerror' in self.__dict__: config.VERBOSE(config.VERBOSE_DEBUG, "[DEBUG] Calling onerror of %s."%(self.tagName)); self.onerror() if self.tagName == "iframe": from Window import Window from PageParser import PageParser window = Window(self.__dict__['__window'].__dict__['__root'], self.__dict__['__window'].document.location.fix_url(val), self.__dict__['__window'].document.location.href) parser = PageParser(window, window.document, window.__dict__['__html']) parser.close()
def __init_html(self): scheme = self.__dict__['__scheme'] url = self.__dict__['__url'] if self.__dict__['__referrer'] and '__url' in self.__dict__['__referrer'].__dict__: referrer = self.__dict__['__referrer'].__dict__['__url'] else: referrer = '' try: self.__dict__['__html'], headers = hc.get(url, referrer) if config.replace_nonascii: self.__dict__['__html'] = re.sub('[\x80-\xff]', ' ', self.__dict__['__html']) for header in headers.splitlines(): self.__dict__['__headers'].append(header) except Exception, e: traceback.print_exc()
def start_embed(self, attrs): self.unknown_starttag('embed', attrs) if 'src' in self.DOM_stack[-1].__dict__: src = self.__dict__['__window'].document.location.fix_url(self.DOM_stack[-1].src) embed, headers = hc.get(src, self.__dict__['__window'].document.location.href) self.DOM_stack[-1].__dict__['embed'] = embed