def _findReferences(self, tag, attrs): ''' This method finds references inside a document. ''' if tag.lower() not in self._tagsContainingURLs: return for attr_name, attr_val in attrs: if attr_name.lower() in self._urlAttrs: # Only add it to the result of the current URL is not a fragment if attr_val and not attr_val.startswith('#'): url = urlParser.urlJoin(self._baseUrl, attr_val) url = self._decode_URL(url, self._encoding) url = urlParser.normalizeURL(url) if url not in self._parsed_URLs: self._parsed_URLs.append(url) self._tag_and_url.append((tag.lower(), url)) break
def _regex_url_parse(self, httpResponse): ''' Use regular expressions to find new URLs. @parameter httpResponse: The http response object that stores the response body and the URL. @return: None. The findings are stored in self._re_URLs. ''' #url_regex = '((http|https):[A-Za-z0-9/](([A-Za-z0-9$_.+!*(),;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:@&~=%-]*))?)' url_regex = '((http|https)://([a-zA-Z0-9_:@\-\./]*?)/[^ \n\r\t"\'<>]*)' for url in re.findall(url_regex, httpResponse.getBody() ): # This try is here because the _decode_URL method raises an exception # whenever it fails to decode a url. try: decoded_url = self._decode_URL(url[0], self._encoding) except w3afException: pass else: self._re_URLs.append(decoded_url) # # Now detect some relative URL's ( also using regexs ) # def find_relative( doc ): res = [] # TODO: Also matches //foo/bar.txt and http://host.tld/foo/bar.txt # I'm removing those matches manually below regex = '((:?[/]{1,2}[A-Z0-9a-z%_\-~\.]+)+\.[A-Za-z0-9]{2,4}(((\?)([a-zA-Z0-9]*=\w*)){1}((&)([a-zA-Z0-9]*=\w*))*)?)' relative_regex = re.compile( regex ) for match_tuple in relative_regex.findall(doc): match_string = match_tuple[0] # # And now I filter out some of the common false positives # if match_string.startswith('//'): continue if match_string.startswith('://'): continue if re.match('HTTP/\d\.\d', match_string): continue # Matches "PHP/5.2.4-2ubuntu5.7" , "Apache/2.2.8", and "mod_python/3.3.1" if re.match('.*?/\d\.\d\.\d', match_string): continue # # Filter finished. # domainPath = urlParser.getDomainPath(httpResponse.getURL()) url = urlParser.urlJoin( domainPath , match_string ) url = self._decode_URL(url, self._encoding) res.append( url ) return res relative_URLs = find_relative( httpResponse.getBody() ) self._re_URLs.extend( relative_URLs ) self._re_URLs = [ urlParser.normalizeURL(i) for i in self._re_URLs ] self._re_URLs = list(set(self._re_URLs))