def checkValidity(self, junkUrl): # Try and fetch an obviously missing version of the junk file fetch = junkUrl + str(random.SystemRandom().randint(0, 99999999)) res = self.sf.fetchUrl(fetch, headOnly=True, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], verify=False) if res['code'] != "404": host = SpiderFootHelpers.urlBaseUrl(junkUrl) self.skiphosts[host] = True return False return True
def cleanLinks(self, links): returnLinks = dict() for link in links: linkBase = SpiderFootHelpers.urlBaseUrl(link) linkFQDN = self.sf.urlFQDN(link) # Skip external sites (typical behaviour..) if not self.getTarget().matches(linkFQDN): # self.debug('Ignoring external site: ' + link) continue # Optionally skip sub-domain sites if self.opts['nosubs'] and not \ self.getTarget().matches(linkFQDN, includeChildren=False): # self.debug("Ignoring subdomain: " + link) continue # Skip parent domain sites if not self.getTarget().matches(linkFQDN, includeParents=False): # self.debug("Ignoring parent domain: " + link) continue # Optionally skip user directories if self.opts['filterusers'] and '/~' in link: # self.debug("Ignoring user folder: " + link) continue # If we are respecting robots.txt, filter those out too if linkBase in self.robotsRules and self.opts['robotsonly']: if list(filter(lambda blocked: type(blocked).lower(blocked) in link.lower() or blocked == '*', self.robotsRules[linkBase])): # self.debug("Ignoring page found in robots.txt: " + link) continue # All tests passed, add link to be spidered self.debug("Adding URL for spidering: " + link) returnLinks[link] = links[link] return returnLinks
def test_url_base_url_should_return_a_string(self): base_url = SpiderFootHelpers.urlBaseUrl( 'http://localhost.local/path?param=value#fragment') self.assertIsInstance(base_url, str) self.assertEqual('http://localhost.local', base_url)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.debug(f"Received event, {eventName}, from {srcModuleName}") # SIMILARDOMAIN and CO_HOSTED_SITE events are domains, not URLs. # Assume HTTP. if eventName in ['SIMILARDOMAIN', 'CO_HOSTED_SITE']: url = 'http://' + eventData.lower() elif 'URL' in eventName: url = eventData else: return fqdn = self.sf.urlFQDN(url) # We are only interested in external sites for the crossref if self.getTarget().matches(fqdn): self.debug(f"Ignoring {url} as not external") return if eventData in self.fetched: self.debug(f"Ignoring {url} as already tested") return if not self.sf.resolveHost(fqdn) and not self.sf.resolveHost6(fqdn): self.debug(f"Ignoring {url} as {fqdn} does not resolve") return self.fetched[url] = True self.debug(f"Testing URL for affiliation: {url}") res = self.sf.fetchUrl(url, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], sizeLimit=10000000, verify=False) if res['content'] is None: self.debug(f"Ignoring {url} as no data returned") return matched = False for name in self.getTarget().getNames(): # Search for mentions of our host/domain in the external site's data pat = re.compile( r"([\.\'\/\"\ ]" + re.escape(name) + r"[\.\'\/\"\ ])", re.IGNORECASE) matches = re.findall(pat, str(res['content'])) if len(matches) > 0: matched = True break if not matched: # If the name wasn't found in the affiliate, and checkbase is set, # fetch the base URL of the affiliate to check for a crossref. if eventName == "LINKED_URL_EXTERNAL" and self.opts['checkbase']: # Check the base url to see if there is an affiliation url = SpiderFootHelpers.urlBaseUrl(eventData) if url in self.fetched: return self.fetched[url] = True res = self.sf.fetchUrl(url, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], sizeLimit=10000000, verify=False) if res['content'] is not None: for name in self.getTarget().getNames(): pat = re.compile( r"([\.\'\/\"\ ]" + re.escape(name) + r"[\'\/\"\ ])", re.IGNORECASE) matches = re.findall(pat, str(res['content'])) if len(matches) > 0: matched = True break if not matched: return if not event.moduleDataSource: event.moduleDataSource = "Unknown" self.info(f"Found link to target from affiliate: {url}") evt1 = SpiderFootEvent("AFFILIATE_INTERNET_NAME", self.sf.urlFQDN(url), self.__name__, event) evt1.moduleDataSource = event.moduleDataSource self.notifyListeners(evt1) evt2 = SpiderFootEvent("AFFILIATE_WEB_CONTENT", res['content'], self.__name__, evt1) evt2.moduleDataSource = event.moduleDataSource self.notifyListeners(evt2)
def spiderFrom(self, startingPoint): keepSpidering = True totalFetched = 0 levelsTraversed = 0 nextLinks = dict() targetBase = SpiderFootHelpers.urlBaseUrl(startingPoint) # Are we respecting robots.txt? if self.opts['robotsonly'] and targetBase not in self.robotsRules: robotsTxt = self.sf.fetchUrl(targetBase + '/robots.txt', timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], verify=False) if robotsTxt['content'] is not None: self.debug('robots.txt contents: ' + robotsTxt['content']) self.robotsRules[targetBase] = SpiderFootHelpers.extractUrlsFromRobotsTxt(robotsTxt['content']) if self.checkForStop(): return # First iteration we are starting with links found on the start page # Iterations after that are based on links found on those pages, # and so on.. links = self.processUrl(startingPoint) # fetch first page if links is None: self.debug("No links found on the first fetch!") return while keepSpidering: # Gets hit in the second and subsequent iterations when more links # are found if len(nextLinks) > 0: links = dict() # Fetch content from the new links for link in nextLinks: # Always skip links we've already fetched if (link in self.fetchedPages): self.debug("Already fetched " + link + ", skipping.") continue # Check if we've been asked to stop if self.checkForStop(): return self.debug("Fetching fresh content from: " + link) time.sleep(self.opts['pausesec']) freshLinks = self.processUrl(link) if freshLinks is not None: links.update(freshLinks) totalFetched += 1 if totalFetched >= self.opts['maxpages']: self.info("Maximum number of pages (" + str(self.opts['maxpages']) + ") reached.") keepSpidering = False break nextLinks = self.cleanLinks(links) self.debug(f"Found links: {nextLinks}") # We've scanned through another layer of the site levelsTraversed += 1 self.debug(f"At level: {levelsTraversed}, Pages: {totalFetched}") if levelsTraversed >= self.opts['maxlevels']: self.info(f"Maximum number of levels ({self.opts['maxlevels']}) reached.") keepSpidering = False # We've reached the end of our journey.. if len(nextLinks) == 0: self.debug("No more links found to spider, finishing..") keepSpidering = False # We've been asked to stop scanning if self.checkForStop(): keepSpidering = False return
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.debug(f"Received event, {eventName}, from {srcModuleName}") if eventData in self.results: return self.results[eventData] = True host = SpiderFootHelpers.urlBaseUrl(eventData) if host in self.skiphosts: self.debug("Skipping " + host + " because it doesn't return 404s.") return # http://www/blah/abc.php -> try http://www/blah/abc.php.[fileexts] for ext in self.opts['urlextstry']: if host in self.skiphosts: self.debug("Skipping " + host + " because it doesn't return 404s.") return if "." + ext + "?" in eventData or "." + ext + "#" in eventData or \ eventData.endswith("." + ext): bits = eventData.split("?") for x in self.opts['fileexts']: if self.checkForStop(): return self.debug("Trying " + x + " against " + eventData) fetch = bits[0] + "." + x if fetch in self.results: self.debug("Skipping, already fetched.") continue self.results[fetch] = True res = self.sf.fetchUrl(fetch, headOnly=True, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], sizeLimit=10000000, verify=False) if res['realurl'] != fetch: self.debug("Skipping because " + res['realurl'] + " isn't the fetched URL of " + fetch) continue if res['code'] == "200": if not self.checkValidity(fetch): continue evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event) self.notifyListeners(evt) base = SpiderFootHelpers.urlBaseDir(eventData) if not base or base in self.bases: return self.bases[base] = True # http://www/blah/abc.html -> try http://www/blah/[files] for f in self.opts['files']: if self.checkForStop(): return if host in self.skiphosts: self.debug("Skipping " + host + " because it doesn't return 404s.") return self.debug("Trying " + f + " against " + eventData) fetch = base + f if fetch in self.results: self.debug("Skipping, already fetched.") continue self.results[fetch] = True res = self.sf.fetchUrl(fetch, headOnly=True, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], verify=False) if res['realurl'] != fetch: self.debug("Skipping because " + res['realurl'] + " isn't the fetched URL of " + fetch) continue if res['code'] == "200": if not self.checkValidity(fetch): continue evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event) self.notifyListeners(evt) # don't do anything with the root directory of a site self.debug(f"Base: {base}, event: {eventData}") if base in [eventData, eventData + "/"]: return # http://www/blah/abc.html -> try http://www/blah.[dirs] for dirfile in self.opts['dirs']: if self.checkForStop(): return if host in self.skiphosts: self.debug("Skipping " + host + " because it doesn't return 404s.") return if base.count('/') == 3: self.debug("Skipping base url.") continue self.debug("Trying " + dirfile + " against " + eventData) fetch = base[0:len(base) - 1] + "." + dirfile if fetch in self.results: self.debug("Skipping, already fetched.") continue self.results[fetch] = True res = self.sf.fetchUrl(fetch, headOnly=True, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], verify=False) if res['realurl'] != fetch: self.debug("Skipping because " + res['realurl'] + " isn't the fetched URL of " + fetch) continue if res['code'] == "200": if not self.checkValidity(fetch): continue evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event) self.notifyListeners(evt)
def handleEvent(self, event): eventData = event.data if self.errorState: return if self.opts['api_key'] == "": self.error( f"You enabled {self.__class__.__name__} but did not set a Google API key!" ) self.errorState = True return if eventData in self.results: return self.results[eventData] = True for dom in list(self.domains.keys()): target = self.domains[dom] res = self.sf.googleIterate( searchString=f'+site:{target} "{eventData}"', opts={ "timeout": self.opts["_fetchtimeout"], "useragent": self.opts["_useragent"], "api_key": self.opts["api_key"], "cse_id": self.opts["cse_id"], }, ) if res is None: # Failed to talk to the Google API or no results returned return urls = res["urls"] new_links = list(set(urls) - set(self.results.keys())) # Add new links to results for link in new_links: self.results[link] = True relevant_links = [ link for link in new_links if SpiderFootHelpers.urlBaseUrl(link).endswith(target) ] for link in relevant_links: self.debug("Found a link: " + link) if self.checkForStop(): return res = self.sf.fetchUrl(link, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent']) if res['content'] is None: self.debug(f"Ignoring {link} as no data returned") continue if re.search( r"[^a-zA-Z\-\_0-9]" + re.escape(eventData) + r"[^a-zA-Z\-\_0-9]", res['content'], re.IGNORECASE) is None: continue evt1 = SpiderFootEvent("LEAKSITE_URL", link, self.__name__, event) self.notifyListeners(evt1) evt2 = SpiderFootEvent("LEAKSITE_CONTENT", res['content'], self.__name__, evt1) self.notifyListeners(evt2)