def test_url_base_dir_should_return_a_string(self): base_dir = SpiderFootHelpers.urlBaseDir( 'http://localhost.local/path?param=value#fragment') self.assertIsInstance(base_dir, str) self.assertEqual('http://localhost.local/', base_dir)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.debug(f"Received event, {eventName}, from {srcModuleName}") if eventData in self.results: return self.results[eventData] = True host = SpiderFootHelpers.urlBaseUrl(eventData) if host in self.skiphosts: self.debug("Skipping " + host + " because it doesn't return 404s.") return # http://www/blah/abc.php -> try http://www/blah/abc.php.[fileexts] for ext in self.opts['urlextstry']: if host in self.skiphosts: self.debug("Skipping " + host + " because it doesn't return 404s.") return if "." + ext + "?" in eventData or "." + ext + "#" in eventData or \ eventData.endswith("." + ext): bits = eventData.split("?") for x in self.opts['fileexts']: if self.checkForStop(): return self.debug("Trying " + x + " against " + eventData) fetch = bits[0] + "." + x if fetch in self.results: self.debug("Skipping, already fetched.") continue self.results[fetch] = True res = self.sf.fetchUrl(fetch, headOnly=True, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], sizeLimit=10000000, verify=False) if res['realurl'] != fetch: self.debug("Skipping because " + res['realurl'] + " isn't the fetched URL of " + fetch) continue if res['code'] == "200": if not self.checkValidity(fetch): continue evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event) self.notifyListeners(evt) base = SpiderFootHelpers.urlBaseDir(eventData) if not base or base in self.bases: return self.bases[base] = True # http://www/blah/abc.html -> try http://www/blah/[files] for f in self.opts['files']: if self.checkForStop(): return if host in self.skiphosts: self.debug("Skipping " + host + " because it doesn't return 404s.") return self.debug("Trying " + f + " against " + eventData) fetch = base + f if fetch in self.results: self.debug("Skipping, already fetched.") continue self.results[fetch] = True res = self.sf.fetchUrl(fetch, headOnly=True, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], verify=False) if res['realurl'] != fetch: self.debug("Skipping because " + res['realurl'] + " isn't the fetched URL of " + fetch) continue if res['code'] == "200": if not self.checkValidity(fetch): continue evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event) self.notifyListeners(evt) # don't do anything with the root directory of a site self.debug(f"Base: {base}, event: {eventData}") if base in [eventData, eventData + "/"]: return # http://www/blah/abc.html -> try http://www/blah.[dirs] for dirfile in self.opts['dirs']: if self.checkForStop(): return if host in self.skiphosts: self.debug("Skipping " + host + " because it doesn't return 404s.") return if base.count('/') == 3: self.debug("Skipping base url.") continue self.debug("Trying " + dirfile + " against " + eventData) fetch = base[0:len(base) - 1] + "." + dirfile if fetch in self.results: self.debug("Skipping, already fetched.") continue self.results[fetch] = True res = self.sf.fetchUrl(fetch, headOnly=True, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent'], verify=False) if res['realurl'] != fetch: self.debug("Skipping because " + res['realurl'] + " isn't the fetched URL of " + fetch) continue if res['code'] == "200": if not self.checkValidity(fetch): continue evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event) self.notifyListeners(evt)