def test_url_base_dir_should_return_a_string(self):
     base_dir = SpiderFootHelpers.urlBaseDir(
         'http://localhost.local/path?param=value#fragment')
     self.assertIsInstance(base_dir, str)
     self.assertEqual('http://localhost.local/', base_dir)
Пример #2
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        self.debug(f"Received event, {eventName}, from {srcModuleName}")

        if eventData in self.results:
            return

        self.results[eventData] = True

        host = SpiderFootHelpers.urlBaseUrl(eventData)

        if host in self.skiphosts:
            self.debug("Skipping " + host + " because it doesn't return 404s.")
            return

        # http://www/blah/abc.php -> try http://www/blah/abc.php.[fileexts]
        for ext in self.opts['urlextstry']:
            if host in self.skiphosts:
                self.debug("Skipping " + host +
                           " because it doesn't return 404s.")
                return

            if "." + ext + "?" in eventData or "." + ext + "#" in eventData or \
                    eventData.endswith("." + ext):
                bits = eventData.split("?")
                for x in self.opts['fileexts']:
                    if self.checkForStop():
                        return

                    self.debug("Trying " + x + " against " + eventData)
                    fetch = bits[0] + "." + x
                    if fetch in self.results:
                        self.debug("Skipping, already fetched.")
                        continue

                    self.results[fetch] = True

                    res = self.sf.fetchUrl(fetch,
                                           headOnly=True,
                                           timeout=self.opts['_fetchtimeout'],
                                           useragent=self.opts['_useragent'],
                                           sizeLimit=10000000,
                                           verify=False)
                    if res['realurl'] != fetch:
                        self.debug("Skipping because " + res['realurl'] +
                                   " isn't the fetched URL of " + fetch)
                        continue
                    if res['code'] == "200":
                        if not self.checkValidity(fetch):
                            continue

                        evt = SpiderFootEvent("JUNK_FILE", fetch,
                                              self.__name__, event)
                        self.notifyListeners(evt)

        base = SpiderFootHelpers.urlBaseDir(eventData)
        if not base or base in self.bases:
            return

        self.bases[base] = True

        # http://www/blah/abc.html -> try http://www/blah/[files]
        for f in self.opts['files']:
            if self.checkForStop():
                return

            if host in self.skiphosts:
                self.debug("Skipping " + host +
                           " because it doesn't return 404s.")
                return

            self.debug("Trying " + f + " against " + eventData)
            fetch = base + f
            if fetch in self.results:
                self.debug("Skipping, already fetched.")
                continue

            self.results[fetch] = True

            res = self.sf.fetchUrl(fetch,
                                   headOnly=True,
                                   timeout=self.opts['_fetchtimeout'],
                                   useragent=self.opts['_useragent'],
                                   verify=False)
            if res['realurl'] != fetch:
                self.debug("Skipping because " + res['realurl'] +
                           " isn't the fetched URL of " + fetch)
                continue
            if res['code'] == "200":
                if not self.checkValidity(fetch):
                    continue

                evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event)
                self.notifyListeners(evt)

        # don't do anything with the root directory of a site
        self.debug(f"Base: {base}, event: {eventData}")
        if base in [eventData, eventData + "/"]:
            return

        # http://www/blah/abc.html -> try http://www/blah.[dirs]
        for dirfile in self.opts['dirs']:
            if self.checkForStop():
                return

            if host in self.skiphosts:
                self.debug("Skipping " + host +
                           " because it doesn't return 404s.")
                return

            if base.count('/') == 3:
                self.debug("Skipping base url.")
                continue

            self.debug("Trying " + dirfile + " against " + eventData)
            fetch = base[0:len(base) - 1] + "." + dirfile
            if fetch in self.results:
                self.debug("Skipping, already fetched.")
                continue

            self.results[fetch] = True

            res = self.sf.fetchUrl(fetch,
                                   headOnly=True,
                                   timeout=self.opts['_fetchtimeout'],
                                   useragent=self.opts['_useragent'],
                                   verify=False)
            if res['realurl'] != fetch:
                self.debug("Skipping because " + res['realurl'] +
                           " isn't the fetched URL of " + fetch)
                continue
            if res['code'] == "200":
                if not self.checkValidity(fetch):
                    continue

                evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event)
                self.notifyListeners(evt)