Пример #1
0
 def checkValidity(self, junkUrl):
     # Try and fetch an obviously missing version of the junk file
     fetch = junkUrl + str(random.SystemRandom().randint(0, 99999999))
     res = self.sf.fetchUrl(fetch,
                            headOnly=True,
                            timeout=self.opts['_fetchtimeout'],
                            useragent=self.opts['_useragent'],
                            verify=False)
     if res['code'] != "404":
         host = SpiderFootHelpers.urlBaseUrl(junkUrl)
         self.skiphosts[host] = True
         return False
     return True
Пример #2
0
    def cleanLinks(self, links):
        returnLinks = dict()

        for link in links:
            linkBase = SpiderFootHelpers.urlBaseUrl(link)
            linkFQDN = self.sf.urlFQDN(link)

            # Skip external sites (typical behaviour..)
            if not self.getTarget().matches(linkFQDN):
                # self.debug('Ignoring external site: ' + link)
                continue

            # Optionally skip sub-domain sites
            if self.opts['nosubs'] and not \
                    self.getTarget().matches(linkFQDN, includeChildren=False):
                # self.debug("Ignoring subdomain: " + link)
                continue

            # Skip parent domain sites
            if not self.getTarget().matches(linkFQDN, includeParents=False):
                # self.debug("Ignoring parent domain: " + link)
                continue

            # Optionally skip user directories
            if self.opts['filterusers'] and '/~' in link:
                # self.debug("Ignoring user folder: " + link)
                continue

            # If we are respecting robots.txt, filter those out too
            if linkBase in self.robotsRules and self.opts['robotsonly']:
                if list(filter(lambda blocked: type(blocked).lower(blocked) in link.lower() or blocked == '*', self.robotsRules[linkBase])):
                    # self.debug("Ignoring page found in robots.txt: " + link)
                    continue

            # All tests passed, add link to be spidered
            self.debug("Adding URL for spidering: " + link)
            returnLinks[link] = links[link]

        return returnLinks
 def test_url_base_url_should_return_a_string(self):
     base_url = SpiderFootHelpers.urlBaseUrl(
         'http://localhost.local/path?param=value#fragment')
     self.assertIsInstance(base_url, str)
     self.assertEqual('http://localhost.local', base_url)
Пример #4
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        self.debug(f"Received event, {eventName}, from {srcModuleName}")

        # SIMILARDOMAIN and CO_HOSTED_SITE events are domains, not URLs.
        # Assume HTTP.
        if eventName in ['SIMILARDOMAIN', 'CO_HOSTED_SITE']:
            url = 'http://' + eventData.lower()
        elif 'URL' in eventName:
            url = eventData
        else:
            return

        fqdn = self.sf.urlFQDN(url)

        # We are only interested in external sites for the crossref
        if self.getTarget().matches(fqdn):
            self.debug(f"Ignoring {url} as not external")
            return

        if eventData in self.fetched:
            self.debug(f"Ignoring {url} as already tested")
            return

        if not self.sf.resolveHost(fqdn) and not self.sf.resolveHost6(fqdn):
            self.debug(f"Ignoring {url} as {fqdn} does not resolve")
            return

        self.fetched[url] = True

        self.debug(f"Testing URL for affiliation: {url}")

        res = self.sf.fetchUrl(url,
                               timeout=self.opts['_fetchtimeout'],
                               useragent=self.opts['_useragent'],
                               sizeLimit=10000000,
                               verify=False)

        if res['content'] is None:
            self.debug(f"Ignoring {url} as no data returned")
            return

        matched = False
        for name in self.getTarget().getNames():
            # Search for mentions of our host/domain in the external site's data
            pat = re.compile(
                r"([\.\'\/\"\ ]" + re.escape(name) + r"[\.\'\/\"\ ])",
                re.IGNORECASE)
            matches = re.findall(pat, str(res['content']))

            if len(matches) > 0:
                matched = True
                break

        if not matched:
            # If the name wasn't found in the affiliate, and checkbase is set,
            # fetch the base URL of the affiliate to check for a crossref.
            if eventName == "LINKED_URL_EXTERNAL" and self.opts['checkbase']:
                # Check the base url to see if there is an affiliation
                url = SpiderFootHelpers.urlBaseUrl(eventData)
                if url in self.fetched:
                    return

                self.fetched[url] = True

                res = self.sf.fetchUrl(url,
                                       timeout=self.opts['_fetchtimeout'],
                                       useragent=self.opts['_useragent'],
                                       sizeLimit=10000000,
                                       verify=False)

                if res['content'] is not None:
                    for name in self.getTarget().getNames():
                        pat = re.compile(
                            r"([\.\'\/\"\ ]" + re.escape(name) +
                            r"[\'\/\"\ ])", re.IGNORECASE)
                        matches = re.findall(pat, str(res['content']))

                        if len(matches) > 0:
                            matched = True
                            break

        if not matched:
            return

        if not event.moduleDataSource:
            event.moduleDataSource = "Unknown"

        self.info(f"Found link to target from affiliate: {url}")

        evt1 = SpiderFootEvent("AFFILIATE_INTERNET_NAME", self.sf.urlFQDN(url),
                               self.__name__, event)
        evt1.moduleDataSource = event.moduleDataSource
        self.notifyListeners(evt1)

        evt2 = SpiderFootEvent("AFFILIATE_WEB_CONTENT", res['content'],
                               self.__name__, evt1)
        evt2.moduleDataSource = event.moduleDataSource
        self.notifyListeners(evt2)
Пример #5
0
    def spiderFrom(self, startingPoint):
        keepSpidering = True
        totalFetched = 0
        levelsTraversed = 0
        nextLinks = dict()
        targetBase = SpiderFootHelpers.urlBaseUrl(startingPoint)

        # Are we respecting robots.txt?
        if self.opts['robotsonly'] and targetBase not in self.robotsRules:
            robotsTxt = self.sf.fetchUrl(targetBase + '/robots.txt',
                                         timeout=self.opts['_fetchtimeout'],
                                         useragent=self.opts['_useragent'],
                                         verify=False)
            if robotsTxt['content'] is not None:
                self.debug('robots.txt contents: ' + robotsTxt['content'])
                self.robotsRules[targetBase] = SpiderFootHelpers.extractUrlsFromRobotsTxt(robotsTxt['content'])

        if self.checkForStop():
            return

        # First iteration we are starting with links found on the start page
        # Iterations after that are based on links found on those pages,
        # and so on..
        links = self.processUrl(startingPoint)  # fetch first page

        if links is None:
            self.debug("No links found on the first fetch!")
            return

        while keepSpidering:
            # Gets hit in the second and subsequent iterations when more links
            # are found
            if len(nextLinks) > 0:
                links = dict()

                # Fetch content from the new links
                for link in nextLinks:
                    # Always skip links we've already fetched
                    if (link in self.fetchedPages):
                        self.debug("Already fetched " + link + ", skipping.")
                        continue

                    # Check if we've been asked to stop
                    if self.checkForStop():
                        return

                    self.debug("Fetching fresh content from: " + link)
                    time.sleep(self.opts['pausesec'])
                    freshLinks = self.processUrl(link)
                    if freshLinks is not None:
                        links.update(freshLinks)

                    totalFetched += 1
                    if totalFetched >= self.opts['maxpages']:
                        self.info("Maximum number of pages (" + str(self.opts['maxpages'])
                                  + ") reached.")
                        keepSpidering = False
                        break

            nextLinks = self.cleanLinks(links)
            self.debug(f"Found links: {nextLinks}")

            # We've scanned through another layer of the site
            levelsTraversed += 1
            self.debug(f"At level: {levelsTraversed}, Pages: {totalFetched}")
            if levelsTraversed >= self.opts['maxlevels']:
                self.info(f"Maximum number of levels ({self.opts['maxlevels']}) reached.")
                keepSpidering = False

            # We've reached the end of our journey..
            if len(nextLinks) == 0:
                self.debug("No more links found to spider, finishing..")
                keepSpidering = False

            # We've been asked to stop scanning
            if self.checkForStop():
                keepSpidering = False

        return
Пример #6
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        self.debug(f"Received event, {eventName}, from {srcModuleName}")

        if eventData in self.results:
            return

        self.results[eventData] = True

        host = SpiderFootHelpers.urlBaseUrl(eventData)

        if host in self.skiphosts:
            self.debug("Skipping " + host + " because it doesn't return 404s.")
            return

        # http://www/blah/abc.php -> try http://www/blah/abc.php.[fileexts]
        for ext in self.opts['urlextstry']:
            if host in self.skiphosts:
                self.debug("Skipping " + host +
                           " because it doesn't return 404s.")
                return

            if "." + ext + "?" in eventData or "." + ext + "#" in eventData or \
                    eventData.endswith("." + ext):
                bits = eventData.split("?")
                for x in self.opts['fileexts']:
                    if self.checkForStop():
                        return

                    self.debug("Trying " + x + " against " + eventData)
                    fetch = bits[0] + "." + x
                    if fetch in self.results:
                        self.debug("Skipping, already fetched.")
                        continue

                    self.results[fetch] = True

                    res = self.sf.fetchUrl(fetch,
                                           headOnly=True,
                                           timeout=self.opts['_fetchtimeout'],
                                           useragent=self.opts['_useragent'],
                                           sizeLimit=10000000,
                                           verify=False)
                    if res['realurl'] != fetch:
                        self.debug("Skipping because " + res['realurl'] +
                                   " isn't the fetched URL of " + fetch)
                        continue
                    if res['code'] == "200":
                        if not self.checkValidity(fetch):
                            continue

                        evt = SpiderFootEvent("JUNK_FILE", fetch,
                                              self.__name__, event)
                        self.notifyListeners(evt)

        base = SpiderFootHelpers.urlBaseDir(eventData)
        if not base or base in self.bases:
            return

        self.bases[base] = True

        # http://www/blah/abc.html -> try http://www/blah/[files]
        for f in self.opts['files']:
            if self.checkForStop():
                return

            if host in self.skiphosts:
                self.debug("Skipping " + host +
                           " because it doesn't return 404s.")
                return

            self.debug("Trying " + f + " against " + eventData)
            fetch = base + f
            if fetch in self.results:
                self.debug("Skipping, already fetched.")
                continue

            self.results[fetch] = True

            res = self.sf.fetchUrl(fetch,
                                   headOnly=True,
                                   timeout=self.opts['_fetchtimeout'],
                                   useragent=self.opts['_useragent'],
                                   verify=False)
            if res['realurl'] != fetch:
                self.debug("Skipping because " + res['realurl'] +
                           " isn't the fetched URL of " + fetch)
                continue
            if res['code'] == "200":
                if not self.checkValidity(fetch):
                    continue

                evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event)
                self.notifyListeners(evt)

        # don't do anything with the root directory of a site
        self.debug(f"Base: {base}, event: {eventData}")
        if base in [eventData, eventData + "/"]:
            return

        # http://www/blah/abc.html -> try http://www/blah.[dirs]
        for dirfile in self.opts['dirs']:
            if self.checkForStop():
                return

            if host in self.skiphosts:
                self.debug("Skipping " + host +
                           " because it doesn't return 404s.")
                return

            if base.count('/') == 3:
                self.debug("Skipping base url.")
                continue

            self.debug("Trying " + dirfile + " against " + eventData)
            fetch = base[0:len(base) - 1] + "." + dirfile
            if fetch in self.results:
                self.debug("Skipping, already fetched.")
                continue

            self.results[fetch] = True

            res = self.sf.fetchUrl(fetch,
                                   headOnly=True,
                                   timeout=self.opts['_fetchtimeout'],
                                   useragent=self.opts['_useragent'],
                                   verify=False)
            if res['realurl'] != fetch:
                self.debug("Skipping because " + res['realurl'] +
                           " isn't the fetched URL of " + fetch)
                continue
            if res['code'] == "200":
                if not self.checkValidity(fetch):
                    continue

                evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event)
                self.notifyListeners(evt)
Пример #7
0
    def handleEvent(self, event):
        eventData = event.data

        if self.errorState:
            return

        if self.opts['api_key'] == "":
            self.error(
                f"You enabled {self.__class__.__name__} but did not set a Google API key!"
            )
            self.errorState = True
            return

        if eventData in self.results:
            return

        self.results[eventData] = True

        for dom in list(self.domains.keys()):
            target = self.domains[dom]
            res = self.sf.googleIterate(
                searchString=f'+site:{target} "{eventData}"',
                opts={
                    "timeout": self.opts["_fetchtimeout"],
                    "useragent": self.opts["_useragent"],
                    "api_key": self.opts["api_key"],
                    "cse_id": self.opts["cse_id"],
                },
            )

            if res is None:
                # Failed to talk to the Google API or no results returned
                return

            urls = res["urls"]
            new_links = list(set(urls) - set(self.results.keys()))

            # Add new links to results
            for link in new_links:
                self.results[link] = True

            relevant_links = [
                link for link in new_links
                if SpiderFootHelpers.urlBaseUrl(link).endswith(target)
            ]

            for link in relevant_links:
                self.debug("Found a link: " + link)

                if self.checkForStop():
                    return

                res = self.sf.fetchUrl(link,
                                       timeout=self.opts['_fetchtimeout'],
                                       useragent=self.opts['_useragent'])

                if res['content'] is None:
                    self.debug(f"Ignoring {link} as no data returned")
                    continue

                if re.search(
                        r"[^a-zA-Z\-\_0-9]" + re.escape(eventData) +
                        r"[^a-zA-Z\-\_0-9]", res['content'],
                        re.IGNORECASE) is None:
                    continue

                evt1 = SpiderFootEvent("LEAKSITE_URL", link, self.__name__,
                                       event)
                self.notifyListeners(evt1)

                evt2 = SpiderFootEvent("LEAKSITE_CONTENT", res['content'],
                                       self.__name__, evt1)
                self.notifyListeners(evt2)