Python SpiderFootHelpers.urlBaseUrl 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: spiderfoot

클래스/타입: SpiderFootHelpers

메소드/함수: urlBaseUrl

hotexamples.com에서의 예제들: 7

Python SpiderFootHelpers.urlBaseUrl - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 spiderfoot.SpiderFootHelpers.urlBaseUrl에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

sanitiseInput(11)

extractEmailsFromText(10)

validEmail(8)

urlBaseUrl(7)

targetTypeFromString(7)

buildGraphData(5)

countryNameFromCountryCode(5)

dataParentChildToTree(5)

genScanInstanceId(4)

buildGraphGexf(3)

targetType(3)

loadModulesAsDict(3)

extractUrlsFromText(3)

extractHashesFromText(3)

extractUrlsFromRobotsTxt(2)

extractPgpKeysFromText(2)

extractIbansFromText(2)

logPath(2)

parseRobotsTxt(2)

extractCreditCardsFromText(2)

countryNameFromTld(2)

countryCodes(2)

urlBaseDir(2)

buildGraphJson(2)

validLEI(2)

loadCorrelationRulesRaw(1)

cachePath(1)

urlRelativeToAbsolute(1)

validPhoneNumber(1)

예제 #1

파일 보기

파일: sfp_junkfiles.py 프로젝트: klark1kent/spiderfoot

 def checkValidity(self, junkUrl):
     # Try and fetch an obviously missing version of the junk file
     fetch = junkUrl + str(random.SystemRandom().randint(0, 99999999))
     res = self.sf.fetchUrl(fetch,
                            headOnly=True,
                            timeout=self.opts['_fetchtimeout'],
                            useragent=self.opts['_useragent'],
                            verify=False)
     if res['code'] != "404":
         host = SpiderFootHelpers.urlBaseUrl(junkUrl)
         self.skiphosts[host] = True
         return False
     return True

예제 #2

파일 보기

    def cleanLinks(self, links):
        returnLinks = dict()

        for link in links:
            linkBase = SpiderFootHelpers.urlBaseUrl(link)
            linkFQDN = self.sf.urlFQDN(link)

            # Skip external sites (typical behaviour..)
            if not self.getTarget().matches(linkFQDN):
                # self.debug('Ignoring external site: ' + link)
                continue

            # Optionally skip sub-domain sites
            if self.opts['nosubs'] and not \
                    self.getTarget().matches(linkFQDN, includeChildren=False):
                # self.debug("Ignoring subdomain: " + link)
                continue

            # Skip parent domain sites
            if not self.getTarget().matches(linkFQDN, includeParents=False):
                # self.debug("Ignoring parent domain: " + link)
                continue

            # Optionally skip user directories
            if self.opts['filterusers'] and '/~' in link:
                # self.debug("Ignoring user folder: " + link)
                continue

            # If we are respecting robots.txt, filter those out too
            if linkBase in self.robotsRules and self.opts['robotsonly']:
                if list(filter(lambda blocked: type(blocked).lower(blocked) in link.lower() or blocked == '*', self.robotsRules[linkBase])):
                    # self.debug("Ignoring page found in robots.txt: " + link)
                    continue

            # All tests passed, add link to be spidered
            self.debug("Adding URL for spidering: " + link)
            returnLinks[link] = links[link]

        return returnLinks

예제 #3

파일 보기

파일: test_spiderfoothelpers.py 프로젝트: klark1kent/spiderfoot

 def test_url_base_url_should_return_a_string(self):
     base_url = SpiderFootHelpers.urlBaseUrl(
         'http://localhost.local/path?param=value#fragment')
     self.assertIsInstance(base_url, str)
     self.assertEqual('http://localhost.local', base_url)

예제 #4

파일 보기

파일: sfp_crossref.py 프로젝트: klark1kent/spiderfoot

    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        self.debug(f"Received event, {eventName}, from {srcModuleName}")

        # SIMILARDOMAIN and CO_HOSTED_SITE events are domains, not URLs.
        # Assume HTTP.
        if eventName in ['SIMILARDOMAIN', 'CO_HOSTED_SITE']:
            url = 'http://' + eventData.lower()
        elif 'URL' in eventName:
            url = eventData
        else:
            return

        fqdn = self.sf.urlFQDN(url)

        # We are only interested in external sites for the crossref
        if self.getTarget().matches(fqdn):
            self.debug(f"Ignoring {url} as not external")
            return

        if eventData in self.fetched:
            self.debug(f"Ignoring {url} as already tested")
            return

        if not self.sf.resolveHost(fqdn) and not self.sf.resolveHost6(fqdn):
            self.debug(f"Ignoring {url} as {fqdn} does not resolve")
            return

        self.fetched[url] = True

        self.debug(f"Testing URL for affiliation: {url}")

        res = self.sf.fetchUrl(url,
                               timeout=self.opts['_fetchtimeout'],
                               useragent=self.opts['_useragent'],
                               sizeLimit=10000000,
                               verify=False)

        if res['content'] is None:
            self.debug(f"Ignoring {url} as no data returned")
            return

        matched = False
        for name in self.getTarget().getNames():
            # Search for mentions of our host/domain in the external site's data
            pat = re.compile(
                r"([\.\'\/\"\ ]" + re.escape(name) + r"[\.\'\/\"\ ])",
                re.IGNORECASE)
            matches = re.findall(pat, str(res['content']))

            if len(matches) > 0:
                matched = True
                break

        if not matched:
            # If the name wasn't found in the affiliate, and checkbase is set,
            # fetch the base URL of the affiliate to check for a crossref.
            if eventName == "LINKED_URL_EXTERNAL" and self.opts['checkbase']:
                # Check the base url to see if there is an affiliation
                url = SpiderFootHelpers.urlBaseUrl(eventData)
                if url in self.fetched:
                    return

                self.fetched[url] = True

                res = self.sf.fetchUrl(url,
                                       timeout=self.opts['_fetchtimeout'],
                                       useragent=self.opts['_useragent'],
                                       sizeLimit=10000000,
                                       verify=False)

                if res['content'] is not None:
                    for name in self.getTarget().getNames():
                        pat = re.compile(
                            r"([\.\'\/\"\ ]" + re.escape(name) +
                            r"[\'\/\"\ ])", re.IGNORECASE)
                        matches = re.findall(pat, str(res['content']))

                        if len(matches) > 0:
                            matched = True
                            break

        if not matched:
            return

        if not event.moduleDataSource:
            event.moduleDataSource = "Unknown"

        self.info(f"Found link to target from affiliate: {url}")

        evt1 = SpiderFootEvent("AFFILIATE_INTERNET_NAME", self.sf.urlFQDN(url),
                               self.__name__, event)
        evt1.moduleDataSource = event.moduleDataSource
        self.notifyListeners(evt1)

        evt2 = SpiderFootEvent("AFFILIATE_WEB_CONTENT", res['content'],
                               self.__name__, evt1)
        evt2.moduleDataSource = event.moduleDataSource
        self.notifyListeners(evt2)

예제 #5

파일 보기

    def spiderFrom(self, startingPoint):
        keepSpidering = True
        totalFetched = 0
        levelsTraversed = 0
        nextLinks = dict()
        targetBase = SpiderFootHelpers.urlBaseUrl(startingPoint)

        # Are we respecting robots.txt?
        if self.opts['robotsonly'] and targetBase not in self.robotsRules:
            robotsTxt = self.sf.fetchUrl(targetBase + '/robots.txt',
                                         timeout=self.opts['_fetchtimeout'],
                                         useragent=self.opts['_useragent'],
                                         verify=False)
            if robotsTxt['content'] is not None:
                self.debug('robots.txt contents: ' + robotsTxt['content'])
                self.robotsRules[targetBase] = SpiderFootHelpers.extractUrlsFromRobotsTxt(robotsTxt['content'])

        if self.checkForStop():
            return

        # First iteration we are starting with links found on the start page
        # Iterations after that are based on links found on those pages,
        # and so on..
        links = self.processUrl(startingPoint)  # fetch first page

        if links is None:
            self.debug("No links found on the first fetch!")
            return

        while keepSpidering:
            # Gets hit in the second and subsequent iterations when more links
            # are found
            if len(nextLinks) > 0:
                links = dict()

                # Fetch content from the new links
                for link in nextLinks:
                    # Always skip links we've already fetched
                    if (link in self.fetchedPages):
                        self.debug("Already fetched " + link + ", skipping.")
                        continue

                    # Check if we've been asked to stop
                    if self.checkForStop():
                        return

                    self.debug("Fetching fresh content from: " + link)
                    time.sleep(self.opts['pausesec'])
                    freshLinks = self.processUrl(link)
                    if freshLinks is not None:
                        links.update(freshLinks)

                    totalFetched += 1
                    if totalFetched >= self.opts['maxpages']:
                        self.info("Maximum number of pages (" + str(self.opts['maxpages'])
                                  + ") reached.")
                        keepSpidering = False
                        break

            nextLinks = self.cleanLinks(links)
            self.debug(f"Found links: {nextLinks}")

            # We've scanned through another layer of the site
            levelsTraversed += 1
            self.debug(f"At level: {levelsTraversed}, Pages: {totalFetched}")
            if levelsTraversed >= self.opts['maxlevels']:
                self.info(f"Maximum number of levels ({self.opts['maxlevels']}) reached.")
                keepSpidering = False

            # We've reached the end of our journey..
            if len(nextLinks) == 0:
                self.debug("No more links found to spider, finishing..")
                keepSpidering = False

            # We've been asked to stop scanning
            if self.checkForStop():
                keepSpidering = False

        return

예제 #6

파일 보기

파일: sfp_junkfiles.py 프로젝트: klark1kent/spiderfoot

    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        self.debug(f"Received event, {eventName}, from {srcModuleName}")

        if eventData in self.results:
            return

        self.results[eventData] = True

        host = SpiderFootHelpers.urlBaseUrl(eventData)

        if host in self.skiphosts:
            self.debug("Skipping " + host + " because it doesn't return 404s.")
            return

        # http://www/blah/abc.php -> try http://www/blah/abc.php.[fileexts]
        for ext in self.opts['urlextstry']:
            if host in self.skiphosts:
                self.debug("Skipping " + host +
                           " because it doesn't return 404s.")
                return

            if "." + ext + "?" in eventData or "." + ext + "#" in eventData or \
                    eventData.endswith("." + ext):
                bits = eventData.split("?")
                for x in self.opts['fileexts']:
                    if self.checkForStop():
                        return

                    self.debug("Trying " + x + " against " + eventData)
                    fetch = bits[0] + "." + x
                    if fetch in self.results:
                        self.debug("Skipping, already fetched.")
                        continue

                    self.results[fetch] = True

                    res = self.sf.fetchUrl(fetch,
                                           headOnly=True,
                                           timeout=self.opts['_fetchtimeout'],
                                           useragent=self.opts['_useragent'],
                                           sizeLimit=10000000,
                                           verify=False)
                    if res['realurl'] != fetch:
                        self.debug("Skipping because " + res['realurl'] +
                                   " isn't the fetched URL of " + fetch)
                        continue
                    if res['code'] == "200":
                        if not self.checkValidity(fetch):
                            continue

                        evt = SpiderFootEvent("JUNK_FILE", fetch,
                                              self.__name__, event)
                        self.notifyListeners(evt)

        base = SpiderFootHelpers.urlBaseDir(eventData)
        if not base or base in self.bases:
            return

        self.bases[base] = True

        # http://www/blah/abc.html -> try http://www/blah/[files]
        for f in self.opts['files']:
            if self.checkForStop():
                return

            if host in self.skiphosts:
                self.debug("Skipping " + host +
                           " because it doesn't return 404s.")
                return

            self.debug("Trying " + f + " against " + eventData)
            fetch = base + f
            if fetch in self.results:
                self.debug("Skipping, already fetched.")
                continue

            self.results[fetch] = True

            res = self.sf.fetchUrl(fetch,
                                   headOnly=True,
                                   timeout=self.opts['_fetchtimeout'],
                                   useragent=self.opts['_useragent'],
                                   verify=False)
            if res['realurl'] != fetch:
                self.debug("Skipping because " + res['realurl'] +
                           " isn't the fetched URL of " + fetch)
                continue
            if res['code'] == "200":
                if not self.checkValidity(fetch):
                    continue

                evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event)
                self.notifyListeners(evt)

        # don't do anything with the root directory of a site
        self.debug(f"Base: {base}, event: {eventData}")
        if base in [eventData, eventData + "/"]:
            return

        # http://www/blah/abc.html -> try http://www/blah.[dirs]
        for dirfile in self.opts['dirs']:
            if self.checkForStop():
                return

            if host in self.skiphosts:
                self.debug("Skipping " + host +
                           " because it doesn't return 404s.")
                return

            if base.count('/') == 3:
                self.debug("Skipping base url.")
                continue

            self.debug("Trying " + dirfile + " against " + eventData)
            fetch = base[0:len(base) - 1] + "." + dirfile
            if fetch in self.results:
                self.debug("Skipping, already fetched.")
                continue

            self.results[fetch] = True

            res = self.sf.fetchUrl(fetch,
                                   headOnly=True,
                                   timeout=self.opts['_fetchtimeout'],
                                   useragent=self.opts['_useragent'],
                                   verify=False)
            if res['realurl'] != fetch:
                self.debug("Skipping because " + res['realurl'] +
                           " isn't the fetched URL of " + fetch)
                continue
            if res['code'] == "200":
                if not self.checkValidity(fetch):
                    continue

                evt = SpiderFootEvent("JUNK_FILE", fetch, self.__name__, event)
                self.notifyListeners(evt)

예제 #7

파일 보기

파일: sfp_pastebin.py 프로젝트: klark1kent/spiderfoot

    def handleEvent(self, event):
        eventData = event.data

        if self.errorState:
            return

        if self.opts['api_key'] == "":
            self.error(
                f"You enabled {self.__class__.__name__} but did not set a Google API key!"
            )
            self.errorState = True
            return

        if eventData in self.results:
            return

        self.results[eventData] = True

        for dom in list(self.domains.keys()):
            target = self.domains[dom]
            res = self.sf.googleIterate(
                searchString=f'+site:{target} "{eventData}"',
                opts={
                    "timeout": self.opts["_fetchtimeout"],
                    "useragent": self.opts["_useragent"],
                    "api_key": self.opts["api_key"],
                    "cse_id": self.opts["cse_id"],
                },
            )

            if res is None:
                # Failed to talk to the Google API or no results returned
                return

            urls = res["urls"]
            new_links = list(set(urls) - set(self.results.keys()))

            # Add new links to results
            for link in new_links:
                self.results[link] = True

            relevant_links = [
                link for link in new_links
                if SpiderFootHelpers.urlBaseUrl(link).endswith(target)
            ]

            for link in relevant_links:
                self.debug("Found a link: " + link)

                if self.checkForStop():
                    return

                res = self.sf.fetchUrl(link,
                                       timeout=self.opts['_fetchtimeout'],
                                       useragent=self.opts['_useragent'])

                if res['content'] is None:
                    self.debug(f"Ignoring {link} as no data returned")
                    continue

                if re.search(
                        r"[^a-zA-Z\-\_0-9]" + re.escape(eventData) +
                        r"[^a-zA-Z\-\_0-9]", res['content'],
                        re.IGNORECASE) is None:
                    continue

                evt1 = SpiderFootEvent("LEAKSITE_URL", link, self.__name__,
                                       event)
                self.notifyListeners(evt1)

                evt2 = SpiderFootEvent("LEAKSITE_CONTENT", res['content'],
                                       self.__name__, evt1)
                self.notifyListeners(evt2)