def test_extractUrlsFromText_should_extract_urls_from_string(self):
        invalid_types = [None, "", list(), dict()]
        for invalid_type in invalid_types:
            with self.subTest(invalid_type=invalid_type):
                cards = SpiderFootHelpers.extractUrlsFromText(invalid_type)
                self.assertIsInstance(cards, list)

        urls = SpiderFootHelpers.extractUrlsFromText(
            "abchttps://example.spiderfoot.net/path\rabchttp://example.spiderfoot.net:1337/path\rabc"
        )
        self.assertIsInstance(urls, list)
        self.assertIn("https://example.spiderfoot.net/path", urls)
        self.assertIn("http://example.spiderfoot.net:1337/path", urls)
예제 #2
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        if self.errorState:
            return

        self.debug(f"Received event, {eventName}, from {srcModuleName}")

        if eventData in self.results:
            self.debug(f"Skipping {eventData}, already checked.")
            return

        if eventName not in self.watchedEvents():
            return

        self.results[eventData] = True

        max_pages = int(self.opts['max_pages'])
        page = 0
        while page < max_pages:
            if self.checkForStop():
                return

            data = self.query(eventData, page)

            page += 1

            if not data:
                self.errorState = True
                return

            results = data.get('results')

            if not results:
                return

            emails = SpiderFootHelpers.extractEmailsFromText(str(results))
            for email in emails:
                if email in self.results:
                    continue

                mail_domain = email.lower().split('@')[1]
                if not self.getTarget().matches(mail_domain):
                    self.debug(f"Skipped email address: {email}")
                    continue

                self.info(f"Found e-mail address: {email}")

                evt_type = "EMAILADDR"
                if email.split("@")[0] in self.opts['_genericusers'].split(
                        ","):
                    evt_type = "EMAILADDR_GENERIC"
                evt = SpiderFootEvent(evt_type, email, self.__name__, event)
                self.notifyListeners(evt)
                self.results[email] = True

            links = set()
            for result in results:
                lines = result.get('lines')
                if lines:
                    for line in lines:
                        links.update(
                            SpiderFootHelpers.extractUrlsFromText(lines[line]))

            for link in links:
                if link in self.results:
                    continue

                host = self.sf.urlFQDN(link)

                if not self.getTarget().matches(
                        host, includeChildren=True, includeParents=True):
                    self.debug(f"Skipped unrelated URL: {link}")
                    continue

                self.debug(f"Found a URL: {link}")
                evt = SpiderFootEvent('LINKED_URL_INTERNAL', link,
                                      self.__name__, event)
                self.notifyListeners(evt)
                self.results[link] = True

                if host in self.results:
                    continue

                if self.opts['dns_resolve'] and not self.sf.resolveHost(
                        host) and not self.sf.resolveHost6(host):
                    self.debug(f"Host {host} could not be resolved")
                    evt = SpiderFootEvent("INTERNET_NAME_UNRESOLVED", host,
                                          self.__name__, event)
                    self.notifyListeners(evt)
                else:
                    evt = SpiderFootEvent("INTERNET_NAME", host, self.__name__,
                                          event)
                    self.notifyListeners(evt)

                self.results[host] = True

            for result in results:
                if eventData not in str(result):
                    continue

                repo = result.get('repo')

                if not repo:
                    continue

                if repo in self.results:
                    continue

                url = result.get('url')

                if not url:
                    continue

                repo_data = f"{repo}\n<SFURL>{url}</SFURL>"

                evt = SpiderFootEvent('PUBLIC_CODE_REPO', repo_data,
                                      self.__name__, event)
                self.notifyListeners(evt)

                evt = SpiderFootEvent('RAW_RIR_DATA', json.dumps(result),
                                      self.__name__, event)
                self.notifyListeners(evt)

                self.results[repo] = True

            if not data.get('nextpage'):
                break
예제 #3
0
    def handleEvent(self, event):
        eventName = event.eventType
        srcModuleName = event.module
        eventData = event.data

        if eventData in self.results:
            self.debug(f"Skipping {eventData}, already checked")
            return

        self.results[eventData] = True

        self.debug(f"Received event, {eventName}, from {srcModuleName}")

        if srcModuleName == 'sfp_flickr':
            self.debug(f"Ignoring {eventData}, from self.")
            return

        # Retrieve API key
        api_key = self.retrieveApiKey()

        if not api_key:
            self.error("Failed to obtain API key")
            return

        self.debug(f"Retrieved API key: {api_key}")

        # Query API for event data
        hosts = list()
        page = 1
        pages = self.opts['maxpages']
        per_page = self.opts['per_page']
        while page <= pages:
            if self.checkForStop():
                return

            if self.errorState:
                return

            data = self.query(eventData, api_key, page=page, per_page=per_page)

            if data is None:
                return

            # Check the response is ok
            if data.get('stat') != "ok":
                self.debug("Error retrieving search results.")
                return

            photos = data.get('photos')

            if not photos:
                self.debug("No search results.")
                return

            # Calculate number of pages to retrieve
            result_pages = int(photos.get('pages', 0))

            if result_pages < pages:
                pages = result_pages

            if 'max_allowed_pages' in photos:
                allowed_pages = int(photos.get('max_allowed_pages', 0))
                if pages > allowed_pages:
                    pages = allowed_pages

            self.info(f"Parsing page {page} of {pages}")

            # Extract data
            for photo in photos.get('photo', list()):
                emails = SpiderFootHelpers.extractEmailsFromText(str(photo))
                for email in emails:
                    if email in self.results:
                        continue

                    mail_domain = email.lower().split('@')[1]

                    if not self.getTarget().matches(mail_domain, includeChildren=True, includeParents=True):
                        self.debug(f"Skipped unrelated address: {email}")
                        continue

                    self.info("Found e-mail address: " + email)
                    if email.split("@")[0] in self.opts['_genericusers'].split(","):
                        evttype = "EMAILADDR_GENERIC"
                    else:
                        evttype = "EMAILADDR"

                    evt = SpiderFootEvent(evttype, email, self.__name__, event)
                    self.notifyListeners(evt)
                    self.results[email] = True

                links = SpiderFootHelpers.extractUrlsFromText(str(photo))
                for link in links:
                    if link in self.results:
                        continue

                    host = self.sf.urlFQDN(link)

                    if not self.getTarget().matches(host, includeChildren=True, includeParents=True):
                        self.debug(f"Skipped unrelated URL: {link}")
                        continue

                    hosts.append(host)

                    self.debug(f"Found a URL: {link}")
                    evt = SpiderFootEvent('LINKED_URL_INTERNAL', link, self.__name__, event)
                    self.notifyListeners(evt)
                    self.results[link] = True

            page += 1

        for host in set(hosts):
            if self.checkForStop():
                return

            if self.errorState:
                return

            if self.opts['dns_resolve'] and not self.sf.resolveHost(host) and not self.sf.resolveHost6(host):
                self.debug(f"Host {host} could not be resolved")
                evt = SpiderFootEvent("INTERNET_NAME_UNRESOLVED", host, self.__name__, event)
                self.notifyListeners(evt)
                continue

            evt = SpiderFootEvent("INTERNET_NAME", host, self.__name__, event)
            self.notifyListeners(evt)
            if self.sf.isDomain(host, self.opts["_internettlds"]):
                evt = SpiderFootEvent("DOMAIN_NAME", host, self.__name__, event)
                self.notifyListeners(evt)