def test_extractUrlsFromText_should_extract_urls_from_string(self): invalid_types = [None, "", list(), dict()] for invalid_type in invalid_types: with self.subTest(invalid_type=invalid_type): cards = SpiderFootHelpers.extractUrlsFromText(invalid_type) self.assertIsInstance(cards, list) urls = SpiderFootHelpers.extractUrlsFromText( "abchttps://example.spiderfoot.net/path\rabchttp://example.spiderfoot.net:1337/path\rabc" ) self.assertIsInstance(urls, list) self.assertIn("https://example.spiderfoot.net/path", urls) self.assertIn("http://example.spiderfoot.net:1337/path", urls)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data if self.errorState: return self.debug(f"Received event, {eventName}, from {srcModuleName}") if eventData in self.results: self.debug(f"Skipping {eventData}, already checked.") return if eventName not in self.watchedEvents(): return self.results[eventData] = True max_pages = int(self.opts['max_pages']) page = 0 while page < max_pages: if self.checkForStop(): return data = self.query(eventData, page) page += 1 if not data: self.errorState = True return results = data.get('results') if not results: return emails = SpiderFootHelpers.extractEmailsFromText(str(results)) for email in emails: if email in self.results: continue mail_domain = email.lower().split('@')[1] if not self.getTarget().matches(mail_domain): self.debug(f"Skipped email address: {email}") continue self.info(f"Found e-mail address: {email}") evt_type = "EMAILADDR" if email.split("@")[0] in self.opts['_genericusers'].split( ","): evt_type = "EMAILADDR_GENERIC" evt = SpiderFootEvent(evt_type, email, self.__name__, event) self.notifyListeners(evt) self.results[email] = True links = set() for result in results: lines = result.get('lines') if lines: for line in lines: links.update( SpiderFootHelpers.extractUrlsFromText(lines[line])) for link in links: if link in self.results: continue host = self.sf.urlFQDN(link) if not self.getTarget().matches( host, includeChildren=True, includeParents=True): self.debug(f"Skipped unrelated URL: {link}") continue self.debug(f"Found a URL: {link}") evt = SpiderFootEvent('LINKED_URL_INTERNAL', link, self.__name__, event) self.notifyListeners(evt) self.results[link] = True if host in self.results: continue if self.opts['dns_resolve'] and not self.sf.resolveHost( host) and not self.sf.resolveHost6(host): self.debug(f"Host {host} could not be resolved") evt = SpiderFootEvent("INTERNET_NAME_UNRESOLVED", host, self.__name__, event) self.notifyListeners(evt) else: evt = SpiderFootEvent("INTERNET_NAME", host, self.__name__, event) self.notifyListeners(evt) self.results[host] = True for result in results: if eventData not in str(result): continue repo = result.get('repo') if not repo: continue if repo in self.results: continue url = result.get('url') if not url: continue repo_data = f"{repo}\n<SFURL>{url}</SFURL>" evt = SpiderFootEvent('PUBLIC_CODE_REPO', repo_data, self.__name__, event) self.notifyListeners(evt) evt = SpiderFootEvent('RAW_RIR_DATA', json.dumps(result), self.__name__, event) self.notifyListeners(evt) self.results[repo] = True if not data.get('nextpage'): break
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data if eventData in self.results: self.debug(f"Skipping {eventData}, already checked") return self.results[eventData] = True self.debug(f"Received event, {eventName}, from {srcModuleName}") if srcModuleName == 'sfp_flickr': self.debug(f"Ignoring {eventData}, from self.") return # Retrieve API key api_key = self.retrieveApiKey() if not api_key: self.error("Failed to obtain API key") return self.debug(f"Retrieved API key: {api_key}") # Query API for event data hosts = list() page = 1 pages = self.opts['maxpages'] per_page = self.opts['per_page'] while page <= pages: if self.checkForStop(): return if self.errorState: return data = self.query(eventData, api_key, page=page, per_page=per_page) if data is None: return # Check the response is ok if data.get('stat') != "ok": self.debug("Error retrieving search results.") return photos = data.get('photos') if not photos: self.debug("No search results.") return # Calculate number of pages to retrieve result_pages = int(photos.get('pages', 0)) if result_pages < pages: pages = result_pages if 'max_allowed_pages' in photos: allowed_pages = int(photos.get('max_allowed_pages', 0)) if pages > allowed_pages: pages = allowed_pages self.info(f"Parsing page {page} of {pages}") # Extract data for photo in photos.get('photo', list()): emails = SpiderFootHelpers.extractEmailsFromText(str(photo)) for email in emails: if email in self.results: continue mail_domain = email.lower().split('@')[1] if not self.getTarget().matches(mail_domain, includeChildren=True, includeParents=True): self.debug(f"Skipped unrelated address: {email}") continue self.info("Found e-mail address: " + email) if email.split("@")[0] in self.opts['_genericusers'].split(","): evttype = "EMAILADDR_GENERIC" else: evttype = "EMAILADDR" evt = SpiderFootEvent(evttype, email, self.__name__, event) self.notifyListeners(evt) self.results[email] = True links = SpiderFootHelpers.extractUrlsFromText(str(photo)) for link in links: if link in self.results: continue host = self.sf.urlFQDN(link) if not self.getTarget().matches(host, includeChildren=True, includeParents=True): self.debug(f"Skipped unrelated URL: {link}") continue hosts.append(host) self.debug(f"Found a URL: {link}") evt = SpiderFootEvent('LINKED_URL_INTERNAL', link, self.__name__, event) self.notifyListeners(evt) self.results[link] = True page += 1 for host in set(hosts): if self.checkForStop(): return if self.errorState: return if self.opts['dns_resolve'] and not self.sf.resolveHost(host) and not self.sf.resolveHost6(host): self.debug(f"Host {host} could not be resolved") evt = SpiderFootEvent("INTERNET_NAME_UNRESOLVED", host, self.__name__, event) self.notifyListeners(evt) continue evt = SpiderFootEvent("INTERNET_NAME", host, self.__name__, event) self.notifyListeners(evt) if self.sf.isDomain(host, self.opts["_internettlds"]): evt = SpiderFootEvent("DOMAIN_NAME", host, self.__name__, event) self.notifyListeners(evt)