def test_extractEmailsFromText_should_return_list_of_emails_from_string( self): emails = SpiderFootHelpers.extractEmailsFromText( "<html><body><p>From:[email protected]</p><p>Subject:Hello [email protected], here's some text</p></body></html>" ) self.assertIsInstance(emails, list) self.assertIn('*****@*****.**', emails) self.assertIn('*****@*****.**', emails)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data if eventData in self.results: return self.results[eventData] = True self.debug(f"Received event, {eventName}, from {srcModuleName}") # Get e-mail addresses on this domain res = self.sf.fetchUrl(f"https://www.email-format.com/d/{eventData}/", timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent']) if res['content'] is None: return html = BeautifulSoup(res["content"], features="lxml") if not html: return tbody = html.find('tbody') if tbody: data = str(tbody.contents) else: # fall back to raw page contents data = res["content"] emails = SpiderFootHelpers.extractEmailsFromText(data) for email in emails: # Skip unrelated emails mailDom = email.lower().split('@')[1] if not self.getTarget().matches(mailDom): self.debug(f"Skipped address: {email}") continue # Skip masked emails if re.match(r"^[0-9a-f]{8}\.[0-9]{7}@", email): self.debug(f"Skipped address: {email}") continue self.info(f"Found e-mail address: {email}") if email.split("@")[0] in self.opts['_genericusers'].split(","): evttype = "EMAILADDR_GENERIC" else: evttype = "EMAILADDR" evt = SpiderFootEvent(evttype, email, self.__name__, event) self.notifyListeners(evt)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.debug(f"Received event, {eventName}, from {srcModuleName}") emails = SpiderFootHelpers.extractEmailsFromText(eventData) for email in set(emails): evttype = "EMAILADDR" email = email.lower() # Get the domain and strip potential ending . mailDom = email.split('@')[1].strip('.') if not self.sf.validHost(mailDom, self.opts['_internettlds']): self.debug(f"Skipping {email} as not a valid e-mail.") continue if not self.getTarget().matches( mailDom, includeChildren=True, includeParents=True ) and not self.getTarget().matches(email): self.debug("External domain, so possible affiliate e-mail") evttype = "AFFILIATE_EMAILADDR" if eventName.startswith("AFFILIATE_"): evttype = "AFFILIATE_EMAILADDR" if not evttype.startswith("AFFILIATE_") and email.split( "@")[0] in self.opts['_genericusers'].split(","): evttype = "EMAILADDR_GENERIC" self.info(f"Found e-mail address: {email}") mail = email.strip('.') evt = SpiderFootEvent(evttype, mail, self.__name__, event) if event.moduleDataSource: evt.moduleDataSource = event.moduleDataSource else: evt.moduleDataSource = "Unknown" self.notifyListeners(evt)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data if self.errorState: return self.debug(f"Received event, {eventName}, from {srcModuleName}") if eventData in self.results: self.debug(f"Skipping {eventData}, already checked.") return if eventName not in self.watchedEvents(): return self.results[eventData] = True max_pages = int(self.opts['max_pages']) page = 0 while page < max_pages: if self.checkForStop(): return data = self.query(eventData, page) page += 1 if not data: self.errorState = True return results = data.get('results') if not results: return emails = SpiderFootHelpers.extractEmailsFromText(str(results)) for email in emails: if email in self.results: continue mail_domain = email.lower().split('@')[1] if not self.getTarget().matches(mail_domain): self.debug(f"Skipped email address: {email}") continue self.info(f"Found e-mail address: {email}") evt_type = "EMAILADDR" if email.split("@")[0] in self.opts['_genericusers'].split( ","): evt_type = "EMAILADDR_GENERIC" evt = SpiderFootEvent(evt_type, email, self.__name__, event) self.notifyListeners(evt) self.results[email] = True links = set() for result in results: lines = result.get('lines') if lines: for line in lines: links.update( SpiderFootHelpers.extractUrlsFromText(lines[line])) for link in links: if link in self.results: continue host = self.sf.urlFQDN(link) if not self.getTarget().matches( host, includeChildren=True, includeParents=True): self.debug(f"Skipped unrelated URL: {link}") continue self.debug(f"Found a URL: {link}") evt = SpiderFootEvent('LINKED_URL_INTERNAL', link, self.__name__, event) self.notifyListeners(evt) self.results[link] = True if host in self.results: continue if self.opts['dns_resolve'] and not self.sf.resolveHost( host) and not self.sf.resolveHost6(host): self.debug(f"Host {host} could not be resolved") evt = SpiderFootEvent("INTERNET_NAME_UNRESOLVED", host, self.__name__, event) self.notifyListeners(evt) else: evt = SpiderFootEvent("INTERNET_NAME", host, self.__name__, event) self.notifyListeners(evt) self.results[host] = True for result in results: if eventData not in str(result): continue repo = result.get('repo') if not repo: continue if repo in self.results: continue url = result.get('url') if not url: continue repo_data = f"{repo}\n<SFURL>{url}</SFURL>" evt = SpiderFootEvent('PUBLIC_CODE_REPO', repo_data, self.__name__, event) self.notifyListeners(evt) evt = SpiderFootEvent('RAW_RIR_DATA', json.dumps(result), self.__name__, event) self.notifyListeners(evt) self.results[repo] = True if not data.get('nextpage'): break
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data if eventData in self.results: self.debug(f"Skipping {eventData}, already checked") return self.results[eventData] = True self.debug(f"Received event, {eventName}, from {srcModuleName}") if srcModuleName == 'sfp_flickr': self.debug(f"Ignoring {eventData}, from self.") return # Retrieve API key api_key = self.retrieveApiKey() if not api_key: self.error("Failed to obtain API key") return self.debug(f"Retrieved API key: {api_key}") # Query API for event data hosts = list() page = 1 pages = self.opts['maxpages'] per_page = self.opts['per_page'] while page <= pages: if self.checkForStop(): return if self.errorState: return data = self.query(eventData, api_key, page=page, per_page=per_page) if data is None: return # Check the response is ok if data.get('stat') != "ok": self.debug("Error retrieving search results.") return photos = data.get('photos') if not photos: self.debug("No search results.") return # Calculate number of pages to retrieve result_pages = int(photos.get('pages', 0)) if result_pages < pages: pages = result_pages if 'max_allowed_pages' in photos: allowed_pages = int(photos.get('max_allowed_pages', 0)) if pages > allowed_pages: pages = allowed_pages self.info(f"Parsing page {page} of {pages}") # Extract data for photo in photos.get('photo', list()): emails = SpiderFootHelpers.extractEmailsFromText(str(photo)) for email in emails: if email in self.results: continue mail_domain = email.lower().split('@')[1] if not self.getTarget().matches(mail_domain, includeChildren=True, includeParents=True): self.debug(f"Skipped unrelated address: {email}") continue self.info("Found e-mail address: " + email) if email.split("@")[0] in self.opts['_genericusers'].split(","): evttype = "EMAILADDR_GENERIC" else: evttype = "EMAILADDR" evt = SpiderFootEvent(evttype, email, self.__name__, event) self.notifyListeners(evt) self.results[email] = True links = SpiderFootHelpers.extractUrlsFromText(str(photo)) for link in links: if link in self.results: continue host = self.sf.urlFQDN(link) if not self.getTarget().matches(host, includeChildren=True, includeParents=True): self.debug(f"Skipped unrelated URL: {link}") continue hosts.append(host) self.debug(f"Found a URL: {link}") evt = SpiderFootEvent('LINKED_URL_INTERNAL', link, self.__name__, event) self.notifyListeners(evt) self.results[link] = True page += 1 for host in set(hosts): if self.checkForStop(): return if self.errorState: return if self.opts['dns_resolve'] and not self.sf.resolveHost(host) and not self.sf.resolveHost6(host): self.debug(f"Host {host} could not be resolved") evt = SpiderFootEvent("INTERNET_NAME_UNRESOLVED", host, self.__name__, event) self.notifyListeners(evt) continue evt = SpiderFootEvent("INTERNET_NAME", host, self.__name__, event) self.notifyListeners(evt) if self.sf.isDomain(host, self.opts["_internettlds"]): evt = SpiderFootEvent("DOMAIN_NAME", host, self.__name__, event) self.notifyListeners(evt)
def test_extractEmailsFromText_invalid_data_should_return_list(self): invalid_types = [None, "", list(), dict()] for invalid_type in invalid_types: with self.subTest(invalid_type=invalid_type): emails = SpiderFootHelpers.extractEmailsFromText(invalid_type) self.assertIsInstance(emails, list)
def handleEvent(self, event): eventData = event.data if self.errorState: return if eventData in self.results: self.debug(f"Skipping {eventData}, already checked.") return self.results[eventData] = True query_results = self.query(eventData, "excerpts") items = query_results.get('items') allEmails = [] allUsernames = [] allIP4s = [] allIP6s = [] if not items: return # Iterate through all results from query, creating raw_rir_data events and extracting emails for item in items: if self.checkForStop(): return body = item["body"] excerpt = item["excerpt"] question = item["question_id"] text = body + excerpt # create raw_rir_data event e = SpiderFootEvent( 'RAW_RIR_DATA', f"<SFURL>https://stackoverflow.com/questions/{question}</SFURL>\n{item}", self.__name__, event ) self.notifyListeners(e) emails = SpiderFootHelpers.extractEmailsFromText(text) if emails: for email in emails: allEmails.append(str(email)) questionId = item["question_id"] username = self.extractUsername(questionId) if username: allUsernames.append(username) ip4s = self.extractIP4s(text) if ip4s: allIP4s.append(ip4s) ip6s = self.extractIP6s(text) if ip6s: allIP6s.append(ip6s) # create events for emails, username and IPs for email in set(allEmails): email = str(email).lower() if self.getTarget().matches(email): e = SpiderFootEvent('EMAILADDR', email, self.__name__, event) else: e = SpiderFootEvent('AFFILIATE_EMAILADDR', email, self.__name__, event) self.notifyListeners(e) for username in set(allUsernames): if " " in username: e = SpiderFootEvent('RAW_RIR_DATA', 'Possible full name: ' + username, self.__name__, event) else: e = SpiderFootEvent('USERNAME', username, self.__name__, event) self.notifyListeners(e) for ip in set(allIP4s): ip = str(ip) e = SpiderFootEvent('AFFILIATE_IP_ADDRESS', ip, self.__name__, event) self.notifyListeners(e) for ip in set(allIP6s): ip = str(ip) e = SpiderFootEvent('AFFILIATE_IPV6_ADDRESS', ip, self.__name__, event) self.notifyListeners(e)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data if eventData in self.results: return self.results[eventData] = True self.debug(f"Received event, {eventName}, from {srcModuleName}") # Get e-mail addresses on this domain res = self.sf.fetchUrl("http://www.skymem.info/srch?q=" + eventData, timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent']) if res['content'] is None: return # Extract emails from results page emails = SpiderFootHelpers.extractEmailsFromText(res['content']) for email in emails: # Skip unrelated emails mailDom = email.lower().split('@')[1] if not self.getTarget().matches(mailDom): self.debug("Skipped address: " + email) continue self.info("Found e-mail address: " + email) if email not in self.results: if email.split("@")[0] in self.opts['_genericusers'].split( ","): evttype = "EMAILADDR_GENERIC" else: evttype = "EMAILADDR" evt = SpiderFootEvent(evttype, email, self.__name__, event) self.notifyListeners(evt) self.results[email] = True # Loop through first 20 pages of results domain_ids = re.findall(r'<a href="/domain/([a-z0-9]+)\?p=', str(res['content'])) if not domain_ids: return domain_id = domain_ids[0] for page in range(1, 21): res = self.sf.fetchUrl( f"http://www.skymem.info/domain/{domain_id}?p={page}", timeout=self.opts['_fetchtimeout'], useragent=self.opts['_useragent']) if res['content'] is None: break emails = SpiderFootHelpers.extractEmailsFromText(res['content']) for email in emails: # Skip unrelated emails mailDom = email.lower().split('@')[1] if not self.getTarget().matches(mailDom): self.debug("Skipped address: " + email) continue self.info("Found e-mail address: " + email) if email not in self.results: if email.split("@")[0] in self.opts['_genericusers'].split( ","): evttype = "EMAILADDR_GENERIC" else: evttype = "EMAILADDR" evt = SpiderFootEvent(evttype, email, self.__name__, event) self.notifyListeners(evt) self.results[email] = True # Check if we're on the last page of results max_page = 0 pages = re.findall(r'/domain/' + domain_id + r'\?p=(\d+)', str(res['content'])) for p in pages: if int(p) >= max_page: max_page = int(p) if page >= max_page: break
def handleEvent(self, event): eventName = event.eventType eventData = event.data if self.errorState: return if eventData in self.results: return self.results[eventData] = True self.debug(f"Received event, {eventName}, from {event.module}") if not self.opts['keyserver_search1'] and not self.opts[ 'keyserver_search2']: self.error( f"You enabled {self.__class__.__name__} but did not set key server URLs" ) self.errorState = True return # Get e-mail addresses on this domain if eventName in ["DOMAIN_NAME", "INTERNET_NAME"]: res = self.queryDomain(self.opts['keyserver_search1'], eventData) if not res: res = self.queryDomain(self.opts['keyserver_search2'], eventData) if not res: return emails = SpiderFootHelpers.extractEmailsFromText(res['content']) self.info(f"Found {len(emails)} email addresses") for email in emails: if email.split("@")[0] in self.opts['_genericusers'].split( ","): evttype = "EMAILADDR_GENERIC" else: evttype = "EMAILADDR" mailDom = email.lower().split('@')[1] if not self.getTarget().matches(mailDom): evttype = "AFFILIATE_EMAILADDR" self.debug(f"Found e-mail address: {email}") evt = SpiderFootEvent(evttype, email, self.__name__, event) self.notifyListeners(evt) if eventName == "EMAILADDR" and self.opts['retrieve_keys']: res = self.queryEmail(self.opts['keyserver_fetch1'], eventData) if not res: res = self.queryEmail(self.opts['keyserver_fetch2'], eventData) if not res: return keys = SpiderFootHelpers.extractPgpKeysFromText(res['content']) self.info(f"Found {len(keys)} public PGP keys") for key in keys: self.debug(f"Found public key: {key}") evt = SpiderFootEvent("PGP_KEY", key, self.__name__, event) self.notifyListeners(evt)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data if eventData in self.results: return self.results[eventData] = True self.debug(f"Received event, {eventName}, from {srcModuleName}") if srcModuleName == 'sfp_grep_app': self.debug("Ignoring " + eventData + ", from self.") return hosts = list() page = 1 per_page = 10 pages = self.opts['max_pages'] while page <= pages: if self.checkForStop(): return if self.errorState: return res = self.query(eventData, page) if res is None: return facets = res.get('facets') if facets is None: return count = facets.get('count') if count is None: return last_page = math.ceil(count / per_page) if last_page is None: pages = 0 if last_page < pages: pages = last_page self.info("Parsing page " + str(page) + " of " + str(pages)) page += 1 hits = res.get('hits') if hits is None: return data = hits.get('hits') if data is None: return for result in data: if result is None: continue evt = SpiderFootEvent("RAW_RIR_DATA", str(result), self.__name__, event) self.notifyListeners(evt) content = result.get('content') if content is None: continue snippet = content.get('snippet') if snippet is None: continue links = self.sf.extractUrls( snippet.replace('<mark>', '').replace('</mark>', '')) if links: for link in links: if link in self.results: continue host = self.sf.urlFQDN(link) if not self.getTarget().matches(host, includeChildren=True, includeParents=True): continue hosts.append(host) if not self.getTarget().matches(self.sf.urlFQDN(link), includeChildren=True, includeParents=True): self.debug("Skipped unrelated link: " + link) continue self.debug('Found a link: ' + link) evt = SpiderFootEvent('LINKED_URL_INTERNAL', link, self.__name__, event) self.notifyListeners(evt) self.results[link] = True emails = SpiderFootHelpers.extractEmailsFromText( snippet.replace('<mark>', '').replace('</mark>', '')) if emails: for email in emails: if email in self.results: continue mail_domain = email.lower().split('@')[1] if not self.getTarget().matches(mail_domain, includeChildren=True, includeParents=True): self.debug("Skipped unrelated email address: " + email) continue self.info("Found e-mail address: " + email) if email.split("@")[0] in self.opts[ '_genericusers'].split(","): evttype = "EMAILADDR_GENERIC" else: evttype = "EMAILADDR" evt = SpiderFootEvent(evttype, email, self.__name__, event) self.notifyListeners(evt) self.results[email] = True for host in set(hosts): if self.checkForStop(): return if self.errorState: return if self.opts['dns_resolve'] and not self.sf.resolveHost( host) and not self.sf.resolveHost6(host): self.debug(f"Host {host} could not be resolved") evt = SpiderFootEvent("INTERNET_NAME_UNRESOLVED", host, self.__name__, event) self.notifyListeners(evt) continue evt = SpiderFootEvent("INTERNET_NAME", host, self.__name__, event) self.notifyListeners(evt) if self.sf.isDomain(host, self.opts["_internettlds"]): evt = SpiderFootEvent("DOMAIN_NAME", host, self.__name__, event) self.notifyListeners(evt)