def crawl(links): blacklist = Blacklist.factory("url", list(links)) links_to_process = deque(blacklist.remove_blacklisted()) email_blacklist = Blacklist(scrub_words=[ 'example', 'email', 'support', 'domain', 'orders', 'info', 'github', 'registration', 'mozilla', 'donate', 'feedback', 'newsletter', 'name' ]) email_writer = EmailWriter(email_blacklist) processed_urls = set() emails = set() logger = logging.getLogger() while len(links_to_process): url1 = links_to_process.pop() # add to processed immediately, to support failure processed_urls.add(url1) url_extras = get_url_extras(url1) response = get_url_response(url1) if not response.ok: continue try: new_emails = get_email_set_from_response(response) except TimeoutError: continue email_writer.add_emails(new_emails) # create a beautiful soup for the html document soup = BeautifulSoup(response.text, "html.parser") # find and process all the anchors in the document for anchor in soup.find_all("a"): # extract link url from the anchor link = anchor.attrs["href"] if "href" in anchor.attrs else '' # resolve relative links if link.startswith('/'): link = url_extras[1] + link elif not link.startswith('http'): link = url_extras[2] + link # add the new url to the queue if it was not enqueued nor processed yet if link not in links_to_process and link not in processed_urls: if not blacklist.is_blacklisted(link): links_to_process.appendleft(link) # scrub linkset to ensure crawler doesn't waste time on one site # urls = scrub_linkset(urls) urls_list = list(links_to_process) scrubbed = scrub(urls_list, 4) logger.debug(scrubbed) links_to_process = deque(scrubbed) return emails
def test_email_blacklist(self): blacklist = Blacklist.factory("emails") self.assertTrue(blacklist.is_blacklisted("*****@*****.**")) self.assertFalse(blacklist.is_blacklisted("*****@*****.**"))