def _on_link(self, page, domain): self.logger.info("Search email address on link to another page") _email_founds = [] # Find all possible link element links = page.findAll('a') # Find all candidate link with keyword on html page keyword_html_link = self._find_keyword_in_html_text(links) # Find all candidate link with keyword on url keyword_url_link = self._find_keyword_in_url(links, domain) # Merge the url result, remove duplicate url candidate_links = Utility.uniquify(keyword_html_link + keyword_url_link) # Check for invalid url and try to fix it invalid_url = [ uri for uri in candidate_links if not cfg.url_regex.match(uri) ] try_fix_invalid_url = map( lambda _uri: Utility.normalize_invalid_url(_uri, domain), invalid_url) # Filter invalid url candidate_links = candidate_links + try_fix_invalid_url candidate_links = Utility.uniquify( [_uri for _uri in candidate_links if cfg.url_regex.match(_uri)]) try: for link in candidate_links: self.logger.info("Go to next link: " + link) try: self._go_to_page(link) except Exception, err: print str(err) continue soup = BS(self.driver.page_source, "lxml") email = self._on_page(soup, domain) _email_founds.append(email) return _email_founds if not _email_founds else Utility.flatten_list( _email_founds)
def _filter_email_candidates(candidates): # Remove duplicate element candidates = Utility.uniquify( map(lambda email: str(email).strip().lower(), [] if not candidates else candidates)) # Filter email that contain blacklist word candidates = filter( lambda email: not re.match(cfg.get_blacklist_regex(), email), candidates) # Filter short email candidates = [ candidate for candidate in candidates if len(candidate) > 5 ] # Filter email that contain newline and space candidates = [ candidate for candidate in candidates if '\n' not in candidate and ' ' not in candidate and '\t' not in candidate ] return candidates