예제 #1
0
 def _on_link(self, page, domain):
     self.logger.info("Search email address on link to another page")
     _email_founds = []
     # Find all possible link element
     links = page.findAll('a')
     # Find all candidate link with keyword on html page
     keyword_html_link = self._find_keyword_in_html_text(links)
     # Find all candidate link with keyword on url
     keyword_url_link = self._find_keyword_in_url(links, domain)
     # Merge the url result, remove duplicate url
     candidate_links = Utility.uniquify(keyword_html_link +
                                        keyword_url_link)
     # Check for invalid url and try to fix it
     invalid_url = [
         uri for uri in candidate_links if not cfg.url_regex.match(uri)
     ]
     try_fix_invalid_url = map(
         lambda _uri: Utility.normalize_invalid_url(_uri, domain),
         invalid_url)
     # Filter invalid url
     candidate_links = candidate_links + try_fix_invalid_url
     candidate_links = Utility.uniquify(
         [_uri for _uri in candidate_links if cfg.url_regex.match(_uri)])
     try:
         for link in candidate_links:
             self.logger.info("Go to next link: " + link)
             try:
                 self._go_to_page(link)
             except Exception, err:
                 print str(err)
                 continue
             soup = BS(self.driver.page_source, "lxml")
             email = self._on_page(soup, domain)
             _email_founds.append(email)
         return _email_founds if not _email_founds else Utility.flatten_list(
             _email_founds)
예제 #2
0
 def _filter_email_candidates(candidates):
     # Remove duplicate element
     candidates = Utility.uniquify(
         map(lambda email: str(email).strip().lower(),
             [] if not candidates else candidates))
     # Filter email that contain blacklist word
     candidates = filter(
         lambda email: not re.match(cfg.get_blacklist_regex(), email),
         candidates)
     # Filter short email
     candidates = [
         candidate for candidate in candidates if len(candidate) > 5
     ]
     # Filter email that contain newline and space
     candidates = [
         candidate for candidate in candidates if '\n' not in candidate
         and ' ' not in candidate and '\t' not in candidate
     ]
     return candidates