def search_email_in_domain(self, domain): self.check_driver() try: self._go_to_page(domain) soup = BS(self.driver.page_source, "lxml") # Find email on page and on link _email_founds = [] for pos in POSSIBLE_POSITION: action_list = { 'on_page': self._on_page, 'on_link': self._on_link } email = action_list.get(pos)(soup, domain) _email_founds.append(email) email_candidates = Utility.flatten_list(_email_founds) if str(domain).endswith('.id') or str(domain).endswith('.id/'): emails = self.search_id_domain(domain) if emails not in email_candidates: email_candidates += emails if not email_candidates: # If email not found self.logger.info('Email not found on domain %s', domain) # Find it using whois return [] else: # If email found, filter it final_candidates = self._filter_email_candidates( email_candidates) return self.sort_email(final_candidates, domain) except Exception as exc: print "Error on domain {} {} ".format(domain, str(exc)) return []
def _on_link(self, page, domain): self.logger.info("Search email address on link to another page") _email_founds = [] # Find all possible link element links = page.findAll('a') # Find all candidate link with keyword on html page keyword_html_link = self._find_keyword_in_html_text(links) # Find all candidate link with keyword on url keyword_url_link = self._find_keyword_in_url(links, domain) # Merge the url result, remove duplicate url candidate_links = Utility.uniquify(keyword_html_link + keyword_url_link) # Check for invalid url and try to fix it invalid_url = [ uri for uri in candidate_links if not cfg.url_regex.match(uri) ] try_fix_invalid_url = map( lambda _uri: Utility.normalize_invalid_url(_uri, domain), invalid_url) # Filter invalid url candidate_links = candidate_links + try_fix_invalid_url candidate_links = Utility.uniquify( [_uri for _uri in candidate_links if cfg.url_regex.match(_uri)]) try: for link in candidate_links: self.logger.info("Go to next link: " + link) try: self._go_to_page(link) except Exception, err: print str(err) continue soup = BS(self.driver.page_source, "lxml") email = self._on_page(soup, domain) _email_founds.append(email) return _email_founds if not _email_founds else Utility.flatten_list( _email_founds)
try: for link in candidate_links: self.logger.info("Go to next link: " + link) try: self._go_to_page(link) except Exception, err: print str(err) continue soup = BS(self.driver.page_source, "lxml") email = self._on_page(soup, domain) _email_founds.append(email) return _email_founds if not _email_founds else Utility.flatten_list( _email_founds) except Exception, e: logging.error(str(e)) return _email_founds if not _email_founds else Utility.flatten_list( _email_founds) def sort_email(self, emails, domain): # If this is not governor's domain, do not get any email candidate with .go.id domain name if '.go.id' not in domain: emails = [email for email in emails if '.go.id' not in emails] domain_name = Utility.find_domain_name(domain) emails = map(lambda email: (email, domain_name), emails) # Sort based on score descending emails.sort(cmp=lambda a, b: -1 if self.email_scoring(a) > self.email_scoring(b) else 0) emails = [x for x, y in emails] return emails[:cfg.max_email] @staticmethod def email_scoring(email_payload):