def _parse_emails(doc): link_texts = doc.q("//a").join(' | ') for email in common.get_emails(link_texts): if '@' in email and email not in emails: emails.append(email) if not emails: #try with text only, not links html = doc.remove("//script").html() for email in common.get_emails(html): if '@' in email and email not in emails: emails.append(email)
def find_emails(self, url, emails_dict, deep=2, link_filter=None): if not url: return [] if not common.subreg(url, '^(http)'): url = 'http://' + url if '@' in url: return common.get_emails(url) if url not in emails_dict: emails_dict[url] = [] if not link_filter: def link_filter(url): keywords = [ "contact", "about", "agent", "info", "imprint", "kontakt", "uber", "wir", "impressum", "contacter", "representatives" ] for kw in keywords: if kw.lower() in url: return True return False def parse(doc): for email in common.get_emails(doc.html()): if email not in emails_dict[url]: emails_dict[url].append(email) self.loop(url=url, next="//a/@href | //iframe/@src", deep=deep, link_filter=link_filter, cb=parse, cc=10, start_now=False)
def find_emails(self, url, emails_dict, deep=2, link_filter=None): if not url: return [] if not common.subreg(url, '^(http)'): url = 'http://'+url if '@' in url: return common.get_emails(url) if url not in emails_dict: emails_dict[url] = [] if not link_filter: def link_filter(url): keywords = ["contact","about","agent","info","imprint","kontakt","uber","wir","impressum","contacter","representatives"] for kw in keywords: if kw.lower() in url: return True return False def parse(doc): for email in common.get_emails(doc.html()): if email not in emails_dict[url]: emails_dict[url].append(email) self.loop(url=url, next="//a/@href | //iframe/@src", deep=deep, link_filter = link_filter, cb = parse, cc=10, start_now = False )
def _parse_emails(doc): emails = [] #firstly try to get emails from the links only because it's more reliable link_texts = doc.q("//a").join(' | ') for email in common.get_emails(link_texts): if '@' in email and email not in emails: emails.append(email) if not emails: #try with text only, not links html = doc.remove("//script").html() for email in common.get_emails(html): if '@' in email and email not in emails: emails.append(email) return emails
def mine_emails(self, url): """ looks for emails on key pages of a website: homepage, contact """ if not url: return [] if not common.subreg(url, '^(http)'): url = 'http://' + url if '@' in url: return common.get_emails(url) domain = common.get_domain(url) emails = [] def _parse_emails(doc): link_texts = doc.q("//a").join(' | ') for email in common.get_emails(link_texts): if '@' in email and email not in emails: emails.append(email) if not emails: #try with text only, not links html = doc.remove("//script").html() for email in common.get_emails(html): if '@' in email and email not in emails: emails.append(email) homepage = self.load(url) _parse_emails(homepage) if emails: #no need to look on other page return emails contact_url = homepage.x( "//a[contains(@href,'contact') or contains(@href,'Contact')]/@href" ) if contact_url: contactpage = self.load(contact_url) _parse_emails(contactpage) return emails
def mine_emails(self, url): """ looks for emails on key pages of a website: homepage, contact """ if not url: return [] if not common.subreg(url, '^(http)'): url = 'http://'+url if '@' in url: return common.get_emails(url) domain = common.get_domain(url) emails = [] def _parse_emails(doc): link_texts = doc.q("//a").join(' | ') for email in common.get_emails(link_texts): if '@' in email and email not in emails: emails.append(email) if not emails: #try with text only, not links html = doc.remove("//script").html() for email in common.get_emails(html): if '@' in email and email not in emails: emails.append(email) homepage = self.load(url) _parse_emails(homepage) if emails: #no need to look on other page return emails contact_url = homepage.x("//a[contains(@href,'contact') or contains(@href,'Contact')]/@href") if contact_url: contactpage = self.load(contact_url) _parse_emails(contactpage) return emails
def mine_emails(url, br=None, deep_level=1): """ deep_level = 1: scrape home page and contact page only """ if not url: return [] if not common.subreg(url, '^(http)'): url = 'http://' + url if '@' in url: return common.get_emails(url) domain = common.get_domain(url).lower() history = {} def _load_page(page_url, current_level): """ Please make sure this _url is not loaded yet, to avoid loaded twice """ logger.debug('mine_emails page %s, level %s', page_url, current_level) html = '' if br: try: br.get(page_url) html = br.page_source except Exception as e: logger.warn('failed to _load_page: %s', page_url) # logger.exception(e) raise e #to trigger new br else: html = s.load_html(page_url) doc = Doc(url=page_url, html=html) #update loaded links links = doc.q("//a") sub_urls = [] for link in links: _url = link.href() if domain not in _url.lower(): continue if _url in history: continue if _url not in sub_urls: sub_urls.append(_url) history[page_url] = (current_level + 1, sub_urls) return doc def _parse_emails(doc): emails = [] #firstly try to get emails from the links only because it's more reliable link_texts = doc.q("//a").join(' | ') for email in common.get_emails(link_texts): if '@' in email and email not in emails: emails.append(email) if not emails: #try with text only, not links html = doc.remove("//script").html() for email in common.get_emails(html): if '@' in email and email not in emails: emails.append(email) return emails def _load_subpages(level): #firstly, compile all the urls of this level in the history urls = [] for url in history: _level, suburls = history[url] if _level != level: continue for suburl in suburls: if suburl in history: continue if suburl not in urls: urls.append(suburl) logger.debug('mine emails in level %s, with %s urls to process', level, len(urls)) for suburl in urls: doc = _load_page(suburl, level) emails = _parse_emails(doc) if emails: #found emails on this page, enough return emails #not found return [] doc = _load_page(url, current_level=1) emails = _parse_emails(doc) if emails: return emails contact_url = doc.x( "//a[contains(@href,'contact') or contains(@href,'Contact')]/@href") if contact_url: doc = _load_page(contact_url, current_level=2) emails = _parse_emails(doc) #when a contact page found, no need to dig further even if no emails found return emails #try with level 2 if deep_level >= 2: emails = _load_subpages(level=2) if emails: return emails #try with level 3 if deep_level >= 3: emails = _load_subpages(level=3) if emails: return emails #not found return []
def parse(doc): for email in common.get_emails(doc.html()): if email not in emails_dict[url]: emails_dict[url].append(email)
def mine_emails(url, br=None, deep_level=1): """ deep_level = 1: scrape home page and contact page only """ if not url: return [] if not common.subreg(url, '^(http)'): url = 'http://'+url if '@' in url: return common.get_emails(url) domain = common.get_domain(url).lower() history = {} def _load_page(page_url, current_level): """ Please make sure this _url is not loaded yet, to avoid loaded twice """ logger.debug('mine_emails page %s, level %s', page_url, current_level) html = '' if br: try: br.get(page_url) html = br.page_source except Exception as e: logger.warn('failed to _load_page: %s', page_url) # logger.exception(e) raise e #to trigger new br else: html = s.load_html(page_url) doc = Doc(url=page_url, html=html) #update loaded links links = doc.q("//a") sub_urls = [] for link in links: _url = link.href() if domain not in _url.lower(): continue if _url in history: continue if _url not in sub_urls: sub_urls.append(_url) history[page_url] = (current_level+1, sub_urls) return doc def _parse_emails(doc): emails = [] #firstly try to get emails from the links only because it's more reliable link_texts = doc.q("//a").join(' | ') for email in common.get_emails(link_texts): if '@' in email and email not in emails: emails.append(email) if not emails: #try with text only, not links html = doc.remove("//script").html() for email in common.get_emails(html): if '@' in email and email not in emails: emails.append(email) return emails def _load_subpages(level): #firstly, compile all the urls of this level in the history urls = [] for url in history: _level, suburls = history[url] if _level != level: continue for suburl in suburls: if suburl in history: continue if suburl not in urls: urls.append(suburl) logger.debug('mine emails in level %s, with %s urls to process', level, len(urls)) for suburl in urls: doc = _load_page(suburl, level) emails = _parse_emails(doc) if emails: #found emails on this page, enough return emails #not found return [] doc = _load_page(url, current_level = 1) emails = _parse_emails(doc) if emails: return emails contact_url = doc.x("//a[contains(@href,'contact') or contains(@href,'Contact')]/@href") if contact_url: doc = _load_page(contact_url, current_level = 2) emails = _parse_emails(doc) #when a contact page found, no need to dig further even if no emails found return emails #try with level 2 if deep_level >=2: emails = _load_subpages(level=2) if emails: return emails #try with level 3 if deep_level >=3: emails = _load_subpages(level=3) if emails: return emails #not found return []