def __init__(self): self.email_regex = r'\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b' self.email_link_regex = r'mailto:\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b' self.sreq = SoupRequestor()
class EmailScraper(object): def __init__(self): self.email_regex = r'\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b' self.email_link_regex = r'mailto:\b([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})\b' self.sreq = SoupRequestor() def follow_link(self, base_url, pattern): pass def get_email_link_from_page(self, soup): r = re.compile(self.email_link_regex, re.I) a = soup.find('a', href=re.compile(r)) if a: m = re.search(r, a.get('href')) if len(m.groups()): return m.group(1) return None def get_email_text_from_page(self, soup): r = re.compile(self.email_regex, re.I) t = soup.find(text=re.compile(r)) if t: m = re.search(r, t) if len(m.groups()): return m.group(1) return None def scrape_email(self, base_url): (r, s) = self.sreq.get(base_url) if r is None: return None if s.meta: if 'searchassist.verizon.com' in s.meta.get('content', ''): return None # First try landing page e = self.get_email_link_from_page(s) if e: return e e = self.get_email_text_from_page(s) if e: return e # See if there's a "contact us" page a = s.find('a', text=re.compile(r'contact', re.I)) if a: u = urlparse.urljoin(base_url, a.get('href')) (r, s) = self.sreq.get(u) if r is not None: e = self.get_email_link_from_page(s) if e: return e e = self.get_email_text_from_page(s) if e: return e # Try escaped fragment version of landing page u = urlparse.urljoin(base_url, '?_escaped_fragment_=') (r, s) = self.sreq.get(u) if r is not None: e = self.get_email_link_from_page(s) if e: return e e = self.get_email_text_from_page(s) if e: return e
def __init__(self): self.url = "http://map.crossfit.com/affinfo.php?a={}&t=0" self.email_scraper = EmailScraper() self.sreq = SoupRequestor()
class CrossfitScraper(object): def __init__(self): self.url = "http://map.crossfit.com/affinfo.php?a={}&t=0" self.email_scraper = EmailScraper() self.sreq = SoupRequestor() def get_gym(self, affid): u = self.url.format(affid) r, s = self.sreq.get(u) return (r, s) def set_gym_from_response(self, affid, r, s): if s.b is None: return b = s.b.extract() p = s.contents[-1].extract() if b.a is None: return addr = ' '.join(['%s' % x for x in s.findAll(text=True)]) addr = ' '.join(addr.split()) gym = CrossfitGym() gym.name = b.a.text gym.link = b.a['href'] gym.addr = addr gym.affid = affid gym.phone = p gym.save() def get_gym_list(self): for i in xrange(1, 3500): if CrossfitGym.objects.filter(affid=i).exists(): continue print 'Getting info for %d' % i r, s = self.get_gym(i) if r is None: continue self.set_gym_from_response(i, r, s) sleep(0.75) def get_gym_email(self, gym): if not gym.email and gym.checked_email is False: print 'Getting email for %s' % gym e = self.email_scraper.scrape_email(gym.link) if e: gym.email = e gym.save() gym.checked_email = True gym.save() def get_gym_emails(self): for gym in CrossfitGym.objects.all(): self.get_gym_email(gym) def scrape(self): self.get_gym_list() self.get_gym_emails()
class CrossfitScraper(object): def __init__(self): self.url = "http://map.crossfit.com/affinfo.php?a={}&t=0" self.email_scraper = EmailScraper() self.sreq = SoupRequestor() def get_gym(self, affid): u = self.url.format(affid) r,s = self.sreq.get(u) return (r,s) def set_gym_from_response(self, affid, r, s): if s.b is None: return b = s.b.extract() p = s.contents[-1].extract() if b.a is None: return addr = ' '.join(['%s' % x for x in s.findAll(text=True)]) addr = ' '.join(addr.split()) gym = CrossfitGym() gym.name = b.a.text gym.link = b.a['href'] gym.addr = addr gym.affid = affid gym.phone = p gym.save() def get_gym_list(self): for i in xrange(1, 3500): if CrossfitGym.objects.filter(affid=i).exists(): continue print 'Getting info for %d' % i r,s = self.get_gym(i) if r is None: continue self.set_gym_from_response(i, r, s) sleep(0.75) def get_gym_email(self, gym): if not gym.email and gym.checked_email is False: print 'Getting email for %s' % gym e = self.email_scraper.scrape_email(gym.link) if e: gym.email = e gym.save() gym.checked_email = True gym.save() def get_gym_emails(self): for gym in CrossfitGym.objects.all(): self.get_gym_email(gym) def scrape(self): self.get_gym_list() self.get_gym_emails()