def __init__(self): self.url = "http://map.crossfit.com/affinfo.php?a={}&t=0" self.email_scraper = EmailScraper() self.sreq = SoupRequestor()
class CrossfitScraper(object): def __init__(self): self.url = "http://map.crossfit.com/affinfo.php?a={}&t=0" self.email_scraper = EmailScraper() self.sreq = SoupRequestor() def get_gym(self, affid): u = self.url.format(affid) r, s = self.sreq.get(u) return (r, s) def set_gym_from_response(self, affid, r, s): if s.b is None: return b = s.b.extract() p = s.contents[-1].extract() if b.a is None: return addr = ' '.join(['%s' % x for x in s.findAll(text=True)]) addr = ' '.join(addr.split()) gym = CrossfitGym() gym.name = b.a.text gym.link = b.a['href'] gym.addr = addr gym.affid = affid gym.phone = p gym.save() def get_gym_list(self): for i in xrange(1, 3500): if CrossfitGym.objects.filter(affid=i).exists(): continue print 'Getting info for %d' % i r, s = self.get_gym(i) if r is None: continue self.set_gym_from_response(i, r, s) sleep(0.75) def get_gym_email(self, gym): if not gym.email and gym.checked_email is False: print 'Getting email for %s' % gym e = self.email_scraper.scrape_email(gym.link) if e: gym.email = e gym.save() gym.checked_email = True gym.save() def get_gym_emails(self): for gym in CrossfitGym.objects.all(): self.get_gym_email(gym) def scrape(self): self.get_gym_list() self.get_gym_emails()
class CrossfitScraper(object): def __init__(self): self.url = "http://map.crossfit.com/affinfo.php?a={}&t=0" self.email_scraper = EmailScraper() self.sreq = SoupRequestor() def get_gym(self, affid): u = self.url.format(affid) r,s = self.sreq.get(u) return (r,s) def set_gym_from_response(self, affid, r, s): if s.b is None: return b = s.b.extract() p = s.contents[-1].extract() if b.a is None: return addr = ' '.join(['%s' % x for x in s.findAll(text=True)]) addr = ' '.join(addr.split()) gym = CrossfitGym() gym.name = b.a.text gym.link = b.a['href'] gym.addr = addr gym.affid = affid gym.phone = p gym.save() def get_gym_list(self): for i in xrange(1, 3500): if CrossfitGym.objects.filter(affid=i).exists(): continue print 'Getting info for %d' % i r,s = self.get_gym(i) if r is None: continue self.set_gym_from_response(i, r, s) sleep(0.75) def get_gym_email(self, gym): if not gym.email and gym.checked_email is False: print 'Getting email for %s' % gym e = self.email_scraper.scrape_email(gym.link) if e: gym.email = e gym.save() gym.checked_email = True gym.save() def get_gym_emails(self): for gym in CrossfitGym.objects.all(): self.get_gym_email(gym) def scrape(self): self.get_gym_list() self.get_gym_emails()