def mine(self, date: datetime): """ If that date hasn't been scraped before, scrape it! """ date_string = date.strftime('%d-%m-%Y') # Switch on the engine m = Scraper(date=date, session=self._session, server=self._server) # Been there, done that if date in self._miners: self._rec('{} has already been mined', date_string) m.close() else: # Go browse the web summary page for that day # and scrape off the job uuid request parameters. jobs = m.scrape_uuids() # I don't work on weekends if not jobs: self._rec('No jobs found for {}', date_string) else: for j in jobs: # Grab the job's web page, regex it and store # the collected fields in a sensible manner. # We don't pickle the data yet: instead, we # pickle multiple days at once before exit. soup = m._get_job(j) raw_data = m._scrape_job(soup) m.process_job(raw_data) # So wanna see results? pp = PrettyPrinter() pp.pprint(m.raw_data[0]) # Job details pp.pprint(m.raw_data[1]) # Price table [pp.pprint(d) for d in m.raw_data[2]] # Addresses # We're never gonna scrape with a 100% success # rate, but let's do better next time! # TODO Hopefully remove this debug message later self._rec('Mined: {} successfully!', date_string) for message in m._warnings: self._rec(message)