def scrape(self): logger.info('Beginning Scraper:') session = Session(datetime=timezone.now()) url = get_url(date_filter=self.date_filter, page=1) logger.info('URL: {}'.format(url)) response = requests.get(url) n_pages = get_n_pages(response) logger.info('n_pages: {}'.format(n_pages)) logger.info('page 1/{}'.format(n_pages)) summary = populate_db(response) logger.info('{} successfully read, {} failed, {} to retry'.format(len(summary['success']), len(summary['failed']), len(summary['retry']))) for page in range(2, n_pages + 1): logger.info('page {}/{}'.format(page, n_pages)) url = get_url(date_filter=self.date_filter, page=page) response = requests.get(url) summary_ = populate_db(response) logger.info('{} successfully read, {} failed, {} to retry'.format(len(summary_['success']), len(summary_['failed']), len(summary_['retry']))) for key, val in summary_.iteritems(): summary[key].extend(val) session.save() logger.info('----------' * 5) logger.info('Total Summary') logger.info('{} successfully read, {} failed, {} to retry'.format(len(summary['success']),len(summary['failed']), len(summary['retry']))) logger.info('----------' * 5)
def update_daily_summaries(): logger.info('Update Daily Summaries') previous_session = ScrapingSession.get_previous_session() logger.info('Previous Scraping Session: {}'.format(previous_session.datetime if previous_session else None)) if previous_session == None: return last_summary = DailySummary.get_last_summary() logger.info('Last Daily Summary: {}'.format(last_summary.date if last_summary else None)) if last_summary == None: start_date = JobListing.get_earliest_job_listing().added - datetime.timedelta(days=1) else: start_date = last_summary.date dates_between = data_science_jobs.get_days_between( start_date, previous_session.datetime.date()) if len(dates_between): logger.info('Getting Daily Summaries Between: {} - {}'.format(dates_between[0], dates_between[-1])) else: logger.info('Daily Summaries Up-To-Date') for date in dates_between: daily_summary = DailySummary.create(date=date) logger.info('{}: n_posts: {}'.format(date, daily_summary.n_posts)) daily_summary.save()
def update_monthly_summaries(): logger.info('Update Monthly Summaries') previous_session = ScrapingSession.get_previous_session() logger.info('Previous Scraping Session: {}'.format(previous_session.datetime if previous_session else None)) if previous_session == None: return last_summary = MonthlySummary.get_last_summary() logger.info('Last Monthly Summary: {}'.format(last_summary.date if last_summary else last_summary)) if last_summary == None: start_date = JobListing.get_earliest_job_listing().added start_date = start_date - datetime.timedelta(days=start_date.day) start_date = datetime.datetime(year=start_date.year, month=start_date.month, day=1).date() else: start_date = last_summary.date previous_session_month = previous_session.datetime.date() previous_session_month = previous_session_month - datetime.timedelta(days=previous_session_month.day - 1) months_between = data_science_jobs.get_months_between( start_date, previous_session.datetime.date()) logger.info('Months between: {}'.format(months_between)) for date in months_between: monthly_summary = MonthlySummary.create(date=date) logger.info('{}: n_posts: {}'.format(date, monthly_summary.n_posts)) monthly_summary.save()
def test_returns_previous_session(self): session = Session.objects.create(datetime=datetime.datetime(year=2015, month=1, day=1)) output = Session.get_previous_session() self.assertEqual(output, session)
def test_no_previous_session_returns_none(self): session = Session.get_previous_session() self.assertEqual(session, None)