Exemplo n.º 1
0
 def scrape(self):
     logger.info('Beginning Scraper:')
     session = Session(datetime=timezone.now())
     url = get_url(date_filter=self.date_filter, page=1)
     logger.info('URL: {}'.format(url))
     response = requests.get(url)
     n_pages = get_n_pages(response)
     logger.info('n_pages: {}'.format(n_pages))
     logger.info('page 1/{}'.format(n_pages))
     summary = populate_db(response)
     logger.info('{} successfully read, {} failed, {} to retry'.format(len(summary['success']), len(summary['failed']), len(summary['retry'])))
     for page in range(2, n_pages + 1):
         logger.info('page {}/{}'.format(page, n_pages))
         url = get_url(date_filter=self.date_filter, page=page)
         response = requests.get(url)
         summary_ = populate_db(response)
         logger.info('{} successfully read, {} failed, {} to retry'.format(len(summary_['success']), len(summary_['failed']), len(summary_['retry'])))
         for key, val in summary_.iteritems():
             summary[key].extend(val)
     session.save()
     logger.info('----------' * 5)
     logger.info('Total Summary')
     logger.info('{} successfully read, {} failed, {} to retry'.format(len(summary['success']),len(summary['failed']), len(summary['retry'])))
     logger.info('----------' * 5)
Exemplo n.º 2
0
def update_daily_summaries():
    logger.info('Update Daily Summaries')
    previous_session = ScrapingSession.get_previous_session()
    logger.info('Previous Scraping Session: {}'.format(previous_session.datetime if previous_session else None))
    if previous_session == None:
        return
    last_summary = DailySummary.get_last_summary()
    logger.info('Last Daily Summary: {}'.format(last_summary.date if last_summary else None))    
    if last_summary == None:
        start_date = JobListing.get_earliest_job_listing().added - datetime.timedelta(days=1)
    else:
        start_date = last_summary.date
    dates_between = data_science_jobs.get_days_between(
        start_date,
        previous_session.datetime.date())
    if len(dates_between):
        logger.info('Getting Daily Summaries Between: {} - {}'.format(dates_between[0], dates_between[-1]))
    else:
        logger.info('Daily Summaries Up-To-Date')
    for date in dates_between:
        daily_summary = DailySummary.create(date=date)
        logger.info('{}: n_posts: {}'.format(date, daily_summary.n_posts))
        daily_summary.save()
Exemplo n.º 3
0
def update_monthly_summaries():
    logger.info('Update Monthly Summaries')
    previous_session = ScrapingSession.get_previous_session()
    logger.info('Previous Scraping Session: {}'.format(previous_session.datetime if previous_session else None))
    if previous_session == None:
        return
    last_summary = MonthlySummary.get_last_summary()
    logger.info('Last Monthly Summary: {}'.format(last_summary.date if last_summary else last_summary))
    if last_summary == None:
        start_date = JobListing.get_earliest_job_listing().added
        start_date = start_date - datetime.timedelta(days=start_date.day)
        start_date = datetime.datetime(year=start_date.year, month=start_date.month, day=1).date()
    else:
        start_date = last_summary.date
    previous_session_month = previous_session.datetime.date()
    previous_session_month = previous_session_month - datetime.timedelta(days=previous_session_month.day - 1)
    months_between = data_science_jobs.get_months_between(
        start_date,
        previous_session.datetime.date())
    logger.info('Months between: {}'.format(months_between))
    for date in months_between:
        monthly_summary = MonthlySummary.create(date=date)
        logger.info('{}: n_posts: {}'.format(date, monthly_summary.n_posts))
        monthly_summary.save()
Exemplo n.º 4
0
 def test_returns_previous_session(self):
     session = Session.objects.create(datetime=datetime.datetime(year=2015, month=1, day=1))
     output = Session.get_previous_session()
     self.assertEqual(output, session)
Exemplo n.º 5
0
 def test_no_previous_session_returns_none(self):
     session = Session.get_previous_session()
     self.assertEqual(session, None)