def test_is_before(d1, d2, expected): """Ensure a d1 is before d2.""" assert date_utils.is_before(d1, d2) == expected
async def async_retrieve(pages=-1, from_=None, to=None, attempts=1, backoff=1): """ Retrieve fatality data. :param str pages: number of pages to retrieve or -1 for all :param str from_: the start date :param str to: the end date :return: the list of fatalities and the number of pages that were read. :rtype: tuple """ res = {} page = 1 has_entries = False no_date_within_range_count = 0 from_date = date_utils.from_date(from_) to_date = date_utils.to_date(to) logger.debug(f'Retrieving fatalities from {from_date} to {to_date}.') async with aiohttp.ClientSession() as session: while True: # Fetch the news page. logger.info(f'Fetching page {page}...') try: news_page = await fetch_news_page(session, page) except Exception: raise ValueError(f'Cannot retrieve news page #{page}.') # Looks for traffic fatality links. page_details_links = extract_traffic_fatalities_page_details_link(news_page) # Generate the full URL for the links. links = generate_detail_page_urls(page_details_links) logger.debug(f'{len(links)} fatality page(s) to process.') # Fetch and parse each link. tasks = [ fetch_and_parse.retry_with( stop=stop_after_attempt(attempts), wait=wait_exponential(multiplier=backoff), reraise=True, )(session, link) for link in links ] page_res = await asyncio.gather(*tasks) if page_res: page_res = [person for item in page_res for person in item] # If the page contains fatalities, ensure all of them happened within the specified time range. entries_in_time_range = [ entry for entry in page_res if date_utils.is_between(entry[Fields.DATE], from_date, to_date) ] # If 2 pages in a row: # 1) contain results # 2) but none of them contain dates within the time range # 3) and we did not collect any valid entries # Then we can stop the operation. past_entries = all([date_utils.is_before(entry[Fields.DATE], from_date) for entry in page_res]) if from_ and past_entries and not has_entries: no_date_within_range_count += 1 if no_date_within_range_count > 1: logger.debug(f'{len(entries_in_time_range)} fatality page(s) within the specified time range.') break # Check whether we found entries in the previous pages. if not has_entries: has_entries = not has_entries and bool(entries_in_time_range) logger.debug(f'{len(entries_in_time_range)} fatality page(s) is/are within the specified time range.') # If there are none in range, we do not need to search further, and we can discard the results. if has_entries and not entries_in_time_range: logger.debug(f'There are no data within the specified time range on page {page}.') break # Store the results if the ID number is new. res.update( {entry.get(Fields.ID): entry for entry in entries_in_time_range if entry.get(Fields.ID) not in res}) # Stop if there is no further pages. if not has_next(news_page) or page >= pages > 0: break page += 1 return list(res.values()), page
async def async_retrieve(pages=-1, from_=None, to=None): """Retrieve fatality data.""" res = {} page = 1 has_entries = False no_date_within_range_count = 0 logger.debug( f'Retrieving fatalities from {date_utils.from_date(from_)} to {date_utils.to_date(to)}.' ) async with aiohttp.ClientSession() as session: while True: # Fetch the news page. logger.info(f'Fetching page {page}...') try: news_page = await fetch_news_page(session, page) except Exception: raise ValueError(f'Cannot retrieve news page #{page}.') # Looks for traffic fatality links. page_details_links = extract_traffic_fatalities_page_details_link( news_page) # Generate the full URL for the links. links = generate_detail_page_urls(page_details_links) logger.debug(f'{len(links)} fatality page(s) to process.') # Fetch and parse each link. tasks = [fetch_and_parse(session, link) for link in links] page_res = await asyncio.gather(*tasks) # If the page contains fatalities, ensure all of them happened within the specified time range. if page_res: entries_in_time_range = [ entry for entry in page_res if date_utils.is_between(entry[Fields.DATE], from_, to) ] # If 2 pages in a row: # 1) contain results # 2) but none of them contain dates within the time range # 3) and we did not collect any valid entries # Then we can stop the operation. if from_ and all([ date_utils.is_before(entry[Fields.DATE], from_) for entry in page_res ]) and not has_entries: no_date_within_range_count += 1 if no_date_within_range_count > 1: logger.debug( f'{len(entries_in_time_range)} fatality page(s) within the specified time range.' ) break # Check whether we found entries in the previous pages. if not has_entries: has_entries = not has_entries and bool( entries_in_time_range) logger.debug( f'{len(entries_in_time_range)} fatality page(s) is/are within the specified time range.' ) # If there are none in range, we do not need to search further, and we can discard the results. if has_entries and not entries_in_time_range: logger.debug( f'There are no data within the specified time range on page {page}.' ) break # Store the results if the case number is new. res.update({ entry.get(Fields.CASE): entry for entry in entries_in_time_range if entry.get(Fields.CASE) not in res }) # Stop if there is no further pages. if not has_next(news_page) or page >= pages > 0: break page += 1 return list(res.values()), page