def load_from_cache():
     # -> db as a cache
     with db_connection_cursor() as cur_r:
         cur_r.execute("SELECT key FROM temp WHERE publisher = %s",
                       [PUBLISHER])
         for record in cur_r:
             yield record[0]
 def get_urls_and_then_cache(response_chunk):
     urls = []
     soup = BeautifulSoup(response_chunk[0].text, "xml")
     for el in soup.find_all("loc"):
         urls.append((PUBLISHER, el.string))
     with db_connection_cursor() as cur_w:
         execute_values(
             cur_w, "INSERT INTO temp (PUBLISHER, key) "
             "VALUES %s ON CONFLICT DO NOTHING", urls)
def archive_urls():
    for year in range(2015, 2021):
        for month in range(1, 13):
            yield f"https://www.emol.com/sitemap/noticias/{year}/emol_noticias_{year}_{month:0>2}_0000001.html"
    # load cached pagination pages
    with db_connection_cursor() as cur:
        cur.execute("SELECT key FROM temp WHERE publisher = %s", [PUBLISHER])
        for record in cur:
            yield record[0]
def pagination_handler(resp):
    soup = BeautifulSoup(resp.text, "lxml")
    next_el = soup.find("a", class_="next icon-")
    if not next_el:
        return
    url = "https://www.abc.net.au/news/archive/" + next_el.attrs["href"]
    with db_connection_cursor() as cur:  # cache
        execute_values(
            cur, "INSERT INTO temp (PUBLISHER, key) "
            "VALUES %s ON CONFLICT DO NOTHING", [(PUBLISHER, url)])
    return url
def archive_urls():
    date_start = datetime.datetime(2015, 1, 1)
    date_end = datetime.datetime(2020, 12, 31)
    for day in range((date_end - date_start).days + 1):
        date = date_start + datetime.timedelta(days=day)
        yield "https://www.abc.net.au/news/archive/?date=" + date.strftime(
            "%Y-%m-%d")
    # load cached pagination pages
    with db_connection_cursor() as cur:
        cur.execute("SELECT key FROM temp WHERE publisher = %s", [PUBLISHER])
        for record in cur:
            yield record[0]
 def generate_from_cache():
     # -> db as a cache
     cached = False
     with db_connection_cursor() as cur_r:
         cur_r.execute("SELECT key, val FROM temp WHERE publisher = %s",
                       [PUBLISHER])
         for record in cur_r:
             cached = True
             if int(record[1]) == 1:
                 yield record[0]
                 continue
             for num in range(1, int(record[1]) + 1):
                 yield f"{record[0]}/page/{num}"
     if not cached:
         raise LookupError
    def get_page_num_for_category_months_and_then_cache(response_chunk):
        urls_and_last_page_nums = []
        for resp in response_chunk:
            try:
                soup = BeautifulSoup(resp.text, "lxml")
                navigation_el = soup.find("li", class_="navigation")
                last_page_num = navigation_el.find_all("a")[-1].string
                urls_and_last_page_nums.append(
                    (PUBLISHER, resp.url, last_page_num))
            except IndexError:
                urls_and_last_page_nums.append((PUBLISHER, resp.url, 1))

        with db_connection_cursor() as cur_w:
            execute_values(
                cur_w, "INSERT INTO temp (PUBLISHER, key, val) "
                "VALUES %s ON CONFLICT DO NOTHING", urls_and_last_page_nums)
def pagination_handler(resp):
    resp.encoding = resp.apparent_encoding
    soup = BeautifulSoup(resp.text, "lxml")
    nav_links = soup.select("#mainContent > a")
    if len(nav_links) == 2:
        url = re.search("(.*)emol_noticias_.*",
                        resp.url).group(1) + nav_links[1].attrs.get("href")
    elif nav_links[0].string == "Siguiente >>":
        url = re.search("(.*)emol_noticias_.*",
                        resp.url).group(1) + nav_links[0].attrs.get("href")
    else:
        return
    with db_connection_cursor() as cur:  # cache
        execute_values(
            cur, "INSERT INTO temp (PUBLISHER, key) "
            "VALUES %s ON CONFLICT DO NOTHING", [(PUBLISHER, url)])
    return url