def load_from_cache(): # -> db as a cache with db_connection_cursor() as cur_r: cur_r.execute("SELECT key FROM temp WHERE publisher = %s", [PUBLISHER]) for record in cur_r: yield record[0]
def get_urls_and_then_cache(response_chunk): urls = [] soup = BeautifulSoup(response_chunk[0].text, "xml") for el in soup.find_all("loc"): urls.append((PUBLISHER, el.string)) with db_connection_cursor() as cur_w: execute_values( cur_w, "INSERT INTO temp (PUBLISHER, key) " "VALUES %s ON CONFLICT DO NOTHING", urls)
def archive_urls(): for year in range(2015, 2021): for month in range(1, 13): yield f"https://www.emol.com/sitemap/noticias/{year}/emol_noticias_{year}_{month:0>2}_0000001.html" # load cached pagination pages with db_connection_cursor() as cur: cur.execute("SELECT key FROM temp WHERE publisher = %s", [PUBLISHER]) for record in cur: yield record[0]
def pagination_handler(resp): soup = BeautifulSoup(resp.text, "lxml") next_el = soup.find("a", class_="next icon-") if not next_el: return url = "https://www.abc.net.au/news/archive/" + next_el.attrs["href"] with db_connection_cursor() as cur: # cache execute_values( cur, "INSERT INTO temp (PUBLISHER, key) " "VALUES %s ON CONFLICT DO NOTHING", [(PUBLISHER, url)]) return url
def archive_urls(): date_start = datetime.datetime(2015, 1, 1) date_end = datetime.datetime(2020, 12, 31) for day in range((date_end - date_start).days + 1): date = date_start + datetime.timedelta(days=day) yield "https://www.abc.net.au/news/archive/?date=" + date.strftime( "%Y-%m-%d") # load cached pagination pages with db_connection_cursor() as cur: cur.execute("SELECT key FROM temp WHERE publisher = %s", [PUBLISHER]) for record in cur: yield record[0]
def generate_from_cache(): # -> db as a cache cached = False with db_connection_cursor() as cur_r: cur_r.execute("SELECT key, val FROM temp WHERE publisher = %s", [PUBLISHER]) for record in cur_r: cached = True if int(record[1]) == 1: yield record[0] continue for num in range(1, int(record[1]) + 1): yield f"{record[0]}/page/{num}" if not cached: raise LookupError
def get_page_num_for_category_months_and_then_cache(response_chunk): urls_and_last_page_nums = [] for resp in response_chunk: try: soup = BeautifulSoup(resp.text, "lxml") navigation_el = soup.find("li", class_="navigation") last_page_num = navigation_el.find_all("a")[-1].string urls_and_last_page_nums.append( (PUBLISHER, resp.url, last_page_num)) except IndexError: urls_and_last_page_nums.append((PUBLISHER, resp.url, 1)) with db_connection_cursor() as cur_w: execute_values( cur_w, "INSERT INTO temp (PUBLISHER, key, val) " "VALUES %s ON CONFLICT DO NOTHING", urls_and_last_page_nums)
def pagination_handler(resp): resp.encoding = resp.apparent_encoding soup = BeautifulSoup(resp.text, "lxml") nav_links = soup.select("#mainContent > a") if len(nav_links) == 2: url = re.search("(.*)emol_noticias_.*", resp.url).group(1) + nav_links[1].attrs.get("href") elif nav_links[0].string == "Siguiente >>": url = re.search("(.*)emol_noticias_.*", resp.url).group(1) + nav_links[0].attrs.get("href") else: return with db_connection_cursor() as cur: # cache execute_values( cur, "INSERT INTO temp (PUBLISHER, key) " "VALUES %s ON CONFLICT DO NOTHING", [(PUBLISHER, url)]) return url