def run_job(job): connection_id = job["connectionId"] connection_url = job["connectionUrl"] scrape_url = job["scrapeUrl"] max_depth = job["maxDepth"] referer = job["referer"] if job_invalid(connection_id): return # Scrape and handle result if MOCK: n_images, n_links, links, title = mock_scrape(scrape_url) else: result = scraper.scrape_url(scrape_url) title = result.title n_images = result.n_images n_links = result.n_links links = result.local_links # Grab the most direct referer if there is one top_referer = referer[-1] if len(referer) > 0 else "" write_scrape_segment(connection_id, scrape_url, top_referer, title, n_links, n_images) send_to_connection( connection_id, connection_url, { "url": scrape_url, "n_images": n_images, "n_links": n_links, "title": "title" }) continue_recursion = max_depth > len(referer) # Ask dynamodb for the links that we have not already scraped unvisited_links = filter_visited_urls(connection_id, links) # Queue up jobs for links if we have not reached the end of the line if continue_recursion: new_referer = referer + [scrape_url] for link in unvisited_links: # Increase the remaining job counter update_job_counter(connection_id, 1) queue_link_job(connection_id, connection_url, link, new_referer, max_depth) # Since we queued new jobs we increase the diff # Reduce the remaining job counter. update_job_counter(connection_id, -1) # FIXME: Race condition here, should use the value read from the # atomic update. Can lead to missing that this was the last job # and thus missing to print structure. jobs_left = get_jobs_left(connection_id) if jobs_left <= 0: summary_and_exit(connection_id, connection_url)
def index(url): global original_shifts_list global lowercase_shifts_list global shift_to_url global url_to_title if url in url_to_title: print("'{}' has already been indexed.".format(url)) return 1 with database_lock.gen_rlock(): osl = copy.deepcopy(original_shifts_list) lsl = copy.deepcopy(lowercase_shifts_list) stu = copy.deepcopy(shift_to_url) utt = copy.deepcopy(url_to_title) print("Indexing " + url) # Get website text try: scraped_text, title = scrape_url(url) except Exception as e: print(e) return None # Circular shift it, get resulting associations shift_url_map, url_title_map = \ circular_shift(scraped_text, url, osl, lsl, title) # Now need to resort the main list osl.sort() lsl.sort() # Merge new shift/url map with existing map for shift in shift_url_map: if shift in stu: stu[shift] = stu[shift].union(shift_url_map[shift]) else: stu[shift] = shift_url_map[shift] # Merge new url/title map with existing map utt.update(url_title_map) with database_lock.gen_wlock(): original_shifts_list[:] = osl lowercase_shifts_list[:] = lsl shift_to_url.update(stu) url_to_title.update(utt) print("Index creation for " + url + " complete") return True
def run(driver, opposing, latest): try: current_url = driver.current_url except selenium.common.exceptions.NoSuchWindowException: driver.switch_to.window(driver.window_handles[-1]) current_url = driver.current_url split_url = urlsplit(current_url) # check if the site is partisan and if we're not on the home page view = get_view(split_url.netloc) if view and len(split_url.path) > 1: print("Partisanship detected. Finding new article...") if opposing: if view == "left": view = "right" elif view == "right": view = "left" else: view = "centrist" title = get_title(current_url) url_to_scrape = get_url_to_scrape(title, view=view, latest=latest) centrist_url = scrape_url(url_to_scrape) driver.execute_script("window.open('');") driver.switch_to.window(driver.window_handles[-1]) driver.get(centrist_url)
def scrape_url(ticker): text = scraper.scrape_url(ticker, 'http://wilsoninformatics.com') return 'Paragraph: %s' % text
import argparse from scraper import get_url_to_scrape, scrape_url from title import get_title if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--url", type=str, default="", help="URL of partisan article") args = parser.parse_args() current_url = args.url print("URL scraped:") print(current_url) article_title = get_title(current_url) print("\nTitle:") print(article_title) url_to_scrape = get_url_to_scrape(article_title) print("\nAllSides URL:") print(url_to_scrape) centrist_url = scrape_url(url_to_scrape) print("\nCentrist URL:") print(centrist_url)
from contextlib import closing from json import dump from urllib.parse import quote_plus from scraper import scrape_url base_url = 'https://za.pycon.org/talks/' talks = scrape_url(base_url) with closing(open('talk_details.json', 'w')) as f: dump(talks, f, indent=4)
def goto_scrape(): scraper.scrape_url()