def _search(): time_since_last_use = 0 prush("Selecting an engine...") engine_name = "" while True: engine = random.choice(search_engines)() engine_name = engine.__class__.__name__ if not engine_name in engine_times: break time_since_last_use = (datetime.now() - engine_times[engine_name]).total_seconds() if time_since_last_use < ENGINE_COOLDOWN_TIME: prush( "Engine '{}' used too recently. Trying another...".format( engine_name)) else: break engine.set_headers({'User-Agent': get_random_user_agent()}) # internally intepreted as sleep(random_uniform(*self._delay)) # This value set low (or zero) since we pause between use of each # engine (above). engine._delay = (0, 0) subject = random.choice(subjects) + " news" prush("Searching for subject '{}'...".format(subject)) search_results = engine.search(subject, pages=SEARCH_PAGES).links() engine_times[engine_name] = datetime.now() prush("Found {} results for subject '{}'.".format( len(search_results), subject)) return search_results
async def _hydrate_codex(clean_file="research/data/cleaned.json", url_timeout=10, max_consecutive_exceptions=10): prush("Loading cleaned data...") with open(os.path.join(os.getcwd(), clean_file), 'r') as f: reports = json.load(f) prush("Counting distinct URLs to process...") sem = asyncio.Semaphore(CODEX_SEMAPHORE_LIMIT) urlset = set() pending = [] for report in reports: for url_info in report["urls"]: urlset.add(url_info["hash"]) pending.append(_fetch(url_info, sem)) url_count_distinct = len(urlset) url_count = len(pending) prush("Distinct URL Count:", url_count_distinct) urlset = None # Shuffle the pending list to try and increase the distance between calls # to each domain. Helps avoid having a target think this is a DoS attack. # Even though we process the URLs asyncronously, shuffling helps further # distribute the domains. random.shuffle(pending) url_count_processed = 0 success_count = 0 while len(pending) > 0: done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) for d in done: # Exceptions should all be caught within the _fetch function. If # not, we unwind and get the hell out of dodge. if d.exception() is not None: for p in pending: p.cancel() raise d.exception() result = d.result() url_count_processed = url_count_processed + 1 pct = round(url_count_processed / url_count * 100, 2) prush("{} - {}/{} ({}%): {} {} {}".format( datetime.now(), url_count_processed, url_count, pct, result["hash"], result["msg"], result["url"])) if result["successful"]: success_count = success_count + 1 prush("Done. {}/{} successully processed.".format(success_count, url_count))
def build_codex(doc_count=1000, subjects_dir="research/data/arbitrary", destination_dir="research/data/arbitrary/codex"): """Builds a set of documents by collecting arbitrary content from the web based on a provided list of subjects. """ SUBJECT_BATCH_SIZE = 10 # Number of documents to try to retrieve for each subject-batch. # Number of pages of results to request per engine per batch. SEARCH_PAGES = 5 # The minimum length (characters) for a document to be eligible for the codex. MIN_DOC_LENGTH = 200 ENGINE_COOLDOWN_TIME = 5 subjects_file = os.path.join(os.getcwd(), subjects_dir, "usnews_subjects.json") with open(subjects_file, 'r') as f: subjects = json.load(f) engine_times = dict() def _search(): time_since_last_use = 0 prush("Selecting an engine...") engine_name = "" while True: engine = random.choice(search_engines)() engine_name = engine.__class__.__name__ if not engine_name in engine_times: break time_since_last_use = (datetime.now() - engine_times[engine_name]).total_seconds() if time_since_last_use < ENGINE_COOLDOWN_TIME: prush( "Engine '{}' used too recently. Trying another...".format( engine_name)) else: break engine.set_headers({'User-Agent': get_random_user_agent()}) # internally intepreted as sleep(random_uniform(*self._delay)) # This value set low (or zero) since we pause between use of each # engine (above). engine._delay = (0, 0) subject = random.choice(subjects) + " news" prush("Searching for subject '{}'...".format(subject)) search_results = engine.search(subject, pages=SEARCH_PAGES).links() engine_times[engine_name] = datetime.now() prush("Found {} results for subject '{}'.".format( len(search_results), subject)) return search_results success_count = 0 search_results = _search() while success_count < doc_count: if success_count % 10 == 0: prush("\n{}: {} docs processed. {}% complete.\n".format( datetime.now(), success_count, 100 * round(success_count / doc_count, 2))) if success_count % SUBJECT_BATCH_SIZE == 0 and success_count != 0: search_results = _search() # We try to maintain a buffer above the minumum number of results required # so we 1) can choose some results at random (not just take all results) and # 2) can account for the fact that some of the links will not return 200. if len(search_results) < SUBJECT_BATCH_SIZE * 2: prush("Not enough results for subject. Trying another...") search_results = _search() continue success = False while not success: if len(search_results) == 0: prush( "Exhausted search results for this subject. Trying another..." ) break random.seed() search_result = random.choice(search_results) search_results.remove(search_result) prush("Accessing {}...".format(search_result)) if "youtube.com" in search_result: prush(" Appears to be a YouTube result. Trying another...") continue if search_result[:-3] == "pdf": prush(" Appears to be a PDF. Trying another...") continue file_name = os.path.join(os.getcwd(), destination_dir, hash(search_result) + ".txt") if os.path.exists(file_name): prush(" URL previously ingested. Trying another...") continue try: with time_limit(REQUEST_TIMEOUT): response = urllib.request.urlopen(search_result) raw_document = bytes(doc.Doc(response.read()).clean, 'utf-8') document = raw_document.decode("utf-8", "strict") except Exception as e: prush(" Error. {}\n Trying another...".format(e)) continue if len(document) < MIN_DOC_LENGTH: prush(" Document too short. Trying another...") continue with open(file_name, 'w') as f: f.write(document) prush(" Success! Written to {}".format( hash(search_result) + ".txt")) success = True success_count = success_count + 1 prush(datetime.now(), "Done")
def scrape(raw_directory="research/data/raw_html", num_pages=165): """Scrapes raw HTML BLM data for each page at https://elephrame.com/textbook/BLM/chart and saves the HTML for each page to a local directory. Parameters ---------- raw_directory : str The directory into which the raw html files will be saved. num_pages : int The number of pages to scrape. Raises ------ Exception The max number of consecutive failures (MAX_FAILURES) has been reached. Notes ----- Since the main table in the scraped page loads asyncronously, it can be difficult to know that the content has been fully rendered prior to automating the click event to load the next page of the table. The function tries to cope with this by checking that the page content has changed since the last time it loaded, and by dynamically adjusting a wait timer to give the components sufficient time to load before processing. This approach is imperfect, but seems to be minimally sufficient to meet current needs. Since the waiting mechanisms currently employed here are fairly crude, the function may accidentally skip some pages as it moves along. This usually occurs if the function tries to submit a multiple click events too qucicky, in which case, the function thinks it had time to process a page that it never actually processed. The function will recover from this and move on to the next page cleanly, but nothing is done to try and recover the skipped page. This function currently runs chrome as the driver, and does not run headless. The function is fragile to the size of the browser window that is opened. Changing the browser window can cause the DOM to restructure, which can cause the XPATH assumptions to become invalid. Not worth fixing at the moment, but be warned. """ page_num = 1 previous_num = 0 wait = MINIMUM_LOAD_WAIT_SECS consecutive_successes = 0 consecutive_failures = 0 previous_hash = hash("") cwd = os.getcwd() driver = webdriver.Chrome() driver.get(BASE_URL) while page_num < num_pages: try: time.sleep(wait) page_input = driver.find_elements_by_xpath(PAGE_XPATH)[0] page_num = int(page_input.get_attribute("value")) if page_num == previous_num: raise Exception("Page number hasn't changed yet.") html_source = driver.page_source current_hash = utils.hash(html_source) if current_hash == previous_hash: raise Exception("Page content hasn't changed yet.") previous_hash = current_hash filename = "{}/{}/page_{}.html".format(cwd, raw_directory, page_num) prush("writing", filename) with open(filename, "w") as f: f.write(html_source) driver.find_element_by_xpath(NEXT_XPATH).click() consecutive_successes = consecutive_successes + 1 consecutive_failures = 0 if consecutive_successes >= SUCCESS_DOWNGRADE_INTERVAL and wait - WAIT_DECREMENT_INTEVAL >= MINIMUM_LOAD_WAIT_SECS: consecutive_successes = 0 wait = wait - WAIT_DECREMENT_INTEVAL prush("Decrementing wait to", wait) except Exception as e: prush(page_num, e) consecutive_failures = consecutive_failures + 1 consecutive_successes = 0 if consecutive_failures >= MAX_FAILURES: raise Exception("Reached max number of consecutive failures.") wait = wait + WAIT_INCREMENT_INTEVAL prush("Incrementing wait to", wait) driver.quit()
def hydrate_codex(): prush("Initializing...") loop = asyncio.get_event_loop() loop.run_until_complete(_hydrate_codex()) loop.close()
def extract(raw_directory="research/data/raw_html"): """Extract the BLM data from raw HTML files in a directory and save the resulting data to disk. Parameters ---------- raw_directory : str The directory to find raw html files to be extracted. """ results = [] raw_dir = os.path.join(os.getcwd(), raw_directory) prush("Getting list of raw html files...") file_names = [ f for f in os.listdir(raw_dir) if os.path.isfile(os.path.join(raw_dir, f)) and f.endswith(".html") ] prush("Extracting data from raw html...") for file_name in file_names: f = open(os.path.join(raw_dir, file_name), "r") tree = html.fromstring(f.read()) f.close() items_list_div = tree.xpath('//div[@class="item chart"]') for item_div in items_list_div: results.append({ "location": utils.first( item_div.xpath( 'div/div[@class="item-protest-location"]/text()')), "start": utils.first( item_div.xpath( 'div/div/div[@class="protest-start"]/text()')), "end": utils.first( item_div.xpath( 'div/div/div[@class="protest-end"]/text()')), "subject": utils.first( item_div.xpath( 'div/ul/li[@class="item-protest-subject"]/text()')), "participants": utils.first( item_div.xpath( 'div/ul/li[@class="item-protest-participants"]/text()') ), "time": utils.first( item_div.xpath( 'div/ul/li[@class="item-protest-time"]/text()')), "description": utils.first( item_div.xpath( 'div/ul/li[@class="item-protest-description"]/text()') ), "urls": item_div.xpath( 'div/ul/li[@class="item-protest-url"]/p/a/text()'), }) prush("Writing to {}...".format(EXTRACT_FILE_NAME)) with open(os.path.join(raw_dir, EXTRACT_FILE_NAME), "w") as f: f.write(json.dumps(results, indent=2)) prush("Done.")