Пример #1
0
    def _search():
        time_since_last_use = 0
        prush("Selecting an engine...")
        engine_name = ""
        while True:
            engine = random.choice(search_engines)()
            engine_name = engine.__class__.__name__
            if not engine_name in engine_times:
                break
            time_since_last_use = (datetime.now() -
                                   engine_times[engine_name]).total_seconds()
            if time_since_last_use < ENGINE_COOLDOWN_TIME:
                prush(
                    "Engine '{}' used too recently. Trying another...".format(
                        engine_name))
            else:
                break

        engine.set_headers({'User-Agent': get_random_user_agent()})
        # internally intepreted as sleep(random_uniform(*self._delay))
        # This value set low (or zero) since we pause between use of each
        # engine (above).
        engine._delay = (0, 0)
        subject = random.choice(subjects) + " news"
        prush("Searching for subject '{}'...".format(subject))
        search_results = engine.search(subject, pages=SEARCH_PAGES).links()
        engine_times[engine_name] = datetime.now()
        prush("Found {} results for subject '{}'.".format(
            len(search_results), subject))
        return search_results
Пример #2
0
async def _hydrate_codex(clean_file="research/data/cleaned.json",
                         url_timeout=10,
                         max_consecutive_exceptions=10):
    prush("Loading cleaned data...")
    with open(os.path.join(os.getcwd(), clean_file), 'r') as f:
        reports = json.load(f)

    prush("Counting distinct URLs to process...")
    sem = asyncio.Semaphore(CODEX_SEMAPHORE_LIMIT)
    urlset = set()
    pending = []
    for report in reports:
        for url_info in report["urls"]:
            urlset.add(url_info["hash"])
            pending.append(_fetch(url_info, sem))
    url_count_distinct = len(urlset)
    url_count = len(pending)
    prush("Distinct URL Count:", url_count_distinct)
    urlset = None

    # Shuffle the pending list to try and increase the distance between calls
    # to each domain. Helps avoid having a target think this is a DoS attack.
    # Even though we process the URLs asyncronously, shuffling helps further
    # distribute the domains.
    random.shuffle(pending)
    url_count_processed = 0
    success_count = 0
    while len(pending) > 0:
        done, pending = await asyncio.wait(pending,
                                           return_when=asyncio.FIRST_COMPLETED)
        for d in done:
            # Exceptions should all be caught within the _fetch function. If
            # not, we unwind and get the hell out of dodge.
            if d.exception() is not None:
                for p in pending:
                    p.cancel()
                raise d.exception()

            result = d.result()
            url_count_processed = url_count_processed + 1
            pct = round(url_count_processed / url_count * 100, 2)
            prush("{} - {}/{} ({}%): {} {} {}".format(
                datetime.now(), url_count_processed, url_count, pct,
                result["hash"], result["msg"], result["url"]))
            if result["successful"]:
                success_count = success_count + 1
    prush("Done. {}/{} successully processed.".format(success_count,
                                                      url_count))
Пример #3
0
def build_codex(doc_count=1000,
                subjects_dir="research/data/arbitrary",
                destination_dir="research/data/arbitrary/codex"):
    """Builds a set of documents by collecting arbitrary content from the web
    based on a provided list of subjects.
    """
    SUBJECT_BATCH_SIZE = 10  # Number of documents to try to retrieve for each subject-batch.
    # Number of pages of results to request per engine per batch.
    SEARCH_PAGES = 5
    # The minimum length (characters) for a document to be eligible for the codex.
    MIN_DOC_LENGTH = 200
    ENGINE_COOLDOWN_TIME = 5

    subjects_file = os.path.join(os.getcwd(), subjects_dir,
                                 "usnews_subjects.json")
    with open(subjects_file, 'r') as f:
        subjects = json.load(f)

    engine_times = dict()

    def _search():
        time_since_last_use = 0
        prush("Selecting an engine...")
        engine_name = ""
        while True:
            engine = random.choice(search_engines)()
            engine_name = engine.__class__.__name__
            if not engine_name in engine_times:
                break
            time_since_last_use = (datetime.now() -
                                   engine_times[engine_name]).total_seconds()
            if time_since_last_use < ENGINE_COOLDOWN_TIME:
                prush(
                    "Engine '{}' used too recently. Trying another...".format(
                        engine_name))
            else:
                break

        engine.set_headers({'User-Agent': get_random_user_agent()})
        # internally intepreted as sleep(random_uniform(*self._delay))
        # This value set low (or zero) since we pause between use of each
        # engine (above).
        engine._delay = (0, 0)
        subject = random.choice(subjects) + " news"
        prush("Searching for subject '{}'...".format(subject))
        search_results = engine.search(subject, pages=SEARCH_PAGES).links()
        engine_times[engine_name] = datetime.now()
        prush("Found {} results for subject '{}'.".format(
            len(search_results), subject))
        return search_results

    success_count = 0
    search_results = _search()
    while success_count < doc_count:
        if success_count % 10 == 0:
            prush("\n{}: {} docs processed. {}% complete.\n".format(
                datetime.now(), success_count,
                100 * round(success_count / doc_count, 2)))
        if success_count % SUBJECT_BATCH_SIZE == 0 and success_count != 0:
            search_results = _search()
        # We try to maintain a buffer above the minumum number of results required
        # so we 1) can choose some results at random (not just take all results) and
        # 2) can account for the fact that some of the links will not return 200.
        if len(search_results) < SUBJECT_BATCH_SIZE * 2:
            prush("Not enough results for subject. Trying another...")
            search_results = _search()
            continue
        success = False
        while not success:
            if len(search_results) == 0:
                prush(
                    "Exhausted search results for this subject. Trying another..."
                )
                break
            random.seed()
            search_result = random.choice(search_results)
            search_results.remove(search_result)
            prush("Accessing {}...".format(search_result))
            if "youtube.com" in search_result:
                prush("  Appears to be a YouTube result. Trying another...")
                continue
            if search_result[:-3] == "pdf":
                prush("  Appears to be a PDF. Trying another...")
                continue
            file_name = os.path.join(os.getcwd(), destination_dir,
                                     hash(search_result) + ".txt")
            if os.path.exists(file_name):
                prush("  URL previously ingested. Trying another...")
                continue
            try:
                with time_limit(REQUEST_TIMEOUT):
                    response = urllib.request.urlopen(search_result)
                raw_document = bytes(doc.Doc(response.read()).clean, 'utf-8')
                document = raw_document.decode("utf-8", "strict")
            except Exception as e:
                prush("  Error. {}\n  Trying another...".format(e))
                continue
            if len(document) < MIN_DOC_LENGTH:
                prush(" Document too short. Trying another...")
                continue
            with open(file_name, 'w') as f:
                f.write(document)
            prush("  Success! Written to {}".format(
                hash(search_result) + ".txt"))
            success = True
            success_count = success_count + 1
    prush(datetime.now(), "Done")
Пример #4
0
def scrape(raw_directory="research/data/raw_html", num_pages=165):
    """Scrapes raw HTML BLM data for each page at https://elephrame.com/textbook/BLM/chart
    and saves the HTML for each page to a local directory.

    Parameters
    ----------
    raw_directory : str
        The directory into which the raw html files will be saved.

    num_pages : int
        The number of pages to scrape.

    Raises
    ------
    Exception
        The max number of consecutive failures (MAX_FAILURES) has been reached.

    Notes
    -----
    Since the main table in the scraped page loads asyncronously, it can be
    difficult to know that the content has been fully rendered prior to automating
    the click event to load the next page of the table. The function tries
    to cope with this by checking that the page content has changed since the
    last time it loaded, and by dynamically adjusting a wait timer to give the
    components sufficient time to load before processing. This approach
    is imperfect, but seems to be minimally sufficient to meet current needs.

    Since the waiting mechanisms currently employed here are fairly crude, the
    function may accidentally skip some pages as it moves along. This usually
    occurs if the function tries to submit a multiple click events too qucicky,
    in which case, the function thinks it had time to process a page that it
    never actually processed. The function will recover from this and move on to
    the next page cleanly, but nothing is done to try and recover the skipped
    page.

    This function currently runs chrome as the driver, and does not run 
    headless. The function is fragile to the size of the browser window that is 
    opened. Changing the browser window can cause the DOM to restructure, which
    can cause the XPATH assumptions to become invalid. Not worth fixing at the
    moment, but be warned.
    """

    page_num = 1
    previous_num = 0
    wait = MINIMUM_LOAD_WAIT_SECS
    consecutive_successes = 0
    consecutive_failures = 0
    previous_hash = hash("")
    cwd = os.getcwd()

    driver = webdriver.Chrome()
    driver.get(BASE_URL)
    while page_num < num_pages:
        try:
            time.sleep(wait)

            page_input = driver.find_elements_by_xpath(PAGE_XPATH)[0]
            page_num = int(page_input.get_attribute("value"))
            if page_num == previous_num:
                raise Exception("Page number hasn't changed yet.")

            html_source = driver.page_source
            current_hash = utils.hash(html_source)
            if current_hash == previous_hash:
                raise Exception("Page content hasn't changed yet.")
            previous_hash = current_hash

            filename = "{}/{}/page_{}.html".format(cwd, raw_directory,
                                                   page_num)
            prush("writing", filename)
            with open(filename, "w") as f:
                f.write(html_source)

            driver.find_element_by_xpath(NEXT_XPATH).click()

            consecutive_successes = consecutive_successes + 1
            consecutive_failures = 0
            if consecutive_successes >= SUCCESS_DOWNGRADE_INTERVAL and wait - WAIT_DECREMENT_INTEVAL >= MINIMUM_LOAD_WAIT_SECS:
                consecutive_successes = 0
                wait = wait - WAIT_DECREMENT_INTEVAL
                prush("Decrementing wait to", wait)
        except Exception as e:
            prush(page_num, e)
            consecutive_failures = consecutive_failures + 1
            consecutive_successes = 0
            if consecutive_failures >= MAX_FAILURES:
                raise Exception("Reached max number of consecutive failures.")
            wait = wait + WAIT_INCREMENT_INTEVAL
            prush("Incrementing wait to", wait)
    driver.quit()
Пример #5
0
def hydrate_codex():
    prush("Initializing...")
    loop = asyncio.get_event_loop()
    loop.run_until_complete(_hydrate_codex())
    loop.close()
Пример #6
0
def extract(raw_directory="research/data/raw_html"):
    """Extract the BLM data from raw HTML files in a directory and save the
    resulting data to disk.

    Parameters
    ----------
    raw_directory : str
        The directory to find raw html files to be extracted.
    """

    results = []
    raw_dir = os.path.join(os.getcwd(), raw_directory)
    prush("Getting list of raw html files...")
    file_names = [
        f for f in os.listdir(raw_dir)
        if os.path.isfile(os.path.join(raw_dir, f)) and f.endswith(".html")
    ]
    prush("Extracting data from raw html...")
    for file_name in file_names:
        f = open(os.path.join(raw_dir, file_name), "r")
        tree = html.fromstring(f.read())
        f.close()
        items_list_div = tree.xpath('//div[@class="item chart"]')
        for item_div in items_list_div:
            results.append({
                "location":
                utils.first(
                    item_div.xpath(
                        'div/div[@class="item-protest-location"]/text()')),
                "start":
                utils.first(
                    item_div.xpath(
                        'div/div/div[@class="protest-start"]/text()')),
                "end":
                utils.first(
                    item_div.xpath(
                        'div/div/div[@class="protest-end"]/text()')),
                "subject":
                utils.first(
                    item_div.xpath(
                        'div/ul/li[@class="item-protest-subject"]/text()')),
                "participants":
                utils.first(
                    item_div.xpath(
                        'div/ul/li[@class="item-protest-participants"]/text()')
                ),
                "time":
                utils.first(
                    item_div.xpath(
                        'div/ul/li[@class="item-protest-time"]/text()')),
                "description":
                utils.first(
                    item_div.xpath(
                        'div/ul/li[@class="item-protest-description"]/text()')
                ),
                "urls":
                item_div.xpath(
                    'div/ul/li[@class="item-protest-url"]/p/a/text()'),
            })
    prush("Writing to {}...".format(EXTRACT_FILE_NAME))
    with open(os.path.join(raw_dir, EXTRACT_FILE_NAME), "w") as f:
        f.write(json.dumps(results, indent=2))
    prush("Done.")