def getLinksForYear(year, dir):
    url = driver.find_element_by_link_text(str(year)).get_attribute('href')
    #go to url for that year
    driver.get(url)

    #some years do not have a pdf for its beige books, so must get the url of html page instead
    if year < 2009 and year > 2002:
        b = driver.find_elements_by_partial_link_text('Beige Book')
        beige_books = [i.get_attribute('href') for i in b]

        for u in beige_books:
            res = requests.get(u)
            html_page = res.content

            soup = BeautifulSoup(html_page, 'html.parser')

            text = soup.find_all(text=True)

            for t in text:
                beige[t] = year

    #find all the links by looking for urls that have keyword 'pdf' in them
    d = driver.find_elements_by_partial_link_text('PDF')
    docuList = [i.get_attribute('href') for i in d]

    #for each pdf link found, use requests library to download it into your working directory
    for link in docuList:
        temp = link
        ind = temp.rfind('/')
        foldername = os.path.join(data_dir, str(year))
        folder_check(foldername)
        filename = os.path.join(foldername, link[ind + 1::])
        download_file(link, filename)
        filename = os.path.join(foldername, link[ind + 1::])
        download_file(link, filename)


#method to download pdf file using its url
def download_file(download_url, filename):
    print(filename)
    file = str(filename)
    response = urllib.request.urlopen(download_url)
    file = open(filename, 'wb+')
    file.write(response.read())
    file.close()


if __name__ == '__main__':
    #take user input for which years they want to scrape from
    val = input(
        "Enter how many years you want to go back from 2013, with a minimum of 1 and a maximum of 27:\t"
    )
    years = int(val)

    load_dotenv('.env')
    data_dir, pdf_dir = os.environ.get("data_dir", "data"), os.environ.get(
        "pdf_dir", "pdfs")
    data_dir = os.path.join(data_dir, pdf_dir)  # Directory for all documents
    folder_check(data_dir)

    for y in range((2014 - years), 2014)[::-1]:
        getLinksForYear(y, data_dir)
        driver.back()
Пример #3
0
    tokens = [w.lower() for w in tokens]  # lower case
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    words = [w for w in words if not w in stop_words]
    return words

def text_cleaner_basic(filename, text_list):
    lc_filename = filename.lower()
    return [page for page in text_list if len(re.split('\W+', page)) >= 50 and not "TABLE OF CONTENTS" in page]  # gotta make sure there's like actual words on it

if __name__ == "__main__":
    data_base_dir, pdf_dir, pkl_dir = os.environ.get("data_dir", "data"), os.environ.get("pdf_dir", "pdfs"), os.environ.get("pkl_dir", "pkl")
    ld, lp = len(split_dir(data_base_dir)), len(split_dir(pkl_dir))
    data_dir = os.path.join(data_base_dir, pdf_dir)
    pkl_dir = os.path.join(data_base_dir, pkl_dir)
    folder_check(pkl_dir)
    for file_tuple in os.walk(data_dir):
        if not file_tuple[2] or file_tuple[0] == data_dir:
            continue
        curr_dir = file_tuple[0]
        dir_parts = split_dir(curr_dir)
        pkl_file_base = os.path.join(pkl_dir, os.path.join(*dir_parts[ld+lp:]))
        folder_check(pkl_file_base)
        for filename in file_tuple[2]:
            print(f"Parsing {filename}")
            text = pdfToList(os.path.join(curr_dir, filename))
            pkl_name = os.path.join(pkl_file_base, filename.rsplit(".", 1)[0] + ".pkl")
            pickle.dump(text, open(pkl_name, "wb+"))
import nltk
from nltk import bigrams, trigrams, ngrams

import pickle
import os
from dotenv import load_dotenv
load_dotenv(".env")
data_dir = os.environ.get("data_dir", "data")
pkl_dir = os.environ.get("pkl_dir", "pkl")

from util import folder_check

if __name__ == "__main__":
    for path, folders, files in os.walk(os.path.join(data_dir, pkl_dir)):
        if not files or (len(files) < 2 and files[0][0] == "."):
            continue
        print(path)
        folder_check(os.path.join(path, "bigram"))
        folder_check(os.path.join(path, "trigram"))
        for file in files:
            fn = file.split(".")[0]
            pages = pickle.load(open(os.path.join(path, file), "rb+"))
            # bigrams
            fn_bigrams = [[g for g in bigrams(page)] for page in pages]
            bigrams_file = os.path.join(path, "bigram", f"{fn}_bigrams.pkl")
            pickle.dump(fn_bigrams, open(bigrams_file, "wb+"))
            # trigrams
            fn_trigrams = [[g for g in trigrams(page)] for page in pages]
            trigrams_file = os.path.join(path, "trigram", f"{fn}_trigrams.pkl")
            pickle.dump(fn_trigrams, open(trigrams_file, "wb+"))
        
    load_dotenv('.env')
    data_dir, pkl_dir = os.environ.get("data_dir", "data"), os.environ.get(
        "pkl_dir", "pkl")
    pkl_dir = os.path.join(data_dir, pkl_dir)
    big_word_bank = []
    year_data = {}
    for file_tuple in os.walk(pkl_dir):
        if not file_tuple[2] or file_tuple[0] == pkl_dir:
            continue
        year = int(os.path.split(file_tuple[0])[-1])
        print(f"Getting frequencies of words for {year}")
        all_docs = [
            pickle.load(open(os.path.join(file_tuple[0], fn), 'rb+'))
            for fn in file_tuple[2]
        ]
        # flatten the array
        all_docs = [arr for array2d in all_docs for arr in array2d]
        big_word_bank.extend(all_docs)
        year_data[year] = word_freq_pdf(all_docs)
    print("Analyzing overall word frequencies")
    overall_freq = word_freq_pdf(big_word_bank)

    # Store data
    results_dir = os.environ.get("results_dir", "results")
    folder_check(results_dir)
    pickle.dump(year_data,
                open(os.path.join(results_dir, 'year_freq.pkl'), 'wb+'))
    pickle.dump(overall_freq,
                open(os.path.join(results_dir, 'overall_freq.pkl'), 'wb+'))
    print("Done!")