def getLinksForYear(year, dir): url = driver.find_element_by_link_text(str(year)).get_attribute('href') #go to url for that year driver.get(url) #some years do not have a pdf for its beige books, so must get the url of html page instead if year < 2009 and year > 2002: b = driver.find_elements_by_partial_link_text('Beige Book') beige_books = [i.get_attribute('href') for i in b] for u in beige_books: res = requests.get(u) html_page = res.content soup = BeautifulSoup(html_page, 'html.parser') text = soup.find_all(text=True) for t in text: beige[t] = year #find all the links by looking for urls that have keyword 'pdf' in them d = driver.find_elements_by_partial_link_text('PDF') docuList = [i.get_attribute('href') for i in d] #for each pdf link found, use requests library to download it into your working directory for link in docuList: temp = link ind = temp.rfind('/') foldername = os.path.join(data_dir, str(year)) folder_check(foldername) filename = os.path.join(foldername, link[ind + 1::]) download_file(link, filename)
filename = os.path.join(foldername, link[ind + 1::]) download_file(link, filename) #method to download pdf file using its url def download_file(download_url, filename): print(filename) file = str(filename) response = urllib.request.urlopen(download_url) file = open(filename, 'wb+') file.write(response.read()) file.close() if __name__ == '__main__': #take user input for which years they want to scrape from val = input( "Enter how many years you want to go back from 2013, with a minimum of 1 and a maximum of 27:\t" ) years = int(val) load_dotenv('.env') data_dir, pdf_dir = os.environ.get("data_dir", "data"), os.environ.get( "pdf_dir", "pdfs") data_dir = os.path.join(data_dir, pdf_dir) # Directory for all documents folder_check(data_dir) for y in range((2014 - years), 2014)[::-1]: getLinksForYear(y, data_dir) driver.back()
tokens = [w.lower() for w in tokens] # lower case # remove punctuation from each word table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] words = [word for word in stripped if word.isalpha()] words = [w for w in words if not w in stop_words] return words def text_cleaner_basic(filename, text_list): lc_filename = filename.lower() return [page for page in text_list if len(re.split('\W+', page)) >= 50 and not "TABLE OF CONTENTS" in page] # gotta make sure there's like actual words on it if __name__ == "__main__": data_base_dir, pdf_dir, pkl_dir = os.environ.get("data_dir", "data"), os.environ.get("pdf_dir", "pdfs"), os.environ.get("pkl_dir", "pkl") ld, lp = len(split_dir(data_base_dir)), len(split_dir(pkl_dir)) data_dir = os.path.join(data_base_dir, pdf_dir) pkl_dir = os.path.join(data_base_dir, pkl_dir) folder_check(pkl_dir) for file_tuple in os.walk(data_dir): if not file_tuple[2] or file_tuple[0] == data_dir: continue curr_dir = file_tuple[0] dir_parts = split_dir(curr_dir) pkl_file_base = os.path.join(pkl_dir, os.path.join(*dir_parts[ld+lp:])) folder_check(pkl_file_base) for filename in file_tuple[2]: print(f"Parsing {filename}") text = pdfToList(os.path.join(curr_dir, filename)) pkl_name = os.path.join(pkl_file_base, filename.rsplit(".", 1)[0] + ".pkl") pickle.dump(text, open(pkl_name, "wb+"))
import nltk from nltk import bigrams, trigrams, ngrams import pickle import os from dotenv import load_dotenv load_dotenv(".env") data_dir = os.environ.get("data_dir", "data") pkl_dir = os.environ.get("pkl_dir", "pkl") from util import folder_check if __name__ == "__main__": for path, folders, files in os.walk(os.path.join(data_dir, pkl_dir)): if not files or (len(files) < 2 and files[0][0] == "."): continue print(path) folder_check(os.path.join(path, "bigram")) folder_check(os.path.join(path, "trigram")) for file in files: fn = file.split(".")[0] pages = pickle.load(open(os.path.join(path, file), "rb+")) # bigrams fn_bigrams = [[g for g in bigrams(page)] for page in pages] bigrams_file = os.path.join(path, "bigram", f"{fn}_bigrams.pkl") pickle.dump(fn_bigrams, open(bigrams_file, "wb+")) # trigrams fn_trigrams = [[g for g in trigrams(page)] for page in pages] trigrams_file = os.path.join(path, "trigram", f"{fn}_trigrams.pkl") pickle.dump(fn_trigrams, open(trigrams_file, "wb+"))
load_dotenv('.env') data_dir, pkl_dir = os.environ.get("data_dir", "data"), os.environ.get( "pkl_dir", "pkl") pkl_dir = os.path.join(data_dir, pkl_dir) big_word_bank = [] year_data = {} for file_tuple in os.walk(pkl_dir): if not file_tuple[2] or file_tuple[0] == pkl_dir: continue year = int(os.path.split(file_tuple[0])[-1]) print(f"Getting frequencies of words for {year}") all_docs = [ pickle.load(open(os.path.join(file_tuple[0], fn), 'rb+')) for fn in file_tuple[2] ] # flatten the array all_docs = [arr for array2d in all_docs for arr in array2d] big_word_bank.extend(all_docs) year_data[year] = word_freq_pdf(all_docs) print("Analyzing overall word frequencies") overall_freq = word_freq_pdf(big_word_bank) # Store data results_dir = os.environ.get("results_dir", "results") folder_check(results_dir) pickle.dump(year_data, open(os.path.join(results_dir, 'year_freq.pkl'), 'wb+')) pickle.dump(overall_freq, open(os.path.join(results_dir, 'overall_freq.pkl'), 'wb+')) print("Done!")