def init_program(): # Load env vars load_dotenv() # init logging program_start_time = utc_now() timezone = program_start_time.tzinfo logs_config() logging.info(f"Begin program run: {program_start_time} ({timezone} time)") # create or clean download dir if DIR_OUTPUT.is_dir(): # delete files from previous run delete_dir_contents(DIR_OUTPUT) else: logging.info("Data directory doesn't exist - building") DIR_OUTPUT.mkdir() # Set pandas options pandas_opts() # this fixes a strange bug with botocore/moto not recognizing AWS credentials: https://github.com/spulec/moto/issues/1941 os.environ["AWS_ACCESS_KEY_ID"] = os.environ.get("KEY_ID") os.environ["AWS_SECRET_ACCESS_KEY"] = os.environ.get("SECRET_KEY_ID") # set Altair themes alt.themes.register("spotlight", spotlight) alt.themes.enable("spotlight") return program_start_time
def setUp(self) -> None: # start logging logs_config(paths["logs_config_test"]) # delete previous test output files logging.info(f"Deleting {mock_dirs['payload_email']} if it exists") if mock_dirs["payload_email"].is_dir(): rmtree(mock_dirs["payload_email"]) # rebuild directory mock_dirs["payload_email"].mkdir(parents=True, exist_ok=True) # SET PANDAS OPTIONS FOR PRINT DISPLAY pd.set_option("display.max_columns", 20) pd.set_option("display.width", 2000) pd.set_option("display.max_rows", 700)
def main(): # init logging scrape_start_time = datetime.now() logs_config() logging.info('Beginning scrape') # SET VARS # list of target filing IDs list_of_filing_ids = [ "336344", # samuel doctor "331887", # phil heasley "332791", # Andre Del Valle "209028" # thomas murt ] sleep_time = 5 # create or clean up PDF download dir if DIR_DATA.is_dir(): # delete files from previous run delete_dir_contents(DIR_DATA) else: DIR_DATA.mkdir() # get pdf for filing_id in list_of_filing_ids: get_pdf(filing_id) # sleep in order to avoid overloading Ethics Commission website logging.info(f'Sleeping for {sleep_time} seconds...') time.sleep(sleep_time) logging.info('...Finished sleeping!') # complete scrape_end_time = datetime.now() scrape_duration = (scrape_end_time - scrape_start_time).total_seconds() logging.info(f'Total scrape time: {scrape_duration /60} minutes') logging.info( f'Time per item scraped: {round(scrape_duration / len(list_of_filing_ids), 2)} sec' ) logging.info("Scrape complete")
def main(): # Load env vars load_dotenv() # Init logging logs_config() logging.info("Begin program") # delete data from previous runs clean_data() # get google sheet config with open(PATH_CONFIG_GSHEETS) as f: config_sheets = json.load(f) for document in config_sheets: sheets = document.get("sheets", []) document_name = document.get("document_name") document_id = document.get("document_id") move_s3 = document.get("move_s3", False) bucket_name = document.get("bucket_name", None) bucket_dest_dir = document.get("bucket_dest_dir", None) move_local = document.get("move_local", False) logging.info(f"Extracting files from document: {document_name}") for sheet in sheets: sheet_name = sheet["name"] output_filename = f"{sheet_name}.csv" download_sheet(sheet_name, document_id, sheet["gid"], output_filename) if move_s3: if bucket_name and bucket_dest_dir: copy_to_s3(output_filename, bucket_name, bucket_dest_dir) else: raise ValueError('Missing bucket_name or bucket_dest_dir') if move_local: copy_to_local(output_filename) logging.info("Sleeping to avoid google rate limiting issues...") sleep(10) logging.info("Waking!")
import unittest from unittest import mock from pathlib import Path from shutil import rmtree from dotenv import load_dotenv # project modules from modules.download import download_pdf from modules.initialize import initialize_driver from locations import dirs, paths, root_dir, test_dir from logs.config.logging import logs_config # LOGGING logs_config(paths["logs_config_test"]) # ENV load_dotenv(root_dir / ".dev.env") # MOCK VARS mock_dirs = { "pdfs": test_dir / "output/pdfs" } # NOTE: Must have resolve otherwise you'll have problems # MOCK FUNCS @mock.patch.dict(dirs, mock_dirs, clear=True) def initialize_test_driver(): """ By mocking the directory paths we force webdriver to set test/output/pdfs as default download directory """ return initialize_driver()
def main(): # load environ vars load_dotenv() # init logging logs_config() # create or clean temp dirs if DIR_DATA.is_dir(): # delete files from previous run delete_dir_contents(DIR_DATA) else: DIR_DATA.mkdir() # init driver logging.info("Begin scrape") driver = initialize_driver() search_terms = [ { "search_name": "House - paper forms", "input_fields": [ {"field": "Template", "value": "SFI Template"}, {"field": "BatesBatch", "value": "*HR*"}, {"field": "Year", "value": "2019"}, ], }, { "search_name": "Senate - paper forms", "input_fields": [ {"field": "Template", "value": "SFI Template"}, {"field": "BatesBatch", "value": "*SN*"}, {"field": "Year", "value": "2019"}, ], }, { "search_name": "House - web forms", "input_fields": [ {"field": "Template", "value": "Web State of Financial Interests Form"}, {"field": "03-05 State Entity", "value": "*rep*"}, {"field": "07 Year", "value": "2019"}, ], }, { "search_name": "Senate - web forms", "input_fields": [ {"field": "Template", "value": "Web State of Financial Interests Form"}, {"field": "03-05 State Entity", "value": "*sen*"}, {"field": "07 Year", "value": "2019"}, ], }, ] """ TODO: > loop over each page of results: > loop over each row on page: > scrape page /// pseudo code /// driver open "https://www.ethicsrulings.pa.gov/WebLink/Search.aspx?dbid=0&repo=EthicsLF8" for search in search_terms: input search_terms click submit # page search loop page_count = 1 while True: wait for page results to load # row search loop row_count = 1 while True: try: click filer's name based on row_count wait for filer page to load scrape_page() row_count += 1 except NoSuchElementException: logging.info("No more rows found") break # end row search loop try: page_count += 1 click page element based on page_count except NoSuchElementException: logging.info("no more pages found") break # end page search loop """ # set vars - just using these numbers for testing, delete this start_id = 3 end_id = 5 for page_id in range(start_id, end_id): scrape_page(driver, page_id)
def main(): ######################################################################## # SETUP ######################################################################## # INIT LOGGING logs_config() # START TIME scrape_start_time = datetime.now() # GET ENV VARS county_list = json.loads(os.environ.get("COUNTY_LIST")) target_scrape_day = os.environ.get("TARGET_SCRAPE_DATE", "yesterday").lower() target_scrape_date = (misc.today_date() if target_scrape_day == "today" else misc.yesterday_date()) # convert to date scrape_name = os.getenv("SCRAPE_NAME", "Cases Scrape") run_env = os.environ.get("ENV_FILE", "DEV") # defaults to 'DEV' rest_api_enabled = os.getenv("REST_API_ENABLED") rest_api_enabled = True if rest_api_enabled == "TRUE" else False move_to_s3_enabled = os.getenv("MOVE_TO_S3_ENABLED") move_to_s3_enabled = True if move_to_s3_enabled == "TRUE" else False # REFORMAT COUNTY LIST county_list = [ x.title() for x in county_list ] # Counties are transformed into print_title case, otherwise we'll get errors during scrape ######################################################################## # START PROGRAM ######################################################################## misc.print_title("pa court report") logging.info("##### PROGRAM START #####") logging.info(f"Scrape: {scrape_name}") logging.info(f"Running in {run_env} environment\n") ######################################################################## # DELETE + CREATE ######################################################################## # DELETE OLD FILES # If temp folder was created from previous scrape we delete it so it doesn't cause complications. misc.delete_folders_and_contents(temp_dir) # CREATE TEMP DIRECTORIES temp_subdirs = [ dirs[directory] for directory in dirs if "/" + str(temp_dir.name) + "/" in str(dirs[directory]) ] misc.create_folders(temp_subdirs) ######################################################################## # SCRAPE ######################################################################## for count, county in enumerate(county_list): # SCRAPE UJS SEARCH RESULTS # We first get basic docket data from search results, like docket # numbers, filing dates, etc. and turn it into a list of dicts. docket_list = scrape(county, target_scrape_date) if docket_list: # DOWNLOAD PDF OF EACH DOCKET # Each case is associated with a PDF that has more data. We # extract info from those pdfs and add them to our dicts. driver = initialize.initialize_driver() for docket in docket_list: pdf_path = download.download_pdf(driver, docket["url"], docket["docketnum"]) text = convert.convert_pdf_to_text(pdf_path, docket["docketnum"]) # PARSE PDF TEXT parsed_data = parse_main(text) docket.update(parsed_data) driver.quit() # CONVERT DICT LIST INTO PANDAS DF df = export.convert_dict_into_df(docket_list, county) # SAVE BACKUP OF DF FOR DEBUGGING df.to_pickle(dirs["df_pkl"] / "df.pkl") # CONVERT DF TO CSV export.convert_df_to_csv(df) # CONVERT DF INTO HTML FOR EMAIL PAYLOAD county_intro = "{} in {} County:".format(df.shape[0], county) # count of cases html_df = export.convert_df_to_html(df) export.save_html_county_payload(county_intro, html_df) if not count == (len(county_list) - 1): sleep_after_scrape = 65 logging.info( f"Sleeping for {sleep_after_scrape} seconds after scrape in order to prevent overloading " f"UJS server") time.sleep(sleep_after_scrape) else: logging.info(f"No cases were found for {county} County") county_intro = f"No cases found for {county} County." export.save_html_county_payload(county_intro) ######################################################################## # EXPORT & EMAIL FINAL PAYLOAD ######################################################################## # START TIME scrape_end_time = datetime.now() # OPTIONAL: MOVE JSON FILE TO S3 if move_to_s3_enabled: export.convert_csv_to_json(scrape_end_time, county_list) copy_file_to_s3_bucket() # OPTIONAL: UPLOAD DATA TO DATABASE if rest_api_enabled and paths["payload_csv"].is_file(): upload.upload_to_rest_api() # SEND EMAIL WITH DOCKET DATA email.email_notification(scrape_start_time, scrape_end_time, target_scrape_day, county_list) # CLOSE PROGRAM logging.info("Scrape completed at: {}".format( get_datetime_now_formatted())) logging.info("Closing program")