def test_convert_empty_dict_into_df(self):
     """
     Test that result is a dataframe
     """
     empty_dict = {}
     df = convert_dict_into_df(empty_dict, "Dauphin")
     self.assertIsInstance(df, pd.core.frame.DataFrame)
    def test_convert_dict_into_df(self):
        """
        Test that result is a dataframe
        """

        df = convert_dict_into_df(docket_list, "Dauphin")
        self.assertIsInstance(df, pd.core.frame.DataFrame)
Пример #3
0
 def test_html_file_is_created(self):
     """
     Test that an HTML file is generated
     """
     # create df
     df = convert_dict_into_df(docket_list, "Dauphin")
     # create styled df
     styled_df = convert_df_to_html(df)
     # wrap styled df with more html
     save_html_county_payload("This is an introduction for the email",
                              styled_df)
     # Check html file has been created
     self.assertTrue(mock_paths["payload_email"].is_file())
 def setUp(self) -> None:
     mock_dirs["payload_email"].mkdir(parents=True, exist_ok=True)
     # create testing df
     df = convert_dict_into_df(docket_list, "Dauphin")
     self.styled_df = convert_df_to_html(df)
 def setUp(self) -> None:
     mock_dirs["payload_csv"].mkdir(parents=True, exist_ok=True)  # make directory
     self.df = convert_dict_into_df(docket_list, "Dauphin")  # make testing df
Пример #6
0
def main():

    ########################################################################
    #                                 SETUP
    ########################################################################

    # INIT LOGGING
    logs_config()

    # START TIME
    scrape_start_time = datetime.now()

    # GET ENV VARS
    county_list = json.loads(os.environ.get("COUNTY_LIST"))
    target_scrape_day = os.environ.get("TARGET_SCRAPE_DATE",
                                       "yesterday").lower()
    target_scrape_date = (misc.today_date() if target_scrape_day == "today"
                          else misc.yesterday_date())  # convert to date
    scrape_name = os.getenv("SCRAPE_NAME", "Cases Scrape")
    run_env = os.environ.get("ENV_FILE", "DEV")  # defaults to 'DEV'
    rest_api_enabled = os.getenv("REST_API_ENABLED")
    rest_api_enabled = True if rest_api_enabled == "TRUE" else False
    move_to_s3_enabled = os.getenv("MOVE_TO_S3_ENABLED")
    move_to_s3_enabled = True if move_to_s3_enabled == "TRUE" else False

    # REFORMAT COUNTY LIST
    county_list = [
        x.title() for x in county_list
    ]  # Counties are transformed into print_title case, otherwise we'll get errors during scrape

    ########################################################################
    #                          START PROGRAM
    ########################################################################

    misc.print_title("pa court report")
    logging.info("##### PROGRAM START #####")
    logging.info(f"Scrape: {scrape_name}")
    logging.info(f"Running in {run_env} environment\n")

    ########################################################################
    #                          DELETE + CREATE
    ########################################################################

    # DELETE OLD FILES
    # If temp folder was created from previous scrape we delete it so it doesn't cause complications.
    misc.delete_folders_and_contents(temp_dir)

    # CREATE TEMP DIRECTORIES
    temp_subdirs = [
        dirs[directory] for directory in dirs
        if "/" + str(temp_dir.name) + "/" in str(dirs[directory])
    ]
    misc.create_folders(temp_subdirs)

    ########################################################################
    #                                 SCRAPE
    ########################################################################

    for count, county in enumerate(county_list):

        # SCRAPE UJS SEARCH RESULTS
        # We first get basic docket data from search results, like docket
        # numbers, filing dates, etc. and turn it into a list of dicts.
        docket_list = scrape(county, target_scrape_date)
        if docket_list:

            # DOWNLOAD PDF OF EACH DOCKET
            # Each case is associated with a PDF that has more data. We
            # extract info from those pdfs and add them to our dicts.
            driver = initialize.initialize_driver()
            for docket in docket_list:
                pdf_path = download.download_pdf(driver, docket["url"],
                                                 docket["docketnum"])
                text = convert.convert_pdf_to_text(pdf_path,
                                                   docket["docketnum"])

                # PARSE PDF TEXT
                parsed_data = parse_main(text)
                docket.update(parsed_data)
            driver.quit()

            # CONVERT DICT LIST INTO PANDAS DF
            df = export.convert_dict_into_df(docket_list, county)

            # SAVE BACKUP OF DF FOR DEBUGGING
            df.to_pickle(dirs["df_pkl"] / "df.pkl")

            # CONVERT DF TO CSV
            export.convert_df_to_csv(df)

            # CONVERT DF INTO HTML FOR EMAIL PAYLOAD
            county_intro = "{} in {} County:".format(df.shape[0],
                                                     county)  # count of cases
            html_df = export.convert_df_to_html(df)
            export.save_html_county_payload(county_intro, html_df)

            if not count == (len(county_list) - 1):
                sleep_after_scrape = 65
                logging.info(
                    f"Sleeping for {sleep_after_scrape} seconds after scrape in order to prevent overloading "
                    f"UJS server")
                time.sleep(sleep_after_scrape)

        else:
            logging.info(f"No cases were found for {county} County")
            county_intro = f"No cases found for {county} County."
            export.save_html_county_payload(county_intro)

    ########################################################################
    #                        EXPORT & EMAIL FINAL PAYLOAD
    ########################################################################

    # START TIME
    scrape_end_time = datetime.now()

    # OPTIONAL: MOVE JSON FILE TO S3
    if move_to_s3_enabled:
        export.convert_csv_to_json(scrape_end_time, county_list)
        copy_file_to_s3_bucket()

    # OPTIONAL: UPLOAD DATA TO DATABASE
    if rest_api_enabled and paths["payload_csv"].is_file():
        upload.upload_to_rest_api()

    # SEND EMAIL WITH DOCKET DATA
    email.email_notification(scrape_start_time, scrape_end_time,
                             target_scrape_day, county_list)

    # CLOSE PROGRAM
    logging.info("Scrape completed at: {}".format(
        get_datetime_now_formatted()))
    logging.info("Closing program")