import pandas as pd from datetime import datetime from config import Configuration from helpers import get_metals_and_prices_from_table, soup_url, get_database_engine if __name__ == '__main__': config = Configuration() soup = soup_url(config.url) metal_table = soup.find("div", {"class": "c_mp_price_table"}) metals, prices = get_metals_and_prices_from_table(metal_table) metal_and_prices_df = pd.DataFrame() metal_and_prices_df["metal"] = metals metal_and_prices_df["price"] = prices metal_and_prices_df["date"] = datetime.today().strftime("%Y-%m-%d") connection = get_database_engine() metal_and_prices_df.to_sql(name=config.sql_table, con=connection, schema="projects", if_exists="append", index=False)
"""Script to add additional H-2A worksites DOL data to our database.""" import helpers from helpers import make_query, get_database_engine, myprint import pandas as pd from dotenv import load_dotenv load_dotenv() engine = get_database_engine(force_cloud=True) # worksites is a DataFrame # year, quarter should be strings - ex: 2020, 4 def manage_worksites(worksites, year, quarter): worksites = worksites.rename( columns={ "PLACE_OF_EMPLOYMENT_ADDRESS1": "WORKSITE_ADDRESS", "PLACE_OF_EMPLOYMENT_ADDRESS2": "WORKSITE_ADDRESS2", "PLACE_OF_EMPLOYMENT_CITY": "WORKSITE_CITY", "PLACE_OF_EMPLOYMENT_STATE": "WORKSITE_STATE", "PLACE_OF_EMPLOYMENT_POSTAL_CODE": "WORKSITE_POSTAL_CODE", "PLACE_OF_EMPLOYMENT_POST_CODE": "WORKSITE_POSTAL_CODE", "ADDITONAL_PLACE_OF_EMPLOYMENT_INFO": "ADDITIONAL_PLACE_OF_EMPLOYMENT_INFORMATION", "JO_ORDER_NUMBER": "JOB_ORDER_NUMBER" }) worksites = helpers.fix_zip_code_columns(worksites, ["WORKSITE_POSTAL_CODE"]) worksites["table"], worksites["Source"], worksites[
"""This script is used for implementing the most recent address fixes from the PostgreSQL low_accuracies table.""" import os import helpers from helpers import myprint, print_red_and_email, make_query, get_database_engine, handle_null import pandas as pd from geocodio import GeocodioClient from dotenv import load_dotenv load_dotenv() geocodio_api_key = os.getenv("GEOCODIO_API_KEY") engine, client = get_database_engine( force_cloud=True), GeocodioClient(geocodio_api_key) # fixed is a DataFrame of low_accuracies table rows that have been fixed def implement_fixes(fixed, fix_worksites=False): # worksite_or_housing is a string - either "worksite" or "housing" # sets the "{worksite_or_housing}_fixed_by", "fixed" columns to failed, False in the i-th row of df def mark_as_failed(i, worksite_or_housing, df): df.at[i, f"{worksite_or_housing}_fixed_by"] = "failed" df.at[i, "fixed"] = False # worksite_or_housing is a string - either "worksite" or "housing" # overwrites the geocoding results columns of the i-th row in df based on its worksite_or_housing address columns # if the geocoding of the new address columns results in accuracy too low or an accuracy type in helpers.bad_accuracy_types, marks the row as failed def fix_by_address(i, row, worksite_or_housing, df): if worksite_or_housing == "worksite": full_address = helpers.create_address_from(
import helpers if helpers.force_cloud: print("Refusing to run tests. Never run tests on the actual database!!! You are seeing this alert because the force_cloud variable is set to True (or you told it to be set it to true with your answer to the last question) in helpers.py. Set it to False to run tests on the local database.") exit() import unittest from add_housing import geocode_manage_split_housing from merge_dol import geocode_manage_split_merge from implement_fixes import implement_fixes from helpers import merge_all_data, get_value, myprint, make_query, geocode_table from colorama import Fore, Style import os import pandas as pd bad_accuracy_types = helpers.bad_accuracy_types engine = helpers.get_database_engine() # replaces job_central and low_accuracies with the DataFrames accurates and inaccurates def set_test_database_state(accurates, inaccurates): make_query("DELETE FROM job_central") make_query("DELETE FROM low_accuracies") accurates.to_sql("job_central", engine, if_exists='append', index=False, dtype=helpers.column_types) inaccurates.to_sql("low_accuracies", engine, if_exists='append', index=False, dtype=helpers.column_types) make_query("REFRESH MATERIALIZED VIEW previously_geocoded") # runs merge_all_data function with the two parameters as inputs and returns the updated job_central and low_accuracies tables as DataFrames def merge_all_and_get_new_state(accurate_new_jobs, inaccurate_new_jobs): merge_all_data(accurate_new_jobs, inaccurate_new_jobs) accurates = pd.read_sql("job_central", con=engine) inaccurates = pd.read_sql("low_accuracies", con=engine) return accurates, inaccurates
import os import helpers from helpers import make_query, get_database_engine import pandas as pd from dotenv import load_dotenv load_dotenv() engine = get_database_engine(force_cloud=False) # geocodes, splits by accuracy, renames columns, and adds necessary columns for the DataFrame 'housing'. # year, quarter should be strings - ex: 2020, 4 def geocode_manage_split_housing(housing, year, quarter): housing = housing.rename( columns={ "PHYSICAL_LOCATION_ADDRESS_2": "HOUSING_ADDRESS2", "JOB_ORDER_NUMBER": "JO_ORDER_NUMBER", "PHYSICAL_LOCATION_STATE": "HOUSING_STATE", "PHYSICAL_LOCATION_POSTAL_CODE": "HOUSING_POSTAL_CODE", "PHYSICAL_LOCATION_CITY": "HOUSING_CITY", "HOUSING_STANDARD_STATE": "HOUSING_STANDARDS_STATE", "HOUSING_STANDARD_LOCAL": "HOUSING_STANDARDS_LOCAL", "HOUSING_STANDARD_FEDERAL": "HOUSING_STANDARDS_FEDERAL", "PHYSICAL_LOCATION_COUNTY": "HOUSING_COUNTY", "PHYSICAL_LOCATION_ADDRESS_1": "HOUSING_ADDRESS_LOCATION" }) housing = helpers.fix_zip_code_columns(housing, ["HOUSING_POSTAL_CODE"]) housing["table"], housing["Source"], housing["fixed"], housing[ "housing_fixed_by"], housing[