def import_oil_reserve(name): page = requests.get(name) soup = BeautifulSoup(page.content, 'lxml') table = soup.find('table', attrs={'class': 'data1'}) rows = table.findAll('tr', attrs={'class': 'DataRow'}) values = [] for tr in rows: state = tr.find('td', attrs={'class': 'DataStub1'}).get_text() otherYear = tr.findAll('td', attrs={'class': 'DataB'}) y11 = otherYear[0].get_text() y12 = otherYear[1].get_text() y13 = otherYear[2].get_text() y14 = otherYear[3].get_text() y15 = otherYear[4].get_text() current = tr.find('td', attrs={'class': 'Current2'}).get_text() values.append(OilReserveData(state, y11, y12, y13, y14, y15, current)) df = pd.DataFrame.from_records([s.to_dict() for s in values]) md.create_table(md.connect(), df, 'oil_reserve')
def main(file_name): initial = pd.read_csv('resources/' + file_name) initial['city'] = initial['city'].str.replace(' ', '').str.upper() seperatedZips = (initial['zip'].str.strip()).str.split(expand=True) fullData = pd.concat([initial, seperatedZips], axis=1) fullData = fullData.drop(['zip'], axis=1) idvars = [ 'city', 'state_id', 'state_name', 'county_name', 'lat', 'lng', 'population' ] allZips = pd.melt(fullData, id_vars=idvars, value_name='zip') allZips = allZips.drop(['variable'], axis=1) # Drops columns with missing zip code values allZips = allZips[pd.notnull(allZips.zip)] #engine = create_engine('mysql+pymysql://pythonUser:abc@localhost:3306/dddm?charset=utf8', encoding='utf-8') #allZips.to_sql(name='zip_lookup', con=engine, if_exists = 'replace') md.create_table(md.connect(), allZips, 'zip_lookup') """ Zip Code lookup table complete, Ready to be joined """ """
def import_weather(file_name): df_weather = pd.read_csv("resources/" + file_name, low_memory=False) df_weather = df_weather[[ 'StationName', 'Date', 'ObsType', 'Value', 'S-Flag', 'City', 'State' ]] md.create_table(md.connect(), df_weather, 'weather_observations')
def import_water_data(file_name): data = pd.read_csv('resources/' + file_name) data = data[[ 'MonitoringLocationTypeName', 'LatitudeMeasure', 'LongitudeMeasure' ]] #data.to_sql(name='water_locations', con=dbEngine, if_exists = 'replace') md.create_table(md.connect(), data, 'water_locations')
def import_land_prices(file_name): df_landprices = pd.read_excel('resources/' + file_name, skiprows=[0], parse_cols="A,B,C,D,E,H,I") df_landprices = df_landprices.loc[df_landprices['Date'] == '2015Q4'] df_landprices['MSA'] = df_landprices.MSA.str.replace(' ', '') #df_landprices.to_sql(name='land_prices', con=dbEngine, index=False, if_exists = 'replace') md.create_table(md.connect(), df_landprices, 'land_prices')
def import_seaports(file_name): df_ports = pd.read_csv("resources/" + file_name, low_memory=False) df_ports = df_ports[[ 'LATITUDE1', 'LONGITUDE1', 'CITY_OR_TO', 'STATE_POST', 'ZIPCODE', 'PORT_NAME' ]] #df_ports.to_sql(name='seaports', con=dbEngine, if_exists = 'replace') md.create_table(md.connect(), df_ports, 'seaports')
def import_existing_plants(file_name): plant_locations = pd.read_csv('resources/' + file_name) plant_locations = plant_locations[[ 'Facility Name', 'Deregistered (Yes/No)', 'City', 'State', 'Zip Code', 'Parent Company', 'Latitude', 'Longitude', 'Number of RMP Submissions' ]] plant_locations = plant_locations[plant_locations.State.notnull()] #plant_locations.to_sql(name='plant_locations', con=dbEngine, if_exists = 'replace') md.create_table(md.connect(), plant_locations, 'plant_locations')
def import_earthquakes(): df_earthquakes = pd.read_csv("resources/USEarthquakes.csv", low_memory=False) df_earthquakes = df_earthquakes[[ 'time', 'latitude', 'longitude', 'mag', 'magType', 'place' ]] df_earthquakes2 = pd.read_csv("resources/AKEarthquakes.csv", low_memory=False) df_earthquakes2 = df_earthquakes2[[ 'time', 'latitude', 'longitude', 'mag', 'magType', 'place' ]] df_earthquakes.append(df_earthquakes2) md.create_table(md.connect(), df_earthquakes, 'earthquake_data')
def normalize_all(): engine = md.connect() #normalize land prices df = pd.read_sql_table('land_prices_final', engine) df['home_value_norm'] = (df['Home Value'] - df['Home Value'].min()) / ( df['Home Value'].max() - df['Home Value'].min()) df['structure_cost_norm'] = ( df['Structure Cost'] - df['Structure Cost'].min()) / ( df['Structure Cost'].max() - df['Structure Cost'].min()) md.create_table(engine, df, 'land_prices_final') #normalize oil reserve data df = pd.read_sql_table('oil_reserve_final', engine) df['year16'] = df['year16'].str.replace(',', '').astype(float) df['year16_norm'] = (df['year16'] - df['year16'].min()) / ( df['year16'].max() - df['year16'].min()) md.create_table(engine, df, 'oil_reserve_final') #normalize disaster data df = pd.read_sql_table('disaster_data_final', engine) df['NumFireReferences_norm'] = ( df['NumFireReferences'] - df['NumFireReferences'].min()) / ( df['NumFireReferences'].max() - df['NumFireReferences'].min()) df['NumFloodReferences_norm'] = ( df['NumFloodReferences'] - df['NumFloodReferences'].min()) / ( df['NumFloodReferences'].max() - df['NumFloodReferences'].min()) df['NumHurricaneReferences_norm'] = ( df['NumHurricaneReferences'] - df['NumHurricaneReferences'].min()) / ( df['NumHurricaneReferences'].max() - df['NumHurricaneReferences'].min()) md.create_table(engine, df, 'disaster_data_final') #normalize railroad data df = pd.read_sql_table('railroad_data_final', engine) df['Tons_norm'] = (df['Tons'] - df['Tons'].min()) / (df['Tons'].max() - df['Tons'].min()) md.create_table(engine, df, 'railroad_data_final') #normalize population density data df = pd.read_sql_table('population_density', engine) df['density_norm'] = (df['density'] - df['density'].min()) / ( df['density'].max() - df['density'].min()) md.create_table(engine, df, 'population_density_final')
def read_state_codes(file_name): state_code = pd.read_csv('resources/' + file_name, sep='|') print(state_code.head(5)) md.create_table(md.connect(), state_code, 'state_codes')
southdakota = len(lines) hursd = len(hurlines) floodsd = len(floodlines) states = [ 'California', 'Louisiana', 'New York', 'Alaska', 'Texas', 'North Carolina', 'Ohio', 'Massachusetts', 'Utah', 'South Dakota' ] firelengths = [ california, louisiana, newyork, alaska, texas, northcarolina, ohio, massachusetts, utah, southdakota ] hurlengths = [ hurca, hurla, hurny, hurak, hurtx, hurnc, huroh, hurma, hurut, hursd ] floodlengths = [ floodca, floodla, floodny, floodak, floodtx, floodnc, floodoh, floodma, floodut, floodsd ] statecodes = ['CA', 'LA', 'NY', 'AK', 'TX', 'NC', 'OH', 'MA', 'UT', 'SD'] df_disaster = pd.DataFrame({ 'State': states, 'StateCode': statecodes, 'NumFireReferences': firelengths, 'NumHurricaneReferences': hurlengths, 'NumFloodReferences': floodlengths }) md.create_table(md.connect(), df_disaster, 'disaster_data')
def import_land_prices(file_name): df_landprices = pd.read_excel('resources/' + file_name) md.create_table(md.connect(), df_landprices, 'railroad_data')
# -*- coding: utf-8 -*- """ Created on Thu Apr 12 13:29:55 2018 @author: Beth """ import pandas as pd import mysqlConnection as md df = pd.read_csv('resources/BadZipCodes.csv', sep=",") md.create_table(md.connect(), df, 'unfavorable_zipcodes')
def import_smaller_zips(file_name): data = pd.read_csv('resources/' + file_name) data = data[['City', 'State', 'Zip']] #data.to_sql(name='water_locations', con=dbEngine, if_exists = 'replace') md.create_table(md.connect(), data, 'test_zips')
# -*- coding: utf-8 -*- """ Created on Sun Apr 8 16:21:48 2018 @author: Cameron """ import pandas as pd import mysqlConnection as md df = pd.read_csv('resources/population_density.csv', sep=",") md.create_table(md.connect(), df, 'population_density')