示例#1
0
def populateData(radius, actualValue):

    #select all zipcodes from table
    query = "select distinct(zip_code) FROM dddm.plant_locations order by zip_code asc"
    data = pd.read_sql(query, md.connect())

    allzipList = data['zip_code'].tolist()
    print(len(allzipList))

    #select inserted zipcodes
    query2 = "SELECT distinct(zip) FROM dddm.model_data"
    q2data = pd.read_sql(query2, md.connect())

    storedZip = q2data['zip'].tolist()
    print(len(storedZip))

    for zipcode in allzipList:
        if not zipcode in storedZip:
            updatedZip = str(int(zipcode))
            if len(str(int(zipcode))) == 3:
                updatedZip = str(0) + str(0) + updatedZip
            elif len(str(int(zipcode))) == 4:
                updatedZip = str(0) + updatedZip
            model.addToTable(updatedZip, radius, actualValue)
            print("Processed " + updatedZip)


#populateData(50, 'Y')
示例#2
0
def test():
    #fetch all the zipcodes for project
    query = "SELECT Zip FROM dddm.test_zips"
    zip_df = pd.read_sql(query, md.connect())
    zip_list = zip_df['Zip'].tolist()
    
    #build a dataframe and push to table
    df = pd.DataFrame(columns=['zip','seaport','landprice','oilreserve',
                               'existingplants','disasters','railroad',
                               'populationdensity', 'elevation', 'water', 
                               'weather', 'rules','earthquake','prediction'])
    
    for zipcode in zip_list:
        result = app.app(zipcode, 50)
        listData = pd.DataFrame([[zipcode,
                                  result.prediction_df['seaport'],
                                  result.prediction_df['landprice'],
                                  result.prediction_df['oilreserve'],
                                  result.prediction_df['existingplants'],
                                  result.prediction_df['disasters'],
                                  result.prediction_df['railroad'],
                                  result.prediction_df['populationdensity'],
                                  result.elevation_data,
                                  result.water_data,
                                  result.weather_data,
                                  result.rules,
                                  result.earthquake_data,
                                  result.prediction]], 
                                columns=['zip','seaport','landprice','oilreserve',
                               'existingplants','disasters','railroad',
                               'populationdensity', 'elevation', 'water', 
                               'weather', 'rules','earthquake','prediction'])
        df = df.append(listData, ignore_index=True)
        df.to_sql(name='test_data', con=md.connect(), if_exists='append', index=False)
示例#3
0
def import_water_data(file_name):
    data = pd.read_csv('resources/' + file_name)
    data = data[[
        'MonitoringLocationTypeName', 'LatitudeMeasure', 'LongitudeMeasure'
    ]]
    #data.to_sql(name='water_locations', con=dbEngine, if_exists = 'replace')
    md.create_table(md.connect(), data, 'water_locations')
示例#4
0
def import_weather(file_name):
    df_weather = pd.read_csv("resources/" + file_name, low_memory=False)
    df_weather = df_weather[[
        'StationName', 'Date', 'ObsType', 'Value', 'S-Flag', 'City', 'State'
    ]]

    md.create_table(md.connect(), df_weather, 'weather_observations')
示例#5
0
def populateData(radius, actualValue):

    #select all zipcodes from table
    query = "select Zip FROM dddm.test_zips"
    data = pd.read_sql(query, md.connect())

    allzipList = data['Zip'].tolist()
    print(len(allzipList))

    #select inserted zipcodes
    #query2 = "SELECT distinct(zip) FROM dddm.test_zip_data"
    #q2data = pd.read_sql(query2, md.connect())

    #storedZip = q2data['zip'].tolist()
    #print(len(storedZip))

    for zipcode in allzipList:
        updatedZip = str(int(zipcode))
        if len(str(int(zipcode))) == 3:
            updatedZip = str(0) + str(0) + updatedZip
        elif len(str(int(zipcode))) == 4:
            updatedZip = str(0) + updatedZip
        model.addToTestTable(updatedZip, radius, actualValue)
        print("Processed " + updatedZip)


#populateData(50, 'None')
示例#6
0
def main(file_name):
    initial = pd.read_csv('resources/' + file_name)
    initial['city'] = initial['city'].str.replace(' ', '').str.upper()

    seperatedZips = (initial['zip'].str.strip()).str.split(expand=True)

    fullData = pd.concat([initial, seperatedZips], axis=1)
    fullData = fullData.drop(['zip'], axis=1)

    idvars = [
        'city', 'state_id', 'state_name', 'county_name', 'lat', 'lng',
        'population'
    ]

    allZips = pd.melt(fullData, id_vars=idvars, value_name='zip')
    allZips = allZips.drop(['variable'], axis=1)

    # Drops columns with missing zip code values
    allZips = allZips[pd.notnull(allZips.zip)]

    #engine = create_engine('mysql+pymysql://pythonUser:abc@localhost:3306/dddm?charset=utf8', encoding='utf-8')
    #allZips.to_sql(name='zip_lookup', con=engine, if_exists = 'replace')

    md.create_table(md.connect(), allZips, 'zip_lookup')
    """ Zip Code lookup table complete, Ready to be joined """
    """
示例#7
0
def import_oil_reserve(name):
    page = requests.get(name)

    soup = BeautifulSoup(page.content, 'lxml')

    table = soup.find('table', attrs={'class': 'data1'})

    rows = table.findAll('tr', attrs={'class': 'DataRow'})

    values = []

    for tr in rows:
        state = tr.find('td', attrs={'class': 'DataStub1'}).get_text()

        otherYear = tr.findAll('td', attrs={'class': 'DataB'})
        y11 = otherYear[0].get_text()
        y12 = otherYear[1].get_text()
        y13 = otherYear[2].get_text()
        y14 = otherYear[3].get_text()
        y15 = otherYear[4].get_text()

        current = tr.find('td', attrs={'class': 'Current2'}).get_text()

        values.append(OilReserveData(state, y11, y12, y13, y14, y15, current))

    df = pd.DataFrame.from_records([s.to_dict() for s in values])

    md.create_table(md.connect(), df, 'oil_reserve')
示例#8
0
def import_land_prices(file_name):
    df_landprices = pd.read_excel('resources/' + file_name,
                                  skiprows=[0],
                                  parse_cols="A,B,C,D,E,H,I")
    df_landprices = df_landprices.loc[df_landprices['Date'] == '2015Q4']
    df_landprices['MSA'] = df_landprices.MSA.str.replace(' ', '')
    #df_landprices.to_sql(name='land_prices', con=dbEngine, index=False, if_exists = 'replace')
    md.create_table(md.connect(), df_landprices, 'land_prices')
示例#9
0
def import_seaports(file_name):
    df_ports = pd.read_csv("resources/" + file_name, low_memory=False)
    df_ports = df_ports[[
        'LATITUDE1', 'LONGITUDE1', 'CITY_OR_TO', 'STATE_POST', 'ZIPCODE',
        'PORT_NAME'
    ]]
    #df_ports.to_sql(name='seaports', con=dbEngine, if_exists = 'replace')

    md.create_table(md.connect(), df_ports, 'seaports')
示例#10
0
def build_adaboost_model():
    datafull = pd.read_sql_table('model_data', md.connect())
    data = datafull[[
        'seaport', 'landprice', 'oilreserve', 'existingplants', 'disasters',
        'railroad', 'populationdensity', 'elevation'
    ]]
    target = datafull[['actual']]

    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.2,
                                                        random_state=3)
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=3)

    AB_clf = AdaBoostClassifier(n_estimators=500, random_state=3)
    AB_clf.fit(x_train, y_train['actual'])

    AB_predicted = cross_val_predict(AB_clf, data, target['actual'])
    AB_scores = cross_val_score(AB_clf, data, target.values.ravel(), cv=cv)

    AB_confusion = confusion_matrix(target, AB_predicted)
    model_names.append('AdaBoost')

    train_accuracy = accuracy_score(y_train, AB_clf.predict(x_train))
    train_accuracies.append(train_accuracy)

    test_accuracy = accuracy_score(y_test, AB_clf.predict(x_test))
    test_accuracies.append(test_accuracy)

    cv_score = accuracy_score(target, AB_predicted)
    cv_scores.append(cv_score)

    print("AdaBoost Classifier Model: ")
    print("  Score of {} for training set: {:.4f}.".format(
        AB_clf.__class__.__name__, train_accuracy))
    print("  Score of {} for test set: {:.4f}.".format(
        AB_clf.__class__.__name__, test_accuracy))
    print("  Cross validation score: %0.2f (+/- %0.2f)" %
          (AB_scores.mean(), AB_scores.std() * 2))
    print("  Predicted values accuracy: %0.2f" %
          (accuracy_score(target, AB_predicted)))

    plot_confusion_matrix(AB_confusion, class_names)
    plt.show()

    print(classification_report(target, AB_predicted))
    print()

    importances = AB_clf.feature_importances_
    indices = np.argsort(importances)[::-1]
    print('Feature Ranking: ')
    for i in range(0, 6):
        print("{} feature no.{} ({})".format(i + 1, indices[i],
                                             importances[indices[i]]))

    return AB_clf
示例#11
0
def import_existing_plants(file_name):
    plant_locations = pd.read_csv('resources/' + file_name)
    plant_locations = plant_locations[[
        'Facility Name', 'Deregistered (Yes/No)', 'City', 'State', 'Zip Code',
        'Parent Company', 'Latitude', 'Longitude', 'Number of RMP Submissions'
    ]]
    plant_locations = plant_locations[plant_locations.State.notnull()]
    #plant_locations.to_sql(name='plant_locations', con=dbEngine, if_exists = 'replace')

    md.create_table(md.connect(), plant_locations, 'plant_locations')
示例#12
0
def build_gaussian_model():
    #class_names = ['No', 'Yes']

    datafull = pd.read_sql_table('model_data', md.connect())
    data = datafull[[
        'seaport', 'landprice', 'oilreserve', 'existingplants', 'disasters',
        'railroad', 'populationdensity', 'elevation'
    ]]
    target = datafull[['actual']]

    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.2,
                                                        random_state=3)
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=3)

    GNB_clf = GaussianNB()

    GNB_clf.fit(x_train, y_train['actual'])

    GNB_predicted = cross_val_predict(GNB_clf, data, target['actual'])
    GNB_scores = cross_val_score(GNB_clf, data, target.values.ravel(), cv=cv)

    GNB_confusion = confusion_matrix(target, GNB_predicted)
    model_names.append('GaussianNB')

    train_accuracy = accuracy_score(y_train, GNB_clf.predict(x_train))
    train_accuracies.append(train_accuracy)

    test_accuracy = accuracy_score(y_test, GNB_clf.predict(x_test))
    test_accuracies.append(test_accuracy)

    cv_score = accuracy_score(target, GNB_predicted)
    cv_scores.append(cv_score)

    print("GaussianNB Model: ")
    print("  Score of {} for training set: {:.4f}.".format(
        GNB_clf.__class__.__name__, train_accuracy))
    print("  Score of {} for test set: {:.4f}.".format(
        GNB_clf.__class__.__name__, test_accuracy))
    print("  Cross validation score: %0.2f (+/- %0.2f)" %
          (GNB_scores.mean(), GNB_scores.std() * 2))
    print("  Predicted values accuracy: %0.2f" % (cv_score))

    #use in slideshow, not in demo
    plot_confusion_matrix(GNB_confusion, class_names)
    plt.show()

    print(classification_report(target, GNB_predicted))
    print()

    return GNB_clf
def fetch_weather_data(zipcode):
    engine = md.connect()
    zipcode = str(int(zipcode))
    #print(zipcode)
    query1 = "SELECT * FROM dddm.zip_lookup where zip = '" + zipcode + "' OR zip='0" + zipcode +"' OR zip = '00" + zipcode +"'"
    zip_data = pd.read_sql(query1, engine)
    
    state = str(zip_data['state_id'])
    
    query2 = "SELECT * FROM dddm.weather_observations where State = '" + state + "'"
    weather_data = pd.read_sql(query2, engine)
    
    return weather_data
示例#14
0
def import_earthquakes():
    df_earthquakes = pd.read_csv("resources/USEarthquakes.csv",
                                 low_memory=False)
    df_earthquakes = df_earthquakes[[
        'time', 'latitude', 'longitude', 'mag', 'magType', 'place'
    ]]
    df_earthquakes2 = pd.read_csv("resources/AKEarthquakes.csv",
                                  low_memory=False)
    df_earthquakes2 = df_earthquakes2[[
        'time', 'latitude', 'longitude', 'mag', 'magType', 'place'
    ]]
    df_earthquakes.append(df_earthquakes2)
    md.create_table(md.connect(), df_earthquakes, 'earthquake_data')
示例#15
0
def build_random_forest_model():
    datafull = pd.read_sql_table('model_data', md.connect())
    data = datafull[[
        'seaport', 'landprice', 'oilreserve', 'existingplants', 'disasters',
        'railroad', 'populationdensity', 'elevation'
    ]]
    target = datafull[['actual']]

    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.2,
                                                        random_state=3)
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=3)

    RF_clf = RandomForestClassifier(n_estimators=200, random_state=3)

    RF_clf.fit(x_train, y_train['actual'].values.ravel())

    RF_predicted = cross_val_predict(RF_clf, data, target['actual'])
    RF_scores = cross_val_score(RF_clf, data, target, cv=cv)

    RF_confusion = confusion_matrix(target, RF_predicted)

    model_names.append('RandomForest')

    train_accuracy = accuracy_score(y_train, RF_clf.predict(x_train))
    train_accuracies.append(train_accuracy)

    test_accuracy = accuracy_score(y_test, RF_clf.predict(x_test))
    test_accuracies.append(test_accuracy)

    cv_score = accuracy_score(target, RF_predicted)
    cv_scores.append(cv_score)

    print("Random Forest Classifier Model: ")
    print("  Score of {} for training set: {:.4f}.".format(
        RF_clf.__class__.__name__, train_accuracy))
    print("  Score of {} for test set: {:.4f}.".format(
        RF_clf.__class__.__name__, test_accuracy))
    print("  Cross validation score: %0.2f (+/- %0.2f)" %
          (RF_scores.mean(), RF_scores.std() * 2))
    print("  Predicted values accuracy: %0.2f" %
          (accuracy_score(target, RF_predicted)))

    plot_confusion_matrix(RF_confusion, class_names)
    plt.show()

    print(classification_report(target, RF_predicted))
    print()

    return RF_clf
示例#16
0
def import_rules(file_name):
    data = open('resources/' + file_name).read()
    sent = sent_tokenize(data)
    #print(sent)

    #clean sentence and filter based on keywords
    keywords = ['well', 'wells', 'oil', 'drill', 'drilled']
    sentence_list = []
    for sentence in sent:
        sentence = sentence.replace("\n", "")
        for word in keywords:
            if word in sentence:
                sentence_list.append(sentence)
                break

    d = {'rule': sentence_list}
    df = pd.DataFrame(data=d)
    df.to_sql('rules', md.connect(), schema='dddm')
示例#17
0
def normalize_all():
    engine = md.connect()
    #normalize land prices
    df = pd.read_sql_table('land_prices_final', engine)
    df['home_value_norm'] = (df['Home Value'] - df['Home Value'].min()) / (
        df['Home Value'].max() - df['Home Value'].min())
    df['structure_cost_norm'] = (
        df['Structure Cost'] - df['Structure Cost'].min()) / (
            df['Structure Cost'].max() - df['Structure Cost'].min())
    md.create_table(engine, df, 'land_prices_final')

    #normalize oil reserve data
    df = pd.read_sql_table('oil_reserve_final', engine)
    df['year16'] = df['year16'].str.replace(',', '').astype(float)
    df['year16_norm'] = (df['year16'] - df['year16'].min()) / (
        df['year16'].max() - df['year16'].min())
    md.create_table(engine, df, 'oil_reserve_final')

    #normalize disaster data
    df = pd.read_sql_table('disaster_data_final', engine)
    df['NumFireReferences_norm'] = (
        df['NumFireReferences'] - df['NumFireReferences'].min()) / (
            df['NumFireReferences'].max() - df['NumFireReferences'].min())
    df['NumFloodReferences_norm'] = (
        df['NumFloodReferences'] - df['NumFloodReferences'].min()) / (
            df['NumFloodReferences'].max() - df['NumFloodReferences'].min())
    df['NumHurricaneReferences_norm'] = (
        df['NumHurricaneReferences'] - df['NumHurricaneReferences'].min()) / (
            df['NumHurricaneReferences'].max() -
            df['NumHurricaneReferences'].min())
    md.create_table(engine, df, 'disaster_data_final')

    #normalize railroad data
    df = pd.read_sql_table('railroad_data_final', engine)
    df['Tons_norm'] = (df['Tons'] - df['Tons'].min()) / (df['Tons'].max() -
                                                         df['Tons'].min())
    md.create_table(engine, df, 'railroad_data_final')

    #normalize population density data
    df = pd.read_sql_table('population_density', engine)
    df['density_norm'] = (df['density'] - df['density'].min()) / (
        df['density'].max() - df['density'].min())
    md.create_table(engine, df, 'population_density_final')
def fetch_earthquake_data(zipcode):
    engine = md.connect()
    #fetch latitude and longitude for zipcode
    zipcode = str(int(zipcode))
    #print(zipcode)
    query1 = "SELECT * FROM dddm.zip_lookup where zip = '" + zipcode + "' OR zip='0" + zipcode +"' OR zip = '00" + zipcode +"' LIMIT 0,1"
    zip_data = pd.read_sql(query1, engine)
    
    coord = 1.0
    lat_range1 = str(int(zip_data['lat']) + coord)
    lat_range2 = str(int(zip_data['lat']) - coord)
    
    lng_range1 = str(int(zip_data['lng']) + coord)
    lng_range2 = str(int(zip_data['lng']) - coord)
    
    query2 = "SELECT * from dddm.earthquake_data where latitude BETWEEN '" + lat_range2 + "' and '" + lat_range1 + "' AND longitude BETWEEN '" \
    + lng_range2 + "' and '" + lng_range1 + "'"
    earthquake_data = pd.read_sql(query2, engine)        
    
    return earthquake_data
def fetch_water_data(zipcode):
    engine = md.connect()
    zipcode = str(int(zipcode))
    #print(zipcode)
    query1 = "SELECT * FROM dddm.zip_lookup where zip = '" + zipcode + "' OR zip='0" + zipcode +"' OR zip = '00" + zipcode +"' LIMIT 0,1"
    zip_data = pd.read_sql(query1, engine)
    
    coord = 0.5
    lat_range1 = str(int(zip_data['lat']) + coord)
    lat_range2 = str(int(zip_data['lat']) - coord)
    
    lng_range1 = str(int(zip_data['lng']) + coord)
    lng_range2 = str(int(zip_data['lng']) - coord)
    
    query2 = "SELECT * FROM dddm.water_locations where LatitudeMeasure BETWEEN '" \
        + str(lat_range2) + "' and '" + str(lat_range1) + "' AND LongitudeMeasure BETWEEN '" \
        + str(lng_range2) + "' and '" + str(lng_range1) + "'"

    water_data = pd.read_sql(query2, engine)
    
    return water_data
示例#20
0
def fetch_all_zip():
    query = "SELECT zip FROM dddm.test_zip_data_all"
    zip_df = pd.read_sql(query, md.connect())
    zip_list = zip_df['zip'].tolist()
    
    return zip_list
示例#21
0
# -*- coding: utf-8 -*-
"""
Created on Sun Apr  8 16:21:48 2018

@author: Cameron
"""

import pandas as pd
import mysqlConnection as md

df = pd.read_csv('resources/population_density.csv', sep=",")
md.create_table(md.connect(), df, 'population_density')
示例#22
0
def read_state_codes(file_name):
    state_code = pd.read_csv('resources/' + file_name, sep='|')
    print(state_code.head(5))
    md.create_table(md.connect(), state_code, 'state_codes')
示例#23
0
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 12 17:29:35 2018

@author: Cameron
"""

import mysqlConnection as md
import pandas as pd


def add_all_data(engine):
    df = pd.read_csv('exported_data/model_data.csv')
    df.to_sql(name='model_data', con=engine, if_exists='append', index=False)


def add_test_data(engine):
    df = pd.read_csv('exported_data/test_data.csv')
    df.to_sql(name='test_zip_data',
              con=engine,
              if_exists='append',
              index=False)


""" Comment out the data already imported """
engine = md.connect()
add_all_data(engine)
add_test_data(engine)
示例#24
0
southdakota = len(lines)
hursd = len(hurlines)
floodsd = len(floodlines)

states = [
    'California', 'Louisiana', 'New York', 'Alaska', 'Texas', 'North Carolina',
    'Ohio', 'Massachusetts', 'Utah', 'South Dakota'
]
firelengths = [
    california, louisiana, newyork, alaska, texas, northcarolina, ohio,
    massachusetts, utah, southdakota
]
hurlengths = [
    hurca, hurla, hurny, hurak, hurtx, hurnc, huroh, hurma, hurut, hursd
]
floodlengths = [
    floodca, floodla, floodny, floodak, floodtx, floodnc, floodoh, floodma,
    floodut, floodsd
]
statecodes = ['CA', 'LA', 'NY', 'AK', 'TX', 'NC', 'OH', 'MA', 'UT', 'SD']

df_disaster = pd.DataFrame({
    'State': states,
    'StateCode': statecodes,
    'NumFireReferences': firelengths,
    'NumHurricaneReferences': hurlengths,
    'NumFloodReferences': floodlengths
})

md.create_table(md.connect(), df_disaster, 'disaster_data')
def fetch_rules():
    engine = md.connect()
    query = "SELECT * FROM dddm.rules where rule like '%%General%%'"
    rules_data = pd.read_sql(query, engine)
    return rules_data['rule'].iloc[0]
示例#26
0
def import_smaller_zips(file_name):
    data = pd.read_csv('resources/' + file_name)
    data = data[['City', 'State', 'Zip']]
    #data.to_sql(name='water_locations', con=dbEngine, if_exists = 'replace')
    md.create_table(md.connect(), data, 'test_zips')
示例#27
0
def connect():
    return md.connect()
示例#28
0
def import_land_prices(file_name):
    df_landprices = pd.read_excel('resources/' + file_name)
    md.create_table(md.connect(), df_landprices, 'railroad_data')
示例#29
0
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 12 13:29:55 2018

@author: Beth
"""

import pandas as pd
import mysqlConnection as md

df = pd.read_csv('resources/BadZipCodes.csv', sep=",")
md.create_table(md.connect(), df, 'unfavorable_zipcodes')