def mergeNGAdata( nametrain='/Users/aklimasewski/Documents/data/cybertrainyeti10_residfeb.csv', nametest='/Users/aklimasewski/Documents/data/cybertestyeti10_residfeb.csv', filenamenga='/Users/aklimasewski/Documents/data/NGA_mag2_9.csv', n=13): from sklearn.model_selection import train_test_split train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain= '/Users/aklimasewski/Documents/data/cybertrainyeti10_residfeb.csv', nametest= '/Users/aklimasewski/Documents/data/cybertestyeti10_residfeb.csv', n=n) train_data1, test_data1, feature_names = add_az(train_data1, test_data1, feature_names) # filenamenga = '/Users/aklimasewski/Documents/data/NGA_mag2_9.csv' nga_data1, nga_targets1, feature_names = readindataNGA(filenamenga, n) nga_data1, feature_names = add_azNGA(filenamenga, nga_data1, feature_names) # ngatrain, ngatest, ngatrain_targets, ngatest_targets = train_test_split(nga_data1,nga_targets1, test_size=0.2, random_state=1) ngatrain, ngatest, ngatrain_targets, ngatest_targets, ngacells_train, ngacells_test = train_test_split( nga_data1, nga_targets1, cells_nga, test_size=0.2, random_state=1) train_data1 = np.concatenate([train_data1, ngatrain], axis=0) test_data1 = np.concatenate([test_data1, ngatest], axis=0) train_data1 = np.concatenate([train_data1, ngatrain], axis=0) test_data1 = np.concatenate([test_data1, ngatest], axis=0) train_targets1 = np.concatenate([train_targets1, ngatrain_targets], axis=0) test_targets1 = np.concatenate([test_targets1, ngatest_targets], axis=0) return train_data1, test_data1, train_targets1, test_targets1, feature_names
def mergeNGAdata( nametrain='/Users/aklimasewski/Documents/data/cybertrainyeti10_residfeb.csv', nametest='/Users/aklimasewski/Documents/data/cybertestyeti10_residfeb.csv', filenamenga='/Users/aklimasewski/Documents/data/NGA_mag2_9.csv', n=13): ''' Read in NGA data file, train test split and merge with cybershake data Parameters ---------- nametrain: path for cybershake training data csv nametest: path for cybershake testing data csv filenamenga: integer number of hidden layers n: number of model input features Returns ------- train_data1: numpy array of training features test_data1: numpy array of testing features train_targets1: numpy array of training features test_targets1: numpy array of testing features feature_names: numpy array feature names ''' from sklearn.model_selection import train_test_split train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain= '/Users/aklimasewski/Documents/data/cybertrainyeti10_residfeb.csv', nametest= '/Users/aklimasewski/Documents/data/cybertestyeti10_residfeb.csv', n=n) train_data1, test_data1, feature_names = add_az(train_data1, test_data1, feature_names) # filenamenga = '/Users/aklimasewski/Documents/data/NGA_mag2_9.csv' nga_data1, nga_targets1, feature_names = readindataNGA(filenamenga, n) nga_data1, feature_names = add_azNGA(filenamenga, nga_data1, feature_names) # ngatrain, ngatest, ngatrain_targets, ngatest_targets = train_test_split(nga_data1,nga_targets1, test_size=0.2, random_state=1) ngatrain, ngatest, ngatrain_targets, ngatest_targets = train_test_split( nga_data1, nga_targets1, test_size=0.2, random_state=1) train_data1 = np.concatenate([train_data1, ngatrain], axis=0) test_data1 = np.concatenate([test_data1, ngatest], axis=0) train_data1 = np.concatenate([train_data1, ngatrain], axis=0) test_data1 = np.concatenate([test_data1, ngatest], axis=0) train_targets1 = np.concatenate([train_targets1, ngatrain_targets], axis=0) test_targets1 = np.concatenate([test_targets1, ngatest_targets], axis=0) return train_data1, test_data1, train_targets1, test_targets1, feature_names
transform_method = 'Norm' # compare to NGA data filenamenga = '/Users/aklimasewski/Documents/data/NGA_mag2_9.csv' nga_data1, nga_targets1, feature_names = readindataNGA(filenamenga, n) nga_data1, feature_names = add_azNGA(filenamenga, nga_data1, feature_names) # nga_data1,feature_names = add_locfeatNGA(filenamenga,nga_data1,feature_names) if az == True: nga_data1, feature_names = add_azNGA(nga_data1, feature_names) # read in cyber shake trainineg and testing data train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain= '/Users/aklimasewski/Documents/data/cybertrainyeti10_residfeb.csv', nametest='/Users/aklimasewski/Documents/data/cybertestyeti10_residfeb.csv', n=n) train_data1, test_data1, feature_names = add_az(train_data1, test_data1, feature_names) x_train, y_train, x_nga, y_nga, x_range, x_train_raw, x_nga_raw = transform_data( transform_method, train_data1, nga_data1, train_targets1, nga_targets1, feature_names, folder_pathNGA) # load model and fit loadedmodel = keras.models.load_model(folder_path + 'model/') pre_nga = loadedmodel.predict(x_nga) resid_nga = np.asarray(nga_targets1) - pre_nga
def ANN_2step(folder_pathmod1, folder_pathmod2, epochs1=50, epochs2=50, numlayers1=1, numlayers2=1, units1=[20], units2=[20]): ''' 2 ANNs: 1st is the base ANN, 2nd ANN uses 1st model residuals as targets and cell location features Parameters ---------- folder_pathmod1: path for saving png files for the first ANN folder_pathmod2: path for saving png files for the second ANN epochs1: number of training epochs for the first ANN epochs2: number of training epochs for the second ANN numlayers1: integer number of hidden layers for the first ANN numlayers2: integer number of hidden layers for the second ANN units1: array of number of units for hidden layers for first ANN units2: array of number of units for hidden layers for second ANN Returns ------- None. creates two ANNS and saves model files and figures ''' from sklearn.preprocessing import PowerTransformer if not os.path.exists(folder_pathmod1): os.makedirs(folder_pathmod1) # read in training, testing, and cell data train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain= '/Users/aklimasewski/Documents/data/cybertrainyeti10_residfeb.csv', nametest= '/Users/aklimasewski/Documents/data/cybertestyeti10_residfeb.csv', n=n) train_data1, test_data1, feature_names = add_az(train_data1, test_data1, feature_names) cells = pd.read_csv(folder_path + 'gridpointslatlon_train.csv', header=0, index_col=0) cells_test = pd.read_csv(folder_path + 'gridpointslatlon_test.csv', header=0, index_col=0) x_train, y_train, x_test, y_test, x_range, x_train_raw, x_test_raw = transform_data( transform_method, train_data1, test_data1, train_targets1, test_targets1, feature_names, folder_pathmod1) resid, resid_test, pre_train, pre_test = create_ANN( x_train, y_train, x_test, y_test, feature_names, numlayers1, units1, epochs1, transform_method, folder_pathmod1) period = [10, 7.5, 5, 4, 3, 2, 1, 0.5, 0.2, 0.1] plot_resid(resid, resid_test, folder_pathmod1) # second ANN if not os.path.exists(folder_pathmod2): os.makedirs(folder_pathmod2) train_targets1 = resid test_targets1 = resid_test train_data1 = np.asarray(cells) test_data1 = np.asarray(cells_test) transform_method = PowerTransformer() feature_names = np.asarray([ 'eventlat', 'eventlon', 'midlat', 'midlon', 'sitelat', 'sitelon', ]) x_train, y_train, x_test, y_test, x_range, x_train_raw, x_test_raw = transform_data( transform_method, train_data1, test_data1, train_targets1, test_targets1, feature_names, folder_pathmod2) resid, resid_test, pre_train, pre_test = create_ANN( x_train, y_train, x_test, y_test, feature_names, numlayers2, units2, epochs2, transform_method, folder_pathmod2) period = [10, 7.5, 5, 4, 3, 2, 1, 0.5, 0.2, 0.1] plot_resid(resid, resid_test, folder_pathmod2)
def ANN_gridpoints(folder_pathmod, epochs=50, numlayers=1, units=[20]): ''' ANN with cell locations as additional features Parameters ---------- folder_pathmod: path for saving png files epochs: number of training epochs numlayers: integer number of hidden layers units: array of number of units for hidden layers Returns ------- None. creates ANNS and saves model files and figures ''' cells = pd.read_csv(folder_path + 'gridpointslatlon_train.csv', header=0, index_col=0) cells_test = pd.read_csv(folder_path + 'gridpointslatlon_test.csv', header=0, index_col=0) if not os.path.exists(folder_pathmod): os.makedirs(folder_pathmod1) transform_method = 'Norm' #function or text n = 13 train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain= '/Users/aklimasewski/Documents/data/cybertrainyeti10_residfeb.csv', nametest= '/Users/aklimasewski/Documents/data/cybertestyeti10_residfeb.csv', n=n) train_data1, test_data1, feature_names = add_az(train_data1, test_data1, feature_names) # add the cell features train_data1 = np.concatenate([train_data1, cells], axis=1) test_data1 = np.concatenate([test_data1, cells_test], axis=1) feature_names = np.concatenate([ feature_names, [ 'eventlat', 'eventlon', 'midlat', 'midlon', 'sitelat', 'sitelon', ] ], axis=0) x_train, y_train, x_test, y_test, x_range, x_train_raw, x_test_raw = transform_data( transform_method, train_data1, test_data1, train_targets1, test_targets1, feature_names, folder_pathmod) resid, resid_test, pre_train, pre_test = create_ANN( x_train, y_train, x_test, y_test, feature_names, numlayers, units, epochs, transform_method, folder_pathmod) plot_resid(resid, resid_test, folder_pathmod1)
def avgpath_resid(df, folder_path, savename): ''' loops through each record and multiplies average normalized target per cell by path length in each cell Parameters ---------- df: pandas dataframe of shapely polgons and midpoint of each grid cell in lat, lon folder_path: path for saving csv files savename: string for name of csv file (ie training or testing) Returns ------- path_target_sum: list of sum of path length per cell * average normalized target per cell saves path target sum per record in a csv ''' import shapely import shapely.geometry from preprocessing import readindata import numpy as np import geopy import geopy.distance import pandas as pd if savename == 'train': gridcells = df['polygon'] targets = df[[ 'T10', 'T7.5', 'T5', 'T4', 'T3', 'T2', 'T1', 'T0.5', 'T0.2', 'T0.1' ]] list_wkt = df['polygon'] list_polygons = [shapely.wkt.loads(poly) for poly in list_wkt] else: gridcells = df['polygon'] targets = df[[ 'T10test', 'T7.5test', 'T5test', 'T4test', 'T3test', 'T2test', 'T1test', 'T0.5test', 'T0.2test', 'T0.1test' ]] list_wkt = df['polygon'] list_polygons = [shapely.wkt.loads(poly) for poly in list_wkt] n = 6 train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain= '/Users/aklimase/Documents/USGS/data/cybertrainyeti10_residfeb.csv', nametest= '/Users/aklimase/Documents/USGS/data/cybertestyeti10_residfeb.csv', n=n) hypoR = train_data1[:, 0] sitelat = train_data1[:, 1] sitelon = train_data1[:, 2] evlat = train_data1[:, 3] evlon = train_data1[:, 4] target = train_targets1[:] path_target_sum = np.zeros((len(hypoR), 10)) #length of number of records # loop through each record for i in range(len(sitelat)): line = [(evlon[i], evlat[i]), (sitelon[i], sitelat[i])] path = shapely.geometry.LineString(line) # loop through each grid cell if (i % 1000) == 0: print('record: ', str(i)) pathsum = 0 for j in range(len(list_polygons)): shapely_poly = shapely.geometry.Polygon(list_polygons[j]) if path.intersects(shapely_poly) == True: shapely_line = shapely.geometry.LineString(line) intersection_line = list( shapely_poly.intersection(shapely_line).coords) if len(intersection_line) == 2: coords_1 = (intersection_line[0][1], intersection_line[0][0]) coords_2 = (intersection_line[1][1], intersection_line[1][0]) length = geopy.distance.distance(coords_1, coords_2).km pathsum += length * np.asarray(targets.iloc[j]) path_target_sum[i] = (pathsum) df_out = pd.DataFrame(path_target_sum, columns=[ 'T10', 'T7.5', 'T5', 'T4', 'T3', 'T2', 'T1', 'T0.5', 'T0.2', 'T0.1' ]) df_out.to_csv(folder_path + 'avgrecord_targets_' + savename + '.csv') return path_target_sum
sns.reset_defaults() #sns.set_style('whitegrid') #sns.set_context('talk') sns.set_context(context='talk', font_scale=0.7) tfd = tfp.distributions ############### #recreate the demo folder_path = '/Users/aklimase/Documents/USGS/models/VGPlayer/' nametrain = '/Users/aklimase/Documents/USGS/data/cybertrainyeti10_residfeb.csv' nametest = '/Users/aklimase/Documents/USGS/data/cybertestyeti10_residfeb.csv' train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain, nametest, n=12) #%% transform_method = Normalizer() x_train, y_train, x_test, y_test, x_range, x_train_raw, x_test_raw = transform_data( transform_method, train_data1, test_data1, train_targets1, test_targets1, feature_names, folder_path) y_test = y_test[:, 0:2] y = y_train[:, 0:2] x_range = [[min(x_train.T[i]) for i in range(len(x_train[0]))], [max(x_train.T[i]) for i in range(len(x_train[0]))]]
sns.set_context(context='talk', font_scale=0.7) ''' first ANN with the base model features ''' topdir = '/Users/aklimasewski/Documents/' folder_path = topdir + 'model_results/2step_ANN/model13/' if not os.path.exists(folder_path): os.makedirs(folder_path) transform_method = 'Norm' epochs = 15 train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain=topdir + 'data/cybertrainyeti10_residfeb.csv', nametest=topdir + 'data/cybertestyeti10_residfeb.csv', n=13) x_train, y_train, x_test, y_test, x_range, x_train_raw, x_test_raw = transform_data( transform_method, train_data1, test_data1, train_targets1, test_targets1, feature_names, folder_path) numlayers = 1 units = [20] resid_train, resid_test, pre_train, pre_test = create_ANN( x_train, y_train, x_test, y_test, feature_names, numlayers, units, epochs, transform_method, folder_path) period = [10, 7.5, 5, 4, 3, 2, 1, 0.5, 0.2, 0.1] plot_resid(resid_train, resid_test, folder_path) '''
def mergeNGAdata_cells( nametrain='/Users/aklimasewski/Documents/data/cybertrainyeti10_residfeb.csv', nametest='/Users/aklimasewski/Documents/data/cybertestyeti10_residfeb.csv', filenamenga='/Users/aklimasewski/Documents/data/NGA_mag2_9.csv', n=13): ''' Read in NGA data file, train test split and merge with cybershake data Parameters ---------- nametrain: path for cybershake training data csv nametest: path for cybershake testing data csv filenamenga: integer number of hidden layers n: number of model input features Returns ------- train_data1: numpy array of training features test_data1: numpy array of testing features train_targets1: numpy array of training features test_targets1: numpy array of testing features feature_names: numpy array feature names ''' from sklearn.model_selection import train_test_split cells = pd.read_csv(folder_path + 'gridpointslatlon_train.csv', header=0, index_col=0) cells_test = pd.read_csv(folder_path + 'gridpointslatlon_test.csv', header=0, index_col=0) cells_nga = pd.read_csv(folder_path + 'gridpointslatlon_nga.csv', header=0, index_col=0) train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain= '/Users/aklimasewski/Documents/data/cybertrainyeti10_residfeb.csv', nametest= '/Users/aklimasewski/Documents/data/cybertestyeti10_residfeb.csv', n=n) train_data1, test_data1, feature_names = add_az(train_data1, test_data1, feature_names) nga_data1, nga_targets1, feature_names = readindataNGA(filenamenga, n) nga_data1, feature_names = add_azNGA(filenamenga, nga_data1, feature_names) nga_data1 = np.concatenate([nga_data1, cells_nga], axis=0) ngatrain, ngatest, ngatrain_targets, ngatest_targets = train_test_split( nga_data1, nga_targets1, test_size=0.2, random_state=1) feature_names = np.concatenate([ feature_names, [ 'eventlat', 'eventlon', 'midlat', 'midlon', 'sitelat', 'sitelon', ] ], axis=0) train_data1 = np.concatenate([train_data1, cells], axis=1) test_data1 = np.concatenate([test_data1, cells_test], axis=1) train_data1 = np.concatenate([train_data1, ngatrain], axis=0) test_data1 = np.concatenate([test_data1, ngatest], axis=0) train_targets1 = np.concatenate([train_targets1, ngatrain_targets], axis=0) test_targets1 = np.concatenate([test_targets1, ngatest_targets], axis=0) return train_data1, test_data1, train_targets1, test_targets1, feature_names
os.makedirs(folder_path) ''' grids data targets per period, normalizes by path length, finds average normalized residual per grid cell creates ANN model including the average path term per period as an input feature comment gridding if csv files already saved ''' period = [10, 7.5, 5, 4, 3, 2, 1, 0.5, 0.2, 0.1] df, lon, lat = create_grid(latmin=30, latmax=38, lonmin=-121, lonmax=-115, dx=1.0) train_data1, test_data1, train_targets1, test_targets1, feature_names = readindata( nametrain=topdir + 'data/cybertrainyeti10_residfeb.csv', nametest=topdir + 'data/cybertestyeti10_residfeb.csv', n=6) hypoR, sitelat, sitelon, evlat, evlon, target, gridded_targetsnorm_list, gridded_counts = grid_data( train_data1, train_targets1, df=df) hypoR_test, sitelat_test, sitelon_test, evlat_test, evlon_test, target_test, gridded_targetsnorm_list_test, gridded_counts_test = grid_data( test_data1, test_targets1, df=df) gridded_mean, gridded_mean_test = mean_grid_save( gridded_targetsnorm_list, gridded_targetsnorm_list_test, gridded_counts, gridded_counts_test, df, folder_path) gridded_plots(gridded_mean, gridded_counts, period, lat, lon, evlon, evlat, sitelon, sitelat, folder_path + 'traingrid/') gridded_plots(gridded_mean_test, gridded_counts_test,