def main(): # 1. load data fname2 = "../data/stendalmark_20181122_RAW_export.xyz" fname1 = "../data/stendalmark_20181121_RAW_export.xyz" fname0 = "../data/stendalmark_20181120_RAW_export.xyz" fname = "../data/vildbjerg_20171101_RAW_export.xyz" _, dbdt, lbl, timestamp, _ = load_data2(fname1, 8, 24) _, dbdt1, lbl1, timestamp1, _ = load_data2(fname0, 8, 24) _, dbdt2, lbl2, timestamp2, _ = load_data2(fname, 8, 24) dbdt, lbl, timestamp = remove_edge(timestamp, dbdt, lbl, 20) dbdt1, lbl1, timestamp1 = remove_edge(timestamp1, dbdt1, lbl1, 20) dbdt2, lbl2, timestamp2 = remove_edge(timestamp2, dbdt2, lbl2, 20) timestamp = timestampToTime(timestamp) # normalize by sign and log # dbdt_norm = np.sign(dbdt) * np.log(np.abs(dbdt)) # split in training test r = int(np.ceil(0.8 * dbdt.shape[0])) dbdt_train = dbdt[0:r, :] lbl_train = lbl[0:r] dbdt_train = dbdt_train[lbl_train == 1, :] # dbdt_train = np.concatenate(( dbdt1[lbl1 == 1, :], dbdt2[lbl2 == 1, :]), axis=0) dbdt_testOG = dbdt[r:, :] dbdt_test = dbdt[r:, :] lbl_test = lbl[r:] # normalise by zero mean, 1 std sc = StandardScaler() dbdt_train = sc.fit_transform(dbdt_train) dbdt_test = sc.transform(dbdt_test) # Build input array, where a sounding and its neighbours are stacked autoencoder(dbdt_train, dbdt_test, lbl_test)
def main(): fname0 = "../data/stendalmark_20181120_RAW_export.xyz" fname1 = "../data/stendalmark_20181121_RAW_export.xyz" df, dbdt, lbl, timestamp, gtimes = load_data2(fname0, 8, 24) dbdt, lbl, timestamp = remove_edge(timestamp, dbdt, lbl, 20) timestamp = (timestamp - timestamp[0]) * 10**5 df0, dbdt0, lbl0, timestamp0, gtimes0 = load_data2(fname1, 8, 24) dbdt0, lbl0, timestamp0 = remove_edge(timestamp0, dbdt0, lbl0, 20) timestamp0 = (timestamp0 - timestamp0[0]) * 10**5 + timestamp[-1] + 0.7 dbdt = np.concatenate((dbdt, dbdt0)) lbl = np.concatenate((lbl, lbl0)) timestamp = np.concatenate((timestamp, timestamp0)) sc = StandardScaler() r = int(np.ceil(0.8 * dbdt.shape[0])) dbdt = dbdt[0:r, :] lbl = lbl[0:r] timestamp = timestamp[0:r] splits = 5 i = 0 kf = KFold(n_splits=splits, shuffle=False) for trainidx, testidx in kf.split(dbdt): X_train, X_test = dbdt[trainidx], dbdt[testidx] y_train, y_test = lbl[trainidx], lbl[testidx] time_test = timestamp[testidx] X_testOG = X_test i += 1 if i == 5: break X_train = X_train[y_train == 1, :] sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) fig, axROC = plt.subplots() ndata = np.linspace(0.01, 1, 50) AUCarray = np.zeros((len(ndata), )) for i, frac in enumerate(ndata): n = int(np.ceil(frac * X_train.shape[0])) y_score, _, _, _, CM = autoencoder2(X_train[0:n, :], X_test, y_test, X_testOG, time_test, True) AUC, _ = data_visualize.plot_roc(y_test, y_score, axROC, i, pos=0) AUCarray[i] = CM[1, 0] + CM[0, 1] plt.close('all') print(i) plt.figure("Learn data") plt.plot(ndata, AUCarray) plt.ylabel("Total errors") plt.xlabel("Training set in use [Fraction]") plt.show()
def main(): ## Data preprocessing fname0 = "../data/stendalmark_20181120_RAW_export.xyz" fname1 = "../data/stendalmark_20181121_RAW_export.xyz" df, dbdt, lbl, timestamp, gtimes = load_data2(fname0, 8, 24) dbdt, lbl, timestamp = remove_edge(timestamp, dbdt, lbl, 20) timestamp = (timestamp - timestamp[0]) * 10 ** 5 df0, dbdt0, lbl0, timestamp0, gtimes0 = load_data2(fname1, 8, 24) dbdt0, lbl0, timestamp0 = remove_edge(timestamp0, dbdt0, lbl0, 20) timestamp0 = (timestamp0 - timestamp0[0]) * 10 ** 5 + timestamp[-1] + 0.7 dbdt = np.concatenate((dbdt, dbdt0)); lbl = np.concatenate((lbl, lbl0)) timestamp = np.concatenate((timestamp, timestamp0)) sc = StandardScaler() r = int(np.ceil(0.8 * dbdt.shape[0])) dbdt = dbdt[0:r, :] lbl = lbl[0:r] timestamp=timestamp[0:r] idx = range(0, dbdt.shape[0]) v = int(np.ceil(0.8 * dbdt.shape[0])) X_test = dbdt[v:, :] X_train = dbdt[0:v, :] y_train = lbl[0:v] y_test = lbl[v:] idx_train = list(idx[0:v]) X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # X_train = build_array_skytem(41, X_train) # X_test = build_array_skytem(41, X_test) # y_score, _, report, acc = rnd_forest2(timestamp, dbdt, lbl, idx[r:], # X_train.T, X_test.T, # y_train[20:-20], y_test[20:-20], n_trees=100) # trees = range(1,100) trees = np.linspace(0.01,1.0, 50) metric = np.zeros((len(trees),)) for i, n_trees in enumerate(trees): # random.shuffle(idx_train) n = int(np.ceil(n_trees * X_train.shape[0])) _, _, report, acc, auc, CM = rnd_forest2(timestamp[v:], dbdt, lbl, X_test, X_train[idx_train[0:n],:], X_test, y_train[idx_train[0:n]], y_test, n_trees=100) metric[i] = CM[1,0] + CM[0,1] plt.figure() plt.close('all') print("Iteration: ", i) plt.plot(trees, metric) plt.ylabel("Total errors") plt.xlabel("Training set in use [Fraction]") plt.show()
def plotNormalnratio(): fname = "../data/20171101_RAW_export.xyz" _, dbdt, lbl, timestamp = load_data2(fname, 8, 20) ratio = difference.row_ratio(timestamp, dbdt) data_visualize.plotDat(timestamp, ratio, lbl) data_visualize.plotDat(timestamp, dbdt, lbl) plt.yscale('log') plt.show()
from sklearn.model_selection import train_test_split, cross_val_score from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MinMaxScaler, Normalizer, PolynomialFeatures from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file # tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) # features = tpot_data.drop('target', axis=1).values # training_features, testing_features, training_target, testing_target = \ # train_test_split(features, tpot_data['target'].values, random_state=42) from utilities.data_reader import load_data2 fname = "../data/20171101_RAW_export.xyz" # fname = "../data/stendalmark_20181120_RAW_export.xyz" df, dbdt, lbl, timestamp = load_data2(fname, 8, 23) training_features, testing_features, training_target, testing_target = \ train_test_split(dbdt, lbl, random_state=42) # apply random undersampling from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, CondensedNearestNeighbour from imblearn.over_sampling import RandomOverSampler from imblearn.combine import SMOTEENN rus = SMOTEENN() training_features, training_target = rus.fit_sample(training_features, training_target) from SVM.tpot_exported_pipeline_4_bernoulliEXTT import pipe4bernoulliET from SVM.tpot_exported_pipeline_5_ET import pipe5ET pipe5ET(training_features, testing_features, training_target, testing_target) exit()
# Length og original data: sequence length, plus stepsize for each row other than the first # ie. the total number og soundings, accounting for overlapping windows nsounding = lblA.shape[1] + (lblA.shape[0] - 1) * stepzise # Create empty array. Each row holds predictions for the corresponding window columns. # remaining columns are NaN. Average the score of a sounding over each windows prediction nanA = np.empty((lblA.shape[0], nsounding)) nanA[:] = np.nan for i in range(0, lblA.shape[0]): idx = i * stepzise nanA[i, idx:idx + lblA.shape[1]] = lblA[i, :, 0] assert np.isnan(nanA[-1, -1]), "last element did not get a score" return np.nanmean(nanA, axis=0) fname = "../data/stendalmark_20181120_RAW_export.xyz" df, dbdt, lbl, timestamp, gtimes = load_data2(fname, 8, 24) from sklearn.preprocessing import StandardScaler sc = StandardScaler() r = int(np.ceil(0.8 * dbdt.shape[0])) X_train = dbdt[0:r, :] X_test = dbdt[r:, :] X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) lbl_train2 = lbl[0:r] lbl_test = lbl[r:] w = 30 s = 1
# from SVM.SVM_1 import SVM_classify # fig, ax = plt.subplots() # ax.plot([0, 0, 1], [0, 1, 1], "r-", label="Perfect classifier") # ax.set_ylim([0,1]) # ax.set_xlim([0,1]) # ax.plot([0,1], [0,1], color="tab:gray", linestyle = "-.", label="50% line") # ax.set_ylabel('True positive rate'); ax.set_xlabel('False positive rate') # ax.legend() # plt.show() # exit() fname = "../data/vildbjerg_20171101_modified_RAW_export.xyz" fname1 = "../data/stendalmark_20181121_RAW_export.xyz" fname0 = "../data/stendalmark_20181120_RAW_export.xyz" df, dbdt, lbl, timestamp, gtimes = load_data2(fname0, 8, 24) dbdt, lbl, timestamp = remove_edge(timestamp, dbdt, lbl, 20) timestamp = (timestamp - timestamp[0]) * 10 ** 5 df0, dbdt0, lbl0, timestamp0, gtimes0 = load_data2(fname1, 8, 24) dbdt0, lbl0, timestamp0 = remove_edge(timestamp0, dbdt0, lbl0, 20) timestamp0 = (timestamp0 - timestamp0[0]) * 10 ** 5 + timestamp[-1]+0.7 dbdt = np.concatenate((dbdt,dbdt0)); lbl = np.concatenate((lbl,lbl0)) timestamp = np.concatenate((timestamp, timestamp0)) print("coupled ",sum(lbl == 0) / dbdt.shape[0]) print("total coupled ",dbdt[lbl==0,:].shape[0]) print("total", dbdt.shape[0]) data_visualize.plotDat(range(0, len(timestamp)), dbdt, lbl)
from keras.layers import Dense from sklearn.preprocessing import StandardScaler from NN.window import build_array_skytem # from neupy.layers import * # from neupy import algorithms ## Data preprocessing from utilities.data_reader import load_data2, remove_edge from utilities.data_visualize import plot_training, plot_misclassified, dist_score fname2 = "../data/stendalmark_20181122_RAW_export.xyz" fname1 = "../data/stendalmark_20181121_RAW_export.xyz" fname0 = "../data/stendalmark_20181120_RAW_export.xyz" fname = "../data/vildbjerg_20171101_RAW_export.xyz" df, dbdt, lbl, timestamp, _ = load_data2(fname1, 8, 24) dbdt, lbl, timestamp = remove_edge(timestamp, dbdt, lbl, 20) if 1: #normalize by sign and log dbdt_norm = np.sign(dbdt) * np.log(np.abs(dbdt)) #split in rtaining test r = int(np.ceil(0.8 * dbdt.shape[0])) dbdt_train = dbdt_norm[0:r, :] lbl_train = lbl[1:r-1] dbdt_testOG = dbdt[r+1:-1] dbdt_test = dbdt_norm[r:] lbl_test = lbl[r+1:-1] #normalise by zero mean, 1 std sc = StandardScaler() dbdt_train = sc.fit_transform(dbdt_train)