def main(): configs = json.load(open('Configuration.json', 'r')) epochs = configs['training']['epochs'] grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) ##start working per outcome for outcome in outcomes: decision_maker = DecisionMaker() X_train_y0, y_train1, X_valid_y0, X_valid_y1, X_valid, y_val1, X_test, y_test1, timesteps, n_features=\ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback)
def main(): configs = json.load(open('Configuration.json', 'r')) epochs = configs['training']['epochs'] grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] saved_models_path = configs['paths']['saved_models_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) ##start working per outcome for outcome in outcomes: decision_maker = DecisionMaker() fold_ind, train_ind, test_ind = get_train_test_split(non_smotedtime_series[outcome].astype(int), non_smotedtime_series[grouping]) X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps,\ n_features = \ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback, train_ind, test_ind) autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features) autoencoder.summary() autoencoder.fit(X_train_y0, X_train_y0, epochs,lookback,X_valid_y0,X_valid_y0,2) ###save model filename = saved_models_path+ configs['model']['name'] + outcome+ '.h5' autoencoder.save_model(filename) ####LSTM autoencoder autoencoder.plot_history() test_x_predictions = autoencoder.predict(X_test) mse = np.mean(np.power(flatten(X_test) - flatten(test_x_predictions), 2), axis=1) test_error_df = pd.DataFrame({'Reconstruction_error' : mse, 'True_class' : y_test.tolist()}) pred_y, best_threshold, precision_rt, recall_rt= \ autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error) autoencoder.output_performance(test_error_df.True_class, test_error_df.Reconstruction_error,pred_y) autoencoder.plot_reconstruction_error(test_error_df, best_threshold) autoencoder.plot_roc(test_error_df) autoencoder.plot_pr(precision_rt, recall_rt)
def main(): configs = json.load(open('Configuration.json', 'r')) grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] static_features = configs['data']['static_columns'] targets = configs['data']['classification_target'] timeseries_path = configs['paths']['data_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) risk_score_visualiser = Visualiser(normalized_timeseries, non_smotedtime_series, dynamic_features, static_features) for target in targets: risk_score_visualiser.plot_risk_scores(target)
def main(): configs = json.load(open('Configuration.json', 'r')) grouping = configs['data']['grouping'] static_features = configs['data']['static_columns'] dynamic_features = configs['data']['dynamic_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] ##read, impute and scale dataset ##start working per outcome for outcome in outcomes: time_series = pd.read_csv(timeseries_path + "SMOTEDTimeSeries/" + outcome + "StackedTimeSeries1Day.csv") time_series[dynamic_features] = impute(time_series, dynamic_features) normalised_series = scale(time_series, dynamic_features) normalised_series.insert(0, grouping, time_series[grouping]) normalised_series.insert(len(normalised_series.columns), outcome, time_series[outcome]) normalised_series = curve_shift(normalised_series, grouping, outcome, shift_by=lookback - 1) decision_maker = DecisionMaker() #train/test and validation sets X_cols = (normalised_series.columns).tolist() X_cols.remove(outcome) X_cols.remove(grouping) input_X = normalised_series.loc[:, normalised_series.columns.isin( X_cols )].values # converts the df to a numpy array input_y = normalised_series[outcome].values n_features = input_X.shape[1] # number of features X, y = temporalize(X=input_X, y=input_y, lookback=lookback) X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), test_size=0.33, random_state=SEED, stratify=y) X_train, X_valid, y_train, y_valid = train_test_split( X_train, y_train, test_size=0.33, random_state=SEED, stratify=y_train) X_train = X_train.reshape(X_train.shape[0], lookback, n_features) X_valid = X_valid.reshape(X_valid.shape[0], lookback, n_features) X_test = X_test.reshape(X_test.shape[0], lookback, n_features) distrs_percents = [ get_distribution_percentages( (normalised_series[outcome]).astype(int)) ] scaler = StandardScaler().fit(flatten(X_train)) a = flatten(X_train) print('colwise mean', np.mean(a, axis=0).round(6)) print('colwise variance', np.var(a, axis=0)) X_valid_scaled = Models.LSTMAutoEncoder.Utils.scale(X_valid, scaler) X_test_scaled = Models.LSTMAutoEncoder.Utils.scale(X_test, scaler) timesteps = X_train.shape[1] # equal to the lookback n_features = X_train.shape[2] # 59 epochs = 100 lr = 0.0001 lstm_autoencoder = Sequential() # Encoder lstm_autoencoder.add( LSTM(32, activation='relu', input_shape=(timesteps, n_features), return_sequences=True)) lstm_autoencoder.add( LSTM(16, activation='relu', return_sequences=False)) lstm_autoencoder.add(RepeatVector(timesteps)) # Decoder lstm_autoencoder.add(LSTM(16, activation='relu', return_sequences=True)) lstm_autoencoder.add(LSTM(32, activation='relu', return_sequences=True)) lstm_autoencoder.add(TimeDistributed(Dense(n_features))) lstm_autoencoder.summary() adam = optimizers.Adam(lr) lstm_autoencoder.compile(loss='mse', optimizer=adam) cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5", save_best_only=True, verbose=0) tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) lstm_autoencoder_history = lstm_autoencoder.fit( X_train, X_train, epochs=epochs, batch_size=lookback, validation_data=(X_valid, X_train), verbose=2).history #print(distrs_percents) ####LSTM autoencoder plt.figure(figsize=(10, 10)) plt.plot(lstm_autoencoder_history['loss'], linewidth=2, label='Train') plt.plot(lstm_autoencoder_history['val_loss'], linewidth=2, label='Valid') plt.legend(loc='upper right') plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.savefig("LossOverEpochsSMOTE.pdf", bbox_inches='tight') plt.figure(figsize=(10, 10)) valid_x_predictions = lstm_autoencoder.predict(X_valid_scaled) mse = np.mean(np.power( flatten(X_valid_scaled) - flatten(valid_x_predictions), 2), axis=1) error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': y_valid.tolist() }) precision_rt, recall_rt, threshold_rt = precision_recall_curve( error_df.True_class, error_df.Reconstruction_error) plt.plot(threshold_rt, precision_rt[1:], label="Precision", linewidth=5) plt.plot(threshold_rt, recall_rt[1:], label="Recall", linewidth=5) plt.title('Precision and recall for different threshold values') plt.xlabel('Threshold') plt.ylabel('Precision/Recall') plt.legend() plt.savefig(outcome + "ThresholdSMOTE.pdf", bbox_inches='tight') test_x_predictions = lstm_autoencoder.predict(X_test_scaled) mse = np.mean(np.power( flatten(X_test_scaled) - flatten(test_x_predictions), 2), axis=1) error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': y_test.tolist() }) plt.figure(figsize=(10, 10)) threshold_fixed = 0.3 groups = error_df.groupby('True_class') fig, ax = plt.subplots() for name, group in groups: ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='', label="Break" if name == 1 else "Normal") ax.hlines(threshold_fixed, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold') ax.legend() plt.title("Reconstruction error for different classes") plt.ylabel("Reconstruction error") plt.xlabel("Data point index") plt.savefig(outcome + "ReconstructionerrorSMOTE.pdf", bbox_inches='tight') pred_y = [ 1 if e > threshold_fixed else 0 for e in error_df.Reconstruction_error.values ] conf_matrix = confusion_matrix(error_df.True_class, pred_y) plt.figure(figsize=(6, 6)) sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d") plt.title("Confusion matrix") plt.ylabel('True class') plt.xlabel('Predicted class') plt.savefig(outcome + "ConfusionMatrixSMOTE.pdf", bbox_inches='tight') false_pos_rate, true_pos_rate, thresholds = roc_curve( error_df.True_class, error_df.Reconstruction_error) roc_auc = auc( false_pos_rate, true_pos_rate, ) plt.figure(figsize=(10, 10)) plt.plot(false_pos_rate, true_pos_rate, linewidth=5, label='AUC = %0.3f' % roc_auc) plt.plot([0, 1], [0, 1], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title('Receiver operating characteristic curve (ROC)') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig(outcome + "rocSMOTE.pdf", bbox_inches='tight') precision, recall, thresholds = precision_recall_curve( error_df.True_class, error_df.Reconstruction_error) pr_auc = auc(recall, precision) plt.figure(figsize=(10, 10)) plt.plot(recall, precision, linewidth=5, label='AUC = %0.3f' % pr_auc) plt.plot([0, 1], [0, 1], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title('Receiver operating characteristic curve (ROC)') plt.ylabel('Precision') plt.xlabel('Recall') plt.savefig(outcome + "precision_recall_aucSMOTE.pdf", bbox_inches='tight')
def main(): configs = json.load(open('Configuration.json', 'r')) epochs = configs['training']['epochs'] grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] static_features = configs['data']['static_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] autoencoder_models_path = configs['paths']['autoencoder_models_path'] test_data_path = configs['paths']['test_data_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) #intialise classification report which will house results of all outcomes classification_report = ClassificationReport() #save lstm performance for comparison with final outcome lstm_praucs = [] ##start working per outcome for outcome in outcomes: fold_ind, train_ind, test_ind = get_train_test_split( non_smotedtime_series[outcome].astype(int), non_smotedtime_series[grouping]) ##Load LSTM models if they exist, otherwise train new models and save them autoencoder_filename = autoencoder_models_path + configs['model'][ 'name'] + outcome + '.h5' X_train, X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, \ n_features = \ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback, train_ind, test_ind) if ("3D" not in outcome): if os.path.isfile(autoencoder_filename): print(" Autoencoder trained model exists for oucome", outcome, "file:", autoencoder_filename) autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features, saved_model=autoencoder_filename) autoencoder.summary() else: print("Autencoder trained model does not exist for outcome", outcome, "file:", autoencoder_filename) autoencoder = LSTMAutoEncoder( configs['model']['name'] + outcome, outcome, timesteps, n_features) autoencoder.summary() autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback, X_valid_y0, X_valid_y0, 2) autoencoder.plot_history() train_x_predictions = autoencoder.predict(X_train) mse_train = np.mean(np.power( lstm_flatten(X_train) - lstm_flatten(train_x_predictions), 2), axis=1) test_x_predictions = autoencoder.predict(X_test) mse_test = np.mean(np.power( lstm_flatten(X_test) - lstm_flatten(test_x_predictions), 2), axis=1) test_error_df = pd.DataFrame({ 'Reconstruction_error': mse_test, 'True_class': y_test.tolist() }) pred_y, best_threshold, precision_rt, recall_rt = \ autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error) autoencoder.output_performance(test_error_df.True_class, pred_y) autoencoder.plot_reconstruction_error(test_error_df, best_threshold) autoencoder.plot_roc(test_error_df) autoencoder.plot_pr(precision_rt, recall_rt) lstm_prauc = auc(recall_rt, precision_rt) lstm_praucs.append(lstm_prauc) #Feature Selector training_loc = train_ind[0] #+train_ind[1] training_ids = non_smotedtime_series.iloc[training_loc] training_ids = training_ids[grouping] testing_ids = non_smotedtime_series.iloc[test_ind[1]] testing_ids = testing_ids[grouping] flat_df, timesteps = flatten(non_smotedtime_series, dynamic_features, grouping, static_features, outcome) temporal_features = set(flat_df.columns) - set(static_features) temporal_features = set(temporal_features) - set( [outcome, grouping]) X_train = flat_df.loc[flat_df[grouping].isin(training_ids)] y_train = X_train[outcome].astype(int) training_groups = X_train[grouping] X_train_static = X_train[static_features] X_train_static[grouping] = training_groups X_train = X_train[temporal_features] X_test = flat_df.loc[flat_df[grouping].isin(testing_ids)] y_test = X_test[outcome].astype(int) testing_groups = X_test[grouping] X_test_static = X_test[static_features] X_test_static.loc[grouping] = testing_groups X_test = X_test[temporal_features] ######## aggregate_df = generate_aggregates(X_train, temporal_features, grouping, training_groups) static_aggregate_train_df = pd.concat( [aggregate_df, X_train_static], axis=1, join='inner') static_aggregate_train_df = static_aggregate_train_df.loc[:, ~static_aggregate_train_df .columns. duplicated( )] static_aggregate_train_df.drop(columns=[grouping], inplace=True, axis=1) static_aggregate_train_df['mse'] = mse_train aggregate_df_test = generate_aggregates(X_test, temporal_features, grouping, testing_groups) static_aggregate_test_df = pd.concat( [aggregate_df_test, X_test_static], axis=1, join='inner') static_aggregate_test_df = static_aggregate_test_df.loc[:, ~static_aggregate_test_df .columns. duplicated( )] static_aggregate_test_df.drop(columns=[grouping], inplace=True, axis=1) static_aggregate_test_df['mse'] = mse_test static_aggregate_test_df.to_csv("static_aggretate.csv", index=False) static_baseline_classifier = XGBoostClassifier( static_aggregate_train_df, y_train, outcome, grouping) static_baseline_classifier.fit("aggregate_static", mse_train * 100) y_pred_binary, best_threshold, precision_rt, recall_rt, yhat = \ static_baseline_classifier.predict(static_aggregate_test_df, y_test) print(" CLASS WEIGHTS FOR Y ACTUAL: ", class_counts(y_test)) print(" CLASS WEIGHTS FOR Y PREDICTE: ", class_counts(y_pred_binary)) static_baseline_classifier.output_performance( y_test, y_pred_binary) static_baseline_classifier.plot_pr(precision_rt, recall_rt, "XGBoost Static") static_baseline_classifier.plot_feature_importance( static_aggregate_test_df.columns) to_write_for_plotting = static_aggregate_test_df to_write_for_plotting['outcome'] = y_test to_write_for_plotting.to_csv(test_data_path + outcome + ".csv", index=False) #add to classification report classification_report.add_model_result(outcome, y_test, y_pred_binary, best_threshold, precision_rt, recall_rt, yhat) #delete variables del static_aggregate_train_df del static_aggregate_test_df del X_train del X_train_y0 del X_valid_y0 del X_valid del y_valid del X_test del y_test del timesteps del train_x_predictions del test_x_predictions del test_error_df #risk_score_visualiser = Visualiser(normalized_timeseries, non_smotedtime_series, # dynamic_features, static_features # ) #After fitting model to all outcomes, plot and get summary statistics classification_report.plot_distributions_vs_aucs() classification_report.plot_pr_auc() classification_report.plot_auc() classification_report.compare_lstim_xgboost(lstm_praucs)
def main(): configs = json.load(open('Configuration.json', 'r')) epochs = configs['training']['epochs'] grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] static_features = configs['data']['static_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] saved_models_path = configs['paths']['saved_models_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) ##start working per outcome for outcome in outcomes: decision_maker = DecisionMaker() fold_ind, train_ind, test_ind = get_train_test_split( non_smotedtime_series[outcome].astype(int), non_smotedtime_series[grouping]) ##Load LSTM models if they exist, otherwise train new models and save them filename = saved_models_path + configs['model'][ 'name'] + outcome + '.h5' X_train, X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, \ n_features = \ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback, train_ind, test_ind) if os.path.isfile(filename): autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features, saved_model=filename) autoencoder.summary() else: autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features) autoencoder.summary() autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback, X_valid_y0, X_valid_y0, 2) autoencoder.plot_history() ###save model filename = saved_models_path + configs['model'][ 'name'] + outcome + '.h5' autoencoder.save_model(filename) ####Predicting using the fitted model (loaded or trained) train_x_predictions = autoencoder.predict(X_train) mse_train = np.mean(np.power( lstm_flatten(X_train) - lstm_flatten(train_x_predictions), 2), axis=1) test_x_predictions = autoencoder.predict(X_test) mse_test = np.mean(np.power( lstm_flatten(X_test) - lstm_flatten(test_x_predictions), 2), axis=1) test_error_df = pd.DataFrame({ 'Reconstruction_error': mse_test, 'True_class': y_test.tolist() }) pred_y, best_threshold, precision_rt, recall_rt = \ autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error) autoencoder.output_performance(test_error_df.True_class, test_error_df.Reconstruction_error, pred_y) autoencoder.plot_reconstruction_error(test_error_df, best_threshold) autoencoder.plot_roc(test_error_df) autoencoder.plot_pr(precision_rt, recall_rt) #Feature Selector training_loc = train_ind[0] #+train_ind[1] training_ids = non_smotedtime_series.iloc[training_loc] training_ids = training_ids[grouping] testing_ids = non_smotedtime_series.iloc[test_ind[1]] testing_ids = testing_ids[grouping] flat_df, timesteps = flatten(non_smotedtime_series, dynamic_features, grouping, static_features, outcome) temporal_features = set(flat_df.columns) - set(static_features) temporal_features = set(temporal_features) - set([outcome, grouping]) X_train = flat_df.loc[flat_df[grouping].isin(training_ids)] y_train = X_train[outcome].astype(int) training_groups = X_train[grouping] X_train_static = X_train[static_features] X_train_static.loc[grouping] = training_groups X_train = X_train[temporal_features] X_train = scale(X_train, temporal_features) X_train['mse'] = mse_train #X_train, y_train = smote(X_train, y_train) X_test = flat_df.loc[flat_df[grouping].isin(testing_ids)] y_test = X_test[outcome].astype(int) testing_groups = X_test[grouping] X_test_static = X_test[static_features] X_test_static.loc[grouping] = testing_groups X_test = X_test[temporal_features] X_test = scale(X_test, temporal_features) X_test['mse'] = mse_test feature_selector = XGBoostClassifier(X_train, y_train, outcome, grouping) # feature_selector.fit("temporal", training_groups) y_pred_binary, best_threshold, precision_rt, recall_rt = feature_selector.predict( X_test, y_test) feature_selector.plot_pr(precision_rt, recall_rt, "XGBoost Temporal") featuredf = pd.DataFrame() temporal_features = set(temporal_features) - set([outcome]) featuredf['features'] = list(temporal_features) #featuredf['imp'] = fs_fi #featuredf = featuredf[featuredf['imp'] > 0] ######## baseline_features = featuredf['features'] baseline_features = set( [x.partition('_')[0] for x in list(baseline_features)]) baseline_features = [x + "_0" for x in list(baseline_features)] baseline_features.insert(0, grouping) baseline_static_features = baseline_features + static_features slopes_df = generate_slopes(X_train, temporal_features, static_features, grouping, training_groups) aggregate_df = generate_aggregates(X_train, temporal_features, grouping, training_groups) slopes_static_baseline_train_df = pd.concat( [slopes_df, X_train_static], axis=1, join='inner') slopes_static_baseline_train_df = slopes_static_baseline_train_df.loc[:, ~slopes_static_baseline_train_df . columns . duplicated( )] slopes_static_baseline_train_groups = slopes_static_baseline_train_df[ grouping] slopes_static_baseline_train_df.drop(columns=[grouping], inplace=True, axis=1) slopes_static_baseline_train_df['mse'] = mse_train slopes_df_test = generate_slopes(X_test, temporal_features, static_features, grouping, testing_groups) slopes_static_baseline_test_df = pd.concat( [slopes_df_test, X_test_static], axis=1, join='inner') slopes_static_baseline_test_df = slopes_static_baseline_test_df.loc[:, ~slopes_static_baseline_test_df . columns . duplicated( )] slopes_static_baseline_test_groups = slopes_static_baseline_test_df[ grouping] slopes_static_baseline_test_df.drop(columns=[grouping], inplace=True, axis=1) slopes_static_baseline_test_df['mse'] = mse_test slopes_static_baseline_classifier = XGBoostClassifier( slopes_static_baseline_train_df, y_train, outcome, grouping) #bs_y, bs_ths, bs_id, bs_fi = slopes_static_baseline_classifier.fit("baseline_static_slope", # slopes_static_baseline_train_groups) slopes_static_baseline_classifier.fit( "baseline_static_slope", slopes_static_baseline_train_groups) y_pred_binary, best_threshold, precision_rt, recall_rt = \ slopes_static_baseline_classifier.predict( slopes_static_baseline_test_df, y_test) slopes_static_baseline_classifier.plot_pr(precision_rt, recall_rt, "XGBoost Static")
def main(): configs = json.load(open('Configuration.json', 'r')) grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] autoencoder_path = configs['paths']['autoencoder_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) ##start working per outcome for outcome in outcomes: decision_maker = DecisionMaker() X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, n_features =\ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback) epochs = 100 autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features) autoencoder.summary() cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5", save_best_only=True, verbose=0) tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback, X_valid_y0, X_valid_y0, 2) ####LSTM autoencoder autoencoder.plot_history() valid_x_predictions = autoencoder.predict(X_valid) mse = np.mean(np.power( flatten(X_valid) - flatten(valid_x_predictions), 2), axis=1) error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': y_valid.tolist() }) precision_rt, recall_rt, threshold_rt = precision_recall_curve( error_df.True_class, error_df.Reconstruction_error) fscore = (2 * precision_rt * recall_rt) / (precision_rt + recall_rt) ix = np.argmax(fscore) best_threshold = threshold_rt[ix] # print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], fscore[ix])) pred_y = (error_df.Reconstruction_error > best_threshold).astype('int32') perf_df = pd.DataFrame() perf_dict = performance_metrics(error_df.True_class, pred_y, error_df.Reconstruction_error) perf_df = perf_df.append(perf_dict, ignore_index=True) perf_df.to_csv(autoencoder_path + "performancemetrics" + outcome + ".csv", index=False) test_x_predictions = autoencoder.predict(X_test) mse = np.mean(np.power( flatten(X_test) - flatten(test_x_predictions), 2), axis=1) error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': y_test.tolist() }) plt.figure(figsize=(10, 10)) groups = error_df.groupby('True_class') fig, ax = plt.subplots() for name, group in groups: ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='', label="1" if name == 1 else "0") ax.hlines(threshold_rt[ix], ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold') ax.legend() plt.title("Reconstruction error for different classes") plt.ylabel("Reconstruction error") plt.xlabel("Data point index") plt.savefig(autoencoder_path + outcome + "Reconstructionerror.pdf", bbox_inches='tight') false_pos_rate, true_pos_rate, thresholds = roc_curve( error_df.True_class, error_df.Reconstruction_error) roc_auc = auc( false_pos_rate, true_pos_rate, ) plt.figure(figsize=(10, 10)) plt.plot(false_pos_rate, true_pos_rate, linewidth=5, label='AUC = %0.3f' % roc_auc) plt.plot([0, 1], [0, 1], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title('Receiver operating characteristic curve (ROC)') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig(autoencoder_path + outcome + "roc.pdf", bbox_inches='tight') pr_auc = auc(recall_rt, precision_rt) plt.figure(figsize=(10, 10)) plt.plot(recall_rt, precision_rt, linewidth=5, label='PR-AUC = %0.3f' % pr_auc) plt.plot([0, 1], [1, 0], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title('Precision Recall Curive') plt.ylabel('Precision') plt.xlabel('Recall') plt.savefig(autoencoder_path + outcome + "precision_recall_auc.pdf", bbox_inches='tight')