def main(): configs = json.load(open('Configuration.json', 'r')) epochs = configs['training']['epochs'] grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] saved_models_path = configs['paths']['saved_models_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) ##start working per outcome for outcome in outcomes: decision_maker = DecisionMaker() fold_ind, train_ind, test_ind = get_train_test_split(non_smotedtime_series[outcome].astype(int), non_smotedtime_series[grouping]) X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps,\ n_features = \ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback, train_ind, test_ind) autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features) autoencoder.summary() autoencoder.fit(X_train_y0, X_train_y0, epochs,lookback,X_valid_y0,X_valid_y0,2) ###save model filename = saved_models_path+ configs['model']['name'] + outcome+ '.h5' autoencoder.save_model(filename) ####LSTM autoencoder autoencoder.plot_history() test_x_predictions = autoencoder.predict(X_test) mse = np.mean(np.power(flatten(X_test) - flatten(test_x_predictions), 2), axis=1) test_error_df = pd.DataFrame({'Reconstruction_error' : mse, 'True_class' : y_test.tolist()}) pred_y, best_threshold, precision_rt, recall_rt= \ autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error) autoencoder.output_performance(test_error_df.True_class, test_error_df.Reconstruction_error,pred_y) autoencoder.plot_reconstruction_error(test_error_df, best_threshold) autoencoder.plot_roc(test_error_df) autoencoder.plot_pr(precision_rt, recall_rt)
def main(): configs = json.load(open('Configuration.json', 'r')) grouping = configs['data']['grouping'] static_features = configs['data']['static_columns'] dynamic_features = configs['data']['dynamic_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] ##read, impute and scale dataset ##start working per outcome for outcome in outcomes: time_series = pd.read_csv(timeseries_path + "SMOTEDTimeSeries/" + outcome + "StackedTimeSeries1Day.csv") time_series[dynamic_features] = impute(time_series, dynamic_features) normalised_series = scale(time_series, dynamic_features) normalised_series.insert(0, grouping, time_series[grouping]) normalised_series.insert(len(normalised_series.columns), outcome, time_series[outcome]) normalised_series = curve_shift(normalised_series, grouping, outcome, shift_by=lookback - 1) decision_maker = DecisionMaker() #train/test and validation sets X_cols = (normalised_series.columns).tolist() X_cols.remove(outcome) X_cols.remove(grouping) input_X = normalised_series.loc[:, normalised_series.columns.isin( X_cols )].values # converts the df to a numpy array input_y = normalised_series[outcome].values n_features = input_X.shape[1] # number of features X, y = temporalize(X=input_X, y=input_y, lookback=lookback) X_train, X_test, y_train, y_test = train_test_split(np.array(X), np.array(y), test_size=0.33, random_state=SEED, stratify=y) X_train, X_valid, y_train, y_valid = train_test_split( X_train, y_train, test_size=0.33, random_state=SEED, stratify=y_train) X_train = X_train.reshape(X_train.shape[0], lookback, n_features) X_valid = X_valid.reshape(X_valid.shape[0], lookback, n_features) X_test = X_test.reshape(X_test.shape[0], lookback, n_features) distrs_percents = [ get_distribution_percentages( (normalised_series[outcome]).astype(int)) ] scaler = StandardScaler().fit(flatten(X_train)) a = flatten(X_train) print('colwise mean', np.mean(a, axis=0).round(6)) print('colwise variance', np.var(a, axis=0)) X_valid_scaled = Models.LSTMAutoEncoder.Utils.scale(X_valid, scaler) X_test_scaled = Models.LSTMAutoEncoder.Utils.scale(X_test, scaler) timesteps = X_train.shape[1] # equal to the lookback n_features = X_train.shape[2] # 59 epochs = 100 lr = 0.0001 lstm_autoencoder = Sequential() # Encoder lstm_autoencoder.add( LSTM(32, activation='relu', input_shape=(timesteps, n_features), return_sequences=True)) lstm_autoencoder.add( LSTM(16, activation='relu', return_sequences=False)) lstm_autoencoder.add(RepeatVector(timesteps)) # Decoder lstm_autoencoder.add(LSTM(16, activation='relu', return_sequences=True)) lstm_autoencoder.add(LSTM(32, activation='relu', return_sequences=True)) lstm_autoencoder.add(TimeDistributed(Dense(n_features))) lstm_autoencoder.summary() adam = optimizers.Adam(lr) lstm_autoencoder.compile(loss='mse', optimizer=adam) cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5", save_best_only=True, verbose=0) tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) lstm_autoencoder_history = lstm_autoencoder.fit( X_train, X_train, epochs=epochs, batch_size=lookback, validation_data=(X_valid, X_train), verbose=2).history #print(distrs_percents) ####LSTM autoencoder plt.figure(figsize=(10, 10)) plt.plot(lstm_autoencoder_history['loss'], linewidth=2, label='Train') plt.plot(lstm_autoencoder_history['val_loss'], linewidth=2, label='Valid') plt.legend(loc='upper right') plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.savefig("LossOverEpochsSMOTE.pdf", bbox_inches='tight') plt.figure(figsize=(10, 10)) valid_x_predictions = lstm_autoencoder.predict(X_valid_scaled) mse = np.mean(np.power( flatten(X_valid_scaled) - flatten(valid_x_predictions), 2), axis=1) error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': y_valid.tolist() }) precision_rt, recall_rt, threshold_rt = precision_recall_curve( error_df.True_class, error_df.Reconstruction_error) plt.plot(threshold_rt, precision_rt[1:], label="Precision", linewidth=5) plt.plot(threshold_rt, recall_rt[1:], label="Recall", linewidth=5) plt.title('Precision and recall for different threshold values') plt.xlabel('Threshold') plt.ylabel('Precision/Recall') plt.legend() plt.savefig(outcome + "ThresholdSMOTE.pdf", bbox_inches='tight') test_x_predictions = lstm_autoencoder.predict(X_test_scaled) mse = np.mean(np.power( flatten(X_test_scaled) - flatten(test_x_predictions), 2), axis=1) error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': y_test.tolist() }) plt.figure(figsize=(10, 10)) threshold_fixed = 0.3 groups = error_df.groupby('True_class') fig, ax = plt.subplots() for name, group in groups: ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='', label="Break" if name == 1 else "Normal") ax.hlines(threshold_fixed, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold') ax.legend() plt.title("Reconstruction error for different classes") plt.ylabel("Reconstruction error") plt.xlabel("Data point index") plt.savefig(outcome + "ReconstructionerrorSMOTE.pdf", bbox_inches='tight') pred_y = [ 1 if e > threshold_fixed else 0 for e in error_df.Reconstruction_error.values ] conf_matrix = confusion_matrix(error_df.True_class, pred_y) plt.figure(figsize=(6, 6)) sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d") plt.title("Confusion matrix") plt.ylabel('True class') plt.xlabel('Predicted class') plt.savefig(outcome + "ConfusionMatrixSMOTE.pdf", bbox_inches='tight') false_pos_rate, true_pos_rate, thresholds = roc_curve( error_df.True_class, error_df.Reconstruction_error) roc_auc = auc( false_pos_rate, true_pos_rate, ) plt.figure(figsize=(10, 10)) plt.plot(false_pos_rate, true_pos_rate, linewidth=5, label='AUC = %0.3f' % roc_auc) plt.plot([0, 1], [0, 1], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title('Receiver operating characteristic curve (ROC)') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig(outcome + "rocSMOTE.pdf", bbox_inches='tight') precision, recall, thresholds = precision_recall_curve( error_df.True_class, error_df.Reconstruction_error) pr_auc = auc(recall, precision) plt.figure(figsize=(10, 10)) plt.plot(recall, precision, linewidth=5, label='AUC = %0.3f' % pr_auc) plt.plot([0, 1], [0, 1], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title('Receiver operating characteristic curve (ROC)') plt.ylabel('Precision') plt.xlabel('Recall') plt.savefig(outcome + "precision_recall_aucSMOTE.pdf", bbox_inches='tight')
def main(): configs = json.load(open('Configuration.json', 'r')) grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] autoencoder_path = configs['paths']['autoencoder_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) ##start working per outcome for outcome in outcomes: decision_maker = DecisionMaker() X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, n_features =\ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback) epochs = 100 autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features) autoencoder.summary() cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5", save_best_only=True, verbose=0) tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback, X_valid_y0, X_valid_y0, 2) ####LSTM autoencoder autoencoder.plot_history() valid_x_predictions = autoencoder.predict(X_valid) mse = np.mean(np.power( flatten(X_valid) - flatten(valid_x_predictions), 2), axis=1) error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': y_valid.tolist() }) precision_rt, recall_rt, threshold_rt = precision_recall_curve( error_df.True_class, error_df.Reconstruction_error) fscore = (2 * precision_rt * recall_rt) / (precision_rt + recall_rt) ix = np.argmax(fscore) best_threshold = threshold_rt[ix] # print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], fscore[ix])) pred_y = (error_df.Reconstruction_error > best_threshold).astype('int32') perf_df = pd.DataFrame() perf_dict = performance_metrics(error_df.True_class, pred_y, error_df.Reconstruction_error) perf_df = perf_df.append(perf_dict, ignore_index=True) perf_df.to_csv(autoencoder_path + "performancemetrics" + outcome + ".csv", index=False) test_x_predictions = autoencoder.predict(X_test) mse = np.mean(np.power( flatten(X_test) - flatten(test_x_predictions), 2), axis=1) error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': y_test.tolist() }) plt.figure(figsize=(10, 10)) groups = error_df.groupby('True_class') fig, ax = plt.subplots() for name, group in groups: ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='', label="1" if name == 1 else "0") ax.hlines(threshold_rt[ix], ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold') ax.legend() plt.title("Reconstruction error for different classes") plt.ylabel("Reconstruction error") plt.xlabel("Data point index") plt.savefig(autoencoder_path + outcome + "Reconstructionerror.pdf", bbox_inches='tight') false_pos_rate, true_pos_rate, thresholds = roc_curve( error_df.True_class, error_df.Reconstruction_error) roc_auc = auc( false_pos_rate, true_pos_rate, ) plt.figure(figsize=(10, 10)) plt.plot(false_pos_rate, true_pos_rate, linewidth=5, label='AUC = %0.3f' % roc_auc) plt.plot([0, 1], [0, 1], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title('Receiver operating characteristic curve (ROC)') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig(autoencoder_path + outcome + "roc.pdf", bbox_inches='tight') pr_auc = auc(recall_rt, precision_rt) plt.figure(figsize=(10, 10)) plt.plot(recall_rt, precision_rt, linewidth=5, label='PR-AUC = %0.3f' % pr_auc) plt.plot([0, 1], [1, 0], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title('Precision Recall Curive') plt.ylabel('Precision') plt.xlabel('Recall') plt.savefig(autoencoder_path + outcome + "precision_recall_auc.pdf", bbox_inches='tight')