def main(): data_dir_path = './data' model_dir_path = './models' ecg_data = pd.read_csv(data_dir_path + '/ground_anomaly.csv') ecg_data = ecg_data[1:] # print([name for name in ecg_data.columns]) ecg_data=ecg_data.drop(['TIMESTAMP', 'RECORD', 'AmbTemp_C_Avg', 'InvPAC_kW_Avg', 'PwrMtrP_kW_Avg'], axis=1) ecg_np_data = ecg_data.as_matrix() scaler = MinMaxScaler() ecg_np_data = scaler.fit_transform(ecg_np_data) ae = LstmAutoEncoder() print(ecg_data.shape) column = ecg_data.shape[0] print(column) # fit the data and save model into model_dir_path ae.fit(ecg_np_data[:10000, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.95) # load back the model saved in model_dir_path detect anomaly ae.load_model(model_dir_path) anomaly_information = ae.anomaly(ecg_np_data[:10000, :]) reconstruction_error = [] abnormal_number = 0 idx_list = [] for idx, (is_anomaly, dist) in enumerate(anomaly_information): if is_anomaly: abnormal_number = abnormal_number + 1 print(idx) idx_list.append(idx) print('# ' + str(idx) + ' is abnormal.') reconstruction_error.append(dist) print(abnormal_number) print(idx_list) visualize_reconstruction_error(reconstruction_error, ae.threshold)
def main(): data_dir_path = './data' model_dir_path = './models' ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None) print(ecg_data.head()) ecg_np_data = ecg_data.as_matrix() scaler = MinMaxScaler() ecg_np_data = scaler.fit_transform(ecg_np_data) print(ecg_np_data.shape) ae = LstmAutoEncoder() # fit the data and save model into model_dir_path if DO_TRAINING: ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9) # load back the model saved in model_dir_path detect anomaly ae.load_model(model_dir_path) anomaly_information = ae.anomaly(ecg_np_data[:23, :]) reconstruction_error = [] for idx, (is_anomaly, dist) in enumerate(anomaly_information): print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')') reconstruction_error.append(dist) visualize_reconstruction_error(reconstruction_error, ae.threshold)
def main(): # data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/data' data_dir_path = '/Users/Shariful/Documents/DataCamp/ADFA-LD(tf-idf)' # model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/models' model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/adfa_demo/models' # ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None) # ecg_data1 = pd.read_csv(data_dir_path + '/test_normal.csv', skiprows=1, \ # index_col=None, header=None) ecg_data2 = pd.read_csv(data_dir_path + '/train_normal.csv', skiprows=1, \ index_col=None, header=None) ecg_data3 = pd.read_csv(data_dir_path + '/test_attack.csv', skiprows=1, \ index_col=None, header=None) # ecg_data1 = ecg_data1.iloc[:, 0:-1] ecg_data2 = ecg_data2.iloc[:, 0:-1] ecg_data3 = ecg_data3.iloc[:, 0:-1] ecg_data = pd.concat([ecg_data2, ecg_data3], ignore_index=True) # print(ecg_data.head()) ecg_np_data = ecg_data.as_matrix() scaler = MinMaxScaler() ecg_np_data = scaler.fit_transform(ecg_np_data) print(ecg_np_data.shape) ae = LstmAutoEncoder() # fit the data and save model into model_dir_path ae.fit(ecg_np_data[0:832, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9) # load back the model saved in model_dir_path detect anomaly ae.load_model(model_dir_path) # anomaly_information = ae.anomaly(ecg_np_data[:23, :]) anomaly_information = ae.anomaly(ecg_np_data, threshold=1.75) reconstruction_error = [] for idx, (is_anomaly, dist) in enumerate(anomaly_information): print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')') reconstruction_error.append(dist) visualize_reconstruction_error(reconstruction_error, ae.threshold)
def lstmnn(inputfile, weekday, lanedirection, hourfrom, hourto): # @app.route('/lstmrnn/<inputfile>/<day>/<int:lanedirection>/<int:hourfrom>/<int:hourto>', methods=['GET']) # def lstmnn(inputfile,day,lanedirection,hourfrom,hourto): begin = time.perf_counter() data_dir_path = './datalake' model_dir_path = './models' ##df = pd.read_csv(data_dir_path + '/Nov2012Dec2014trafficmatches.csv') # df = pd.read_csv(data_dir_path + '/Nov2012Dec2017trafficmatch.csv') df = pd.read_csv(data_dir_path + '/' + inputfile) ##print(df.head()) dat = df.loc[(df['Week'] == weekday) & (df['LaneDirection'] == lanedirection) & (df['Hour'] >= hourfrom) & (df['Hour'] <= hourto)] # dat=df.loc[(df['DayName']==day) & (df['LaneDirection']==lanedirection) & (df['Hour'] >= hourfrom) & (df['Hour'] <= hourto)] # dat=df.loc[(df['Week']=='Weekdays') & (df['DirectionDescription']=='South') & (df['Hour'] >= 21) & (df['Hour'] <= 23)] ## dat=df.loc[(df['Week']==weekday) & (df['DirectionDescription']==direction) & (df['Hour'] >= hourfrom) & (df['Hour'] <= hourto)] dat.insert(0, 'Row', range(0, 0 + len(dat))) dat = dat[[ 'Row', 'Sdate', 'DayName', 'LaneNumber', 'DirectionDescription', 'Volume', 'AvgSpeed', 'Outlier' ]] print(dat) ##traffic_data = pd.read_csv(data_dir_path + '/test_data.csv', header=None) ##traffic_data = pd.read_csv(data_dir_path + '/test_south.csv', header=None) traffic_data = dat[['Volume']] print(traffic_data.head()) traffic_np_data = traffic_data.values scaler = MinMaxScaler() traffic_np_data = scaler.fit_transform(traffic_np_data) print(traffic_np_data.shape) ae = LstmAutoEncoder() # fit the data and save model into model_dir_path ae.fit(traffic_np_data[:, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9) # load back the model saved in model_dir_path detect anomaly ae.load_model(model_dir_path) anomaly_information = ae.anomaly(traffic_np_data) reconstruction_error = [] # new dataframe to store idk and anomaly colnames = ['Row', 'OutlierPrediction'] df2 = pd.DataFrame(columns=colnames) for idx, (is_anomaly, dist) in enumerate(anomaly_information): print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')') df2.loc[len(df2)] = [idx, ('abnormal' if is_anomaly else 'normal')] #if is_anomaly : #df2.loc[len(df2)] = [idx, 'abnormal'] #else: #pass reconstruction_error.append(dist) #print(df2) dat['OutlierPrediction'] = np.where(df2['OutlierPrediction'] == 'abnormal', 1, 0) df3 = dat.loc[dat['OutlierPrediction'] == 1] print(df3) tn, fp, fn, tp = confusion_matrix(dat['Outlier'].values, dat['OutlierPrediction'].values).ravel() sensitivity = tp / (tp + fn) #sensitivity=recall_score(df['Outlier'], df['OutlierPrediction'], average='weighted') specificity = tn / (fp + tn) BalancedAccuracy = (sensitivity + specificity) / 2 #FPRate = fp/(fp+tn) end = time.perf_counter() - begin print('Balanced Accuracy=%.2f' % (BalancedAccuracy)) # visualize_reconstruction_error(reconstruction_error, ae.threshold) #plot img = io.BytesIO() pyplot.plot(reconstruction_error, marker='o', ms=3.5, linestyle='', label='Point') pyplot.hlines(ae.threshold, xmin=0, xmax=len(reconstruction_error) - 1, colors="r", zorder=100, label='Threshold') pyplot.legend() pyplot.ylabel("Dist") pyplot.xlabel("Data point index") #pyplot.show() pyplot.savefig(img, format='png') img.seek(0) plot_url = base64.b64encode(img.getvalue()).decode() rsp = '<img src="data:image/png;base64,{}">'.format(plot_url) # return rsp # return render_template("home.html", graph=rsp, data=df3.to_html(),baccuracy=BalancedAccuracy) return render_template("home.html", graph=rsp, data="Balanced Accuracy = " + str(round(BalancedAccuracy, 2)), data3=df3.to_html(), data2=round(end, 2))
def main(): #================read training dataset==================== # train_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/train/5_gram.csv' # attack test path # test_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_attack_2.csv' # test_data = pd.read_csv(test_path, index_col=0, usecols=[0,1,2,3,4,5]) # test_data_np = test_data.as_matrix() # normal test path # data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/data' data_dir_path = (r'/Users/Shariful/Documents/SysCallDataset/PreparedData' r'/Canali_dataset/sliding_window_5') # model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/models' model_dir_path = (r'/Users/Shariful/Documents/GitHubRepo/deeplearning/' r'syscall_anomaly/Canali/trained_models') score_dir_path = (r'/Users/Shariful/Documents/GitHubRepo/deeplearning/' r'syscall_anomaly/Canali/scores') canali_data = pd.read_csv(data_dir_path + '/train_set.csv', header=None) # canali_data = pd.read_csv(data_dir_path + '/train_set.csv', \ # index_col=0, usecols=[0,1,2,3,4,5]) #==================Fit the LSTM model===================== # ['0','1','2','3','4'] # canali_data = canali_data.iloc[:, 0:-1] # print(canali_data.head()) canali_np_data = canali_data.as_matrix() # scaler = MinMaxScaler() # canali_np_data = scaler.fit_transform(canali_np_data) # print(canali_np_data.shape) ae = LstmAutoEncoder() # fit the data and save model into model_dir_path ae.fit(canali_np_data, model_dir_path=model_dir_path, batch_size=1000, \ epochs=20, estimated_negative_sample_ratio=None) #==========Load the saved model=========== # load back the model saved in model_dir_path detect anomaly ae.load_model(model_dir_path) #=============read test dataset=============== # test data set test_idx_path = data_dir_path + '/test_set_index_range_label.csv' df_test_idx = pd.read_csv(test_idx_path, header=None) test_path = data_dir_path + '/test_set.csv' df_test = pd.read_csv(test_path, header=None) df_test_np = df_test.as_matrix() # df_test_np = df_test_np[0:123649,:] test_labels = np.array(df_test_idx.iloc[:, -1]) # ecg_np_test_data = canali_np_data[0:43559, :] # test_data_np = np.vstack((ecg_np_test_data, test_data_np)) #================predict scores on testing set============ # anomaly_information = ae.anomaly(canali_np_data[:23, :]) anomaly_information = ae.anomaly(df_test_np, threshold=150) # reconstruction_error = [] idx_out = 0 max_scores = np.zeros((df_test_idx.shape[0])) for idx_in, (is_anomaly, dist) in enumerate(anomaly_information): # print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')') # reconstruction_error.append(dist) #finding the maximum score out of all subsequences' scores if idx_in <= df_test_idx.loc[idx_out][:][1]: if max_scores[idx_out] < dist: max_scores[idx_out] = dist else: idx_out += 1 max_scores[idx_out] = dist # visualize_reconstruction_error(reconstruction_error, ae.threshold) visualize_reconstruction_error(max_scores, ae.threshold) #=============load and plot the computed scores on testing set============== # max_scores = pd.read_csv('/Users/Shariful/Documents/GitHubRepo/deeplearning/syscall_anomaly/scores_on_testset/lstm_128_units.csv', \ # header = None) # visualize_reconstruction_error(max_scores, 150) # draw the roc curve plot_ROC(test_labels, max_scores) # save the computed scores np.savetxt(score_dir_path + '/lstm_128_units.csv', max_scores, delimiter=",")