def main():
    data_dir_path = './data'
    model_dir_path = './models'

    # ecg data in which each row is a temporal sequence data of continuous values
    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)

    print(ecg_np_data.shape)

    ae = FeedForwardAutoEncoder()

    # fit the data and save model into model_dir_path
    if DO_TRAINING:
        ae.fit(ecg_np_data[:23, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold)
def main():
    data_dir_path = './data'
    model_dir_path = './models'
    ecg_data = pd.read_csv(data_dir_path + '/ground_anomaly.csv')
    ecg_data = ecg_data[1:]
    # print([name for name in ecg_data.columns])
    ecg_data=ecg_data.drop(['TIMESTAMP', 'RECORD', 'AmbTemp_C_Avg', 'InvPAC_kW_Avg', 'PwrMtrP_kW_Avg'], axis=1)
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)

    ae = LstmAutoEncoder()
    print(ecg_data.shape)
    column = ecg_data.shape[0]
    print(column)

    # fit the data and save model into model_dir_path
    ae.fit(ecg_np_data[:10000, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.95)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
    anomaly_information = ae.anomaly(ecg_np_data[:10000, :])
    reconstruction_error = []
    abnormal_number = 0
    idx_list = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        if is_anomaly:
            abnormal_number = abnormal_number + 1
            print(idx)
            idx_list.append(idx)
            print('# ' + str(idx) + ' is abnormal.')
        reconstruction_error.append(dist)
    print(abnormal_number)
    print(idx_list)
    visualize_reconstruction_error(reconstruction_error, ae.threshold)
def main():

#    data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/data'
    data_dir_path = '/Users/Shariful/Documents/DataCamp/ADFA-LD(tf-idf)'
#    model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/models'
    model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/adfa_demo/models'

#    ecg_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
#    ecg_data1 = pd.read_csv(data_dir_path + '/test_normal.csv', skiprows=1, \
#                           index_col=None, header=None)
    ecg_data2 = pd.read_csv(data_dir_path + '/train_normal.csv', skiprows=1, \
                           index_col=None, header=None)
    ecg_data3 = pd.read_csv(data_dir_path + '/test_attack.csv', skiprows=1, \
                           index_col=None, header=None)
#    ecg_data1 = ecg_data1.iloc[:, 0:-1]
    ecg_data2 = ecg_data2.iloc[:, 0:-1]
    ecg_data3 = ecg_data3.iloc[:, 0:-1]
    
    ecg_data = pd.concat([ecg_data2, ecg_data3], ignore_index=True)
    
#    print(ecg_data.head())
    ecg_np_data = ecg_data.as_matrix()
    scaler = MinMaxScaler()
    ecg_np_data = scaler.fit_transform(ecg_np_data)
    print(ecg_np_data.shape)

    ae = LstmAutoEncoder()

    # fit the data and save model into model_dir_path
    ae.fit(ecg_np_data[0:832, :], model_dir_path=model_dir_path, estimated_negative_sample_ratio=0.9)

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)
#    anomaly_information = ae.anomaly(ecg_np_data[:23, :])
    anomaly_information = ae.anomaly(ecg_np_data, threshold=1.75)
    reconstruction_error = []
    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        reconstruction_error.append(dist)

    visualize_reconstruction_error(reconstruction_error, ae.threshold)
示例#4
0
def main():
#================read training dataset====================

    #    train_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/train/5_gram.csv'
#    attack test path
#    test_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_attack_2.csv'
#    test_data = pd.read_csv(test_path, index_col=0, usecols=[0,1,2,3,4,5])
#    test_data_np = test_data.as_matrix()
#    normal test path
    
#    data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/data'
    data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/train'
#    model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/models'
    model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/models_5_gram'
    
    score_dir_path = '/Users/Shariful/Documents/GitHubRepo/deeplearning/syscall_anomaly/scores_on_testset'

#    adfa_data = pd.read_csv(data_dir_path + '/ecg_discord_test.csv', header=None)
    adfa_data = pd.read_csv(data_dir_path + '/5_gram.csv', \
                           index_col=0, usecols=[0,1,2,3,4,5])

##==================Fit the LSTM model=====================
##    ['0','1','2','3','4']
##    adfa_data = adfa_data.iloc[:, 0:-1]
##    print(adfa_data.head())
#    adfa_np_data = adfa_data.as_matrix()
##    scaler = MinMaxScaler()
##    adfa_np_data = scaler.fit_transform(adfa_np_data)
##    print(adfa_np_data.shape)
#
#    ae = LstmAutoEncoder()
#
#    # fit the data and save model into model_dir_path
#    ae.fit(adfa_np_data, model_dir_path=model_dir_path, batch_size=100, \
#           epochs=20, estimated_negative_sample_ratio=None)

##==========Load the saved model===========
#    
#    # load back the model saved in model_dir_path detect anomaly
#    ae.load_model(model_dir_path)

#=============read test dataset===============
    
#    test data set
    test_idx_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_test_idx.csv'
    df_test_idx = pd.read_csv(test_idx_path, header = None, skiprows = 1)
    test_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_test.csv'
    df_test = pd.read_csv(test_path, header = None, skiprows = 1)
    df_test_np = df_test.as_matrix()
#    df_test_np = df_test_np[0:123649,:]
    
    test_labels = np.hstack((np.ones(60, dtype = int), \
                          np.zeros(df_test_idx.shape[0]-60, dtype = int))) 
        
#    ecg_np_test_data = adfa_np_data[0:43559, :]
#    test_data_np = np.vstack((ecg_np_test_data, test_data_np))

##================predict scores on testing set============
#
#    
##    anomaly_information = ae.anomaly(adfa_np_data[:23, :])
#    anomaly_information = ae.anomaly(df_test_np, threshold=150)
##    reconstruction_error = []
#    idx_out = 0
#    max_scores = np.zeros((df_test_idx.shape[0]))
#    for idx_in, (is_anomaly, dist) in enumerate(anomaly_information):
##        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
##        reconstruction_error.append(dist)
#
#        #finding the maximum score out of all subsequences' scores
#        if idx_in <= df_test_idx.loc[idx_out][:][1]:
#            if max_scores[idx_out] < dist:
#                max_scores[idx_out] = dist
#        else:
#            idx_out += 1
#            max_scores[idx_out] = dist
#
##    visualize_reconstruction_error(reconstruction_error, ae.threshold)
#    visualize_reconstruction_error(max_scores, ae.threshold)
    
    
#=============load and plot the computed scores on testing set==============  
    
    max_scores = pd.read_csv('/Users/Shariful/Documents/GitHubRepo/deeplearning/syscall_anomaly/scores_on_testset/lstm_128_units.csv', \
                            header = None)
    visualize_reconstruction_error(max_scores, 150)
    
#    draw the roc curve
    plot_ROC(test_labels, max_scores)
def AutoEncoder_test(X_data, Y_data, sub_output_dir, num, model_name, ae, error_list) :

    model_dir_path = sub_output_dir + model_name + '/'

    anomaly_dir = model_dir_path + 'anomaly/'
    png_dir_1  = sub_output_dir + '1_png/'
    png_dir_2 = model_dir_path + 'png/'
    metrics_dir_1 = sub_output_dir + '2_metrics/'
    metrics_dir_2 = model_dir_path + 'metrics/'
    confusion_dir_1 = sub_output_dir + '3_confusion/'
    confusion_dir_2 = model_dir_path + 'confusion/'

    create_directory(model_dir_path)
    create_directory(anomaly_dir)
    create_directory(png_dir_1)
    create_directory(png_dir_2)
    create_directory(metrics_dir_1)
    create_directory(metrics_dir_2)
    create_directory(confusion_dir_1)
    create_directory(confusion_dir_2)

    x_size = len(X_data)
    y_size = 0
    for i in range (x_size) :
        if Y_data[i] == 0 :
            y_size +=1
    estimated_negative_sample_ratio = y_size / x_size

    # fit the data and save model into model_dir_path
    history = ae.fit(X_data, model_dir_path=model_dir_path, estimated_negative_sample_ratio=estimated_negative_sample_ratio)

    # load back the model saved in model_dir_path detect anomaly
    #ae.load_model(model_dir_path)

    if 0 :
        _, Xtest, _, Ytest = train_test_split(X_data, Y_data, test_size=0.5, random_state=1004)
    else :
        Xtest = X_data
        Ytest = Y_data

    adjusted_threshold = ae.threshold
    anomaly_information = ae.anomaly(Xtest, adjusted_threshold)
    reconstruction_error = []
    Ypred = []

    file_name_info = anomaly_dir + str(num) + '_anomaly.txt'
    f1 = open(file_name_info, mode='at')
    f2 = open(model_dir_path + 'dist.csv', mode='at')

    for idx, (is_anomaly, dist) in enumerate(anomaly_information):
        temp_str = '# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')'
        #print(temp_str)
        f1.write(temp_str + '\n')
        index = Y_data[idx]
        predicted_label = 1 if is_anomaly else 0
        Ypred.append(predicted_label)
        reconstruction_error.append(dist)

        anomal_str = str(idx) + ',' + str(index) + ',' + str(dist)
        f2.write(anomal_str+'\n')

    f1.close()
    f2.close()

    png_name_info_1 = png_dir_1 + str(num) + '_' + model_name + '_anomaly.png'
    png_name_info_2 = png_dir_2 + str(num) + '_' + model_name + '_anomaly.png'
    png_title = str(num) + '_'  + model_name + '_' + str(len(X_data))
    visualize_reconstruction_error(reconstruction_error, ae.threshold, Y_data, png_name_info_1, png_name_info_2, png_title, WINDOW_SIZE, error_list)
    plot_training_history_file(history, model_dir_path, num)

    #visualize_anomaly(Ytest, reconstruction_error, adjusted_threshold)
    visualize_anomaly_errors(Ytest, reconstruction_error, adjusted_threshold, error_list, png_title, model_dir_path, num)
    report_evaluation_metrics_file(Ytest, Ypred, metrics_dir_1, metrics_dir_2, num, model_name)
    plot_confusion_matrix_file(Ytest, Ypred, confusion_dir_1, confusion_dir_2, num, model_name)
def main():
    #================read training dataset====================

    #    train_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/train/5_gram.csv'
    #    attack test path
    #    test_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ADFA-LD/n-gram/5-gram/5_gram_attack_2.csv'
    #    test_data = pd.read_csv(test_path, index_col=0, usecols=[0,1,2,3,4,5])
    #    test_data_np = test_data.as_matrix()
    #    normal test path

    #    data_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/data'
    data_dir_path = (r'/Users/Shariful/Documents/SysCallDataset/PreparedData'
                     r'/Canali_dataset/sliding_window_5')
    #    model_dir_path = '/Users/Shariful/Documents/GitHubRepo/Datasets/ecg_demo/models'
    model_dir_path = (r'/Users/Shariful/Documents/GitHubRepo/deeplearning/'
                      r'syscall_anomaly/Canali/trained_models')

    score_dir_path = (r'/Users/Shariful/Documents/GitHubRepo/deeplearning/'
                      r'syscall_anomaly/Canali/scores')

    canali_data = pd.read_csv(data_dir_path + '/train_set.csv', header=None)
    #    canali_data = pd.read_csv(data_dir_path + '/train_set.csv', \
    #                           index_col=0, usecols=[0,1,2,3,4,5])

    #==================Fit the LSTM model=====================
    #    ['0','1','2','3','4']
    #    canali_data = canali_data.iloc[:, 0:-1]
    #    print(canali_data.head())
    canali_np_data = canali_data.as_matrix()
    #    scaler = MinMaxScaler()
    #    canali_np_data = scaler.fit_transform(canali_np_data)
    #    print(canali_np_data.shape)

    ae = LstmAutoEncoder()

    # fit the data and save model into model_dir_path
    ae.fit(canali_np_data, model_dir_path=model_dir_path, batch_size=1000, \
           epochs=20, estimated_negative_sample_ratio=None)

    #==========Load the saved model===========

    # load back the model saved in model_dir_path detect anomaly
    ae.load_model(model_dir_path)

    #=============read test dataset===============

    #    test data set
    test_idx_path = data_dir_path + '/test_set_index_range_label.csv'
    df_test_idx = pd.read_csv(test_idx_path, header=None)

    test_path = data_dir_path + '/test_set.csv'
    df_test = pd.read_csv(test_path, header=None)

    df_test_np = df_test.as_matrix()
    #    df_test_np = df_test_np[0:123649,:]

    test_labels = np.array(df_test_idx.iloc[:, -1])

    #    ecg_np_test_data = canali_np_data[0:43559, :]
    #    test_data_np = np.vstack((ecg_np_test_data, test_data_np))

    #================predict scores on testing set============

    #    anomaly_information = ae.anomaly(canali_np_data[:23, :])
    anomaly_information = ae.anomaly(df_test_np, threshold=150)
    #    reconstruction_error = []
    idx_out = 0
    max_scores = np.zeros((df_test_idx.shape[0]))
    for idx_in, (is_anomaly, dist) in enumerate(anomaly_information):
        #        print('# ' + str(idx) + ' is ' + ('abnormal' if is_anomaly else 'normal') + ' (dist: ' + str(dist) + ')')
        #        reconstruction_error.append(dist)

        #finding the maximum score out of all subsequences' scores
        if idx_in <= df_test_idx.loc[idx_out][:][1]:
            if max_scores[idx_out] < dist:
                max_scores[idx_out] = dist
        else:
            idx_out += 1
            max_scores[idx_out] = dist

#    visualize_reconstruction_error(reconstruction_error, ae.threshold)
    visualize_reconstruction_error(max_scores, ae.threshold)

    #=============load and plot the computed scores on testing set==============

    #    max_scores = pd.read_csv('/Users/Shariful/Documents/GitHubRepo/deeplearning/syscall_anomaly/scores_on_testset/lstm_128_units.csv', \
    #                            header = None)
    #    visualize_reconstruction_error(max_scores, 150)

    #    draw the roc curve
    plot_ROC(test_labels, max_scores)

    #    save the computed scores
    np.savetxt(score_dir_path + '/lstm_128_units.csv',
               max_scores,
               delimiter=",")