def main(): step_length = 150 interval_length = 300 model_scenario = int(sys.argv[3]) data_scenario = int(sys.argv[4]) # pcap_file = sys.argv[1] # Dictionary of malicious IP addresses with start timestamp as its value botnet_nodes = scenario_info.get_botnet_nodes(data_scenario) pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1 savefile_x = sys.argv[ 1] # 'x_scenario_' + str(data_scenario) + '_lstm.txt' savefile_y = sys.argv[ 2] # 'y_scenario_' + str(data_scenario) + '_lstm.txt' model_savefile = 'stateless_lstm_12_features_model_scenario_' + str(model_scenario) \ + '_interval_' + str(interval_length) + '_step_' + str(step_length) + '.h5' ''' x, y = prep_time_series_input.generate_input_arrays(pcap_file, \ botnet_nodes, pcap_duration, step_length = step_length, \ interval_length = interval_length, do_save=True, \ savefile_x=savefile_x, savefile_y=savefile_y, verbose = True) ''' x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \ filename_y=savefile_y) x = np.delete(x, np.s_[12:], 2) # wow I was initially writing tens of lines to do this balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio=10) del x del y windowed_x, windowed_y, num_samples, windows_per_sample = prep_time_series_input.time_window_data \ (balanced_x, balanced_y, 5, 2, interval_length, step_length, data_scenario) # Note that the test set contains all the data so obviously it includes the # training data...since the training data is so limited, it likely will have # little effect on the outcome though ''' _, _, x_test, y_test = separate_into_sets(x, y, training_proportion = 0) x_train, y_train, _, _ = \ separate_into_sets(balanced_x, balanced_y, training_proportion = 0.7) ''' x_train, y_train, x_test, y_test = prep_time_series_input. \ separate_into_sets(windowed_x, windowed_y, positive_proportion = 0.7) print "Number of samples (training and testing): ", str(num_samples) print "Number of windows per sample (training and testing): ", str( windows_per_sample) print "x_train, y_train shapes: ", x_train.shape, y_train.shape print "x_test, y_test shapes: ", x_test.shape, y_test.shape weighted_y_train = np.copy(y_train) weighted_y_train[weighted_y_train == 1] = 6 weighted_y_test = np.copy(y_test) weighted_y_test[weighted_y_test == 1] = 6 # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER ''' ADD K-FOLD CROSS VALIDATION SOON...NOT NECESSARY RIGHT NOW FOR TESTING PURPOSES BUT DEFINITELY SHOULD DO IT FOR THE FINAL EVALUATION OF THE MODEL. https://machinelearningmastery.com/evaluate-performance-deep-learning-models-keras/ http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html ''' model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \ save_model=True, savefile=model_savefile) """ model = load_model(model_savefile, custom_objects = \ {'true_positives': true_positives, 'false_positives': false_positives, \ 'true_negatives': true_negatives, 'false_negatives': false_negatives, \ 'true_positive_rate': true_positive_rate, \ 'false_positive_rate': false_positive_rate, \ 'true_negative_rate': true_negative_rate, \ 'false_negative_rate': false_negative_rate}) """ evaluate_model(model, x_test, y_test, pcap_duration, step_length) generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario, \ savefile = 'stateless_lstm_12_features_model_scenario_' + str(model_scenario) \ + '_data_scenario_' + str(data_scenario) + '_interval_' \ + str(interval_length) + '_step_' + str(step_length) + '.png')
def main(): step_length = 150 interval_length = 300 # model_scenario = int(sys.argv[3]) # data_scenario = int(sys.argv[4]) # pcap_file = sys.argv[1] # Dictionary of malicious IP addresses with start timestamp as its value # botnet_nodes = scenario_info.get_botnet_nodes(data_scenario) pcap_duration = 150000 # arbitrarily chosen value (used to calculate batch size) # scenario_info.get_pcap_duration(data_scenario) # * 0.1 #savefile_x = sys.argv[1] # 'x_scenario_' + str(data_scenario) + '_lstm.txt' #savefile_y = sys.argv[2] # 'y_scenario_' + str(data_scenario) + '_lstm.txt' savefile_x_list = [ 'x_scenario_6_lstm_normalized.txt', 'x_scenario_7_lstm_normalized.txt', 'x_scenario_10_lstm_normalized.txt', 'x_scenario_11_lstm_normalized.txt', 'x_scenario_12_lstm_normalized.txt' ] savefile_y_list = [ 'y_scenario_6_lstm.txt', 'y_scenario_7_lstm.txt', 'y_scenario_10_lstm.txt', 'y_scenario_11_lstm.txt', 'y_scenario_12_lstm.txt' ] scenario_lst = [6, 7, 10, 11, 12] model_savefile = 'stateless_lstm_combined_model_scenario_6_7_10_11_12' \ + '_interval_' + str(interval_length) + '_step_' + str(step_length) + '.h5' ''' x, y = prep_time_series_input.generate_input_arrays(pcap_file, \ botnet_nodes, pcap_duration, step_length = step_length, \ interval_length = interval_length, do_save=True, \ savefile_x=savefile_x, savefile_y=savefile_y, verbose = True) ''' windowed_x = np.array([]).reshape(0, 5, 28) windowed_y = np.array([]) for i in range(len(savefile_x_list)): data_scenario = scenario_lst[i] savefile_x = savefile_x_list[i] savefile_y = savefile_y_list[i] x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \ filename_y=savefile_y) balanced_x, balanced_y = \ prep_time_series_input.balance_data(x, y, ratio = 10) del x del y current_windowed_x, current_windowed_y, num_samples, windows_per_sample = \ prep_time_series_input.time_window_data(balanced_x, balanced_y, \ 5, 2, interval_length, step_length, data_scenario) print current_windowed_x.shape print current_windowed_y.shape windowed_x = np.append(windowed_x, current_windowed_x, axis=0) windowed_y = np.append(windowed_y, current_windowed_y, axis=0) # Note that the test set contains all the data so obviously it includes the # training data...since the training data is so limited, it likely will have # little effect on the outcome though ''' _, _, x_test, y_test = separate_into_sets(x, y, training_proportion = 0) x_train, y_train, _, _ = \ separate_into_sets(balanced_x, balanced_y, training_proportion = 0.7) ''' x_train, y_train, x_test, y_test = prep_time_series_input. \ separate_into_sets(windowed_x, windowed_y, positive_proportion = 0.7) ''' x_train, y_train, x_test, y_test = prep_time_series_input. \ separate_into_sets(windowed_x, windowed_y, training_proportion = 0) ''' print "Number of samples (training and testing): ", str(num_samples) print "Number of windows per sample (training and testing): ", str( windows_per_sample) print "x_train, y_train shapes: ", x_train.shape, y_train.shape print "x_test, y_test shapes: ", x_test.shape, y_test.shape weighted_y_train = np.copy(y_train) weighted_y_train[weighted_y_train == 1] = 6 weighted_y_test = np.copy(y_test) weighted_y_test[weighted_y_test == 1] = 6 # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER ''' ADD K-FOLD CROSS VALIDATION SOON...NOT NECESSARY RIGHT NOW FOR TESTING PURPOSES BUT DEFINITELY SHOULD DO IT FOR THE FINAL EVALUATION OF THE MODEL. https://machinelearningmastery.com/evaluate-performance-deep-learning-models-keras/ http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html ''' model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \ save_model=True, savefile=model_savefile) """ model = load_model(model_savefile, custom_objects = \ {'true_positives': true_positives, 'false_positives': false_positives, \ 'true_negatives': true_negatives, 'false_negatives': false_negatives, \ 'true_positive_rate': true_positive_rate, \ 'false_positive_rate': false_positive_rate, \ 'true_negative_rate': true_negative_rate, \ 'false_negative_rate': false_negative_rate}) """ evaluate_model(model, x_test, y_test, pcap_duration, step_length) generate_roc_curve(model, x_test, y_test, \ savefile = 'stateless_lstm_combined_model_6_7_10_11_12' \ + '_combined_data_6_7_10_11_12' + '_interval_' \ + str(interval_length) + '_step_' + str(step_length) + '.png')
def main(): step_length = 15 interval_length = 60 model_scenario = 11 data_scenario = 11 # scenario 9's data has good results for several models` pcap_file = sys.argv[1] # Dictionary of malicious IP addresses with start timestamp as its value botnet_nodes = scenario_info.get_botnet_nodes(data_scenario) pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1 savefile_x = 'x_scenario_' + str(data_scenario) + '_lstm.txt' savefile_y = 'y_scenario_' + str(data_scenario) + '_lstm.txt' model_savefile = 'stateful_lstm_model_scenario_' + str(model_scenario) + '.h5' ''' Note that it's important that the original x and y are processed in the following order: balanced (maintain a certain ratio between postive and negative samples), separated into training and testing sets, and then broken into time windows (for stateful LSTM). This is because balancing data before keeps the entire initial time interval for the chosen samples and then the testing and training sets each contain of fewer samples with their entire time intervals. Finally we break each set's samples' entire time interval into time windows (as opposed to breaking into time windows and haphazardly choosing time windows from the entire time interval) ''' # x and y contain the entire dataset in these NumPy arrays x, y = prep_time_series_input.generate_input_arrays(pcap_file, botnet_nodes, pcap_duration, \ step_length = step_length, interval_length = interval_length, \ do_save=True, savefile_x=savefile_x, savefile_y=savefile_y, \ verbose = True) ''' ''' x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, filename_y=savefile_y) # Balanced x and y arrays maintain a certain ratio; each sample contains # its entire time interval balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10) # Pre-(x/y)-(train/test) separate the balanced x and y arrays based on a # certain ratio -> each sample still contains its entire time interval ''' # Note that the test set contains all the data so obviously it includes the # training data _, _, pre_x_test, pre_y_test = \ separate_into_sets(x, y, training_proportion = 0) ''' pre_x_train, pre_y_train, _, _ = prep_time_series_input. \ separate_into_sets(balanced_x, balanced_y, training_proportion = 1) _, _, pre_x_test, pre_y_test = prep_time_series_input. \ separate_into_sets(x, y, training_proportion = 0) ''' pre_x_train, pre_y_train, pre_x_test, pre_y_test = prep_time_series_input. \ separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5) ''' # (x,y)_(train/test) contains the chosen samples (balanced and broken into # time windows) x_train, y_train, num_training_samples, windows_per_training_sample \ = prep_time_series_input.time_window_data(pre_x_train, pre_y_train, 5, 2, \ interval_length, step_length, data_scenario) x_test, y_test, num_testing_samples, windows_per_testing_sample \ = prep_time_series_input.time_window_data(pre_x_test, pre_y_test, 5, 2, \ interval_length, step_length, data_scenario) print "Original x, y shapes: ", x.shape, y.shape print "Number of training samples: ", str(num_training_samples) print "Number of windows per training sample: ", str(windows_per_training_sample) print "Number of testing samples: ", str(num_testing_samples) print "Number of windows per testing sample: ", str(windows_per_testing_sample) print "x_train, y_train shapes: ", x_train.shape, y_train.shape print "x_test, y_test shapes: ", x_test.shape, y_test.shape weighted_y_train = np.copy(y_train) weighted_y_train[weighted_y_train == 1] = 6 weighted_y_test = np.copy(y_test) weighted_y_test[weighted_y_test == 1] = 6 # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER model = create_model(x_train, weighted_y_train, num_training_samples, \ windows_per_training_sample, save_model=False, savefile=model_savefile) """ model = load_model(model_savefile, custom_objects = \ {'true_positives': true_positives, 'false_positives': false_positives, \ 'true_negatives': true_negatives, 'false_negatives': false_negatives, \ 'true_positive_rate': true_positive_rate, \ 'false_positive_rate': false_positive_rate, \ 'true_negative_rate': true_negative_rate, \ 'false_negative_rate': false_negative_rate}) """ evaluate_model(model, x_test, y_test, windows_per_testing_sample) generate_roc_curve(model, x_test, y_test, windows_per_testing_sample, \ data_scenario, model_scenario)
def main(): step_length = 15 interval_length = 60 model_scenario = 11 data_scenario = 11 # scenario 9's data has good results for several models pcap_file = sys.argv[1] # Dictionary of malicious IP addresses with start timestamp as its value botnet_nodes = scenario_info.get_botnet_nodes(data_scenario) pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1 savefile_x = 'x_scenario_' + str(data_scenario) + '_lstm.txt' savefile_y = 'y_scenario_' + str(data_scenario) + '_lstm.txt' model_savefile = 'stateless_lstm_model_scenario_' + str( model_scenario) + '.h5' x, y = prep_time_series_input.generate_input_arrays(pcap_file, \ botnet_nodes, pcap_duration, step_length = step_length, \ interval_length = interval_length, do_save=True, \ savefile_x=savefile_x, savefile_y=savefile_y, verbose = True) ''' ''' x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \ filename_y=savefile_y) x, y, _, _ = prep_time_series_input.time_window_data(x, y, 5, 2, \ interval_length, step_length, data_scenario) ''' x_train, y_train, x_test, y_test = separate_into_sets(x, y, \ training_proportion = 0.7) ''' balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio=10) # Note that the test set contains all the data so obviously it includes the # training data...since the training data is so limited, it likely will have # little effect on the outcome though ''' _, _, x_test, y_test = separate_into_sets(x, y, training_proportion = 0) x_train, y_train, _, _ = \ separate_into_sets(balanced_x, balanced_y, training_proportion = 0.7) ''' x_train, y_train, x_test, y_test \ = prep_time_series_input.separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5) print x.shape, y.shape print x_test.shape, y_test.shape print x_train.shape, y_train.shape weighted_y_train = np.copy(y_train) weighted_y_train[weighted_y_train == 1] = 6 weighted_y_test = np.copy(y_test) weighted_y_test[weighted_y_test == 1] = 6 # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER """ model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \ save_model=True, savefile=model_savefile) """ model = load_model(model_savefile, custom_objects = \ {'true_positives': true_positives, 'false_positives': false_positives, \ 'true_negatives': true_negatives, 'false_negatives': false_negatives, \ 'true_positive_rate': true_positive_rate, \ 'false_positive_rate': false_positive_rate, \ 'true_negative_rate': true_negative_rate, \ 'false_negative_rate': false_negative_rate}) #evaluate_model(model, x_test, y_test, pcap_duration, step_length) generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario)