示例#1
0
def calculate_y(sample_y, scenario, window_start_time, window_end_time):
    # If the node is never infected, it is always non-malicious
    if sample_y == 0:
        return 0
    # assert window_start_time < window_end_time
    infection_time = min(scenario_info.get_botnet_nodes(scenario).values())
    if window_start_time < infection_time and window_end_time < infection_time:
        return 0
    elif window_start_time < infection_time and window_end_time > infection_time:
        return 1  # I can experiment with this value
    else:  # if window_start_time > infection_time and window_end_time > infection_time
        return 1
def main():
    step_length = 150
    interval_length = 300

    model_scenario = int(sys.argv[3])
    data_scenario = int(sys.argv[4])

    # pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario)  # * 0.1

    savefile_x = sys.argv[
        1]  # 'x_scenario_' + str(data_scenario) + '_lstm.txt'
    savefile_y = sys.argv[
        2]  # 'y_scenario_' + str(data_scenario) + '_lstm.txt'
    model_savefile = 'stateless_lstm_12_features_model_scenario_' + str(model_scenario) \
        + '_interval_' + str(interval_length) + '_step_' + str(step_length) + '.h5'
    '''
    x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
        botnet_nodes, pcap_duration, step_length = step_length, \
        interval_length = interval_length, do_save=True, \
        savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
    '''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \
        filename_y=savefile_y)
    x = np.delete(x, np.s_[12:],
                  2)  # wow I was initially writing tens of lines to do this

    balanced_x, balanced_y = prep_time_series_input.balance_data(x,
                                                                 y,
                                                                 ratio=10)
    del x
    del y
    windowed_x, windowed_y, num_samples, windows_per_sample = prep_time_series_input.time_window_data \
        (balanced_x, balanced_y, 5, 2, interval_length, step_length, data_scenario)
    # Note that the test set contains all the data so obviously it includes the
    # training data...since the training data is so limited, it likely will have
    # little effect on the outcome though
    '''
    _, _, x_test, y_test = separate_into_sets(x, y, training_proportion = 0)
    x_train, y_train, _, _ = \
        separate_into_sets(balanced_x, balanced_y, training_proportion = 0.7)
    '''
    x_train, y_train, x_test, y_test = prep_time_series_input. \
        separate_into_sets(windowed_x, windowed_y, positive_proportion = 0.7)

    print "Number of samples (training and testing): ", str(num_samples)
    print "Number of windows per sample (training and testing): ", str(
        windows_per_sample)
    print "x_train, y_train shapes: ", x_train.shape, y_train.shape
    print "x_test, y_test shapes: ", x_test.shape, y_test.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = 6
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = 6
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    '''
    ADD K-FOLD CROSS VALIDATION SOON...NOT NECESSARY RIGHT NOW FOR TESTING PURPOSES
    BUT DEFINITELY SHOULD DO IT FOR THE FINAL EVALUATION OF THE MODEL.
    https://machinelearningmastery.com/evaluate-performance-deep-learning-models-keras/
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
    '''

    model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \
        save_model=True, savefile=model_savefile)
    """
    model = load_model(model_savefile, custom_objects = \
        {'true_positives': true_positives, 'false_positives': false_positives, \
         'true_negatives': true_negatives, 'false_negatives': false_negatives, \
         'true_positive_rate': true_positive_rate, \
         'false_positive_rate': false_positive_rate, \
         'true_negative_rate': true_negative_rate, \
         'false_negative_rate': false_negative_rate})
    """
    evaluate_model(model, x_test, y_test, pcap_duration, step_length)
    generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario, \
        savefile = 'stateless_lstm_12_features_model_scenario_' + str(model_scenario) \
        + '_data_scenario_' + str(data_scenario) + '_interval_' \
        + str(interval_length) + '_step_' + str(step_length) + '.png')
def main():
	step_length = 60
	interval_length = 120
	
	data_scenario = 11
	#pcap_file = sys.argv[1]
	# Dictionary of malicious IP addresses with start timestamp as its value
	botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
	pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1

	'''
	savefile_x = 'x_scenario_' + str(data_scenario) + '_one_graph.txt'
	savefile_y = 'y_scenario_' + str(data_scenario) + '_one_graph.txt'
	
	x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
		botnet_nodes, pcap_duration, step_length = step_length, \
		interval_length = interval_length, do_save=True, \
		savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
	
	x_total, y_total = prep_time_series_input. \
		load_input_arrays(filename_x=savefile_x, filename_y=savefile_y)
	'''
	# The below must be a regular SOM input file
	x, y = load_input_arrays(filename_x=sys.argv[1], filename_y=sys.argv[2])
	length, height = 5, 5 # SOM dimensions
	# Train a 5x5 SOM with 500 iterations
	
	print "Training the SOM"
	som = SOM(length, height, VECTOR_SIZE, 100)
	som.train(x, verbose = True)
	 
	# Get output grid
	clusters = np.array(som.get_centroids())
	clusters = np.reshape(clusters, (length * height, VECTOR_SIZE))
	
	print clusters

	# The below must be an LSTM input file
	filename_x = sys.argv[3]
	filename_y = sys.argv[4]
	mat_filename = 'output_mat_file.mat'

	x_total, y = prep_time_series_input.load_input_arrays(filename_x \
		= filename_x, filename_y = filename_y)
	num_samples = len(x_total)

	"""
	The x and y are now in the shape [[[feature size] * time_stamps] * samples]
	but we need it in the shape [[[feature size] * samples] * time_stamps] to
	represent all samples over time intervals - hence we transpose the matrix.
	The y is in the proper shape and can be re-used for every time interval in
	x_total
	"""
	x_total = np.transpose(x_total, axes=[1,0,2])
	num_time_intervals = len(x_total)

	# Dictionary to follow the layout of a .mat file
	arr = np.ndarray(shape=(num_time_intervals, 3), dtype=object)
	# nodes_array = np.array(range(len(y)))
	# the above gives a var x 1 matrix when I need a 1 x var matrix (below)
	# NOTE: I'M ADDING DUMMY NODES TO MAKE THE FIRST TIME STEP HAVE ALL THE GROUPS
	# (NEEEDED FOR THE VISUALIZATION TO SHOW THESE GROUPS)
	nodes_array = np.array(range(len(y) + (length * height + 1)))[np.newaxis].transpose()
	# nodes_array = np.array(range(len(y)))[np.newaxis].transpose()
	nodes_array += 1 # MATLAB INDICES START AT 1
	nodes_array = np.ndarray.astype(nodes_array, dtype=np.uint64)
	time_step = 0
	for x in x_total: # Iterate over time stamps
		i = 0
		print "Time interval " + str(time_step + 1) + ' / ' \
			+ str(num_time_intervals)
		cluster_array = np.array([])
		for sample in x:
			cluster_number = most_similar_group(sample, clusters)
			cluster_array = np.append(cluster_array, cluster_number)
			if y[i] != 0:
				print "Botnet node: cluster " + str(cluster_number + 1)
			i += 1

		'''
		if time_step == 0: # account for the dummy nodes
			cluster_array = np.append(cluster_array, range(length * height + 1))
		else:
			cluster_array = np.append(cluster_array, [0] * (length * height + 1))
		'''
		print "Actual (not dummy) clusters: ", str(np.unique(cluster_array + 1))
		cluster_array = np.append(cluster_array, range(length * height + 1))

		cluster_array = cluster_array[np.newaxis].transpose()
		cluster_array += 1 # MATLAB INDICES START AT 1
		cluster_array = np.ndarray.astype(cluster_array, dtype=np.uint64)

		arr[time_step][0] = np.array([0])
		# ^ dummy variable - eventually I can remove this and modify the MATLAB code
		arr[time_step][1] = nodes_array
		arr[time_step][2] = cluster_array
		time_step += 1
		# m x n matrix with each cell containing lists of indices of input vectors
		# mapped to it
		'''
		# Now I obviously can't use grid_map but I should eventually implement its 
		# equivalent so that I can identify the botnet nodes in the visualization
		grid_map = som.get_grid_mapping(mapped)
		# print 'grid_map: ', str(grid_map)
		# m x n matrix with each cell containing the number of input vectors 
		# mapped to it
		num_grid = [[len(grid_map[i][j]) for j in range(len(grid_map[i]))] \
		    for i in range(len(grid_map))]
		num_botnet_grid = [[count_botnet_nodes(grid_map[i][j], y) for j in \
			range(len(grid_map[i]))] for i in range(len(grid_map))]
		print num_grid
		print num_botnet_grid
		'''
		# if time_step == 10:
		# 	break

	output_dict = {'data': arr}
	# print arr
	# print output_dict
	print "Saving mat file"
	scipy.io.savemat(mat_filename, output_dict)
	# For some reason the mat file is created and saved just fine but the below
	# portion crashes...for now I will just use this generated .mat file and
	# make do, but eventually this needs to be fixed
	'''
示例#4
0
def main():
    step_length = 15
    interval_length = 60
    
    model_scenario = 11
    data_scenario = 11 # scenario 9's data has good results for several models`

    pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1

    savefile_x = 'x_scenario_' + str(data_scenario) + '_lstm.txt'
    savefile_y = 'y_scenario_' + str(data_scenario) + '_lstm.txt'
    model_savefile = 'stateful_lstm_model_scenario_' + str(model_scenario) + '.h5'

    '''
    Note that it's important that the original x and y are processed in the
    following order: balanced (maintain a certain ratio between postive and
    negative samples), separated into training and testing sets, and then
    broken into time windows (for stateful LSTM). This is because balancing
    data before keeps the entire initial time interval for the chosen samples
    and then the testing and training sets each contain of fewer samples with
    their entire time intervals. Finally we break each set's samples' entire
    time interval into time windows (as opposed to breaking into time windows
    and haphazardly choosing time windows from the entire time interval)
    '''

    # x and y contain the entire dataset in these NumPy arrays
    x, y = prep_time_series_input.generate_input_arrays(pcap_file, botnet_nodes, pcap_duration, \
        step_length = step_length, interval_length = interval_length, \
        do_save=True, savefile_x=savefile_x, savefile_y=savefile_y, \
        verbose = True)
    '''
    '''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, filename_y=savefile_y)

    # Balanced x and y arrays maintain a certain ratio; each sample contains
    # its entire time interval
    balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)

    # Pre-(x/y)-(train/test) separate the balanced x and y arrays based on a
    # certain ratio -> each sample still contains its entire time interval
    '''
    # Note that the test set contains all the data so obviously it includes the
    # training data
    _, _, pre_x_test, pre_y_test = \
        separate_into_sets(x, y, training_proportion = 0)
    '''
    pre_x_train, pre_y_train, _, _ = prep_time_series_input. \
        separate_into_sets(balanced_x, balanced_y, training_proportion = 1)
    _, _, pre_x_test, pre_y_test = prep_time_series_input. \
        separate_into_sets(x, y, training_proportion = 0)
    '''
    pre_x_train, pre_y_train, pre_x_test, pre_y_test = prep_time_series_input. \
        separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5)
    '''

    # (x,y)_(train/test) contains the chosen samples (balanced and broken into
    # time windows)
    x_train, y_train, num_training_samples, windows_per_training_sample \
        = prep_time_series_input.time_window_data(pre_x_train, pre_y_train, 5, 2, \
        interval_length, step_length, data_scenario)
    x_test, y_test, num_testing_samples, windows_per_testing_sample \
        = prep_time_series_input.time_window_data(pre_x_test, pre_y_test, 5, 2, \
        interval_length, step_length, data_scenario)

    print "Original x, y shapes: ", x.shape, y.shape
    print "Number of training samples: ", str(num_training_samples)
    print "Number of windows per training sample: ", str(windows_per_training_sample)
    print "Number of testing samples: ", str(num_testing_samples)
    print "Number of windows per testing sample: ", str(windows_per_testing_sample)
    print "x_train, y_train shapes: ", x_train.shape, y_train.shape
    print "x_test, y_test shapes: ", x_test.shape, y_test.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = 6
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = 6
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    model = create_model(x_train, weighted_y_train, num_training_samples, \
        windows_per_training_sample, save_model=False, savefile=model_savefile)
    """
    model = load_model(model_savefile, custom_objects = \
        {'true_positives': true_positives, 'false_positives': false_positives, \
         'true_negatives': true_negatives, 'false_negatives': false_negatives, \
         'true_positive_rate': true_positive_rate, \
         'false_positive_rate': false_positive_rate, \
         'true_negative_rate': true_negative_rate, \
         'false_negative_rate': false_negative_rate})
    """
    
    evaluate_model(model, x_test, y_test, windows_per_testing_sample)
    generate_roc_curve(model, x_test, y_test, windows_per_testing_sample, \
        data_scenario, model_scenario)
def main():
    step_length = 15
    interval_length = 60

    model_scenario = 11
    data_scenario = 11  # scenario 9's data has good results for several models

    # pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario)  # * 0.1

    savefile_x = 'lstm_inputs/12_characteristics/x_scenario_' + str(
        data_scenario) + '_lstm.txt'  # sys.argv[1]
    savefile_y = 'lstm_inputs/12_characteristics/y_scenario_' + str(
        data_scenario) + '_lstm.txt'  # sys.argv[2]
    model_savefile = 'stateless_lstm_som_training_model_scenario_' + str(
        model_scenario) + '.h5'
    '''
	x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
		botnet_nodes, pcap_duration,, step_length = step_length, \
		interval_length = interval_length, do_save=True, \
		savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
	'''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \
     filename_y=savefile_y)
    x, y, _, _ = prep_time_series_input.time_window_data(x, y, 5, 2, \
     interval_length, step_length, data_scenario)
    '''
	x_train, y_train, x_test, y_test = separate_into_sets(x, y, \
		training_proportion = 0.7)
	'''
    training_indices = []  # list of indices of training samples
    x_som = np.loadtxt(sys.argv[3])
    # Eventually replace the below block with an import statement to clean it up
    # Train a 5x5 SOM with 500 iterations
    print "Training the SOM"
    som = SOM(5, 5, VECTOR_SIZE, 100)
    som.train(x_som, verbose=True)
    mapped = som.map_vects(x)
    # print "mapped", str(mapped)
    # m x n matrix with each cell containing lists of indices of input vectors
    # mapped to it
    grid_map = som.get_grid_mapping(mapped)
    num_grid = [[len(grid_map[i][j]) for j in range(len(grid_map[i]))] \
        for i in range(len(grid_map))]
    print num_grid
    with open('grid_map.txt', 'w') as f:
        f.write(str(grid_map))
    '''
	
	with open('grid_map.txt', 'r') as f:
		grid_map = eval(f.readline())

	'''
    num_samples_per_cluster = 3  # number of samples to be chosen from each SOM cluster
    for i in range(len(y)):  # Add all malicious nodes to the training set
        if y[i] != 0:
            print "Added a positive sample"
            training_indices.append(i)
    print "Positive sample indices: ", str(training_indices)

    largest_cluster = (0, 0)
    max_size = 0
    for i in range(len(grid_map)):  # Get largest cluster's indices
        for j in range(len(grid_map[i])):
            if len(grid_map[i][j]) > max_size:
                max_size = len(grid_map[i][j])
                largest_cluster = (i, j)
    for i in range(
            len(grid_map)
    ):  # Add a certain number of nodes from each SOM cluster to the training set
        for j in range(len(grid_map[i])):
            if i == largest_cluster[0] and j == largest_cluster[1]:
                training_indices += random.sample(grid_map[i][j], \
                 min(len(grid_map[i][j]), num_samples_per_cluster + 17))
            else:
                training_indices += random.sample(grid_map[i][j], \
                 min(len(grid_map[i][j]), num_samples_per_cluster - 2))

    training_indices = list(set(training_indices))  # remove duplicates
    training_indices.sort()
    balanced_x = np.array([])
    balanced_y = np.array([])
    _, num_time_steps, feature_size = x.shape
    for element in training_indices:
        balanced_x = np.append(balanced_x, x[element])
        balanced_y = np.append(balanced_y, y[element])
    balanced_x = balanced_x.reshape(len(training_indices), num_time_steps,
                                    feature_size)

    # REMOVED THE LINE BELOW
    # balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)
    # Note that the test set contains all the data so obviously it includes the
    # training data...since the training data is so limited, it likely will have
    # little effect on the outcome though
    x_train, y_train, _, _ = \
     separate_into_sets(balanced_x, balanced_y, training_proportion = 1)
    _, _, x_test, y_test = separate_into_sets(x, y, training_proportion=0)
    '''
	x_train, y_train, x_test, y_test \
		= prep_time_series_input.separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5)
	'''

    positive_training_windows = 0  # number of training time windows that are malicious
    positive_testing_windows = 0  # number of testing time windows that are malicious
    for i in range(len(y_train)):
        if y_train[i] != 0:
            positive_training_windows += 1
    for i in range(len(y_test)):
        if y_test[i] != 0:
            positive_testing_windows += 1
    print "Malicious training windows: ", str(positive_training_windows)
    print "Malicious testing windows: ", str(positive_testing_windows)

    print "Original x, y shapes: ", x.shape, y.shape
    print "Number of training samples: ", str(num_training_samples)
    print "Number of windows per training sample: ", str(
        windows_per_training_sample)
    print "Number of testing samples: ", str(num_testing_samples)
    print "Number of windows per testing sample: ", str(
        windows_per_testing_sample)
    print "x_train, y_train shapes: ", x_train.shape, y_train.shape
    print "x_test, y_test shapes: ", x_test.shape, y_test.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = len(
        training_indices) / 2  # hard-coded by experimentation
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = len(
        training_indices) / 2  # hard-coded by experimentation
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \
      save_model=True, savefile=model_savefile)
    """
	model = load_model(model_savefile, custom_objects = \
		{'true_positives': true_positives, 'false_positives': false_positives, \
		 'true_negatives': true_negatives, 'false_negatives': false_negatives, \
		 'true_positive_rate': true_positive_rate, \
		 'false_positive_rate': false_positive_rate, \
		 'true_negative_rate': true_negative_rate, \
		 'false_negative_rate': false_negative_rate})
	"""
    evaluate_model(model, x_test, y_test, pcap_duration, step_length)
    generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario)
示例#6
0
def main():
    step_length = 60
    interval_length = 120

    data_scenario = 11
    #pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario)  # * 0.1
    '''
	savefile_x = 'x_scenario_' + str(data_scenario) + '_one_graph.txt'
	savefile_y = 'y_scenario_' + str(data_scenario) + '_one_graph.txt'
	
	x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
		botnet_nodes, pcap_duration, step_length = step_length, \
		interval_length = interval_length, do_save=True, \
		savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
	
	x_total, y_total = prep_time_series_input. \
		load_input_arrays(filename_x=savefile_x, filename_y=savefile_y)
	'''
    filename_x = sys.argv[1]
    filename_y = sys.argv[2]
    mat_filename = 'output_mat_file.mat'

    x_total, y = prep_time_series_input.load_input_arrays(filename_x \
     = filename_x, filename_y = filename_y)
    num_samples = len(x_total)
    """
	The x and y are now in the shape [[[feature size] * time_stamps] * samples]
	but we need it in the shape [[[feature size] * samples] * time_stamps] to
	represent all samples over time intervals - hence we transpose the matrix.
	The y is in the proper shape and can be re-used for every time interval in
	x_total
	"""
    x_total = np.transpose(x_total, axes=[1, 0, 2])
    num_time_intervals = len(x_total)

    # Dictionary to follow the layout of a .mat file
    arr = np.ndarray(shape=(3, 3), dtype=object)
    # nodes_array = np.array(range(len(y)))
    # the above gives a var x 1 matrix when I need a 1 x var matrix (below)
    nodes_array = np.array(range(len(y)))[np.newaxis].transpose()
    nodes_array += 1  # MATLAB INDICES START AT 1
    nodes_array = np.ndarray.astype(nodes_array, dtype=np.uint64)
    time_step = 0
    for x in x_total:  # Iterate over time stamps
        # Train a 5x5 SOM with 1 iteration
        print "Time interval " + str(time_step + 1) + ' / ' \
         + str(num_time_intervals)
        length, height = 5, 5
        som = SOM(length, height, VECTOR_SIZE, 1)
        som.train(x, verbose=True)

        # Get output grid
        #image_grid = som.get_centroids()
        #print image_grid

        # Map colours to their closest neurons
        mapped = som.map_vects(x)
        #print 'mapped: ', str(mapped)

        cluster_array = np.array([])
        for item in mapped:
            cluster_array = np.append(cluster_array,
                                      item[0] * length + item[1])
        cluster_array = cluster_array[np.newaxis].transpose()
        cluster_array += 1  # MATLAB INDICES START AT 1
        cluster_array = np.ndarray.astype(cluster_array, dtype=np.uint64)

        # Dummy variable - eventually I can remove this and modify the MATLAB code
        arr[time_step][0] = np.array([0])
        arr[time_step][1] = nodes_array
        arr[time_step][2] = cluster_array
        time_step += 1
        # m x n matrix with each cell containing lists of indices of input vectors
        # mapped to it
        grid_map = som.get_grid_mapping(mapped)
        # print 'grid_map: ', str(grid_map)
        # m x n matrix with each cell containing the number of input vectors
        # mapped to it
        num_grid = [[len(grid_map[i][j]) for j in range(len(grid_map[i]))] \
            for i in range(len(grid_map))]
        num_botnet_grid = [[count_botnet_nodes(grid_map[i][j], y) for j in \
         range(len(grid_map[i]))] for i in range(len(grid_map))]
        print num_grid
        print num_botnet_grid
        if time_step == 3:
            break

    output_dict = {'data': arr}
    print arr
    print output_dict
    print "Saving mat file"
    scipy.io.savemat(mat_filename, output_dict)
    # For some reason the mat file is created and saved just fine but the below
    # portion crashes...for now I will just use this generated .mat file and
    # make do, but eventually this needs to be fixed
    print "Running MATLAB visualization"
    a = NetgramCommunityEvolutionVisualization.initialize()
    a.run_script(mat_filename)
    # Prevent the program from exiting as soon as the figure is created
    raw_input("\nClick enter to exit the program...")
    sys.exit(0)
示例#7
0
def main():
    step_length = 60
    interval_length = 120

    model_scenario = 11
    data_scenario = 11

    # pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario)  # * 0.1

    savefile_x = 'neural_net_inputs/Scenario_' + str(data_scenario) + '_model/' + \
     'x_scenario_' + str(data_scenario) + '.txt'
    savefile_y = 'neural_net_inputs/Scenario_' + str(data_scenario) + '_model/' + \
     'y_scenario_' + str(data_scenario) + '.txt'
    '''
	x, y = prep_input.generate_input_arrays(pcap_file, botnet_nodes, pcap_duration, \
		step_length = step_length, interval_length = interval_length, \
		do_save=True, savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
	'''
    '''
	'''
    x, y = prep_input.load_input_arrays(filename_x=savefile_x,
                                        filename_y=savefile_y)
    balanced_savefile_x, balanced_savefile_y = \
     prep_input.balance_data(savefile_x, savefile_y, ratio=10)

    balanced_x, balanced_y = prep_input.load_input_arrays(filename_x=balanced_savefile_x, \
     filename_y=balanced_savefile_y)
    # Note that the test set contains all the data so obviously it includes the
    # training data...since the training data is so limited, it likely will have
    # little effect on the outcome though
    clf = RandomForestClassifier(
        n_estimators=100,  # number of trees
        criterion=
        "gini",  # Gini impurity ("gini") or information gain ("entropy")
        max_features="sqrt",  # integer, percentage, log2, or None (= n_features)
        n_jobs=-1,  # use all cores available (parallelize the task)
        bootstrap=True,
        class_weight=
        "balanced_subsample",  #{0: 1, 1: 10}, # weight malicious nodes by 10
        # Default options:
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        oob_score=True,
        random_state=None,
        verbose=0,
        warm_start=False,
        max_leaf_nodes=None,
        min_impurity_split=0)
    # I NEED TO FIGURE OUT HOW TO BOOTSTRAP WITH ALL THE SAMPLES (NOT REMOVE MAJORITY CLASS MEMBERS
    # AND INSTEAD MAKE SURE THAT EACH BOOSTRAP SAMPLE CONTAINS POSITIVE SAMPLES)
    # WHAT IS THE BOOTSTRAP SIZE ANYWAY??
    print "Fitting the data..."
    clf.fit(x, y)
    joblib.dump(clf, 'scenario_' + str(model_scenario) + '_random_forest.pkl')
    # clf = joblib.load('scenario_' + str(model_scenario) + '_random_forest.pkl')

    print "Evaluating the model..."
    scores = cross_val_score(clf, x, y)
    print "Scores: ", str(scores)
    print "Mean score: ", str(scores.mean())
    # print "Estimators: ", str(clf.estimators_)
    print "Classes: ", str(clf.classes_)
    print "Number of classes: ", str(clf.n_classes_)
    print "Number of features: ", str(clf.n_features_)
    print "Number of outputs: ", str(clf.n_outputs_)
    print "Feature importances: ", str(clf.feature_importances_)
    print "Oob score: ", str(clf.oob_score_)
    print "Oob decision function: ", str(clf.oob_decision_function_)

    pred = np.array([])  # contains predicted y values
    # If decision function gives >= 0.5 probability that it is 1, predicted value is a 1
    last_printed_percent = 0
    for i in range(len(clf.oob_decision_function_)):
        if float(i) / len(
                clf.oob_decision_function_) * 100 > last_printed_percent + 1:
            print str(float(i) / len(clf.oob_decision_function_) * 100) + "%"
            last_printed_percent = float(i) / len(
                clf.oob_decision_function_) * 100
        if clf.oob_decision_function_[i][1] >= 0.5:
            pred = np.append(pred, 1)
        else:
            pred = np.append(pred, 0)

    true_positives, false_positives, true_negatives, false_negatives = 0, 0, 0, 0
    for i in range(len(y)):
        if y[i] == 0:
            if pred[i] == 0:
                true_negatives += 1
            else:
                false_positives += 1
        else:  # if y[i] == 1
            if pred[i] == 1:
                true_positives += 1
            else:
                false_negatives += 1

    true_positive_rate = float(true_positives) / (true_positives +
                                                  false_negatives)
    false_positive_rate = float(false_positives) / (true_negatives +
                                                    false_positives)
    true_negative_rate = float(true_negatives) / (true_negatives +
                                                  false_positives)
    false_negative_rate = float(false_negatives) / (true_positives +
                                                    false_negatives)
    print "True positives: ", str(true_positives)
    print "True positive rate: ", str(true_positive_rate)
    print "True negatives: ", str(true_negatives)
    print "True negative rate: ", str(true_negative_rate)
    print "False positives: ", str(false_positives)
    print "False positive rate: ", str(false_positive_rate)
    print "False negatives: ", str(false_negatives)
    print "False negative rate: ", str(false_negative_rate)
def main():
    step_length = 15
    interval_length = 60

    model_scenario = 11
    data_scenario = 11  # scenario 9's data has good results for several models

    pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario)  # * 0.1

    savefile_x = 'x_scenario_' + str(data_scenario) + '_lstm.txt'
    savefile_y = 'y_scenario_' + str(data_scenario) + '_lstm.txt'
    model_savefile = 'stateless_lstm_model_scenario_' + str(
        model_scenario) + '.h5'

    x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
        botnet_nodes, pcap_duration, step_length = step_length, \
        interval_length = interval_length, do_save=True, \
        savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
    '''
    '''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \
        filename_y=savefile_y)
    x, y, _, _ = prep_time_series_input.time_window_data(x, y, 5, 2, \
        interval_length, step_length, data_scenario)
    '''
    x_train, y_train, x_test, y_test = separate_into_sets(x, y, \
        training_proportion = 0.7)
    '''
    balanced_x, balanced_y = prep_time_series_input.balance_data(x,
                                                                 y,
                                                                 ratio=10)
    # Note that the test set contains all the data so obviously it includes the
    # training data...since the training data is so limited, it likely will have
    # little effect on the outcome though
    '''
    _, _, x_test, y_test = separate_into_sets(x, y, training_proportion = 0)
    x_train, y_train, _, _ = \
        separate_into_sets(balanced_x, balanced_y, training_proportion = 0.7)
    '''
    x_train, y_train, x_test, y_test \
        = prep_time_series_input.separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5)
    print x.shape, y.shape
    print x_test.shape, y_test.shape
    print x_train.shape, y_train.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = 6
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = 6
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    """
    model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \
        save_model=True, savefile=model_savefile)
    
    """
    model = load_model(model_savefile, custom_objects = \
        {'true_positives': true_positives, 'false_positives': false_positives, \
         'true_negatives': true_negatives, 'false_negatives': false_negatives, \
         'true_positive_rate': true_positive_rate, \
         'false_positive_rate': false_positive_rate, \
         'true_negative_rate': true_negative_rate, \
         'false_negative_rate': false_negative_rate})
    #evaluate_model(model, x_test, y_test, pcap_duration, step_length)
    generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario)
def main():
	step_length = 60
	interval_length = 120
	
	data_scenario = 11
	#pcap_file = sys.argv[1]
	# Dictionary of malicious IP addresses with start timestamp as its value
	botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
	pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1

	'''
	savefile_x = 'x_scenario_' + str(data_scenario) + '_one_graph.txt'
	savefile_y = 'y_scenario_' + str(data_scenario) + '_one_graph.txt'
	
	x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
		botnet_nodes, pcap_duration, step_length = step_length, \
		interval_length = interval_length, do_save=True, \
		savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
	
	x_total, y_total = prep_time_series_input. \
		load_input_arrays(filename_x=savefile_x, filename_y=savefile_y)
	'''
	filename_x = sys.argv[1]
	filename_y = sys.argv[2]
	mat_filename = 'output_mat_file.mat'

	x_total, y = prep_time_series_input.load_input_arrays(filename_x \
		= filename_x, filename_y = filename_y)
	num_samples = len(x_total)

	"""
	The x and y are now in the shape [[[feature size] * time_stamps] * samples]
	but we need it in the shape [[[feature size] * samples] * time_stamps] to
	represent all samples over time intervals - hence we transpose the matrix.
	The y is in the proper shape and can be re-used for every time interval in
	x_total
	"""
	x_total = np.transpose(x_total, axes=[1,0,2])
	num_time_intervals = len(x_total)

	# Dictionary to follow the layout of a .mat file
	arr = np.ndarray(shape=(3, 3), dtype=object)
	# nodes_array = np.array(range(len(y)))
	# the above gives a var x 1 matrix when I need a 1 x var matrix (below)
	nodes_array = np.array(range(len(y)))[np.newaxis].transpose()
	nodes_array += 1 # MATLAB INDICES START AT 1
	nodes_array = np.ndarray.astype(nodes_array, dtype=np.uint64)
	time_step = 0
	length, height = 5, 5 # SOM dimensions
	# To keep the visualization clean, I will sort the groups by size:
	# the lowest indexed cluster contains the largest cluster, etc.
	# so that the smallest index contains the smallest cluster
	for x in x_total: # Iterate over time stamps
		alive_nodes = np.array([]) # contains indices of nodes present in graph (non-zero arrays)
		alive_x = np.array([]) # contains alive nodes in x
		vector_size = len(x[0])
		zero_array = np.array([0] * vector_size)
		for i in range(len(x)):
			if not np.array_equal(x[i], zero_array):
				alive_nodes = np.append(alive_nodes, i)
				alive_x = np.append(alive_x, x[i])
		# This array is not flattened so we reshape it to (number of samples, vector size)
		alive_x = np.reshape(alive_x, (len(alive_x)/vector_size, vector_size))
		# Train a 5x5 SOM with 1 iteration
		print "Time interval " + str(time_step + 1) + ' / ' \
			+ str(num_time_intervals)
		som = SOM(length, height, VECTOR_SIZE, 1)
		som.train(alive_x, verbose = True)
		 
		# Get output grid
		#image_grid = som.get_centroids()
		#print image_grid
		 
		# Map vectors to their closest neurons
		mapped = som.map_vects(x)
		#print 'mapped: ', str(mapped)

		cluster_array = np.array([])
		j = 0
		k = 0
		for i in range(len(x)):
			if i == alive_nodes[j]:
				cluster_array = np.append(cluster_array, mapped[k][0] * length \
					+ mapped[k][1] + 1)
				j = min(j + 1, len(alive_nodes) - 1)
				k += 1
			else:
				# The lowest indexed cluster consists of dead nodes
				cluster_array = np.append(cluster_array, 0)

		
		# Sort the clusters in descending order of size (number of nodes)
		count_dict = {}
		sorted_count_dict = {}
		for i in range(length * height + 1): # number of clusters
			count_dict[i] = 0
		for element in cluster_array:
			count_dict[element] += 1
		values = count_dict.values()
		values_copy = count_dict.values()
		keys = count_dict.keys()
		# Sorted_keys contains keys sorted by size of values
		sorted_keys = [keys for (values,keys) in sorted(zip(values,keys))]
		values_copy.sort()
		values_copy = values_copy[::-1]
		for i in range(length * height + 1): # number of clusters
			sorted_count_dict[sorted_keys[i]] = values_copy[i]
		for element in cluster_array:
			element = sorted_count_dict[element]
		

		cluster_array = cluster_array[np.newaxis].transpose()
		cluster_array += 1 # MATLAB INDICES START AT 1
		cluster_array = np.ndarray.astype(cluster_array, dtype=np.uint64)

		arr[time_step][0] = np.array([0])
		# ^ dummy variable - eventually I can remove this and modify the MATLAB code
		arr[time_step][1] = nodes_array
		arr[time_step][2] = cluster_array
		time_step += 1
		# m x n matrix with each cell containing lists of indices of input vectors
		# mapped to it
		grid_map = som.get_grid_mapping(mapped)
		# print 'grid_map: ', str(grid_map)
		# m x n matrix with each cell containing the number of input vectors 
		# mapped to it
		num_grid = [[len(grid_map[i][j]) for j in range(len(grid_map[i]))] \
		    for i in range(len(grid_map))]
		num_botnet_grid = [[count_botnet_nodes(grid_map[i][j], y) for j in \
			range(len(grid_map[i]))] for i in range(len(grid_map))]
		print num_grid
		print num_botnet_grid
		if time_step == 3:
			break

	output_dict = {'data': arr}
	print arr
	print output_dict
	print "Saving mat file"
	scipy.io.savemat(mat_filename, output_dict)
	# For some reason the mat file is created and saved just fine but the below
	# portion crashes...for now I will just use this generated .mat file and
	# make do, but eventually this needs to be fixed
	'''
def main():
	step_length = 60
	interval_length = 120
	
	data_scenario = sys.argv[3]
	#pcap_file = sys.argv[1]
	# Dictionary of malicious IP addresses with start timestamp as its value
	botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
	pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1

	'''
	savefile_x = 'x_scenario_' + str(data_scenario) + '_one_graph.txt'
	savefile_y = 'y_scenario_' + str(data_scenario) + '_one_graph.txt'
	
	x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
		botnet_nodes, pcap_duration, step_length = step_length, \
		interval_length = interval_length, do_save=True, \
		savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
	
	x_total, y_total = prep_time_series_input. \
		load_input_arrays(filename_x=savefile_x, filename_y=savefile_y)
	'''
	# The below must be an LSTM input file (NOT NORMALIZED!)
	filename_x = sys.argv[1]
	filename_y = sys.argv[2]

	x_total, y = prep_time_series_input.load_input_arrays(filename_x \
		= filename_x, filename_y = filename_y)
	print "Loaded LSTM x shape: ", str(x_total.shape)
	print "Loaded LSTM y shape: ", str(y.shape)
	# Keep only the 10 characteristics for the SOM
	x_total = np.delete(x_total, np.s_[12:], 2) # remove non-graph features
	x_total = np.delete(x_total, np.s_[6, 8], 2) # remove closeness (feature 7) and Katz (feature 9)
	print "Removed the extraneous features " \
		"(keeping just the 10 graph-based features excluding closeness and Katz)"
	print "Trimmed LSTM x shape: ", str(x_total.shape)
	print "Trimmed LSTM y shape: ", str(y.shape)
	num_samples = len(x_total)

	# The below must be a regular SOM input file
	x, _ = convert_lstm_to_regular_input(x_total, y)
	print "Converted SOM x shape: ", str(x.shape)
	height, length = 5, 5 # SOM dimensions
	# Train a 5x5 SOM with 500 iterations
	
	print "Training the SOM"
	som = SOM(length, height, VECTOR_SIZE, 1)
	som.train(x, verbose = True)
	 
	# Get output grid
	clusters = np.array(som.get_centroids())
	# clusters = np.reshape(clusters, (length * height, VECTOR_SIZE))
	mapped = som.map_vects(x)
	grid_map = som.get_grid_mapping(mapped)
	num_grid = [[len(grid_map[i][j]) for j in range(len(grid_map[i]))] \
	    for i in range(len(grid_map))]
	print num_grid
	# Sort clusters by distance (in their vectors) from the largest cluster
	max_cluster_row = 0
	max_cluster_col = 0
	max_cluster_vector = None
	max_cluster_size = 0
	for row in range(len(num_grid)):
		for col in range(len(num_grid[row])):
			if num_grid[row][col] > max_cluster_size:
				max_cluster_row = row
				max_cluster_col = col
				max_cluster_vector = clusters[row][col]
				max_cluster_size = num_grid[row][col]
	print max_cluster_row
	print max_cluster_col
	print max_cluster_vector
	print max_cluster_size
	cluster_distances = []
	for row in range(len(clusters)):
		for col in range(len(clusters[row])):
			cluster_distances.append(np.linalg.norm(max_cluster_vector - clusters[row][col]))
	clusters = np.reshape(clusters, (length * height, VECTOR_SIZE))
	print clusters
	print cluster_distances
	clusters = list(clusters)
	for i in range(len(clusters)):
		clusters[i] = list(clusters[i])
	print clusters
	clusters = [clusters for _,clusters in sorted(zip(cluster_distances,clusters))]
	print clusters
	with open("scenario_" + str(data_scenario) + "_som_" + str(height) + "_" \
		+ str(length) + "_clusters.txt", "w") as f:
		f.write(str(clusters))
	'''
	# Output from above (to avoid running it again)
	with open("scenario_" + str(data_scenario) + "_som_" + str(height) + "_" + str(length) + \
		"_clusters.txt", "w") as f:
		clusters = eval(f.readline())
	# print clusters
	'''

	"""
	The x and y are now in the shape [[[feature size] * time_stamps] * samples]
	but we need it in the shape [[[feature size] * samples] * time_stamps] to
	represent all samples over time intervals - hence we transpose the matrix.
	The y is in the proper shape and can be re-used for every time interval in
	x_total
	"""
	x_total = np.transpose(x_total, axes=[1,0,2])
	num_time_intervals = len(x_total)

	arr = np.ndarray(shape=(num_time_intervals + 1), dtype=object)
	nodes_array = np.array(range(len(y))) #[np.newaxis].transpose()
	arr[0] = nodes_array
	time_step = 1
	botnet_node_clusters = [] # list of the form [[clusters containing botnets] * time intervals]
	for x in x_total: # Iterate over time stamps
		i = 0
		print "Time interval " + str(time_step) + ' / ' \
			+ str(num_time_intervals)
		cluster_array = np.array([])
		current_botnet_clusters = []
		for sample in x:
			cluster_number = most_similar_group(sample, clusters)
			cluster_array = np.append(cluster_array, cluster_number)
			if y[i] != 0:
				print "Botnet node: cluster " + str(cluster_number + 1)
				current_botnet_clusters.append(cluster_number + 1)
			i += 1
		botnet_node_clusters.append(list(set(current_botnet_clusters)))
		print "Unique clusters: ", str(np.unique(cluster_array + 1))
		arr[time_step] = cluster_array
		time_step += 1

	# contains lists of tuples where each list represents time step A to B
	# and tuples are in the form (prev_cluster, next_cluster, number of this transition)
	transitions_arr = []
	for i in range(1, num_time_intervals):
		all_time_interval_transitions = []
		for j in arr[0]:
			all_time_interval_transitions.append((int(arr[i][j] + 1), int(arr[i + 1][j] + 1)))
		time_interval_transitions = list(set(all_time_interval_transitions)) # unique cluster transitions
		for i in range(len(time_interval_transitions)):
			time_interval_transitions[i] = (time_interval_transitions[i][0], \
				time_interval_transitions[i][1], \
				all_time_interval_transitions.count(time_interval_transitions[i]))
		time_interval_transitions.sort()
		transitions_arr.append(time_interval_transitions)

	with open("botnet_clusters_" + str(height) + "_" + str(length) \
		+ ".txt", "w") as output:
		output.write(str(botnet_node_clusters))

	with open("cluster_transitions_" + str(height) + "_" + str(length) \
		+ ".txt", "w") as output:
	    output.write(str(transitions_arr))
def main():
    step_length = 15
    interval_length = 60

    model_scenario = 11
    data_scenario = 11  # scenario 9's data has good results for several models`

    # pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario)  # * 0.1

    savefile_x = 'lstm_inputs/12_characteristics/x_scenario_' + str(
        data_scenario) + '_lstm.txt'  # sys.argv[1]
    savefile_y = 'lstm_inputs/12_characteristics/y_scenario_' + str(
        data_scenario) + '_lstm.txt'  # sys.argv[2]
    model_savefile = '17_2_weighted_cluster_stateful_lstm_som_training_model_scenario_' + str(
        model_scenario) + '.h5'
    '''
	Note that it's important that the original x and y are processed in the
	following order: balanced (maintain a certain ratio between postive and
	negative samples), separated into training and testing sets, and then
	broken into time windows (for stateful LSTM). This is because balancing
	data before keeps the entire initial time interval for the chosen samples
	and then the testing and training sets each contain of fewer samples with
	their entire time intervals. Finally we break each set's samples' entire
	time interval into time windows (as opposed to breaking into time windows
	and haphazardly choosing time windows from the entire time interval)
	'''

    # x and y contain the entire dataset in these NumPy arrays
    '''
	x, y = prep_time_series_input.generate_input_arrays(pcap_file, botnet_nodes, pcap_duration, \
		step_length = step_length, interval_length = interval_length, \
		do_save=True, savefile_x=savefile_x, savefile_y=savefile_y, \
		verbose = True)
	'''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x,
                                                    filename_y=savefile_y)
    training_indices = []  # list of indices of training samples
    x_som = np.loadtxt(sys.argv[3])
    # Eventually replace the below block with an import statement to clean it up
    # Train a 5x5 SOM with 500 iterations
    print "Training the SOM"
    som = SOM(5, 5, VECTOR_SIZE, 100)
    som.train(x_som, verbose=True)
    mapped = som.map_vects(x)
    # print "mapped", str(mapped)
    # m x n matrix with each cell containing lists of indices of input vectors
    # mapped to it
    grid_map = som.get_grid_mapping(mapped)
    num_grid = [[len(grid_map[i][j]) for j in range(len(grid_map[i]))] \
        for i in range(len(grid_map))]
    print num_grid
    with open('grid_map.txt', 'w') as f:
        f.write(str(grid_map))
    '''
	
	with open('grid_map.txt', 'r') as f:
		grid_map = eval(f.readline())

	'''
    num_samples_per_cluster = 3  # number of samples to be chosen from each SOM cluster
    for i in range(len(y)):  # Add all malicious nodes to the training set
        if y[i] != 0:
            print "Added a positive sample"
            training_indices.append(i)
    print "Positive sample indices: ", str(training_indices)

    largest_cluster = (0, 0)
    max_size = 0
    for i in range(len(grid_map)):  # Get largest cluster's indices
        for j in range(len(grid_map[i])):
            if len(grid_map[i][j]) > max_size:
                max_size = len(grid_map[i][j])
                largest_cluster = (i, j)
    for i in range(
            len(grid_map)
    ):  # Add a certain number of nodes from each SOM cluster to the training set
        for j in range(len(grid_map[i])):
            if i == largest_cluster[0] and j == largest_cluster[1]:
                training_indices += random.sample(grid_map[i][j], \
                 min(len(grid_map[i][j]), num_samples_per_cluster + 17))
            else:
                training_indices += random.sample(grid_map[i][j], \
                 min(len(grid_map[i][j]), num_samples_per_cluster - 2))

    training_indices = list(set(training_indices))  # remove duplicates
    training_indices.sort()
    balanced_x = np.array([])
    balanced_y = np.array([])
    _, num_time_steps, feature_size = x.shape
    for element in training_indices:
        balanced_x = np.append(balanced_x, x[element])
        balanced_y = np.append(balanced_y, y[element])
    balanced_x = balanced_x.reshape(len(training_indices), num_time_steps,
                                    feature_size)

    # balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)

    # REMOVED THE BELOW
    # Balanced x and y arrays maintain a certain ratio; each sample contains
    # its entire time interval
    #balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)

    # Pre-(x/y)-(train/test) separate the balanced x and y arrays based on a
    # certain ratio -> each sample still contains its entire time interval
    '''
	# Note that the test set contains all the data so obviously it includes the
	# training data
	'''
    pre_x_train, pre_y_train, _, _ = \
     prep_time_series_input.separate_into_sets(balanced_x, balanced_y, training_proportion = 1)
    _, _, pre_x_test, pre_y_test = \
     prep_time_series_input.separate_into_sets(x, y, training_proportion = 0)
    '''
	pre_x_train, pre_y_train, pre_x_test, pre_y_test = prep_time_series_input. \
		separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5)
	'''

    # (x,y)_(train/test) contains the chosen samples (balanced and broken into
    # time windows)
    x_train, y_train, num_training_samples, windows_per_training_sample \
     = prep_time_series_input.time_window_data(pre_x_train, pre_y_train, 5, 2, \
     interval_length, step_length, data_scenario) #######
    x_test, y_test, num_testing_samples, windows_per_testing_sample \
     = prep_time_series_input.time_window_data(pre_x_test, pre_y_test, 5, 2, \
     interval_length, step_length, data_scenario) #######

    positive_training_windows = 0  # number of training time windows that are malicious
    positive_testing_windows = 0  # number of testing time windows that are malicious
    for i in range(len(y_train)):
        if y_train[i] != 0:
            positive_training_windows += 1
    for i in range(len(y_test)):
        if y_test[i] != 0:
            positive_testing_windows += 1
    print "Malicious training windows: ", str(positive_training_windows)
    print "Malicious testing windows: ", str(positive_testing_windows)

    print "Original x, y shapes: ", x.shape, y.shape
    print "Number of training samples: ", str(num_training_samples)
    print "Number of windows per training sample: ", str(
        windows_per_training_sample)
    print "Number of testing samples: ", str(num_testing_samples)
    print "Number of windows per testing sample: ", str(
        windows_per_testing_sample)
    print "x_train, y_train shapes: ", x_train.shape, y_train.shape
    print "x_test, y_test shapes: ", x_test.shape, y_test.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = len(
        training_indices) / 2  # hard-coded by experimentation
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = len(
        training_indices) / 2  # hard-coded by experimentation
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    model = create_model(x_train, weighted_y_train, num_training_samples, \
     windows_per_training_sample, save_model=True, savefile=model_savefile)
    """
	model = load_model(model_savefile, custom_objects = \
		{'true_positives': true_positives, 'false_positives': false_positives, \
		 'true_negatives': true_negatives, 'false_negatives': false_negatives, \
		 'true_positive_rate': true_positive_rate, \
		 'false_positive_rate': false_positive_rate, \
		 'true_negative_rate': true_negative_rate, \
		 'false_negative_rate': false_negative_rate})
	"""

    evaluate_model(model, x_test, y_test, windows_per_testing_sample)
    generate_roc_curve(model, x_test, y_test, windows_per_testing_sample, \
     data_scenario, model_scenario)