示例#1
0
def main():
    step_length = 15
    interval_length = 60
    
    model_scenario = 11
    data_scenario = 11 # scenario 9's data has good results for several models`

    pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario) # * 0.1

    savefile_x = 'x_scenario_' + str(data_scenario) + '_lstm.txt'
    savefile_y = 'y_scenario_' + str(data_scenario) + '_lstm.txt'
    model_savefile = 'stateful_lstm_model_scenario_' + str(model_scenario) + '.h5'

    '''
    Note that it's important that the original x and y are processed in the
    following order: balanced (maintain a certain ratio between postive and
    negative samples), separated into training and testing sets, and then
    broken into time windows (for stateful LSTM). This is because balancing
    data before keeps the entire initial time interval for the chosen samples
    and then the testing and training sets each contain of fewer samples with
    their entire time intervals. Finally we break each set's samples' entire
    time interval into time windows (as opposed to breaking into time windows
    and haphazardly choosing time windows from the entire time interval)
    '''

    # x and y contain the entire dataset in these NumPy arrays
    x, y = prep_time_series_input.generate_input_arrays(pcap_file, botnet_nodes, pcap_duration, \
        step_length = step_length, interval_length = interval_length, \
        do_save=True, savefile_x=savefile_x, savefile_y=savefile_y, \
        verbose = True)
    '''
    '''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, filename_y=savefile_y)

    # Balanced x and y arrays maintain a certain ratio; each sample contains
    # its entire time interval
    balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)

    # Pre-(x/y)-(train/test) separate the balanced x and y arrays based on a
    # certain ratio -> each sample still contains its entire time interval
    '''
    # Note that the test set contains all the data so obviously it includes the
    # training data
    _, _, pre_x_test, pre_y_test = \
        separate_into_sets(x, y, training_proportion = 0)
    '''
    pre_x_train, pre_y_train, _, _ = prep_time_series_input. \
        separate_into_sets(balanced_x, balanced_y, training_proportion = 1)
    _, _, pre_x_test, pre_y_test = prep_time_series_input. \
        separate_into_sets(x, y, training_proportion = 0)
    '''
    pre_x_train, pre_y_train, pre_x_test, pre_y_test = prep_time_series_input. \
        separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5)
    '''

    # (x,y)_(train/test) contains the chosen samples (balanced and broken into
    # time windows)
    x_train, y_train, num_training_samples, windows_per_training_sample \
        = prep_time_series_input.time_window_data(pre_x_train, pre_y_train, 5, 2, \
        interval_length, step_length, data_scenario)
    x_test, y_test, num_testing_samples, windows_per_testing_sample \
        = prep_time_series_input.time_window_data(pre_x_test, pre_y_test, 5, 2, \
        interval_length, step_length, data_scenario)

    print "Original x, y shapes: ", x.shape, y.shape
    print "Number of training samples: ", str(num_training_samples)
    print "Number of windows per training sample: ", str(windows_per_training_sample)
    print "Number of testing samples: ", str(num_testing_samples)
    print "Number of windows per testing sample: ", str(windows_per_testing_sample)
    print "x_train, y_train shapes: ", x_train.shape, y_train.shape
    print "x_test, y_test shapes: ", x_test.shape, y_test.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = 6
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = 6
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    model = create_model(x_train, weighted_y_train, num_training_samples, \
        windows_per_training_sample, save_model=False, savefile=model_savefile)
    """
    model = load_model(model_savefile, custom_objects = \
        {'true_positives': true_positives, 'false_positives': false_positives, \
         'true_negatives': true_negatives, 'false_negatives': false_negatives, \
         'true_positive_rate': true_positive_rate, \
         'false_positive_rate': false_positive_rate, \
         'true_negative_rate': true_negative_rate, \
         'false_negative_rate': false_negative_rate})
    """
    
    evaluate_model(model, x_test, y_test, windows_per_testing_sample)
    generate_roc_curve(model, x_test, y_test, windows_per_testing_sample, \
        data_scenario, model_scenario)
def main():
    step_length = 150
    interval_length = 300

    # model_scenario = int(sys.argv[3])
    # data_scenario = int(sys.argv[4])

    # pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    # botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = 150000  # arbitrarily chosen value (used to calculate batch size)
    # scenario_info.get_pcap_duration(data_scenario) # * 0.1

    #savefile_x = sys.argv[1] # 'x_scenario_' + str(data_scenario) + '_lstm.txt'
    #savefile_y = sys.argv[2] # 'y_scenario_' + str(data_scenario) + '_lstm.txt'
    savefile_x_list = [
        'x_scenario_6_lstm_normalized.txt', 'x_scenario_7_lstm_normalized.txt',
        'x_scenario_10_lstm_normalized.txt',
        'x_scenario_11_lstm_normalized.txt',
        'x_scenario_12_lstm_normalized.txt'
    ]
    savefile_y_list = [
        'y_scenario_6_lstm.txt', 'y_scenario_7_lstm.txt',
        'y_scenario_10_lstm.txt', 'y_scenario_11_lstm.txt',
        'y_scenario_12_lstm.txt'
    ]
    scenario_lst = [6, 7, 10, 11, 12]
    model_savefile = 'stateless_lstm_combined_model_scenario_6_7_10_11_12' \
        + '_interval_' + str(interval_length) + '_step_' + str(step_length) + '.h5'
    '''
    x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
        botnet_nodes, pcap_duration, step_length = step_length, \
        interval_length = interval_length, do_save=True, \
        savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
    '''
    windowed_x = np.array([]).reshape(0, 5, 28)
    windowed_y = np.array([])
    for i in range(len(savefile_x_list)):
        data_scenario = scenario_lst[i]
        savefile_x = savefile_x_list[i]
        savefile_y = savefile_y_list[i]
        x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \
        filename_y=savefile_y)
        balanced_x, balanced_y = \
            prep_time_series_input.balance_data(x, y, ratio = 10)
        del x
        del y
        current_windowed_x, current_windowed_y, num_samples, windows_per_sample = \
            prep_time_series_input.time_window_data(balanced_x, balanced_y, \
            5, 2, interval_length, step_length, data_scenario)
        print current_windowed_x.shape
        print current_windowed_y.shape
        windowed_x = np.append(windowed_x, current_windowed_x, axis=0)
        windowed_y = np.append(windowed_y, current_windowed_y, axis=0)
    # Note that the test set contains all the data so obviously it includes the
    # training data...since the training data is so limited, it likely will have
    # little effect on the outcome though
    '''
    _, _, x_test, y_test = separate_into_sets(x, y, training_proportion = 0)
    x_train, y_train, _, _ = \
        separate_into_sets(balanced_x, balanced_y, training_proportion = 0.7)
    '''

    x_train, y_train, x_test, y_test = prep_time_series_input. \
        separate_into_sets(windowed_x, windowed_y, positive_proportion = 0.7)
    '''
    x_train, y_train, x_test, y_test = prep_time_series_input. \
        separate_into_sets(windowed_x, windowed_y, training_proportion = 0)
    '''

    print "Number of samples (training and testing): ", str(num_samples)
    print "Number of windows per sample (training and testing): ", str(
        windows_per_sample)
    print "x_train, y_train shapes: ", x_train.shape, y_train.shape
    print "x_test, y_test shapes: ", x_test.shape, y_test.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = 6
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = 6
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    '''
    ADD K-FOLD CROSS VALIDATION SOON...NOT NECESSARY RIGHT NOW FOR TESTING PURPOSES
    BUT DEFINITELY SHOULD DO IT FOR THE FINAL EVALUATION OF THE MODEL.
    https://machinelearningmastery.com/evaluate-performance-deep-learning-models-keras/
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
    '''

    model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \
        save_model=True, savefile=model_savefile)
    """
    model = load_model(model_savefile, custom_objects = \
        {'true_positives': true_positives, 'false_positives': false_positives, \
         'true_negatives': true_negatives, 'false_negatives': false_negatives, \
         'true_positive_rate': true_positive_rate, \
         'false_positive_rate': false_positive_rate, \
         'true_negative_rate': true_negative_rate, \
         'false_negative_rate': false_negative_rate})
    """
    evaluate_model(model, x_test, y_test, pcap_duration, step_length)
    generate_roc_curve(model, x_test, y_test, \
        savefile = 'stateless_lstm_combined_model_6_7_10_11_12' \
        + '_combined_data_6_7_10_11_12' + '_interval_' \
        + str(interval_length) + '_step_' + str(step_length) + '.png')
def main():
    step_length = 15
    interval_length = 60

    model_scenario = 11
    data_scenario = 11  # scenario 9's data has good results for several models

    pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario)  # * 0.1

    savefile_x = 'x_scenario_' + str(data_scenario) + '_lstm.txt'
    savefile_y = 'y_scenario_' + str(data_scenario) + '_lstm.txt'
    model_savefile = 'stateless_lstm_model_scenario_' + str(
        model_scenario) + '.h5'

    x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
        botnet_nodes, pcap_duration, step_length = step_length, \
        interval_length = interval_length, do_save=True, \
        savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
    '''
    '''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \
        filename_y=savefile_y)
    x, y, _, _ = prep_time_series_input.time_window_data(x, y, 5, 2, \
        interval_length, step_length, data_scenario)
    '''
    x_train, y_train, x_test, y_test = separate_into_sets(x, y, \
        training_proportion = 0.7)
    '''
    balanced_x, balanced_y = prep_time_series_input.balance_data(x,
                                                                 y,
                                                                 ratio=10)
    # Note that the test set contains all the data so obviously it includes the
    # training data...since the training data is so limited, it likely will have
    # little effect on the outcome though
    '''
    _, _, x_test, y_test = separate_into_sets(x, y, training_proportion = 0)
    x_train, y_train, _, _ = \
        separate_into_sets(balanced_x, balanced_y, training_proportion = 0.7)
    '''
    x_train, y_train, x_test, y_test \
        = prep_time_series_input.separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5)
    print x.shape, y.shape
    print x_test.shape, y_test.shape
    print x_train.shape, y_train.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = 6
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = 6
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    """
    model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \
        save_model=True, savefile=model_savefile)
    
    """
    model = load_model(model_savefile, custom_objects = \
        {'true_positives': true_positives, 'false_positives': false_positives, \
         'true_negatives': true_negatives, 'false_negatives': false_negatives, \
         'true_positive_rate': true_positive_rate, \
         'false_positive_rate': false_positive_rate, \
         'true_negative_rate': true_negative_rate, \
         'false_negative_rate': false_negative_rate})
    #evaluate_model(model, x_test, y_test, pcap_duration, step_length)
    generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario)
def main():
    step_length = 15
    interval_length = 60

    model_scenario = 11
    data_scenario = 11  # scenario 9's data has good results for several models

    # pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario)  # * 0.1

    savefile_x = 'lstm_inputs/12_characteristics/x_scenario_' + str(
        data_scenario) + '_lstm.txt'  # sys.argv[1]
    savefile_y = 'lstm_inputs/12_characteristics/y_scenario_' + str(
        data_scenario) + '_lstm.txt'  # sys.argv[2]
    model_savefile = 'stateless_lstm_som_training_model_scenario_' + str(
        model_scenario) + '.h5'
    '''
	x, y = prep_time_series_input.generate_input_arrays(pcap_file, \
		botnet_nodes, pcap_duration,, step_length = step_length, \
		interval_length = interval_length, do_save=True, \
		savefile_x=savefile_x, savefile_y=savefile_y, verbose = True)
	'''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x, \
     filename_y=savefile_y)
    x, y, _, _ = prep_time_series_input.time_window_data(x, y, 5, 2, \
     interval_length, step_length, data_scenario)
    '''
	x_train, y_train, x_test, y_test = separate_into_sets(x, y, \
		training_proportion = 0.7)
	'''
    training_indices = []  # list of indices of training samples
    x_som = np.loadtxt(sys.argv[3])
    # Eventually replace the below block with an import statement to clean it up
    # Train a 5x5 SOM with 500 iterations
    print "Training the SOM"
    som = SOM(5, 5, VECTOR_SIZE, 100)
    som.train(x_som, verbose=True)
    mapped = som.map_vects(x)
    # print "mapped", str(mapped)
    # m x n matrix with each cell containing lists of indices of input vectors
    # mapped to it
    grid_map = som.get_grid_mapping(mapped)
    num_grid = [[len(grid_map[i][j]) for j in range(len(grid_map[i]))] \
        for i in range(len(grid_map))]
    print num_grid
    with open('grid_map.txt', 'w') as f:
        f.write(str(grid_map))
    '''
	
	with open('grid_map.txt', 'r') as f:
		grid_map = eval(f.readline())

	'''
    num_samples_per_cluster = 3  # number of samples to be chosen from each SOM cluster
    for i in range(len(y)):  # Add all malicious nodes to the training set
        if y[i] != 0:
            print "Added a positive sample"
            training_indices.append(i)
    print "Positive sample indices: ", str(training_indices)

    largest_cluster = (0, 0)
    max_size = 0
    for i in range(len(grid_map)):  # Get largest cluster's indices
        for j in range(len(grid_map[i])):
            if len(grid_map[i][j]) > max_size:
                max_size = len(grid_map[i][j])
                largest_cluster = (i, j)
    for i in range(
            len(grid_map)
    ):  # Add a certain number of nodes from each SOM cluster to the training set
        for j in range(len(grid_map[i])):
            if i == largest_cluster[0] and j == largest_cluster[1]:
                training_indices += random.sample(grid_map[i][j], \
                 min(len(grid_map[i][j]), num_samples_per_cluster + 17))
            else:
                training_indices += random.sample(grid_map[i][j], \
                 min(len(grid_map[i][j]), num_samples_per_cluster - 2))

    training_indices = list(set(training_indices))  # remove duplicates
    training_indices.sort()
    balanced_x = np.array([])
    balanced_y = np.array([])
    _, num_time_steps, feature_size = x.shape
    for element in training_indices:
        balanced_x = np.append(balanced_x, x[element])
        balanced_y = np.append(balanced_y, y[element])
    balanced_x = balanced_x.reshape(len(training_indices), num_time_steps,
                                    feature_size)

    # REMOVED THE LINE BELOW
    # balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)
    # Note that the test set contains all the data so obviously it includes the
    # training data...since the training data is so limited, it likely will have
    # little effect on the outcome though
    x_train, y_train, _, _ = \
     separate_into_sets(balanced_x, balanced_y, training_proportion = 1)
    _, _, x_test, y_test = separate_into_sets(x, y, training_proportion=0)
    '''
	x_train, y_train, x_test, y_test \
		= prep_time_series_input.separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5)
	'''

    positive_training_windows = 0  # number of training time windows that are malicious
    positive_testing_windows = 0  # number of testing time windows that are malicious
    for i in range(len(y_train)):
        if y_train[i] != 0:
            positive_training_windows += 1
    for i in range(len(y_test)):
        if y_test[i] != 0:
            positive_testing_windows += 1
    print "Malicious training windows: ", str(positive_training_windows)
    print "Malicious testing windows: ", str(positive_testing_windows)

    print "Original x, y shapes: ", x.shape, y.shape
    print "Number of training samples: ", str(num_training_samples)
    print "Number of windows per training sample: ", str(
        windows_per_training_sample)
    print "Number of testing samples: ", str(num_testing_samples)
    print "Number of windows per testing sample: ", str(
        windows_per_testing_sample)
    print "x_train, y_train shapes: ", x_train.shape, y_train.shape
    print "x_test, y_test shapes: ", x_test.shape, y_test.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = len(
        training_indices) / 2  # hard-coded by experimentation
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = len(
        training_indices) / 2  # hard-coded by experimentation
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    model = create_model(x_train, weighted_y_train, pcap_duration, step_length, \
      save_model=True, savefile=model_savefile)
    """
	model = load_model(model_savefile, custom_objects = \
		{'true_positives': true_positives, 'false_positives': false_positives, \
		 'true_negatives': true_negatives, 'false_negatives': false_negatives, \
		 'true_positive_rate': true_positive_rate, \
		 'false_positive_rate': false_positive_rate, \
		 'true_negative_rate': true_negative_rate, \
		 'false_negative_rate': false_negative_rate})
	"""
    evaluate_model(model, x_test, y_test, pcap_duration, step_length)
    generate_roc_curve(model, x_test, y_test, data_scenario, model_scenario)
def main():
    step_length = 15
    interval_length = 60

    model_scenario = 11
    data_scenario = 11  # scenario 9's data has good results for several models`

    # pcap_file = sys.argv[1]
    # Dictionary of malicious IP addresses with start timestamp as its value
    botnet_nodes = scenario_info.get_botnet_nodes(data_scenario)
    pcap_duration = scenario_info.get_pcap_duration(data_scenario)  # * 0.1

    savefile_x = 'lstm_inputs/12_characteristics/x_scenario_' + str(
        data_scenario) + '_lstm.txt'  # sys.argv[1]
    savefile_y = 'lstm_inputs/12_characteristics/y_scenario_' + str(
        data_scenario) + '_lstm.txt'  # sys.argv[2]
    model_savefile = '17_2_weighted_cluster_stateful_lstm_som_training_model_scenario_' + str(
        model_scenario) + '.h5'
    '''
	Note that it's important that the original x and y are processed in the
	following order: balanced (maintain a certain ratio between postive and
	negative samples), separated into training and testing sets, and then
	broken into time windows (for stateful LSTM). This is because balancing
	data before keeps the entire initial time interval for the chosen samples
	and then the testing and training sets each contain of fewer samples with
	their entire time intervals. Finally we break each set's samples' entire
	time interval into time windows (as opposed to breaking into time windows
	and haphazardly choosing time windows from the entire time interval)
	'''

    # x and y contain the entire dataset in these NumPy arrays
    '''
	x, y = prep_time_series_input.generate_input_arrays(pcap_file, botnet_nodes, pcap_duration, \
		step_length = step_length, interval_length = interval_length, \
		do_save=True, savefile_x=savefile_x, savefile_y=savefile_y, \
		verbose = True)
	'''
    x, y = prep_time_series_input.load_input_arrays(filename_x=savefile_x,
                                                    filename_y=savefile_y)
    training_indices = []  # list of indices of training samples
    x_som = np.loadtxt(sys.argv[3])
    # Eventually replace the below block with an import statement to clean it up
    # Train a 5x5 SOM with 500 iterations
    print "Training the SOM"
    som = SOM(5, 5, VECTOR_SIZE, 100)
    som.train(x_som, verbose=True)
    mapped = som.map_vects(x)
    # print "mapped", str(mapped)
    # m x n matrix with each cell containing lists of indices of input vectors
    # mapped to it
    grid_map = som.get_grid_mapping(mapped)
    num_grid = [[len(grid_map[i][j]) for j in range(len(grid_map[i]))] \
        for i in range(len(grid_map))]
    print num_grid
    with open('grid_map.txt', 'w') as f:
        f.write(str(grid_map))
    '''
	
	with open('grid_map.txt', 'r') as f:
		grid_map = eval(f.readline())

	'''
    num_samples_per_cluster = 3  # number of samples to be chosen from each SOM cluster
    for i in range(len(y)):  # Add all malicious nodes to the training set
        if y[i] != 0:
            print "Added a positive sample"
            training_indices.append(i)
    print "Positive sample indices: ", str(training_indices)

    largest_cluster = (0, 0)
    max_size = 0
    for i in range(len(grid_map)):  # Get largest cluster's indices
        for j in range(len(grid_map[i])):
            if len(grid_map[i][j]) > max_size:
                max_size = len(grid_map[i][j])
                largest_cluster = (i, j)
    for i in range(
            len(grid_map)
    ):  # Add a certain number of nodes from each SOM cluster to the training set
        for j in range(len(grid_map[i])):
            if i == largest_cluster[0] and j == largest_cluster[1]:
                training_indices += random.sample(grid_map[i][j], \
                 min(len(grid_map[i][j]), num_samples_per_cluster + 17))
            else:
                training_indices += random.sample(grid_map[i][j], \
                 min(len(grid_map[i][j]), num_samples_per_cluster - 2))

    training_indices = list(set(training_indices))  # remove duplicates
    training_indices.sort()
    balanced_x = np.array([])
    balanced_y = np.array([])
    _, num_time_steps, feature_size = x.shape
    for element in training_indices:
        balanced_x = np.append(balanced_x, x[element])
        balanced_y = np.append(balanced_y, y[element])
    balanced_x = balanced_x.reshape(len(training_indices), num_time_steps,
                                    feature_size)

    # balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)

    # REMOVED THE BELOW
    # Balanced x and y arrays maintain a certain ratio; each sample contains
    # its entire time interval
    #balanced_x, balanced_y = prep_time_series_input.balance_data(x, y, ratio = 10)

    # Pre-(x/y)-(train/test) separate the balanced x and y arrays based on a
    # certain ratio -> each sample still contains its entire time interval
    '''
	# Note that the test set contains all the data so obviously it includes the
	# training data
	'''
    pre_x_train, pre_y_train, _, _ = \
     prep_time_series_input.separate_into_sets(balanced_x, balanced_y, training_proportion = 1)
    _, _, pre_x_test, pre_y_test = \
     prep_time_series_input.separate_into_sets(x, y, training_proportion = 0)
    '''
	pre_x_train, pre_y_train, pre_x_test, pre_y_test = prep_time_series_input. \
		separate_into_sets(balanced_x, balanced_y, positive_proportion = 0.5)
	'''

    # (x,y)_(train/test) contains the chosen samples (balanced and broken into
    # time windows)
    x_train, y_train, num_training_samples, windows_per_training_sample \
     = prep_time_series_input.time_window_data(pre_x_train, pre_y_train, 5, 2, \
     interval_length, step_length, data_scenario) #######
    x_test, y_test, num_testing_samples, windows_per_testing_sample \
     = prep_time_series_input.time_window_data(pre_x_test, pre_y_test, 5, 2, \
     interval_length, step_length, data_scenario) #######

    positive_training_windows = 0  # number of training time windows that are malicious
    positive_testing_windows = 0  # number of testing time windows that are malicious
    for i in range(len(y_train)):
        if y_train[i] != 0:
            positive_training_windows += 1
    for i in range(len(y_test)):
        if y_test[i] != 0:
            positive_testing_windows += 1
    print "Malicious training windows: ", str(positive_training_windows)
    print "Malicious testing windows: ", str(positive_testing_windows)

    print "Original x, y shapes: ", x.shape, y.shape
    print "Number of training samples: ", str(num_training_samples)
    print "Number of windows per training sample: ", str(
        windows_per_training_sample)
    print "Number of testing samples: ", str(num_testing_samples)
    print "Number of windows per testing sample: ", str(
        windows_per_testing_sample)
    print "x_train, y_train shapes: ", x_train.shape, y_train.shape
    print "x_test, y_test shapes: ", x_test.shape, y_test.shape

    weighted_y_train = np.copy(y_train)
    weighted_y_train[weighted_y_train == 1] = len(
        training_indices) / 2  # hard-coded by experimentation
    weighted_y_test = np.copy(y_test)
    weighted_y_test[weighted_y_test == 1] = len(
        training_indices) / 2  # hard-coded by experimentation
    # TEMPORARY: I AM APPLYING MY WEIGHTS HERE INSTEAD OF IN A CUSTOM LOSS FUNCTION
    # (WHICH IS PROBABLY MORE CORRECT); CHANGE THIS LATER
    model = create_model(x_train, weighted_y_train, num_training_samples, \
     windows_per_training_sample, save_model=True, savefile=model_savefile)
    """
	model = load_model(model_savefile, custom_objects = \
		{'true_positives': true_positives, 'false_positives': false_positives, \
		 'true_negatives': true_negatives, 'false_negatives': false_negatives, \
		 'true_positive_rate': true_positive_rate, \
		 'false_positive_rate': false_positive_rate, \
		 'true_negative_rate': true_negative_rate, \
		 'false_negative_rate': false_negative_rate})
	"""

    evaluate_model(model, x_test, y_test, windows_per_testing_sample)
    generate_roc_curve(model, x_test, y_test, windows_per_testing_sample, \
     data_scenario, model_scenario)