def analyze_syscalls(): # Parameters min_word_count = 5 # Minimum word count context = 10 # Context window size paths = [ "/var/lib/arhuaco/data/normal_clean.csv", "/var/lib/arhuaco/data/malicious_clean.csv" ] labels = [0, 1] number_samples = 2 number_samples_w2v = 10000 num_epochs = 10 embedding_dim = 10 # Model Hyperparameters max_length = 7 n_gram = 6 # Create objects data_helpers = DataHelpers(paths, labels, max_length, n_gram, number_samples) w2v = W2V() sentence_stream = data_helpers.sentence_stream(number_samples_w2v) params = w2v.train_word2vec_stream(sentence_stream, num_features=embedding_dim, min_word_count=min_word_count, context=context, num_epochs=num_epochs) # Create the model classes = np.array([0, 1]) clf = SGDClassifier(loss='hinge', penalty="l2", eta0=0.01, learning_rate='constant') # Data load data_generator = data_helpers.get_data_chunk(params[1]) # Training the model train_accuracy = [] test_accuracy = [] x_train, y_train = next(data_generator) clf.partial_fit(x_train, y_train, classes=classes) for batch in range(num_epochs): for sample in range(1000): x_train_aux, y_train_aux = next(data_generator) # x_test_aux, y_test_aux = next(data_generator) x_train = np.concatenate([x_train, x_train_aux], 0) y_train = np.concatenate([y_train, y_train_aux], 0) # x_test = np.concatenate([x_test, x_test_aux], 0) # y_test = np.concatenate([y_test, y_test_aux], 0) clf.partial_fit(x_train, y_train) print("Batch: %d" % batch) print('Train Accuracy: %.3f' % clf.score(x_train, y_train)) # print('Test Accuracy: %.3f' % clf.score(x_test, y_test)) train_accuracy.append(clf.score(x_train, y_train)) # test_accuracy.append(clf.score(x_test, y_test)) # Plot the results plot = Plot() plot.history2plot([train_accuracy], "Model accuracy", "Epoch", "Accuracy")
def analyze_network(): # Parameters seed = 5 model_variation = 'svm-non-static' # Model Hyperparameters # Max lenght of one sentence max_length = 5 # Number of lines included in the # series n_gram = 1 # Total lenght of the classification # object sequence_length = max_length*n_gram # Size of the vector representing each word embedding_dim = 10 dropout_prob = (0.25, 0.5) # Number of neurons in the hidden layer hidden_dims = 10 # Training parameters number_samples = 5 number_samples_w2v = 10000 num_epochs = 100 val_split = 0.1 # Word2Vec parameters, see train_word2vec # Minimum word count min_word_count = 1 # Number of words that make sense in the context context = 4 action = "predict" weights_file = "/var/lib/arhuaco/data/models/weights_file" sentence = "process ptrace request=11(PTRACE_ATTACH)" # Training dataset paths = [ "/var/lib/arhuaco/data/dns_normal.log", "/var/lib/arhuaco/data/dns_malicious.log"] # Training labels labels = [ 0, 1 ] # Create objects data_helpers = DataHelpers( paths, labels, max_length, n_gram, number_samples) w2v = W2V() sentence_stream = data_helpers.sentence_stream(number_samples_w2v) params = w2v.train_word2vec_stream(sentence_stream, num_features=embedding_dim, min_word_count=min_word_count, context=context, num_epochs=num_epochs) svm_w2v = svmW2v(seed, model_variation, sequence_length, embedding_dim, filter_sizes, num_filters, dropout_prob, hidden_dims, number_samples, num_epochs, val_split, min_word_count, context, action, weights_file, sentence, paths, labels, data_helpers) svm_w2v.get_data(params[0], params[1], params[2]) svm_w2v.build_model() svm_w2v.train_model()
def analyze_stream(self, data_source, max_length, n_gram, output_queue): self.output_queue = output_queue data_helpers = DataHelpers(data_source, None, max_length, n_gram, samples_per_batch=None, seed=20) data_generator = data_helpers.get_data_stream(self.vocabulary, data_source) while True: data = next(data_generator) result = self.model.predict(data) self.output_queue.put(result)
def analyze_syscalls(): # Parameters max_length = 10 n_gram = 5 sequence_length = max_length * n_gram maxlen = 40 step = 3 num_epochs = 100 num_chars = 100 number_samples = 1000 seed = 5 paths = [ "/var/lib/arhuaco/data/normal_clean.csv", "/var/lib/arhuaco/data/malicious_clean.csv" ] labels = [0, 1] # Create objects # Create objects data_helpers = DataHelpers(paths, labels, max_length, n_gram, number_samples, seed) cnn_w2v = RnnGen(data_helpers, maxlen, step, num_epochs, num_chars) cnn_w2v.get_data() cnn_w2v.build_model() cnn_w2v.train_model()
def do_analyze(self, type=None): # Create objects if K.backend() == "tensorflow": with tf.Session(graph = tf.Graph()) as sess: model, configuration = self.build_model(type=type) # First create the sources of data data_helpers = DataHelpers(data_source=configuration['paths'], label=None, tokens_per_line=configuration['tokens_per_line'], number_lines=configuration['number_lines'], samples_per_batch=configuration['samples_per_batch'], seed=configuration['seed']) # Get the data sources online_generator = data_helpers.get_data_stream(configuration['vocabulary'], configuration['input_queue']) logging.info("Convolutional intrusion detection: %s" % type) result = model.analyze_stream(online_generator,self.output_queue) else: model, configuration = self.build_model(type=type) # First create the sources of data data_helpers = DataHelpers(data_source=configuration['paths'], label=None, tokens_per_line=configuration['tokens_per_line'], number_lines=configuration['number_lines'], samples_per_batch=configuration['samples_per_batch'], seed=configuration['seed']) # Get the data sources online_generator = data_helpers.get_data_stream(configuration['vocabulary'], configuration['input_queue']) logging.info("Convolutional intrusion detection: %s" % type) result = model.analyze_stream(online_generator,self.output_queue)
def analyze_network(): # Parameters # Max words per line max_number_words = 5 # Number of line per object n_gram = 1 # Number of characters for data input max_chars_len = 40 # Number of steps ahead to learn step = 1 # Number of iterations over the data num_epochs = 2 # Size of the vocabulary num_chars = 100 # Consecutive samples per data extraction number_samples = 100000 # Number of extractions samples_per_epoch = 10 # Number of characters to generate number_generated = 20000 # seed for random operations seed = 4 paths = [ "/var/lib/arhuaco/data/dns_normal.log", "/var/lib/arhuaco/data/dns_malicious.log" ] labels = [-1, 1] weights_file = "/var/lib/arhuaco/data/models/gen_model.weights" model_file = "/var/lib/arhuaco/data/models/gen_model.json" generated_file = "/var/lib/arhuaco/data/models/generated" # Create objects data_helpers = DataHelpers(paths, labels, max_number_words, n_gram, number_samples, seed) cnn_w2v = RnnGen(data_helpers=data_helpers, maxlen=max_chars_len, step=step, num_epochs=num_epochs, num_chars=num_chars, samples_per_epoch=samples_per_epoch, weights_file=weights_file, model_file=model_file, generated_file=generated_file, number_generated=number_generated) cnn_w2v.get_data() cnn_w2v.build_model() cnn_w2v.train_model()
def analyze_network(): clf = SGDClassifier(loss='hinge', penalty="l2") # Word2Vec parameters, see train_word2vec min_word_count = 1 # Minimum word count context = 4 # Context window size paths = [ "/var/lib/arhuaco/data/dns_normal.log", "/var/lib/arhuaco/data/dns_malicious.log" ] labels = [0, 1] number_samples = 10 num_epochs = 100 embedding_dim = 5 # Model Hyperparameters max_length = 5 n_gram = 1 # Create objects data_helpers = data_helpers = DataHelpers(paths, labels, max_length, n_gram, number_samples) w2v = W2V() # Load data print("Loading data...") x, y, vocabulary, vocabulary_inv = data_helpers.load_data() embedding_weights, vocabulary = w2v.train_word2vec(x, embedding_dim, min_word_count, context) classes = np.array([0, 1]) # Data load data_generator = data_helpers.get_data_chunk(vocabulary) # Training the model train_accuracy = [] train_loss = [] test_accuracy = [] test_loss = [] for batch in range(num_epochs): x_train, y_train = next(data_generator) x_test, y_test = next(data_generator) clf.partial_fit(x_train, y_train, classes=classes) print("Batch: %d" % batch) print('Train Accuracy: %.3f' % clf.score(x_train, y_train)) print('Test Accuracy: %.3f' % clf.score(x_test, y_test)) train_accuracy.append(clf.score(x_train, y_train)) test_accuracy.append(clf.score(x_test, y_test)) # Plot the results plot = Plot() plot.history2plot([train_accuracy, test_accuracy], "Model accuracy", "Epoch", "Accuracy")
def optimize_cnn_hyperparameters(self, tokens_per_line, number_lines, type="syscall"): # Load configuration config_object = Configuration() if type == "syscall": config_object.load_configuration("host") configuration = config_object.default_config # Training parameters configuration['verbose'] = 2 configuration['samples_per_batch'] = 5 configuration['samples_per_epoch'] = 10000 configuration['num_epochs'] = 100 configuration['val_split'] = 0.1 configuration['weights_file_conv'] = "/var/lib/arhuaco/data/models/sys_W_conv-%s"\ % time.strftime("%Y%m%d-%H%M%S") configuration['model_file_conv'] = "/var/lib/arhuaco/data/models/sys_model_conv-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset configuration['paths'] = [ "/var/lib/arhuaco/data/normal_clean_filtered.csv", "/var/lib/arhuaco/data/malicious_clean_filtered.csv" ] elif type == "network": # Load configuration config_object = Configuration() config_object.load_configuration("network") configuration = config_object.default_config # Training parameters configuration['verbose'] = 2 configuration['samples_per_batch'] = 5 configuration['samples_per_epoch'] = 1000 configuration['num_epochs'] = 100 configuration['val_split'] = 0.1 configuration['weights_file_conv'] = "/var/lib/arhuaco/data/models/net_W_conv-%s"\ % time.strftime("%Y%m%d-%H%M%S") configuration['model_file_conv'] = "/var/lib/arhuaco/data/models/net_model_conv-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset paths = [ "/var/lib/arhuaco/data/dns_normal.log", "/var/lib/arhuaco/data/dns_malicious.log" ] # Create objects # First create the sources of data data_helpers = DataHelpers( data_source=configuration['paths'], label=None, tokens_per_line=tokens_per_line, number_lines=number_lines, samples_per_batch=configuration['samples_per_batch'], seed=configuration['seed']) # Apply the word2vec processing w2v = W2V() sentence_stream = data_helpers.sentence_stream( configuration['samples_per_epoch']) params = w2v.train_word2vec_stream( sentence_stream, num_features=configuration['embedding_dim'], min_word_count=configuration['min_word_count'], context=configuration['context'], num_epochs=configuration['num_epochs']) embedding_weights = params[0] vocabulary = params[1] vocabulary_index = params[2] # Create the Convolutional network object cnn_w2v = CnnW2v(seed=configuration['seed'], samples_per_batch=configuration['samples_per_batch'], min_word_count=configuration['min_word_count'], context=configuration['context'], weights_file=configuration['weights_file_conv'], model_file=configuration['model_file_conv'], labels=None, verbose=configuration['verbose']) cnn_w2v.set_w2v_params(embedding_weights=params[0], vocabulary=params[1], vocabulary_index=params[2]) print("Convolutional optimization") # Get the data sources training_generator = data_helpers.get_data_chunk( vocabulary, configuration['labels_conv']) validation_generator = data_helpers.get_data_chunk( vocabulary, configuration['labels_conv']) test_generator = data_helpers.get_data_chunk( vocabulary, configuration['labels_conv']) # Create model for grid search model = KerasClassifier(build_fn=cnn_w2v.build_model, epochs=8, batch_size=10, verbose=3) # Define the grid search parameters learn_rate = [0.001, 0.01, 0.1] momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9] decay = [0.0, 1e-5, 1e-6, 1e-7] nesterov = [True, False] regularizer_param = [0.1, 0.01, 0.001] hidden_neurons = [5, 10, 20, 30] num_filters = [5, 10, 20, 30] filter_sizes = [(1, 2, 3, 4), (3, 4, 5), (5, 6)] dropout_rate = [0.0, 0.5, 0.1, 0.01] embedding_dim = [configuration['embedding_dim']] pool_size = [2, 3, 4] sequence_length = [tokens_per_line * number_lines] print("Starting grid search for %d tokens per line and %d lines" % (tokens_per_line, number_lines)) param_grid = dict(learn_rate=learn_rate, momentum=momentum, decay=decay, nesterov=nesterov, regularizer_param=regularizer_param, hidden_neurons=hidden_neurons, num_filters=num_filters, filter_sizes=filter_sizes, dropout_rate=dropout_rate, embedding_dim=embedding_dim, pool_size=pool_size, sequence_length=sequence_length) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, verbose=3) print("Extracting data from source...") X, Y = next(training_generator) # TODO: fix this! It is too slow... for i in range(configuration['samples_per_epoch']): X_i, Y_i = next(training_generator) X = np.append(X, X_i, axis=0) Y = np.append(Y, Y_i, axis=0) print("Starting grid search trainings...") grid_result = grid.fit(X, Y, verbose=3) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param))
def analyze_syscalls(): # Parameters seed = 5 verbose = 2 # Model Hyperparameters # Max lenght of one sentence max_length = 7 # Number of lines included in the # series n_gram = 6 # Total lenght of the classification # object sequence_length = max_length * n_gram # Size of the vector representing each word embedding_dim = 20 dropout_prob = (0.0, 0.0) # Number of neurons in the hidden layer hidden_dims = 20 # Training parameters number_samples = 5 samples_per_epoch = 10000 num_epochs = 100 val_split = 0.1 # Word2Vec parameters, see train_word2vec # Minimum word count min_word_count = 6 # Number of words that make sense in the context context = 10 weights_file_svm = "/var/lib/arhuaco/data/models/sys_W_svm-%s"\ % time.strftime("%Y%m%d-%H%M%S") model_file_svm = "/var/lib/arhuaco/data/models/sys_model_svm-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset paths = [ "/var/lib/arhuaco/data/normal_clean.csv", "/var/lib/arhuaco/data/malicious_clean.csv" ] # Training labels labels_svm = [-1, 1] # Create objects data_helpers = DataHelpers(paths, None, max_length, n_gram, number_samples, seed) w2v = W2V() sentence_stream = data_helpers.sentence_stream(samples_per_epoch) params = w2v.train_word2vec_stream(sentence_stream, num_features=embedding_dim, min_word_count=min_word_count, context=context, num_epochs=num_epochs) svm = SVM(seed, sequence_length, embedding_dim, dropout_prob, hidden_dims, number_samples, num_epochs, val_split, min_word_count, context, weights_file_svm, model_file_svm, paths, None, data_helpers, verbose) svm.get_data(params[0], params[1], params[2]) svm.build_model() print("SVM syscall training") history_svm = svm.train_model(samples_per_epoch, labels_svm) result = svm.test_model(10000, labels_svm, max_length, n_gram) # Graphically plot the results plot = Plot() # Training vs validation plot.history2plot([ history_svm.history['real_accuracy'], history_svm.history['val_real_accuracy'] ], ['Training', 'Validation'], "SVM accuracy", "Epoch", "Accuracy", "/var/lib/arhuaco/data/models/sys_svm_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), location='lower right') # Trainning vs validation fpr plot.history2plot([history_svm.history['false_pos_rate'], history_svm.history['val_false_pos_rate']], ['Training', 'Validation'], "SVM false positive rate", "Epoch", "False positive rate", "/var/lib/arhuaco/data/models/sys_svm_fpr-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S"), location='upper right')
def train(self, type="syscall"): # Load configuration config_object = Configuration() if type == "syscall": config_object.load_configuration("host") configuration = config_object.default_config # Training parameters configuration['verbose'] = 2 configuration['samples_per_batch'] = 5 configuration['samples_per_epoch'] = 100000 configuration['num_epochs'] = 10 configuration['val_split'] = 0.1 configuration['weights_file_svm'] = "/var/lib/arhuaco/data/models/sys_W_svm-%s"\ % time.strftime("%Y%m%d-%H%M%S") configuration['model_file_svm'] = "/var/lib/arhuaco/data/models/sys_model_svm-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset configuration['paths'] = [ "/var/lib/arhuaco/data/normal_clean_filtered.csv", "/var/lib/arhuaco/data/malicious_clean_filtered.csv" ] configuration['pdf_paths'] = ["/var/lib/arhuaco/data/models/sys_svm_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), "/var/lib/arhuaco/data/models/sys_svm_fpr-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S")] elif type == "network": # Load configuration config_object = Configuration() config_object.load_configuration("network") configuration = config_object.default_config # Training parameters configuration['verbose'] = 2 configuration['samples_per_batch'] = 5 configuration['samples_per_epoch'] = 1000 configuration['num_epochs'] = 10 configuration['val_split'] = 0.1 configuration['weights_file_svm'] = "/var/lib/arhuaco/data/models/net_W_svm-%s"\ % time.strftime("%Y%m%d-%H%M%S") configuration['model_file_svm'] = "/var/lib/arhuaco/data/models/net_model_svm-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset configuration['paths'] = [ "/var/lib/arhuaco/data/dns_normal.log", "/var/lib/arhuaco/data/dns_malicious.log" ] # "/var/lib/arhuaco/data/dns_malicious_generated.log"] configuration['pdf_paths'] = ["/var/lib/arhuaco/data/models/net_svm_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), "/var/lib/arhuaco/data/models/net_svm_fpr-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S")] # Create objects # First create the sources of data data_helper = DataHelpers( data_source=configuration['paths'], label=None, tokens_per_line=configuration['tokens_per_line'], number_lines=configuration['number_lines'], samples_per_batch=configuration['samples_per_batch'], seed=configuration['seed']) # Apply the word2vec processing w2v = W2V() sentence_stream = data_helper.sentence_stream( configuration['samples_per_epoch']) params = w2v.train_word2vec_stream( sentence_stream, num_features=configuration['embedding_dim'], min_word_count=configuration['min_word_count'], context=configuration['context'], num_epochs=configuration['num_epochs']) embedding_weights = params[0] vocabulary = params[1] vocabulary_index = params[2] # Create the svm network object svm_bow = SVM(seed=configuration['seed'], samples_per_batch=configuration['samples_per_batch'], min_word_count=configuration['min_word_count'], context=configuration['context'], weights_file=configuration['weights_file_svm'], model_file=configuration['model_file_svm'], labels=None, verbose=configuration['verbose']) svm_bow.set_bow_params(embedding_weights=params[0], vocabulary=params[1], vocabulary_index=params[2]) # Buid the model svm_bow.build_model( learn_rate=configuration['learn_rate'], momentum=configuration['momentum'], decay=configuration['decay'], nesterov=configuration['nesterov'], regularizer_param=configuration['regularizer_param'], dropout_rate=configuration['dropout_prob'], embedding_dim=configuration['embedding_dim'], ) print("svm training") # Get the data sources training_generator = data_helper.get_data_BoW_chunk( vocabulary, configuration['labels_svm']) validation_generator = data_helper.get_data_BoW_chunk( vocabulary, configuration['labels_svm']) # Train and validate the model history_object = svm_bow.train_model(training_source=training_generator, validation_source=validation_generator, samples_per_epoch\ =configuration['samples_per_epoch'], number_epochs=configuration['num_epochs'], val_split=configuration['val_split']) # Test the model with new data # Create a new data source for validation with generated data configuration['paths'][1] = '/var/lib/arhuaco/data/dns_malicious.log' configuration['samples_per_epoch'] = 1000 validation_data_helper = DataHelpers( data_source=configuration['paths'], label=None, tokens_per_line=configuration['tokens_per_line'], number_lines=configuration['number_lines'], samples_per_batch=configuration['samples_per_batch'], seed=configuration['seed'] + 3) test_generator = validation_data_helper.get_data_BoW_chunk( vocabulary, configuration['labels_svm']) result = svm_bow.test_model( test_data_source=test_generator, samples_to_test=configuration['samples_per_epoch']) # Graphically plot the results plot = Plot() # Training vs validation accuracy plot.history2plot([ history_object.history['real_accuracy'], history_object.history['val_real_accuracy'] ], ['Training', 'Validation'], "svm accuracy", "Epoch", "Accuracy", configuration['pdf_paths'][0], 'lower right', [0, 9], [0.8, 1.0]) # Trainning vs validation fpr plot.history2plot([ history_object.history['false_pos_rate'], history_object.history['val_false_pos_rate'] ], ['Training', 'Validation'], "svm false positive rate", "Epoch", "False positive rate", configuration['pdf_paths'][1], 'upper right', [0, 9], [0, 0.2])
def analyze_network(): # Parameters seed = 5 model_variation = 'CNN-non-static' # Model Hyperparameters # Max lenght of one sentence max_length = 5 # Number of lines included in the # series n_gram = 1 # Total lenght of the classification # object sequence_length = max_length * n_gram # Size of the vector representing each word embedding_dim = 10 # Conv. Filters applied to the text filter_sizes = (2, 3) # Total filters used num_filters = 3 dropout_prob = (0.0, 0.0) # Number of neurons in the hidden layer hidden_dims = 10 # Training parameters number_samples = 5 samples_per_epoch = 1000 num_epochs = 100 val_split = 0.1 verbose = 2 # Word2Vec parameters, see train_word2vec # Minimum word count min_word_count = 1 # Number of words that make sense in the context context = 4 weights_file_conv = "/var/lib/arhuaco/data/models/net_W_conv-%s"\ % time.strftime("%Y%m%d-%H%M%S") model_file_conv = "/var/lib/arhuaco/data/models/net_model_conv-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset paths = [ "/var/lib/arhuaco/data/dns_normal.log", #"/var/lib/arhuaco/data/dns_malicious.log"] "/var/lib/arhuaco/data/dns_malicious_generated.log" ] # Training labels labels_conv = [0, 1] # Create objects data_helpers = DataHelpers(paths, None, max_length, n_gram, number_samples, seed) w2v = W2V() sentence_stream = data_helpers.sentence_stream(samples_per_epoch) params = w2v.train_word2vec_stream(sentence_stream, num_features=embedding_dim, min_word_count=min_word_count, context=context, num_epochs=num_epochs) cnn_w2v = CnnW2v(seed, model_variation, sequence_length, embedding_dim, filter_sizes, num_filters, dropout_prob, hidden_dims, number_samples, num_epochs, val_split, min_word_count, context, weights_file_conv, model_file_conv, paths, None, data_helpers, verbose) cnn_w2v.get_data(params[0], params[1], params[2]) cnn_w2v.build_model() print("Convolutional network training") history_conv = cnn_w2v.train_model(samples_per_epoch, labels_conv) cnn_w2v.paths[1] = "/var/lib/arhuaco/data/dns_malicious.log" result = cnn_w2v.test_model(1000, labels_conv, max_length, n_gram) # Graphically plot the results plot = Plot() # Training vs validation plot.history2plot([history_conv.history['real_accuracy'], history_conv.history['val_real_accuracy']], ['Training', 'Validation'], "CNN accuracy", "Epoch", "Accuracy", "/var/lib/arhuaco/data/models/net_cnn_accuracy-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S"), location='lower right') # Trainning vs validation fpr plot.history2plot([history_conv.history['false_pos_rate'], history_conv.history['val_false_pos_rate']], ['Training', 'Validation'], "CNN false positive rate", "Epoch", "False positive rate", "/var/lib/arhuaco/data/models/net_cnn_fpr-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S"), location='upper right')
def train(self, type="syscall" ): # Load configuration config_object = Configuration() if type == "syscall": config_object.load_configuration("host") configuration = config_object.default_config # Training parameters configuration['verbose'] = 1 configuration['samples_per_batch'] = 5 # Thesis configuration configuration['samples_per_epoch'] = 100000 # Test configuration to show concept # configuration['samples_per_epoch'] = 1000 configuration['num_epochs'] = 10 configuration['val_split'] = 0.1 configuration['weights_file_conv'] = "/var/lib/arhuaco/data/models/sys_W_conv-%s"\ % time.strftime("%Y%m%d-%H%M%S") configuration['model_file_conv'] = "/var/lib/arhuaco/data/models/sys_model_conv-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset configuration['paths'] = [ "/var/lib/arhuaco/data/normal_clean_filtered.csv", "/var/lib/arhuaco/data/malicious_clean_filtered.csv"] configuration['pdf_paths'] = ["/var/lib/arhuaco/data/models/sys_cnn_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), "/var/lib/arhuaco/data/models/sys_cnn_fpr-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S")] elif type == "network": # Load configuration config_object = Configuration() config_object.load_configuration("network") configuration = config_object.default_config # Training parameters configuration['verbose'] = 1 configuration['samples_per_batch'] = 5 # Thesis configuration # configuration['samples_per_epoch'] = 10000 # Test configuration to show concept configuration['samples_per_epoch'] = 1000 configuration['num_epochs'] = 10 configuration['val_split'] = 0.1 configuration['weights_file_conv'] = "/var/lib/arhuaco/data/models/net_W_conv-%s"\ % time.strftime("%Y%m%d-%H%M%S") configuration['model_file_conv'] = "/var/lib/arhuaco/data/models/net_model_conv-%s.json"\ % time.strftime("%Y%m%d-%H%M%S") # Training dataset paths = [ "/var/lib/arhuaco/data/dns_normal.log", "/var/lib/arhuaco/data/dns_malicious.log"] configuration['pdf_paths'] = ["/var/lib/arhuaco/data/models/net_cnn_accuracy-%s.pdf" % time.strftime("%Y%m%d-%H%M%S"), "/var/lib/arhuaco/data/models/net_cnn_fpr-%s.pdf"\ % time.strftime("%Y%m%d-%H%M%S")] # Create objects # First create the sources of data data_helpers = DataHelpers(data_source=configuration['paths'], label=None, tokens_per_line=configuration['tokens_per_line'], number_lines=configuration['number_lines'], samples_per_batch=configuration['samples_per_batch'], seed=configuration['seed']) # Apply the word2vec processing w2v = W2V() sentence_stream = data_helpers.sentence_stream( configuration['samples_per_epoch']) params = w2v.train_word2vec_stream(sentence_stream, num_features=configuration['embedding_dim'], min_word_count=configuration['min_word_count'], context=configuration['context'], num_epochs=configuration['num_epochs']) embedding_weights=params[0] vocabulary=params[1] vocabulary_index=params[2] # Create the Convolutional network object cnn_w2v = CnnW2v(seed=configuration['seed'], samples_per_batch=configuration['samples_per_batch'], min_word_count=configuration['min_word_count'], context=configuration['context'], weights_file=configuration['weights_file_conv'], model_file=configuration['model_file_conv'], labels=None, verbose=configuration['verbose']) cnn_w2v.set_w2v_params(embedding_weights=params[0], vocabulary=params[1], vocabulary_index=params[2]) # Buid the model cnn_w2v.build_model(learn_rate=configuration['learn_rate'], momentum=configuration['momentum'], decay=configuration['decay'], nesterov=configuration['nesterov'], regularizer_param=configuration['regularizer_param'], hidden_neurons=configuration['hidden_dims'], num_filters=configuration['num_filters'], filter_sizes=configuration['filter_sizes'], dropout_rate=configuration['dropout_prob'], embedding_dim=configuration['embedding_dim'], pool_size=configuration['pool_size'], sequence_length=configuration['sequence_length'] ) print("Convolutional training") # Get the data sources training_generator = data_helpers.get_data_chunk(vocabulary, configuration['labels_conv']) validation_generator = data_helpers.get_data_chunk(vocabulary, configuration['labels_conv']) test_generator = data_helpers.get_data_chunk(vocabulary, configuration['labels_conv']) # Train and validate the model history_object = cnn_w2v.train_model(training_source=training_generator, validation_source=validation_generator, samples_per_epoch\ =configuration['samples_per_epoch'], number_epochs=configuration['num_epochs'], val_split=configuration['val_split']) # Test the model with new data result = cnn_w2v.test_model(test_data_source=test_generator, samples_to_test=configuration['samples_per_epoch']) # Graphically plot the results (skip for the time being) ''' plot = Plot()