示例#1
0
def analyze_syscalls():
    # Parameters
    min_word_count = 5  # Minimum word count
    context = 10  # Context window size
    paths = [
        "/var/lib/arhuaco/data/normal_clean.csv",
        "/var/lib/arhuaco/data/malicious_clean.csv"
    ]
    labels = [0, 1]
    number_samples = 2
    number_samples_w2v = 10000
    num_epochs = 10
    embedding_dim = 10
    # Model Hyperparameters
    max_length = 7
    n_gram = 6
    # Create objects
    data_helpers = DataHelpers(paths, labels, max_length, n_gram,
                               number_samples)
    w2v = W2V()
    sentence_stream = data_helpers.sentence_stream(number_samples_w2v)
    params = w2v.train_word2vec_stream(sentence_stream,
                                       num_features=embedding_dim,
                                       min_word_count=min_word_count,
                                       context=context,
                                       num_epochs=num_epochs)
    # Create the model
    classes = np.array([0, 1])
    clf = SGDClassifier(loss='hinge',
                        penalty="l2",
                        eta0=0.01,
                        learning_rate='constant')
    # Data load
    data_generator = data_helpers.get_data_chunk(params[1])
    # Training the model
    train_accuracy = []
    test_accuracy = []
    x_train, y_train = next(data_generator)
    clf.partial_fit(x_train, y_train, classes=classes)
    for batch in range(num_epochs):
        for sample in range(1000):
            x_train_aux, y_train_aux = next(data_generator)
            # x_test_aux, y_test_aux = next(data_generator)
            x_train = np.concatenate([x_train, x_train_aux], 0)
            y_train = np.concatenate([y_train, y_train_aux], 0)
            # x_test  = np.concatenate([x_test, x_test_aux], 0)
            # y_test  = np.concatenate([y_test, y_test_aux], 0)
        clf.partial_fit(x_train, y_train)
        print("Batch: %d" % batch)
        print('Train Accuracy: %.3f' % clf.score(x_train, y_train))
        # print('Test Accuracy: %.3f' % clf.score(x_test, y_test))
        train_accuracy.append(clf.score(x_train, y_train))
        # test_accuracy.append(clf.score(x_test, y_test))
    # Plot the results
    plot = Plot()
    plot.history2plot([train_accuracy], "Model accuracy", "Epoch", "Accuracy")
示例#2
0
def analyze_network():
    # Parameters
    seed = 5
    model_variation = 'svm-non-static'

    # Model Hyperparameters
    # Max lenght of one sentence
    max_length = 5
    # Number of lines included in the
    # series
    n_gram = 1
    # Total lenght of the classification
    # object
    sequence_length = max_length*n_gram
    # Size of the vector representing each word
    embedding_dim = 10
    dropout_prob = (0.25, 0.5)
    # Number of neurons in the hidden layer
    hidden_dims = 10

    # Training parameters
    number_samples = 5
    number_samples_w2v = 10000
    num_epochs = 100
    val_split = 0.1

    # Word2Vec parameters, see train_word2vec
    # Minimum word count
    min_word_count = 1
    # Number of words that make sense in the context
    context = 4
    action = "predict"
    weights_file = "/var/lib/arhuaco/data/models/weights_file"
    sentence = "process ptrace request=11(PTRACE_ATTACH)"
    # Training dataset
    paths = [ "/var/lib/arhuaco/data/dns_normal.log", "/var/lib/arhuaco/data/dns_malicious.log"]
    # Training labels
    labels = [ 0, 1 ]

    # Create objects
    data_helpers = DataHelpers( paths, labels, max_length, n_gram, number_samples)
    w2v = W2V()
    sentence_stream = data_helpers.sentence_stream(number_samples_w2v)
    params = w2v.train_word2vec_stream(sentence_stream,
                              num_features=embedding_dim,
                              min_word_count=min_word_count, context=context,
                              num_epochs=num_epochs)
    svm_w2v = svmW2v(seed, model_variation, sequence_length, embedding_dim,
                     filter_sizes, num_filters, dropout_prob, hidden_dims, number_samples,
                     num_epochs, val_split, min_word_count, context, action, weights_file,
                     sentence, paths, labels, data_helpers)
    svm_w2v.get_data(params[0], params[1], params[2])
    svm_w2v.build_model()
    svm_w2v.train_model()
示例#3
0
def analyze_network():
    clf = SGDClassifier(loss='hinge', penalty="l2")
    # Word2Vec parameters, see train_word2vec
    min_word_count = 1  # Minimum word count
    context = 4  # Context window size
    paths = [
        "/var/lib/arhuaco/data/dns_normal.log",
        "/var/lib/arhuaco/data/dns_malicious.log"
    ]
    labels = [0, 1]
    number_samples = 10
    num_epochs = 100
    embedding_dim = 5
    # Model Hyperparameters
    max_length = 5
    n_gram = 1

    # Create objects
    data_helpers = data_helpers = DataHelpers(paths, labels, max_length,
                                              n_gram, number_samples)
    w2v = W2V()
    # Load data
    print("Loading data...")
    x, y, vocabulary, vocabulary_inv = data_helpers.load_data()
    embedding_weights, vocabulary = w2v.train_word2vec(x, embedding_dim,
                                                       min_word_count, context)

    classes = np.array([0, 1])
    # Data load
    data_generator = data_helpers.get_data_chunk(vocabulary)
    # Training the model
    train_accuracy = []
    train_loss = []
    test_accuracy = []
    test_loss = []
    for batch in range(num_epochs):
        x_train, y_train = next(data_generator)
        x_test, y_test = next(data_generator)
        clf.partial_fit(x_train, y_train, classes=classes)
        print("Batch: %d" % batch)
        print('Train Accuracy: %.3f' % clf.score(x_train, y_train))
        print('Test Accuracy: %.3f' % clf.score(x_test, y_test))
        train_accuracy.append(clf.score(x_train, y_train))
        test_accuracy.append(clf.score(x_test, y_test))
    # Plot the results
    plot = Plot()
    plot.history2plot([train_accuracy, test_accuracy], "Model accuracy",
                      "Epoch", "Accuracy")
示例#4
0
    def optimize_cnn_hyperparameters(self,
                                     tokens_per_line,
                                     number_lines,
                                     type="syscall"):
        # Load configuration
        config_object = Configuration()
        if type == "syscall":
            config_object.load_configuration("host")
            configuration = config_object.default_config

            # Training parameters
            configuration['verbose'] = 2
            configuration['samples_per_batch'] = 5
            configuration['samples_per_epoch'] = 10000
            configuration['num_epochs'] = 100
            configuration['val_split'] = 0.1

            configuration['weights_file_conv'] = "/var/lib/arhuaco/data/models/sys_W_conv-%s"\
                                                 % time.strftime("%Y%m%d-%H%M%S")
            configuration['model_file_conv'] = "/var/lib/arhuaco/data/models/sys_model_conv-%s.json"\
                                               % time.strftime("%Y%m%d-%H%M%S")
            # Training dataset
            configuration['paths'] = [
                "/var/lib/arhuaco/data/normal_clean_filtered.csv",
                "/var/lib/arhuaco/data/malicious_clean_filtered.csv"
            ]
        elif type == "network":
            # Load configuration
            config_object = Configuration()
            config_object.load_configuration("network")
            configuration = config_object.default_config

            # Training parameters
            configuration['verbose'] = 2
            configuration['samples_per_batch'] = 5
            configuration['samples_per_epoch'] = 1000
            configuration['num_epochs'] = 100
            configuration['val_split'] = 0.1

            configuration['weights_file_conv'] = "/var/lib/arhuaco/data/models/net_W_conv-%s"\
                                                 % time.strftime("%Y%m%d-%H%M%S")
            configuration['model_file_conv'] = "/var/lib/arhuaco/data/models/net_model_conv-%s.json"\
                                                % time.strftime("%Y%m%d-%H%M%S")
            # Training dataset
            paths = [
                "/var/lib/arhuaco/data/dns_normal.log",
                "/var/lib/arhuaco/data/dns_malicious.log"
            ]

        # Create objects
        # First create the sources of data
        data_helpers = DataHelpers(
            data_source=configuration['paths'],
            label=None,
            tokens_per_line=tokens_per_line,
            number_lines=number_lines,
            samples_per_batch=configuration['samples_per_batch'],
            seed=configuration['seed'])

        # Apply the word2vec processing
        w2v = W2V()
        sentence_stream = data_helpers.sentence_stream(
            configuration['samples_per_epoch'])
        params = w2v.train_word2vec_stream(
            sentence_stream,
            num_features=configuration['embedding_dim'],
            min_word_count=configuration['min_word_count'],
            context=configuration['context'],
            num_epochs=configuration['num_epochs'])
        embedding_weights = params[0]
        vocabulary = params[1]
        vocabulary_index = params[2]

        # Create the Convolutional network object
        cnn_w2v = CnnW2v(seed=configuration['seed'],
                         samples_per_batch=configuration['samples_per_batch'],
                         min_word_count=configuration['min_word_count'],
                         context=configuration['context'],
                         weights_file=configuration['weights_file_conv'],
                         model_file=configuration['model_file_conv'],
                         labels=None,
                         verbose=configuration['verbose'])
        cnn_w2v.set_w2v_params(embedding_weights=params[0],
                               vocabulary=params[1],
                               vocabulary_index=params[2])

        print("Convolutional optimization")
        # Get the data sources
        training_generator = data_helpers.get_data_chunk(
            vocabulary, configuration['labels_conv'])
        validation_generator = data_helpers.get_data_chunk(
            vocabulary, configuration['labels_conv'])
        test_generator = data_helpers.get_data_chunk(
            vocabulary, configuration['labels_conv'])

        # Create model for grid search
        model = KerasClassifier(build_fn=cnn_w2v.build_model,
                                epochs=8,
                                batch_size=10,
                                verbose=3)

        # Define the grid search parameters
        learn_rate = [0.001, 0.01, 0.1]
        momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
        decay = [0.0, 1e-5, 1e-6, 1e-7]
        nesterov = [True, False]
        regularizer_param = [0.1, 0.01, 0.001]
        hidden_neurons = [5, 10, 20, 30]
        num_filters = [5, 10, 20, 30]
        filter_sizes = [(1, 2, 3, 4), (3, 4, 5), (5, 6)]
        dropout_rate = [0.0, 0.5, 0.1, 0.01]
        embedding_dim = [configuration['embedding_dim']]
        pool_size = [2, 3, 4]
        sequence_length = [tokens_per_line * number_lines]

        print("Starting grid search for %d tokens per line and %d lines" %
              (tokens_per_line, number_lines))
        param_grid = dict(learn_rate=learn_rate,
                          momentum=momentum,
                          decay=decay,
                          nesterov=nesterov,
                          regularizer_param=regularizer_param,
                          hidden_neurons=hidden_neurons,
                          num_filters=num_filters,
                          filter_sizes=filter_sizes,
                          dropout_rate=dropout_rate,
                          embedding_dim=embedding_dim,
                          pool_size=pool_size,
                          sequence_length=sequence_length)

        grid = GridSearchCV(estimator=model,
                            param_grid=param_grid,
                            n_jobs=-1,
                            verbose=3)

        print("Extracting data from source...")
        X, Y = next(training_generator)
        # TODO: fix this! It is too slow...
        for i in range(configuration['samples_per_epoch']):
            X_i, Y_i = next(training_generator)
            X = np.append(X, X_i, axis=0)
            Y = np.append(Y, Y_i, axis=0)
        print("Starting grid search trainings...")
        grid_result = grid.fit(X, Y, verbose=3)
        # summarize results
        print("Best: %f using %s" %
              (grid_result.best_score_, grid_result.best_params_))
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))
示例#5
0
def analyze_syscalls():
    # Parameters
    seed = 5
    verbose = 2

    # Model Hyperparameters
    # Max lenght of one sentence
    max_length = 7
    # Number of lines included in the
    # series
    n_gram = 6
    # Total lenght of the classification
    # object
    sequence_length = max_length * n_gram
    # Size of the vector representing each word
    embedding_dim = 20
    dropout_prob = (0.0, 0.0)
    # Number of neurons in the hidden layer
    hidden_dims = 20

    # Training parameters
    number_samples = 5
    samples_per_epoch = 10000
    num_epochs = 100
    val_split = 0.1

    # Word2Vec parameters, see train_word2vec
    # Minimum word count
    min_word_count = 6
    # Number of words that make sense in the context
    context = 10
    weights_file_svm = "/var/lib/arhuaco/data/models/sys_W_svm-%s"\
                       % time.strftime("%Y%m%d-%H%M%S")
    model_file_svm = "/var/lib/arhuaco/data/models/sys_model_svm-%s.json"\
                     % time.strftime("%Y%m%d-%H%M%S")
    # Training dataset
    paths = [
        "/var/lib/arhuaco/data/normal_clean.csv",
        "/var/lib/arhuaco/data/malicious_clean.csv"
    ]
    # Training labels
    labels_svm = [-1, 1]

    # Create objects
    data_helpers = DataHelpers(paths, None, max_length, n_gram, number_samples,
                               seed)
    w2v = W2V()
    sentence_stream = data_helpers.sentence_stream(samples_per_epoch)
    params = w2v.train_word2vec_stream(sentence_stream,
                                       num_features=embedding_dim,
                                       min_word_count=min_word_count,
                                       context=context,
                                       num_epochs=num_epochs)
    svm = SVM(seed, sequence_length, embedding_dim, dropout_prob, hidden_dims,
              number_samples, num_epochs, val_split, min_word_count, context,
              weights_file_svm, model_file_svm, paths, None, data_helpers,
              verbose)
    svm.get_data(params[0], params[1], params[2])
    svm.build_model()
    print("SVM syscall training")
    history_svm = svm.train_model(samples_per_epoch, labels_svm)
    result = svm.test_model(10000, labels_svm, max_length, n_gram)
    # Graphically plot the results
    plot = Plot()
    # Training vs validation
    plot.history2plot([
        history_svm.history['real_accuracy'],
        history_svm.history['val_real_accuracy']
    ], ['Training', 'Validation'],
                      "SVM accuracy",
                      "Epoch",
                      "Accuracy",
                      "/var/lib/arhuaco/data/models/sys_svm_accuracy-%s.pdf" %
                      time.strftime("%Y%m%d-%H%M%S"),
                      location='lower right')
    # Trainning vs validation fpr
    plot.history2plot([history_svm.history['false_pos_rate'],
                       history_svm.history['val_false_pos_rate']],
                       ['Training', 'Validation'],
                       "SVM false positive rate", "Epoch",
                       "False positive rate",
                       "/var/lib/arhuaco/data/models/sys_svm_fpr-%s.pdf"\
                       % time.strftime("%Y%m%d-%H%M%S"),
                       location='upper right')
示例#6
0
    def build_model(self,type="syscall"):
       # Load configuration
        config_object = Configuration()
        if type == "syscall":
            # Load configuration
            config_object.load_configuration("host")
            configuration = config_object.default_config
            configuration['input_queue'] = self.input_queue_dict["syscall_sensor"]

            # Training parameters
            configuration['verbose'] = 2
            configuration['samples_per_batch'] = 5
            configuration['samples_per_epoch'] = 100000
            configuration['num_epochs'] = 10
            configuration['val_split'] = 0.1

            configuration['weights_file_conv'] = "/var/lib/arhuaco/data/models/sys_W_conv.bin"
            configuration['model_file_conv'] = "/var/lib/arhuaco/data/models/sys_model_conv.json"
        elif type == "network":
            # Load configuration
            config_object = Configuration()
            config_object.load_configuration("network")
            configuration = config_object.default_config
            configuration['input_queue'] = self.input_queue_dict["network_sensor"]

            # Training parameters
            configuration['verbose'] = 2
            configuration['samples_per_batch'] = 5
            configuration['samples_per_epoch'] = 10000
            configuration['num_epochs'] = 10
            configuration['val_split'] = 0.1

            configuration['weights_file_conv'] = "/var/lib/arhuaco/data/models/net_W_conv.bin"
            configuration['model_file_conv'] = "/var/lib/arhuaco/data/models/net_model_conv.json"

        w2v_model_name = "{:d}features_{:d}minwords_{:d}context".format(
                      configuration['num_features'],
                      configuration['min_word_count'],
                      configuration['context'])

        # Create objects
        # Apply the word2vec processing
        w2v = W2V()
        params = w2v.load_word2vec_model(w2v_model_name)
        embedding_weights=params[0]
        configuration['vocabulary']=params[1]
        configuration['vocabulary_index']=params[2]

        # Create the Convolutional network object
        cnn_w2v = CnnW2v(seed=configuration['seed'],
                  samples_per_batch=configuration['samples_per_batch'],
                  min_word_count=configuration['min_word_count'],
                  context=configuration['context'],
                  weights_file=configuration['weights_file_conv'],
                  model_file=configuration['model_file_conv'],
                  labels=None,
                  verbose=configuration['verbose'])
        cnn_w2v.set_w2v_params(embedding_weights=embedding_weights,
                               vocabulary=configuration['vocabulary'],
                               vocabulary_index=configuration['vocabulary_index'])
        # Buid the model
        # Do I really need to build the model again?
        # What is in the the model.json file then?
        cnn_w2v.build_model(learn_rate=configuration['learn_rate'],
                            momentum=configuration['momentum'],
                            decay=configuration['decay'],
                            nesterov=configuration['nesterov'],
                            regularizer_param=configuration['regularizer_param'],
                            hidden_neurons=configuration['hidden_dims'],
                            num_filters=configuration['num_filters'],
                            filter_sizes=configuration['filter_sizes'],
                            dropout_rate=configuration['dropout_prob'],
                            embedding_dim=configuration['embedding_dim'],
                            pool_size=configuration['pool_size'],
                            sequence_length=configuration['sequence_length']
                           )
        cnn_w2v.load_model_weights(configuration['weights_file_conv'])
        return cnn_w2v, configuration
示例#7
0
    def train(self, type="syscall"):
        # Load configuration
        config_object = Configuration()
        if type == "syscall":
            config_object.load_configuration("host")
            configuration = config_object.default_config

            # Training parameters
            configuration['verbose'] = 2
            configuration['samples_per_batch'] = 5
            configuration['samples_per_epoch'] = 100000
            configuration['num_epochs'] = 10
            configuration['val_split'] = 0.1

            configuration['weights_file_svm'] = "/var/lib/arhuaco/data/models/sys_W_svm-%s"\
                                                 % time.strftime("%Y%m%d-%H%M%S")
            configuration['model_file_svm'] = "/var/lib/arhuaco/data/models/sys_model_svm-%s.json"\
                                               % time.strftime("%Y%m%d-%H%M%S")
            # Training dataset
            configuration['paths'] = [
                "/var/lib/arhuaco/data/normal_clean_filtered.csv",
                "/var/lib/arhuaco/data/malicious_clean_filtered.csv"
            ]

            configuration['pdf_paths'] = ["/var/lib/arhuaco/data/models/sys_svm_accuracy-%s.pdf"
                                          % time.strftime("%Y%m%d-%H%M%S"),
                                          "/var/lib/arhuaco/data/models/sys_svm_fpr-%s.pdf"\
                                          % time.strftime("%Y%m%d-%H%M%S")]

        elif type == "network":
            # Load configuration
            config_object = Configuration()
            config_object.load_configuration("network")
            configuration = config_object.default_config

            # Training parameters
            configuration['verbose'] = 2
            configuration['samples_per_batch'] = 5
            configuration['samples_per_epoch'] = 1000
            configuration['num_epochs'] = 10
            configuration['val_split'] = 0.1

            configuration['weights_file_svm'] = "/var/lib/arhuaco/data/models/net_W_svm-%s"\
                                                 % time.strftime("%Y%m%d-%H%M%S")
            configuration['model_file_svm'] = "/var/lib/arhuaco/data/models/net_model_svm-%s.json"\
                                                % time.strftime("%Y%m%d-%H%M%S")
            # Training dataset
            configuration['paths'] = [
                "/var/lib/arhuaco/data/dns_normal.log",
                "/var/lib/arhuaco/data/dns_malicious.log"
            ]
            # "/var/lib/arhuaco/data/dns_malicious_generated.log"]

            configuration['pdf_paths'] = ["/var/lib/arhuaco/data/models/net_svm_accuracy-%s.pdf"
                                          % time.strftime("%Y%m%d-%H%M%S"),
                                          "/var/lib/arhuaco/data/models/net_svm_fpr-%s.pdf"\
                                          % time.strftime("%Y%m%d-%H%M%S")]

        # Create objects
        # First create the sources of data
        data_helper = DataHelpers(
            data_source=configuration['paths'],
            label=None,
            tokens_per_line=configuration['tokens_per_line'],
            number_lines=configuration['number_lines'],
            samples_per_batch=configuration['samples_per_batch'],
            seed=configuration['seed'])

        # Apply the word2vec processing
        w2v = W2V()
        sentence_stream = data_helper.sentence_stream(
            configuration['samples_per_epoch'])
        params = w2v.train_word2vec_stream(
            sentence_stream,
            num_features=configuration['embedding_dim'],
            min_word_count=configuration['min_word_count'],
            context=configuration['context'],
            num_epochs=configuration['num_epochs'])
        embedding_weights = params[0]
        vocabulary = params[1]
        vocabulary_index = params[2]

        # Create the svm network object
        svm_bow = SVM(seed=configuration['seed'],
                      samples_per_batch=configuration['samples_per_batch'],
                      min_word_count=configuration['min_word_count'],
                      context=configuration['context'],
                      weights_file=configuration['weights_file_svm'],
                      model_file=configuration['model_file_svm'],
                      labels=None,
                      verbose=configuration['verbose'])
        svm_bow.set_bow_params(embedding_weights=params[0],
                               vocabulary=params[1],
                               vocabulary_index=params[2])

        # Buid the model
        svm_bow.build_model(
            learn_rate=configuration['learn_rate'],
            momentum=configuration['momentum'],
            decay=configuration['decay'],
            nesterov=configuration['nesterov'],
            regularizer_param=configuration['regularizer_param'],
            dropout_rate=configuration['dropout_prob'],
            embedding_dim=configuration['embedding_dim'],
        )
        print("svm training")
        # Get the data sources
        training_generator = data_helper.get_data_BoW_chunk(
            vocabulary, configuration['labels_svm'])
        validation_generator = data_helper.get_data_BoW_chunk(
            vocabulary, configuration['labels_svm'])

        # Train and validate the model
        history_object = svm_bow.train_model(training_source=training_generator,
                                             validation_source=validation_generator,
                                             samples_per_epoch\
                                             =configuration['samples_per_epoch'],
                                             number_epochs=configuration['num_epochs'],
                                             val_split=configuration['val_split'])
        # Test the model with new data
        # Create a new data source for validation with generated data
        configuration['paths'][1] = '/var/lib/arhuaco/data/dns_malicious.log'
        configuration['samples_per_epoch'] = 1000

        validation_data_helper = DataHelpers(
            data_source=configuration['paths'],
            label=None,
            tokens_per_line=configuration['tokens_per_line'],
            number_lines=configuration['number_lines'],
            samples_per_batch=configuration['samples_per_batch'],
            seed=configuration['seed'] + 3)

        test_generator = validation_data_helper.get_data_BoW_chunk(
            vocabulary, configuration['labels_svm'])

        result = svm_bow.test_model(
            test_data_source=test_generator,
            samples_to_test=configuration['samples_per_epoch'])
        # Graphically plot the results
        plot = Plot()
        # Training vs validation accuracy
        plot.history2plot([
            history_object.history['real_accuracy'],
            history_object.history['val_real_accuracy']
        ], ['Training', 'Validation'], "svm accuracy", "Epoch", "Accuracy",
                          configuration['pdf_paths'][0], 'lower right', [0, 9],
                          [0.8, 1.0])
        # Trainning vs validation fpr
        plot.history2plot([
            history_object.history['false_pos_rate'],
            history_object.history['val_false_pos_rate']
        ], ['Training', 'Validation'], "svm false positive rate", "Epoch",
                          "False positive rate", configuration['pdf_paths'][1],
                          'upper right', [0, 9], [0, 0.2])
示例#8
0
def analyze_network():
    # Parameters
    seed = 5
    model_variation = 'CNN-non-static'

    # Model Hyperparameters
    # Max lenght of one sentence
    max_length = 5
    # Number of lines included in the
    # series
    n_gram = 1
    # Total lenght of the classification
    # object
    sequence_length = max_length * n_gram
    # Size of the vector representing each word
    embedding_dim = 10
    # Conv. Filters applied to the text
    filter_sizes = (2, 3)
    # Total filters used
    num_filters = 3
    dropout_prob = (0.0, 0.0)
    # Number of neurons in the hidden layer
    hidden_dims = 10

    # Training parameters
    number_samples = 5
    samples_per_epoch = 1000
    num_epochs = 100
    val_split = 0.1
    verbose = 2

    # Word2Vec parameters, see train_word2vec
    # Minimum word count
    min_word_count = 1
    # Number of words that make sense in the context
    context = 4
    weights_file_conv = "/var/lib/arhuaco/data/models/net_W_conv-%s"\
                         % time.strftime("%Y%m%d-%H%M%S")
    model_file_conv = "/var/lib/arhuaco/data/models/net_model_conv-%s.json"\
                      % time.strftime("%Y%m%d-%H%M%S")
    # Training dataset
    paths = [
        "/var/lib/arhuaco/data/dns_normal.log",
        #"/var/lib/arhuaco/data/dns_malicious.log"]
        "/var/lib/arhuaco/data/dns_malicious_generated.log"
    ]
    # Training labels
    labels_conv = [0, 1]

    # Create objects
    data_helpers = DataHelpers(paths, None, max_length, n_gram, number_samples,
                               seed)
    w2v = W2V()
    sentence_stream = data_helpers.sentence_stream(samples_per_epoch)
    params = w2v.train_word2vec_stream(sentence_stream,
                                       num_features=embedding_dim,
                                       min_word_count=min_word_count,
                                       context=context,
                                       num_epochs=num_epochs)
    cnn_w2v = CnnW2v(seed, model_variation, sequence_length, embedding_dim,
                     filter_sizes, num_filters, dropout_prob, hidden_dims,
                     number_samples, num_epochs, val_split, min_word_count,
                     context, weights_file_conv, model_file_conv, paths, None,
                     data_helpers, verbose)
    cnn_w2v.get_data(params[0], params[1], params[2])
    cnn_w2v.build_model()
    print("Convolutional network training")
    history_conv = cnn_w2v.train_model(samples_per_epoch, labels_conv)
    cnn_w2v.paths[1] = "/var/lib/arhuaco/data/dns_malicious.log"
    result = cnn_w2v.test_model(1000, labels_conv, max_length, n_gram)
    # Graphically plot the results
    plot = Plot()
    # Training vs validation
    plot.history2plot([history_conv.history['real_accuracy'],
                       history_conv.history['val_real_accuracy']],
                       ['Training', 'Validation'],
                       "CNN accuracy", "Epoch", "Accuracy",
                       "/var/lib/arhuaco/data/models/net_cnn_accuracy-%s.pdf"\
                       % time.strftime("%Y%m%d-%H%M%S"),
                       location='lower right')
    # Trainning vs validation fpr
    plot.history2plot([history_conv.history['false_pos_rate'],
                       history_conv.history['val_false_pos_rate']],
                       ['Training', 'Validation'],
                       "CNN false positive rate", "Epoch",
                       "False positive rate",
                       "/var/lib/arhuaco/data/models/net_cnn_fpr-%s.pdf"\
                       % time.strftime("%Y%m%d-%H%M%S"),
                       location='upper right')
示例#9
0
    def train(self,
              type="syscall"
              ):
        # Load configuration
        config_object = Configuration()
        if type == "syscall":
            config_object.load_configuration("host")
            configuration = config_object.default_config

            # Training parameters
            configuration['verbose'] = 1
            configuration['samples_per_batch'] = 5
            # Thesis configuration
            configuration['samples_per_epoch'] = 100000
            # Test configuration to show concept
            # configuration['samples_per_epoch'] = 1000
            configuration['num_epochs'] = 10
            configuration['val_split'] = 0.1

            configuration['weights_file_conv'] = "/var/lib/arhuaco/data/models/sys_W_conv-%s"\
                                                 % time.strftime("%Y%m%d-%H%M%S")
            configuration['model_file_conv'] = "/var/lib/arhuaco/data/models/sys_model_conv-%s.json"\
                                               % time.strftime("%Y%m%d-%H%M%S")
            # Training dataset
            configuration['paths'] = [ "/var/lib/arhuaco/data/normal_clean_filtered.csv",
                                       "/var/lib/arhuaco/data/malicious_clean_filtered.csv"]

            configuration['pdf_paths'] = ["/var/lib/arhuaco/data/models/sys_cnn_accuracy-%s.pdf"
                                          % time.strftime("%Y%m%d-%H%M%S"),
                                          "/var/lib/arhuaco/data/models/sys_cnn_fpr-%s.pdf"\
                                          % time.strftime("%Y%m%d-%H%M%S")]

        elif type == "network":
            # Load configuration
            config_object = Configuration()
            config_object.load_configuration("network")
            configuration = config_object.default_config

            # Training parameters
            configuration['verbose'] = 1
            configuration['samples_per_batch'] = 5
            # Thesis configuration
            # configuration['samples_per_epoch'] = 10000
            # Test configuration to show concept
            configuration['samples_per_epoch'] = 1000
            configuration['num_epochs'] = 10
            configuration['val_split'] = 0.1

            configuration['weights_file_conv'] = "/var/lib/arhuaco/data/models/net_W_conv-%s"\
                                                 % time.strftime("%Y%m%d-%H%M%S")
            configuration['model_file_conv'] = "/var/lib/arhuaco/data/models/net_model_conv-%s.json"\
                                                % time.strftime("%Y%m%d-%H%M%S")
            # Training dataset
            paths = [ "/var/lib/arhuaco/data/dns_normal.log",
                      "/var/lib/arhuaco/data/dns_malicious.log"]

            configuration['pdf_paths'] = ["/var/lib/arhuaco/data/models/net_cnn_accuracy-%s.pdf"
                                          % time.strftime("%Y%m%d-%H%M%S"),
                                          "/var/lib/arhuaco/data/models/net_cnn_fpr-%s.pdf"\
                                          % time.strftime("%Y%m%d-%H%M%S")]

        # Create objects
        # First create the sources of data
        data_helpers = DataHelpers(data_source=configuration['paths'],
                                   label=None,
                                   tokens_per_line=configuration['tokens_per_line'],
                                   number_lines=configuration['number_lines'],
                                   samples_per_batch=configuration['samples_per_batch'],
                                   seed=configuration['seed'])

        # Apply the word2vec processing
        w2v = W2V()
        sentence_stream = data_helpers.sentence_stream(
                                       configuration['samples_per_epoch'])
        params = w2v.train_word2vec_stream(sentence_stream,
                                    num_features=configuration['embedding_dim'],
                                    min_word_count=configuration['min_word_count'],
                                    context=configuration['context'],
                                    num_epochs=configuration['num_epochs'])
        embedding_weights=params[0]
        vocabulary=params[1]
        vocabulary_index=params[2]

        # Create the Convolutional network object
        cnn_w2v = CnnW2v(seed=configuration['seed'],
                  samples_per_batch=configuration['samples_per_batch'],
                  min_word_count=configuration['min_word_count'],
                  context=configuration['context'],
                  weights_file=configuration['weights_file_conv'],
                  model_file=configuration['model_file_conv'],
                  labels=None,
                  verbose=configuration['verbose'])
        cnn_w2v.set_w2v_params(embedding_weights=params[0],
                               vocabulary=params[1],
                               vocabulary_index=params[2])

        # Buid the model
        cnn_w2v.build_model(learn_rate=configuration['learn_rate'],
                            momentum=configuration['momentum'],
                            decay=configuration['decay'],
                            nesterov=configuration['nesterov'],
                            regularizer_param=configuration['regularizer_param'],
                            hidden_neurons=configuration['hidden_dims'],
                            num_filters=configuration['num_filters'],
                            filter_sizes=configuration['filter_sizes'],
                            dropout_rate=configuration['dropout_prob'],
                            embedding_dim=configuration['embedding_dim'],
                            pool_size=configuration['pool_size'],
                            sequence_length=configuration['sequence_length']
                           )
        print("Convolutional training")
        # Get the data sources
        training_generator = data_helpers.get_data_chunk(vocabulary,
                                                   configuration['labels_conv'])
        validation_generator = data_helpers.get_data_chunk(vocabulary,
                                                configuration['labels_conv'])
        test_generator = data_helpers.get_data_chunk(vocabulary,
                                               configuration['labels_conv'])

        # Train and validate the model
        history_object = cnn_w2v.train_model(training_source=training_generator,
                                             validation_source=validation_generator,
                                             samples_per_epoch\
                                             =configuration['samples_per_epoch'],
                                             number_epochs=configuration['num_epochs'],
                                             val_split=configuration['val_split'])
        # Test the model with new data
        result = cnn_w2v.test_model(test_data_source=test_generator,
                                    samples_to_test=configuration['samples_per_epoch'])
        # Graphically plot the results (skip for the time being)
        ''' plot = Plot()