def main(): data = Data() logistic_regression = models.LogisticRegression() neural_network = models.NeuralNet() svm = models.SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale') random_forest = models.RandomForest(n_estimators=100, max_depth=None, random_state=None) # Process dataset training_data_features, training_data_labels, mnist_test_data_features, mnist_test_data_labels, \ usps_test_data_features, usps_test_data_labels, combined_test_data_features, combined_test_data_labels = \ data.pre_process() # Logistic Regression logistic_regression.fit(training_data_features, training_data_labels, learning_rate=0.01, epochs=500) accuracy_mnist, confusion_mnist = logistic_regression.predict( mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = logistic_regression.predict( usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = logistic_regression.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('Logistic Regression', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Neural Network neural_network.fit(training_data_features, training_data_labels, epochs=10) accuracy_mnist, confusion_mnist = neural_network.predict( mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = neural_network.predict( usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = neural_network.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('Neural Network', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Support Vector Machine svm.fit(training_data_features, training_data_labels) accuracy_mnist, confusion_mnist = svm.predict(mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = svm.predict(usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = svm.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('SVM', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Random Forest random_forest.fit(training_data_features, training_data_labels) accuracy_mnist, confusion_mnist = random_forest.predict( mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = random_forest.predict( usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = random_forest.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('Random Forest', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined)
#loaded_model = load("model_SVC.joblib") #SVM.test_svm_classifier(loaded_model, val_data, val_labels) loaded_model = load_model("models/best_model_DNN_Adam.h5") NN.test_neural_network(loaded_model, val_data, val_labels) if __name__ == "__main__": total_features = 545333 # total unique features testing_set_size = 1500 # set site that will be used to create random test set malware_ratio = 0.3 # malware ratio in the set size print("Creating data-labels...") onehot.create_list_of_apps() # function from set_one_encoding.py # initialize sklearn models GNB = models.GaussianNaiveBayes() MNB = models.MultinomialNaiveBayes() CNB = models.ComplementNaiveBayes() BNB = models.BernoulliNaiveBayes() DT = models.DecisionTree() RF = models.RandomForest() KNN = models.KNearestNeighbors() LR = models.LogRegression() SVM = models.SupportVectorMachine() val_runs = 8 #evaluate_models(val_runs) evaluate_on_test_set()
def train_external_detector(): train_data, train_labels, test_data, test_labels = create_sets() trained_model = tf.keras.models.load_model('best_model_Adam.h5') predict_original = trained_model.predict(train_data) confusion = confusion_matrix(train_labels, np.argmax(predict_original, axis=1)) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] FNR_original = FN / float(FN + TP) * 100 FPR = FP / float(FP + TN) * 100 accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 print(confusion) print("Original FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) print("Original Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR_original) average_changes = 0 amount_malwares = 0 averageChanges = 0 # the numpy array will be filled dynamically adversarial_data = np.zeros((0, 3880), dtype=float) for i in range(len(train_data)): if train_labels[i] == 1: x = train_data[i:i + 1] # print("x: ", x) # print(x.shape) try: adv_x, changes = craft_adversarial_samples( x, 0, trained_model, 1) # print(adv_x) # append the adversarial data to the numpy array adversarial_data = np.concatenate((adversarial_data, adv_x)) if changes >= 0: average_changes += changes amount_malwares += 1 except NameError: pass except ValueError: pass if amount_malwares > 0: averageChanges += (average_changes / float(amount_malwares)) train_data, train_labels, test_data, test_labels = create_sets() predictions = trained_model.predict(train_data) confusion = confusion_matrix(train_labels, np.argmax(predictions, axis=1)) print(confusion) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] FNR = FN / float(FN + TP) * 100 FPR = FP / float(FP + TN) * 100 accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) print("Misclassification Rate:", FNR - FNR_original) print("Distortion:", averageChanges) predictions = trained_model.predict(adversarial_data) adversarial_labels = np.ones((len(adversarial_data), ), dtype=int) confusion = confusion_matrix(adversarial_labels, np.argmax(predictions, axis=1)) print(confusion) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] FNR = FN / float(FN + TP) * 100 FPR = FP / float(FP + TN) * 100 accuracy = ((TP + TN) / float(TP + TN + FP + FN)) * 100 print("Adversarial FP:", FP, "- FN:", FN, "- TP:", TP, "- TN", TN) print("Adversarial Accuracy:", accuracy, "- FPR:", FPR, "- FNR:", FNR) print("Misclassification Rate:", FNR - FNR_original) print("Distortion:", averageChanges) print(changes_dict) del predict_original, FNR_original, predictions, confusion, TP, TN, FP, FN, FNR, FPR, accuracy # concatenate legit with produced adversarial input final_train_data = np.concatenate((train_data, adversarial_data)) print("final train data shape:", final_train_data.shape) train_labels = np.zeros((len(train_labels), ), dtype=int) # fill with 0 (the original class) print("train labels shape:", train_labels.shape) adverarial_labels = np.ones( (len(adversarial_data), ), dtype=int) # fill with 1 (the adversarial class) print("adversarial labels:", adverarial_labels.shape) final_train_labels = np.concatenate((train_labels, adverarial_labels)) print("final labels shape:", final_train_labels.shape) print("Unique classes:", np.unique(final_train_labels)) del train_data, train_labels, adversarial_data, adverarial_labels #shuffle the set shuffle(final_train_data, final_train_labels, random_state=123) # train with the augmented dataset (with adverarial examples belong to class '1') model = generate_neural_network(total_features, [200, 200], 0.2, 0.001, "glorot_uniform", "zeros", "relu", 2) train_neural_network(model, epochs=30, batch_size=150, features=final_train_data, labels=final_train_labels, verbose=2, validation=True, val_data=final_train_data, val_labels=final_train_labels, callbacks=True, path=dir_path + "logs/fit/", model_name="external_detector_2") GNB = models.GaussianNaiveBayes() MNB = models.MultinomialNaiveBayes() CNB = models.ComplementNaiveBayes() BNB = models.BernoulliNaiveBayes() DT = models.DecisionTree() RF = models.RandomForest() KNN = models.KNearestNeighbors() LR = models.LogRegression() SVM = models.SupportVectorMachine() model = GNB.train_gaussian_naive_bayes_classifier( final_train_data, final_train_labels) # train Naive Bayes score_GNB = GNB.evaluate_gaussian_naive_bayes_classifier( model, final_train_data, final_train_labels) # test performance print("GNB", score_GNB) model = MNB.train_multi_naive_bayes_classifier(final_train_data, final_train_labels) score_MNB = MNB.evaluate_multi_naive_bayes_classifier( model, final_train_data, final_train_labels) print("MNB", score_MNB) model = CNB.train_complement_naive_bayes_classifier( final_train_data, final_train_labels) score_CNB = CNB.evaluate_complement_naive_bayes_classifier( model, final_train_data, final_train_labels) print("CNB", score_CNB) model = BNB.train_bernoulli_naive_bayes_classifier(final_train_data, final_train_labels) score_BNB = BNB.evaluate_bernoulli_naive_bayes_classifier( model, test_data, test_labels) print("BNB", score_BNB) model = DT.train_decision_tree_classifier( final_train_data, final_train_labels) # train Decision Tree Classifier score_dt = DT.evaluate_decision_tree_classifier(model, final_train_data, final_train_labels) print("DT:", score_dt) model = LR.train_logistic_regression_classifier( final_train_data, final_train_labels) # train logistic Regression score_lr = LR.evaluate_logistic_regression_classifier( model, final_train_data, final_train_labels) print("LR", score_lr) model = KNN.train_knn_classifier( final_train_data, final_train_labels) # train k-Nearest Neighbors Classifier score_knn = KNN.evaluate_knn_classifier(model, final_train_data, final_train_labels) print("KNN", score_knn) model = SVM.train_svm_classifier( final_train_data, final_train_labels) # train Support Vector Machines score_svm = SVM.evaluate_svm_classifier(model, final_train_data, final_train_labels) print("SVM", score_svm) model = RF.train_random_forest_classifier( final_train_data, final_train_labels) # train Random Forest score_rf = RF.evaluate_random_forest_classifier(model, final_train_data, final_train_labels) print("RF:", score_rf)
def main(): data = Data() logistic_regression = models.LogisticRegression() neural_network = models.NeuralNet() svm = models.SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale') random_forest = models.RandomForest(n_estimators=100, max_depth=None, random_state=None) discriminant_analysis = DiscriminantAnalysis() vaecnn = deep_learning_models.VAEConvolutionNeuralNet( input_data.read_data_sets("data", one_hot=True), (28, 28), (28, 28)) # Process dataset training_data_features, training_data_labels, mnist_test_data_features, mnist_test_data_labels, \ usps_test_data_features, usps_test_data_labels, combined_test_data_features, combined_test_data_labels = \ data.pre_process() # Discriminant Analysis IMAGE_SIZE = int(training_data_features.shape[-1]**0.5) discriminant_analysis.fit( training_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)), training_data_labels) accuracy_mnist, confusion_mnist = discriminant_analysis.predict( 'MNIST dataset', mnist_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)), mnist_test_data_labels) accuracy_usps, confusion_usps = discriminant_analysis.predict( 'USPS dataset', usps_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)), usps_test_data_labels) accuracy_combined, confusion_combined = discriminant_analysis.predict( 'Combined dataset', combined_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)), combined_test_data_labels) print_and_plot('Bayesian Discriminant Analysis', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Logistic Regression logistic_regression.fit(training_data_features, training_data_labels, learning_rate=0.01, epochs=500) accuracy_mnist, confusion_mnist = logistic_regression.predict( mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = logistic_regression.predict( usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = logistic_regression.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('Logistic Regression', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Neural Network neural_network.fit(training_data_features, training_data_labels, epochs=10) accuracy_mnist, confusion_mnist = neural_network.predict( mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = neural_network.predict( usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = neural_network.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('Neural Network', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Support Vector Machine svm.fit(training_data_features, training_data_labels) accuracy_mnist, confusion_mnist = svm.predict(mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = svm.predict(usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = svm.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('SVM', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Random Forest random_forest.fit(training_data_features, training_data_labels) accuracy_mnist, confusion_mnist = random_forest.predict( mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = random_forest.predict( usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = random_forest.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('Random Forest', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Restricted Boltzmann Machine num_hidden_nodes_list = [20, 100, 500] for num_hidden_nodes in num_hidden_nodes_list: rbm = deep_learning_models.RBM(images=input_data.read_data_sets( "data", one_hot=True), n_components=num_hidden_nodes, learning_rate=0.02, batch_size=100, n_iter=1000, random_state=0) rbm.fit() rbm.gibbs_sampling(1000) rbm.generate_images(num_hidden_nodes) # Variational Auto Encoders code_unit_list = [2, 8, 16] for code_unit in code_unit_list: vae = deep_learning_models.VAE( input_data.read_data_sets("data", one_hot=True), code_unit) vae.generate_images(epochs=20) # Variational Auto Encoders with Convolutional Neural Networks vaecnn.encode() vaecnn.decode() vaecnn.compile_() vaecnn.train(epochs=10, batch_size=100)
def main(): data = Data() logistic_regression = models.LogisticRegression() neural_network = models.NeuralNet() svm = models.SupportVectorMachine(C=1.0, kernel='rbf', gamma='scale') random_forest = models.RandomForest(n_estimators=100, max_depth=None, random_state=None) discriminant_analysis = DiscriminantAnalysis() # Process dataset training_data_features, training_data_labels, mnist_test_data_features, mnist_test_data_labels, \ usps_test_data_features, usps_test_data_labels, combined_test_data_features, combined_test_data_labels = \ data.pre_process() # Discriminant Analysis IMAGE_SIZE = int(training_data_features.shape[-1]**0.5) discriminant_analysis.fit( training_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)), training_data_labels) accuracy_mnist, confusion_mnist = discriminant_analysis.predict( 'MNIST dataset', mnist_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)), mnist_test_data_labels) accuracy_usps, confusion_usps = discriminant_analysis.predict( 'USPS dataset', usps_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)), usps_test_data_labels) accuracy_combined, confusion_combined = discriminant_analysis.predict( 'Combined dataset', combined_test_data_features.reshape((-1, IMAGE_SIZE, IMAGE_SIZE)), combined_test_data_labels) print_and_plot('Bayesian Discriminant Analysis', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Logistic Regression logistic_regression.fit(training_data_features, training_data_labels, learning_rate=0.01, epochs=500) accuracy_mnist, confusion_mnist = logistic_regression.predict( mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = logistic_regression.predict( usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = logistic_regression.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('Logistic Regression', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Neural Network neural_network.fit(training_data_features, training_data_labels, epochs=10) accuracy_mnist, confusion_mnist = neural_network.predict( mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = neural_network.predict( usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = neural_network.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('Neural Network', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Support Vector Machine svm.fit(training_data_features, training_data_labels) accuracy_mnist, confusion_mnist = svm.predict(mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = svm.predict(usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = svm.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('SVM', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined) # Random Forest random_forest.fit(training_data_features, training_data_labels) accuracy_mnist, confusion_mnist = random_forest.predict( mnist_test_data_features, mnist_test_data_labels) accuracy_usps, confusion_usps = random_forest.predict( usps_test_data_features, usps_test_data_labels) accuracy_combined, confusion_combined = random_forest.predict( combined_test_data_features, combined_test_data_labels) print_and_plot('Random Forest', accuracy_mnist, accuracy_usps, accuracy_combined, confusion_mnist, confusion_usps, confusion_combined)