import numpy as np from read_dataset import read_data # define input symbols X, y_array = read_data('./dataset/dataset.csv') # define variable symbols W = [np.random.rand(), np.random.rand()] b = np.random.rand() lose_rate = 0.01 min_error_threshold = 0.5 train = int(3 / 4 * len(X)) test = len(X) - train def sigmoid(x): return 1 / (1 + np.exp(-x)) def derivative_sigmoid(x, w, b): wx_plus_b = w * x + b return x * sigmoid(wx_plus_b) * (1 - sigmoid(wx_plus_b)) def calculate_y(W, X, b): w2 = np.array(W) x2 = np.array(X) w2.reshape([2, 1]) np.matmul(x2, w2) + b return np.matmul(X, W) + b
''' Building a malware analysis tool, based on automatic feature selection Automatic Feature Selection: automatic feature weighting and selection based on statistical properties of the training set, where features are ranked based upon their significance. ''' import features_selection import read_dataset from sklearn.model_selection import train_test_split import train import time # reads dataset, classify content {malware|not malware} # returns x = {file|features' occurrences} and y = {labels} x, y = read_dataset.read_data() # select features from the already classified dataset # to be used to train our model print('\nFeatures Selection based on KBest: ') features_selection.select_features_k_best(x, y) print('\nFeatures Selection based on Recursive Features Elimination: ') features_selection.select_features_recursive_feature_elimination(x, y) print('\nFeatures Selection based on Extra trees classifier: ') features_selection.select_features_extra_trees(x, y) print('\nFeatures Selection based on Random Forest classifier: ') features_selection.select_features_random_forest(x, y) # Split data into training and testing sets of 80% - 20%
from sklearn.model_selection import train_test_split from tensorflow.keras.preprocessing.image import ImageDataGenerator # Default params batch_size = 128 epochs = 100 training_dir = "training" checkpoint_format = "weights.{epoch:04d}-{val_loss:.2f}.h5" period = 5 # Create training dir common.create_dir_if_not_exists(training_dir) # Read data X_train, y_train = read_dataset.read_data('train_new/') print(f'X_train.shape = {X_train.shape}, y_train.shape = {y_train.shape}') # Create model model = common.create_baseline_model() # Baseline model # model = common.create_lenet5_model() # LeNet5 # model = common.create_alexnet_model() # AlexNet # model = common.create_vgg_model() # model = common.create_cnn_model() # Summary model print("=" * 80) model.summary() input("Press Enter to continue...") print("=" * 80)
from read_dataset import read_data, read_labels, convert_to_ascii import os import numpy as np path_list = ["/Train/"] # ,"/Test/","/Validate/"] label_list = ["Train_labels.txt"] # ,"Test_labels.txt","Validate_labels.txt"] for path in path_list: x = read_data(os.getcwd() + path) x = np.asarray([i[1].T for i in sorted(x.items())]) for label in label_list: y = read_labels(label) y = np.array([np.array(convert_to_ascii(i)) for i in y.values()]) np.save("labels.npy", y) np.save("train_data.npy", x)
import sys, os import numpy as np import common import read_dataset # Read data X_test, y_test = read_dataset.read_data('test_new/') print(f'X_test.shape = {X_test.shape}, y_test.shape = {y_test.shape}') # Create model model = common.create_baseline_model() # Baseline model # model = common.create_lenet5_model() # LeNet5 # model = common.create_alexnet_model() # AlexNet model # model = common.create_vgg_model() # VGG model # model = common.create_cnn_model() # Load weights try: checkpoint_filepath = sys.argv[1] model.load_weights(checkpoint_filepath) except IndexError: print("Usage: " + os.path.basename(__file__) + " <checkpoint_filepath>") sys.exit(1) # Evaluate print("=" * 80) print('Evaluate on test data') results = model.evaluate(X_test, y_test) print(f'Test loss = {results[0]:.2f}') print(f'Test acc = {results[1]*100:.2f}%')