from keras.models import * from keras.optimizers import Adam from malware_classification import common_process_data as read_data from keras.layers import Input, LSTM, Bidirectional, Conv2D, Reshape batch_size = 64 TIME_STEPS = 25 INPUT_DIM = 25 lstm_units = 128 num_classes = 15 epochs = 40 # data pre-processing # (X_train, y_train), (X_test, y_test) = mnist.load_data('mnist.npz') (X_train, y_train), (X_test, y_test) = read_data.load_npz_data( "F:/数据集/Kim2016/malware_dataset/malware_dataset/attention_train_test_data.npz" ) X_train = X_train.reshape(-1, 25, 25, 1) / 255. X_test = X_test.reshape(-1, 25, 25, 1) / 255. y_train = np_utils.to_categorical(y_train, num_classes=num_classes) y_test = np_utils.to_categorical(y_test, num_classes=num_classes) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) # build RNN model with attention inputs = Input(shape=(25, 25, 1)) # build CNN model linked RNN outputs x = Conv2D(filters=128, kernel_size=(5, 5), padding='same',
import time from malware_classification.Self_Attention import Self_Attention_Layer from malware_classification import common_process_data as read_data from malware_classification import global_var as GLVAR from keras.models import Model from keras.layers import * max_features = GLVAR.TOTAL_OPERATIONS_COUNT + 1 # 该数要比operation的个数大1 epochs = 25 batch_size = 32 print('Loading data...') (x_train, y_train), (x_test, y_test) = read_data.load_npz_data(GLVAR.TRAIN_AND_TEST_DATA) X_train = x_train.reshape(-1, GLVAR.pic_pow_size * GLVAR.pic_pow_size) # why / 255? X_test = x_test.reshape(-1, GLVAR.pic_pow_size * GLVAR.pic_pow_size) # 标签转换为独热码 y_train = np_utils.to_categorical(y_train, num_classes=GLVAR.NUM_CLASSES) y_test = np_utils.to_categorical(y_test, num_classes=GLVAR.NUM_CLASSES) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') # %%数据归一化处理 maxlen = GLVAR.pic_pow_size * GLVAR.pic_pow_size print('x_train shape:', x_train.shape)
print("cnn_x_test length is:", len(cnn_x_test.shape)) cnn_x_train = cnn_x_train.reshape(-1,64,32,1) cnn_x_test = cnn_x_test.reshape(-1,64,32,1) print(cnn_x_train.shape) print(cnn_x_test.shape) np.savez(GLVAR.MULTY_BINARY_CNN_TRAIN_TEST_DATA, x_train=cnn_x_train, x_test=cnn_x_test, y_train=y_train,y_test=y_test) ''' max_features = 1000 # 该数要比operation的个数大1 maxlen = 2048 (x_train, y_train), (x_test, y_test) = read_data.load_npz_data( GLVAR.MULTY_BINARY_CNN_TRAIN_TEST_DATA) x_train = x_train.reshape(-1, 2048) x_test = x_test.reshape(-1, 2048) S_inputs = Input(shape=(maxlen, ), dtype='float32') embeddings = Embedding(max_features, 256)(S_inputs) O_seq = Self_Attention_Layer(256)(embeddings) O_seq = GlobalAveragePooling1D()(O_seq) # O_seq = Dropout(0.5)(O_seq) outputs = Dense(GLVAR.NUM_CLASSES, activation='softmax')(O_seq)
def train_self_attention(lable_name, epochs, batch_size, score_filename): print("================training ", lable_name, ".......================") os.environ["CUDA_VISIBLE_DEVICES"] = "0" gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) keras.backend.set_session(sess) max_features = GLVAR.TOTAL_OPERATIONS_COUNT + 1 # 该数要比operation的个数大1 print('Loading data...') current_lable_data = GLVAR.MULTY_BINARY_TRAIN_AND_TEST_DATA_DIR + lable_name + '.npz' print("Current lable train and test data dir is : %s" % (current_lable_data)) (x_train, y_train), (x_test, y_test) = read_data.load_npz_data(current_lable_data) x_train = x_train.reshape(-1, GLVAR.pic_pow_size * GLVAR.pic_pow_size) # why / 255? x_test = x_test.reshape(-1, GLVAR.pic_pow_size * GLVAR.pic_pow_size) # 标签转换为独热码 y_train = np_utils.to_categorical( y_train, num_classes=GLVAR.NUM_CLASSES_OF_MULTY_BINARY) y_test = np_utils.to_categorical( y_test, num_classes=GLVAR.NUM_CLASSES_OF_MULTY_BINARY) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') # %%数据归一化处理 maxlen = GLVAR.pic_pow_size * GLVAR.pic_pow_size print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) S_inputs = Input(shape=(maxlen, ), dtype='int32') embeddings = Embedding(maxlen, 256)(S_inputs) O_seq = Self_Attention_Layer(256)(embeddings) O_seq = GlobalAveragePooling1D()(O_seq) O_seq = Dropout(0.5)(O_seq) O_seq = Dense(16, activation='softmax')(O_seq) outputs = Dense(GLVAR.NUM_CLASSES_OF_MULTY_BINARY, activation='softmax')(O_seq) model = Model(inputs=S_inputs, outputs=outputs) print(model.summary()) # try using different optimizers and different optimizer configs recall = keras_metrics.binary_recall(label=0) # 使用适于二分类的loss函数 binary_crossentropy model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', recall]) # %% print('Training') h = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test)) model_filename = GLVAR.MULTY_BINARY_SELF_ATTENTION_MODEL_DIR + lable_name + ".h5" model.save(model_filename) plt.plot(h.history["loss"], label="train_loss") plt.plot(h.history["val_loss"], label="val_loss") plt.plot(h.history["acc"], label="train_acc") plt.plot(h.history["val_acc"], label="val_acc") plt.legend() plt.show() print("-----------------------DY Add------------------------") show_train_history(h, 'acc', 'val_acc', epochs) show_train_history(h, 'loss', 'val_loss', epochs) print('Testing--------------') loss, accuracy, recall = model.evaluate(x_test, y_test, batch_size=batch_size) print('test loss:', loss) print('test accuracy:', accuracy) print('test recall:', recall) print("\t[Info] Accuracy of testing data = {:2.1f}%".format(accuracy * 100.0)) score = "----------" + lable_name + "----------\n" + "test loss:" + str( format(loss, '.2f')) + "%\ntest accuracy:" + str( format(accuracy, '.2f')) + "%\ntest recall:" + str( format(recall, '.2f')) + "%\n" with open(score_filename, "a") as f: f.write(score)
from matplotlib import pyplot as plt import time from malware_classification.Self_Attention import Self_Attention_Layer from malware_classification import common_process_data as read_data from malware_classification import global_var as GLVAR from keras.models import Model from keras.layers import * max_features = GLVAR.TOTAL_OPERATIONS_COUNT + 1 # 该数要比operation的个数大1 epochs=30 batch_size = 32 print('Loading data...') (x_train, y_train), (x_test, y_test) = read_data.load_npz_data(GLVAR.TRAIN_AND_TEST_DATA) X_train = x_train.reshape(-1,GLVAR.pic_pow_size * GLVAR.pic_pow_size) # why / 255? X_test = x_test.reshape(-1,GLVAR.pic_pow_size * GLVAR.pic_pow_size) # 标签转换为独热码 y_train = np_utils.to_categorical(y_train, num_classes=GLVAR.NUM_CLASSES) y_test = np_utils.to_categorical(y_test, num_classes=GLVAR.NUM_CLASSES) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') # %%数据归一化处理 maxlen = GLVAR.pic_pow_size * GLVAR.pic_pow_size print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape)