def truncNgramMLP(): data_file = "./Data/train/real_train_data.csv" label_file = "./Data/train/real_train_label.csv" # data_file = "./Data/train/train.csv" # label_file = "./Data/train/train_label.csv" X = read_data_file(data_file) X = pad_sequences(X, maxlen=328, dtype='int32', padding='post', truncating='post') y = read_label_file(label_file) # print ("Shape of train data(m):\n", X.shape) # print ("Data:\n", X[0:5], "\n") # print ("Shape of train label:", y.shape) # print ("Label:\n", y[0:5], "\n") str_X = [] for i in range(X.shape[0]): str_X.append(','.join([str(k) for k in X[i]])) df = pd.DataFrame(str_X, index=range(X.shape[0]), columns=['data']) # Apply ngram and Tfidf to tfidf = TfidfVectorizer(analyzer="word", max_features=5000, ngram_range=(2, 4)) # print(tfidf) X_transformed = tfidf.fit_transform(df.data) # test_transformed = tfidf.fit_transform() X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42) # Success print("Training and testing split was successful.") print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) mlp_model = MLP(X_train.shape[1]) print(mlp_model.summary()) tensorBoardCallback = TensorBoard(log_dir='./logs/trunc_ngram_mlp', write_graph=X.shape[1]) optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=1e-4, amsgrad=False) # optimizer = SGD(lr=0.01, momentum=0.9, decay=1e-6, nesterov=False) mlp_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) mlp_model.fit(X_train, y_train, callbacks=[tensorBoardCallback], epochs=20, batch_size=128) score, acc = mlp_model.evaluate(X_test, y_test, verbose=2, batch_size=128) print("score: %.2f" % (score)) print("acc: %.2f" % (acc))
from evaluate import evaluate_model from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LogisticRegression from data import read_data_file from reduce_skewness import ReduceSkewness from Encoder import One_Hot_Encoder import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader # Read file train_df = read_data_file('adult.data') test_df = read_data_file('adult.test') # Drop the fnlwgt column which is which is useless for the later analysis train_df = train_df.drop('fnlwgt', axis=1) test_df = test_df.drop('fnlwgt', axis=1) # Get list of categorical variable object_col = train_df.select_dtypes(include=object).columns.tolist() for col in object_col: print(train_df[col].value_counts(dropna=False)/train_df.shape[0],'\n') # Convert '?' to NANs
def extract_header(in_file_path, out_file_path, skip_flag=True): BASE = 0 DOS_HEADER_LEN = 64 DOS_STUB_LEN = 14 DOS_header = [] PE_header = [] Sec_header = [] PE_header_len_list = [] Sec_header_len_list = [] raw_data = read_data_file(in_file_path) short_index = [] index = -1 print('data processing started ...') for i in raw_data: index += 1 # Remove data smaller than 97 bytes, not a PE file if (skip_flag and len(i) < 97): print('Too short data with length: ', len(i), ' index is: ', index, ' .... skipping......') short_index.append(index) continue # 0 - 78 fixed length temp_DOS = i[BASE:DOS_HEADER_LEN + DOS_STUB_LEN] DOS_header.append(temp_DOS) # Locate PE Pointer in DOS header (60, 61, 62, 63) PE_pointer = temp_DOS[DOS_HEADER_LEN - 4:DOS_HEADER_LEN] # print('PE_pointer ', PE_pointer) PE_header_offset = PE_pointer[0] + PE_pointer[1] * 256 + PE_pointer[ 2] * (256**2) + PE_pointer[3] * (256**3) # print('PE_header_offset ', PE_header_offset) # Locate PE Header section by PE Header offset PE_signature = i[PE_header_offset:PE_header_offset + 4] PE_file_header = i[PE_header_offset + 4:PE_header_offset + 24] # Locate Section Header number in PE file header and calculate section header length Sec_num_pointer = PE_file_header[2:4] # print(Sec_num_pointer) Sec_num = Sec_num_pointer[0] * 256 + Sec_num_pointer[1] Sec_header_len = Sec_num * 40 # Locate Optional header length in PE file header Opt_header_hex = PE_file_header[16:18] Opt_header_len = Opt_header_hex[0] + Opt_header_hex[1] * 256 # Calculate Section Header offset by PE header length Sec_header_offset = PE_header_offset + 24 + Opt_header_len # Get PE optional header by PE_header offset PE_opt_header = i[PE_header_offset + 24:Sec_header_offset] # Concatenate PE sigature, PE file header and PE optional header temp_PE = list() temp_PE.extend(PE_signature) temp_PE.extend(PE_file_header) temp_PE.extend(PE_opt_header) # print('temp_PE ', temp_PE) PE_header.append(temp_PE) PE_header_len_list.append(len(temp_PE)) temp_Sec_header = i[Sec_header_offset:Sec_header_offset + Sec_header_len] # truncate section header, only take the first 40bit trunc_sec_header = [] for j in range(Sec_num): trunc_sec_header.extend(temp_Sec_header[j * 40:j * 40 + 12]) Sec_header.append(trunc_sec_header) Sec_header_len_list.append(len(trunc_sec_header)) print('PE_header_max_len: ', max(PE_header_len_list)) print('Sec_header_max_len: ', max(Sec_header_len_list)) return PE_header_len_list, Sec_header_len_list
# %% [markdown] # ## Plot PE Header length distribution - Test data # Credit: [@MengdanCode](https://github.com/MengdanCode) # %% from data import read_data_file, read_label_file import numpy as np from matplotlib import pyplot as plt data_file = "./Data/train/train.csv" label_file = "./Data/train/train_label.csv" X = read_data_file(data_file) y = read_label_file(label_file) # %% X_len = [] for i in X: X_len.append(len(i)) print('X_len generated') X_len = np.array(X_len) print(X_len.min()) print(X_len.max()) fig_per_hour = plt.figure() per_hour = fig_per_hour.add_subplot(111) counts, bins, patches = per_hour.hist(X_len, bins=100, normed=False,