filename = os.path.basename(fullname) df = utils.load_h5(dir, filename) dataframes.append(df) num_examples = len(df.values) # create one large dataframe data = pd.concat(dataframes) data.sample(frac=1, random_state=seed).reset_index(drop=True) num_rows = data.shape[0] columns = data.columns print(columns) # step 3: get features (x) and scale the features # get x and convert it to numpy array # x = da.getbytes(data, 1460) standard_scaler = StandardScaler() x = da.getbytes(data, num_headers*54) x_std = standard_scaler.fit_transform(x) # step 4: get class labels y and then encode it into number # get class label data y = data['label'].values # encode the class label class_labels = np.unique(y) label_encoder = LabelEncoder() y = label_encoder.fit_transform(y) # step 5: split the data into training set and test set test_percentage = 0.1 x_tests = [] y_tests = [] x_train, x_test, y_train, y_test = train_test_split(x_std, y, test_size=test_percentage, random_state=seed) # t-distributed Stochastic Neighbor Embedding (t-SNE) visualization plot_savename = "t-sne_16headers_windows_linux_perplexity"
num_examples = 0 for dir in dirs: for fullname in glob.iglob(dir + '*.h5'): filename = os.path.basename(fullname) df = utils.load_h5(dir, filename) dataframes.append(df) num_examples = len(df.values) # create one large dataframe data = pd.concat(dataframes) data.sample(frac=1, random_state=seed).reset_index(drop=True) num_rows = data.shape[0] columns = data.columns print(columns) # step 2: get features (x) and convert it to numpy array x = da.getbytes(data, data_len) # step 3: get class labels y and then encode it into number # get class label data y = data['label'].values # encode the class label class_labels = np.unique(y) label_encoder = LabelEncoder() y = label_encoder.fit_transform(y) # step 4: split the data into training set and test set test_percentage = 0.5 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_percentage, random_state=seed) plot_savename = "histogram_payload"