### Remove redundant columns, useless columns and unused targets ### K: _number columns are numerical representations of other existing columns. ### K: category and subcategory are other labels. ### K: saddr and daddr may specialize the model to a single network redundant_columns = ['state_number', 'proto_number', 'flgs_number'] other_targets = ['category', 'subcategory'] misc_columns = ['saddr', 'daddr'] print('Removing redundant columns:', redundant_columns) print('Removing useless targets:', other_targets) print('Removing misc columns:', misc_columns) columns_to_remove = redundant_columns + other_targets + misc_columns df.drop(axis='columns', columns=columns_to_remove, inplace=True) ############################################################################### ### Remove NaN columns (with a lot of NaN values) df, log = remove_nan_columns(df, 1 / 2, verbose=False) print(log) ############################################################################### ### Encode categorical features print('Encoding categorical features (ordinal encoding).') my_encoder = OrdinalEncoder() df['flgs'] = my_encoder.fit_transform(df['flgs'].values.reshape(-1, 1)) df['proto'] = my_encoder.fit_transform(df['proto'].values.reshape(-1, 1)) df['sport'] = my_encoder.fit_transform(df['sport'].astype(str).values.reshape( -1, 1)) df['dport'] = my_encoder.fit_transform(df['dport'].astype(str).values.reshape( -1, 1)) df['state'] = my_encoder.fit_transform(df['state'].values.reshape(-1, 1)) print('Objects:', list(df.select_dtypes(['object']).columns))
state = 0 try: state = int (sys.argv [1]) except: pass print ("STATE = ", state) STATES = [0, 10, 100, 1000, 10000] pd.set_option ('display.max_rows', None) pd.set_option ('display.max_columns', 5) df = load_dataset () print ("Data Loaded") remove_columns_with_one_value (df, verbose=False) remove_nan_columns (df, 0.6, verbose=False) #making the final DataFrame #dropping the number of the rows column df = df.drop(df.columns[0], axis=1) #dropping unrelated columns df.drop(axis='columns', columns=['ts', 'te', 'sa', 'da'], inplace=True) #sampling the df df = df.sample (frac=1, replace=True, random_state=0) ################################# ## Encoding the data ## ################################# cat_cols, num_cols = df.columns[df.dtypes == 'O'], df.columns[df.dtypes != 'O']