def load_data(): loaded_dataset = datasets.load_all(os.path.join("datasets")) # load dataset from csv tor = loaded_dataset[loaded_dataset.class1 == "TOR"] nontor = loaded_dataset[loaded_dataset.class1 == "NONTOR"] print(tor, nontor) tor_supsample = resample(tor, replace=True, # sample with replacement n_samples=nontor.shape[0], # to match majority class random_state=42) return pd.concat([tor_supsample, nontor], ignore_index=True)
def load_data(): # data = datasets.load_all(os.path.join("datasets")) # load dataset from csv # ddos = data[data.Label != "BENIGN"] # benign = data[data.Label == "BENIGN"] # ddos['Label'] = "DDoS" # # subsample = resample(benign, # replace=True, # n_samples=ddos.shape[0], # random_state=42) # # return pd.concat([ddos, subsample], ignore_index=True) return datasets.load_all(os.path.join("datasets")) # load dataset from csv
def load_data(logger): benign = datasets.load_all(os.path.join( "datasets", DATASET_NAME_2)) # load dataset from csv ransomware = datasets.load_all(os.path.join( "datasets", DATASET_NAME)) # load dataset from csv logger.info("{} {}".format("benign shape", benign.shape)) logger.info("{} {}".format("ransomware shape", ransomware.shape)) benign = datasets.prepare_dataset(benign, shuffle=True) ransomware = datasets.prepare_dataset(ransomware, shuffle=True) n_elements = min(benign.shape[0], ransomware.shape[0], 150000) benign = benign.head(n_elements) ransomware = ransomware.head(n_elements) logger.info("{} {}".format("benign shape after balancing", benign.shape)) logger.info("{} {}".format("ransomware shape after balancing", ransomware.shape)) ransomware["Label"] = DATASET_NAME.upper() return pd.concat([benign, ransomware], ignore_index=True) # union dataset
def load_data(): data = datasets.load_all(os.path.join("datasets")) # load dataset from csv asware = data[data.calss == "asware"] benign = data[data.calss == "benign"] n_samples = min(asware.shape[0], benign.shape[0], 100000) benign = resample(benign, replace=False, n_samples=n_samples, random_state=42) asware = resample(asware, replace=False, n_samples=n_samples, random_state=42) return pd.concat([asware, benign], ignore_index=True)
def load_data(): return datasets.load_all(os.path.join("datasets")) # load dataset from csv
def calc(): if not os.path.exists(RESULTS_FOLDER_PATH): os.makedirs(RESULTS_FOLDER_PATH) logfile = os.path.join(RESULTS_FOLDER_PATH, "log.log") if os.path.exists(logfile): os.remove(logfile) # logging stuff level = logging.INFO formats = { "console": '\u001b[37m %(message)s\033[0m', "file": '%(message)s' } file_handler, console_handler = logging.FileHandler( logfile, "x"), logging.StreamHandler() console_handler.setFormatter(logging.Formatter(formats["console"])) file_handler.setFormatter(logging.Formatter(formats["file"])) logger = logging.getLogger(__name__) logger.addHandler(console_handler) logger.addHandler(file_handler) logger.setLevel(logging.INFO) # begin calc benign = datasets.load_all(os.path.join( "datasets", DATASET_NAME_2)) # load dataset from csv scareware = datasets.load_all(os.path.join( "datasets", DATASET_NAME)) # load dataset from csv logger.info("{} {}".format("benign shape", benign.shape)) logger.info("{} {}".format("scareware shape", scareware.shape)) benign = datasets.prepare_dataset(benign, shuffle=True) scareware = datasets.prepare_dataset(scareware, shuffle=True) n_elements = min(benign.shape[0], scareware.shape[0], 150000) benign = benign.head(n_elements) scareware = scareware.head(n_elements) logger.info("{} {}".format("benign shape after balancing", benign.shape)) logger.info("{} {}".format("scareware shape after balancing", scareware.shape)) scareware["Label"] = DATASET_NAME.upper() loaded_dataset = pd.concat([benign, scareware], ignore_index=True) # union dataset logger.info(loaded_dataset.head()) loaded_dataset.info() benign = None scareware = None logger.info("{} {}".format("Dataset shape BEFORE preparation", loaded_dataset.shape)) dataset = datasets.prepare_dataset(loaded_dataset, drop_columns=[ "Flow Bytes/s", "Flow Packets/s", "Flow ID", "Source IP", "Destination IP", "Timestamp", "Fwd Header Length.1" ], shuffle=True, dropna_axis=True) loaded_dataset = None logger.info("{} {}".format("Dataset shape AFTER preparation", dataset.shape)) xTest, yTest = datasets.separate_labels(dataset, encode=True) dataset = None xTest = datasets.drop_variance(xTest) roc_auc_scores = [] roc_fpr_tpr_thres = [] # Estimators number test logger.info("Estimators number test") for i in range(4, 30, 4): n_estimators = i**2 logger.info("Training random forest with {} estimators ({})".format( n_estimators, i)) clf = RandomForestClassifier( n_estimators=n_estimators, n_jobs=-1, random_state=42) # Random Forest Classifier roc, auc_score = random_forest.fit_and_roc(clf, xTest, yTest) save_result(roc, auc_score, "estimators", n_estimators, roc_fpr_tpr_thres, roc_auc_scores) # Max depth number test roc_auc_scores = [] roc_fpr_tpr_thres = [] logger.info("max depth number test") for i in range(1, 11): max_depth = 2**i logger.info("Training random forest with {} max depth ({})".format( max_depth, i)) rnd_forest = RandomForestClassifier( n_estimators=144, max_depth=max_depth, n_jobs=-1, random_state=42) # Random Forest Classifier roc, auc_score = random_forest.fit_and_roc(rnd_forest, xTest, yTest) save_result(roc, auc_score, "max_depth", max_depth, roc_fpr_tpr_thres, roc_auc_scores) # Min Sample Leaf number test roc_auc_scores = [] roc_fpr_tpr_thres = [] logger.info("Min Sample Leaf number test") for i in range(1, 11): min_sample_leaf = i logger.info( "Training random forest with {} min sample leaf ({})".format( min_sample_leaf, i)) rnd_forest = RandomForestClassifier( n_estimators=144, max_depth=32, min_samples_leaf=min_sample_leaf, n_jobs=-1, random_state=42) # Random Forest Classifier roc, auc_score = random_forest.fit_and_roc(rnd_forest, xTest, yTest) save_result(roc, auc_score, "min_sample_leaf", min_sample_leaf, roc_fpr_tpr_thres, roc_auc_scores) roc_auc_scores, roc_fpr_tpr_thres = [], [] xTest = None yTest = None file_handler.close() console_handler.close()