예제 #1
0
def evaluator(args):
    is_flow_cache_experiment = True
    K = 10
    samplerdir, classifier_name, benign_threshold = args
    print('treshold at ', benign_threshold)

    clf_dir = get_classifier_dir(samplerdir,
                                 classifier_name,
                                 class_weight=None)
    gt_classes_str = pd.read_csv(join(samplerdir, '{}fold_0.csv'.format(K)),
                                 usecols=['Label'])['Label'].unique()
    gt_classes = sorted(encode_label(gt_classes_str))

    C = len(gt_classes)
    cm_any_sum = np.zeros((C, C), dtype=float)
    cm_majority_sum = np.zeros((C, C), dtype=float)
    cm_all_sum = np.zeros((C, C), dtype=float)

    col_names = ['Timestamp'] + get_cols4eval()
    for test_index in range(K):
        runs_dir = join(clf_dir, 'K_{}/log/{}'.format(K, test_index))
        if is_flow_cache_experiment:
            runs_dir = replace_w_unlimited_FC(runs_dir)
        clf = load_classifier(classifier_name, runs_dir)

        test_csv_file = join(samplerdir, '{}fold_{}.csv'.format(K, test_index))
        df = pd.read_csv(test_csv_file,
                         usecols=col_names,
                         dtype=get_dtype4normalized())  #,skiprows=skip_idx)
        df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(
            str)  # type string
        df = df.sort_values(by=['Flow ID', 'Day', 'Label'
                                ])  #used when deriving flow level metric

        pred_per_record = predict_proba_per_record(df, clf, benign_threshold)
        flowids, flowlabels_str, grouped = group_data(df)

        y = encode_label(flowlabels_str)
        pred_any, pred_maj, pred_all = evaluate_per_flow(
            grouped, y, pred_per_record)

        any_cm = confusion_matrix(y, pred_any)
        majority_cm = confusion_matrix(y, pred_maj)
        all_cm = confusion_matrix(y, pred_all)

        cm_any_sum += any_cm
        cm_majority_sum += majority_cm
        cm_all_sum += all_cm
        #gt_classes = np.unique(y)
        result_logger_ids18(
            join(clf_dir,
                 'K_{}_benign_threshold_{}'.format(K, benign_threshold)),
            gt_classes, (any_cm, majority_cm, all_cm),
            'fold_{}_'.format(test_index))
    result_logger_ids18(
        join(clf_dir, 'K_{}_benign_threshold_{}'.format(K, benign_threshold)),
        gt_classes, (cm_any_sum, cm_majority_sum, cm_all_sum),
        'fold_avg_'.format(K))
예제 #2
0
def load_folds(dataroot, fold_prefix, K):
    df_list = [
        pd.read_csv(join(dataroot, fold_prefix.format(i)),
                    usecols=get_cols4ml(),
                    dtype=get_dtype4normalized()) for i in range(K)
    ]

    fold_data = [ (df.drop(columns=['Label']).values, encode_label(df.Label.values)) \
    for df in df_list]
    return fold_data
예제 #3
0
    def __init__(self, csv_file):
        self.csv_file = csv_file
        self.num_records = self.get_num_records(csv_file)

        df = pd.read_csv(csv_file,
                         engine='c',
                         usecols=get_cols4ml(),
                         dtype=get_dtype4normalized())
        self.x = torch.FloatTensor(df.drop(columns=['Label']).values)
        self.y = torch.LongTensor(self.encode_label(df.Label.values))
예제 #4
0
def shuffle(fn):
    print(fn)
    df = pd.read_csv(fn, dtype=get_dtype4normalized(), engine='c') # 1min

    df = df.sample(frac=1, random_state = getSeed(), replace=False)# 20 sec

    fn_o = fn.replace('fold','shuffled_fold')
    tick = time.time()
    df.to_csv(fn_o,index=False, chunksize=10**4) # 7min
    print("Wrote in {:.2f}".format(time.time()-tick))
예제 #5
0
    def __init__(self, csv_file, chunksize=10**4):
        self.csv_file = csv_file
        self.chunksize = chunksize
        self.seen_so_far = 0  # number of flow records seen so far
        self.seen_chunks = 0
        self.iterableReader = pd.read_csv(csv_file,
                                          engine='c',
                                          usecols=get_cols4ml(),
                                          dtype=get_dtype4normalized(),
                                          chunksize=chunksize)

        label_to_id, id_to_label, _ = get_ids18_mappers()
        self.label_to_id = label_to_id
예제 #6
0
def evaluator(dataroot, classifier_name):
    print('evaluating ', ntpath.basename(dataroot))

    test_csv_file = join(dataroot, 'fold_0.csv')
    result_test = subprocess.run(['wc', '-l', test_csv_file],
                                 stdout=subprocess.PIPE)
    test_records = int(result_test.stdout.split()[0])

    # load Classifier
    class_weight = get_class_weights(
        dataroot)  # because it is not important for evaluation
    num_class = 14  # because we remove Label,FlowID,Timestamp columns from X
    classifier_args, config = get_args(classifier_name, num_class,
                                       class_weight)

    pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
    fingerprint = pre_fingerprint + config
    print('clf fingerprint', ntpath.basename(fingerprint))
    classifier_args['runs_dir'] = join(fingerprint, 'log')
    clf = ClassifierLoader().load(classifier_args)
    # classifier loaded

    # load data
    col_names = get_cols4eval()
    col_names.append('Timestamp')
    df = pd.read_csv(test_csv_file,
                     usecols=col_names,
                     dtype=get_dtype4normalized())
    print("Record distribution:")
    print(df.Label.value_counts())
    df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(str)  # type string

    #group data
    df = df.sort_values(by=['Flow ID', 'Label'
                            ])  # replaces ordering task in per_flow_eval
    flowids, flowlabels, grouped = group_data(df)
    y = encode_label(flowlabels)
    print("data is grouped and labels are encoded")
    pred_any, pred_maj, pred_all, _ = evaluate_per_flow(clf, y, grouped, df)

    any_cm = confusion_matrix(y, pred_any)
    maj_cm = confusion_matrix(y, pred_maj)
    all_cm = confusion_matrix(y, pred_all)

    any_acc = metrics.balanced_accuracy_score(y, pred_any)
    maj_acc = metrics.balanced_accuracy_score(y, pred_maj)
    all_acc = metrics.balanced_accuracy_score(y, pred_all)
    print(any_acc, maj_acc, all_acc)

    result_logger_ids18(fingerprint, np.unique(y), (any_cm, maj_cm, all_cm),
                        'test')
예제 #7
0
def evaluator(dataroot, classifier_name):
    K = 10

    print("\nevaling ", dataroot)
    gt_num_class = pd.read_csv(join(dataroot, '{}fold_0.csv'.format(K)),
                               usecols=['Label'])['Label'].nunique()

    # load Classifier
    classifier_args, config = get_args(classifier_name, class_weight=None)
    print("Balancing technique: ", classifier_args['balance'])
    pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
    fingerprint = join(pre_fingerprint + config, 'K_{}'.format(K))
    logdir = join(fingerprint, 'log')

    gt_classes = None
    for test_index in range(K):
        print("*************  Testing holdout ", test_index, '************')
        runs_dir = join(logdir, '{}'.format(test_index))
        print(runs_dir)

        print("with model: ", runs_dir)
        classifier_args['runs_dir'] = runs_dir

        loader = ClassifierLoader()
        clf = loader.load(classifier_args)
        # classifier loaded

        # load data
        col_names = get_cols4eval()
        col_names.append('Timestamp')
        test_csv_file = join(dataroot, '{}fold_{}.csv'.format(K, test_index))
        df = pd.read_csv(test_csv_file,
                         usecols=col_names,
                         dtype=get_dtype4normalized(),
                         nrows=4096)  #,skiprows=skip_idx)
        df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(
            str)  # type string
        df = df.sort_values(by=['Flow ID', 'Day', 'Label'])
        # Done

        inference_times = predict(df, clf)
        for key, val in inference_times.items():
            print(key, val)
        break
def worker(fn, cnt):
    reader = pd.read_csv(fn,
                         usecols=get_cols4eval(),
                         engine='c',
                         dtype=get_dtype4normalized(),
                         chunksize=10**6)  # 1.5 min

    df = pd.concat([df for df in reader], sort=False)
    print(ntpath.basename(fn), df.Label.value_counts())
    g = df.groupby(['Label'], sort=False)  # 0.00 sec
    new_df = pd.DataFrame(
        g.apply(lambda x: x.sample(cnt, random_state=getSeed(), replace=True).
                reset_index(drop=True)))  # 33 sec
    outfile = fn.replace(foldname_prefix, '{}bal_fold_'.format(K))
    new_df = new_df.sample(frac=1, random_state=getSeed(),
                           replace=False)  # shuffling, 1min
    tick = time.time()
    new_df.to_csv(outfile, chunksize=10**5, index=False)
    print("Written in {:.2f} ".format(time.time() - tick))  # 3.5 mins for SFS
예제 #9
0
def evaluator(dataroot,classifier_name):
    K=10
   
    print("\nevaling ",dataroot)
    gt_num_class = pd.read_csv(join(dataroot,'{}fold_0.csv'.format(K)),usecols=['Label'])['Label'].nunique()

    # load Classifier
    classifier_args, config = get_args(classifier_name, class_weight=None )
    print("Balancing technique: ", classifier_args['balance'])
    pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
    fingerprint = join(pre_fingerprint + config,'K_{}'.format(K))
    logdir = join(fingerprint,'log')

    gt_classes = None 
    for test_index in range(K):
        print("*************  Testing holdout ", test_index,'************')
        runs_dir = join(logdir,'{}'.format(test_index))
        print(runs_dir)

        print("with model: ",runs_dir)
        classifier_args['runs_dir'] = runs_dir

        loader = ClassifierLoader()
        clf = loader.load(classifier_args)
        # classifier loaded


        # load data
        col_names = get_cols4eval()
        col_names.append('Timestamp')
        test_csv_file = join(dataroot, '{}fold_{}.csv'.format(K,test_index))
        df = pd.read_csv(test_csv_file,usecols=col_names,dtype=get_dtype4normalized(), nrows=10000)#,skiprows=skip_idx)
        x = df.drop(columns=['Flow ID','Label','Timestamp']).values
        # Done    
        for i in range(10):
            inference_time = inference(clf,x)
            print(inference_time)
        break
예제 #10
0
def fold_worker(fn, b,m,d):
    #Assumptions:
    # 1. Benign records are in the beginning of each fold
    # 2. num of benign records is `b`
    # 3. num of duplicates made in over/under sampling is `d`
    # 4. num of malicious duplicated records  `m`

    # then, using `m+d` benign records ensures we have equal #records against balancing case
    # which means we should remove last `b-(m+d)` benign records from fold
    K=5 # num iof folds
    num_to_exclude = (b - (m+d))//K
     

    tick = time.time()    
    df = pd.read_csv(fn, engine='c', dtype=get_dtype4normalized()) # 4~5 min
    print("Read fold in {:.2f} min".format((time.time()-tick)/60))
    N = df.shape[0]
    r_df = df[:N-num_to_exclude]
    
    sh_df = r_df.sample(frac=1,random_state=getSeed(), replace=False)

    outfile = fn.replace(in_fold, out_fold)
    assert fn!=outfile, "outfile is same as input file {}".format(ntpath.basename(fn))
    sh_df.to_csv(outfile,chunksize=10**5, index=False)
예제 #11
0
def evaluator(dataroot, classifier_name):
    print(ntpath.basename(dataroot))
    test_csv_file = join(dataroot, 'fold_0.csv')

    # load Classifier
    classifier_args, config = get_args(classifier_name,
                                       num_class='dummy',
                                       class_weight=None)
    print("Balancing technique: ", classifier_args['balance'])
    pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
    fingerprint = pre_fingerprint + config
    logdir = join(fingerprint, 'log')

    if 'mem_const_exp' == 'mem_const_exp':
        # for mem constraint exp
        start = logdir.find('CSVs_r')
        end = logdir.find('_m_')
        CSV_dirname = logdir[start:end]
        logdir = logdir.replace(CSV_dirname, 'CSVs_r_1.0')
        # end

    print(logdir)
    classifier_args['runs_dir'] = logdir

    loader = ClassifierLoader()
    clf = loader.load(classifier_args)

    if 'noprint_clf_attr' == 'print_clf_attr' and 'tree' in classifier_name:
        print("maximum depth of the tree ", clf.tree_.max_depth)
        import matplotlib.pyplot as plt
        from sklearn.tree import plot_tree
        plt.figure()
        plot_tree(clf, filled=True)
        plt.savefig(join(logdir, 'tree_plot.png'), dpi=1000)
        return
    if 'norf_attr' == 'rf_attr' and 'forest' in classifier_name:
        depth = [est.tree_.max_depth for est in clf.estimators_]
        print(depth)
        depth = np.array(depth)
        print("forest depth", depth.mean(), depth.max(), depth.min())
        print("maximum depth of the tree ", clf.base_estimator_.max_depth)
        return
        import matplotlib.pyplot as plt
        from sklearn.tree import plot_tree
        plt.figure()
        plot_tree(clf, filled=True)
        plt.savefig(join(logdir, 'tree_plot.png'), dpi=1000)
        return

    print("Classifier Loaded!")
    # classifier loaded

    # load data
    col_names = get_cols4eval()
    col_names.append('Timestamp')
    df = pd.read_csv(test_csv_file,
                     usecols=col_names,
                     dtype=get_dtype4normalized())  #,skiprows=skip_idx)
    df['Day'] = df['Timestamp'].map(lambda x: x[:2]).astype(str)  # type string
    df = df.sort_values(by=['Flow ID', 'Day', 'Label'])
    print(df.Label.value_counts())

    # Done
    pred_per_record = predict_per_record(df, clf)
    per_record_evaluation(df, pred_per_record)

    tick = time.time()
    flowids, flowlabels_str, grouped = group_data(df)
    print("Grouped in {:.2f} min".format((time.time() - tick) / 60))
    y = encode_label(flowlabels_str)
    print("data is grouped and labels are encoded")

    pred_any, pred_maj, pred_all, y = evaluate_per_flow(
        clf, y, grouped, df, pred_per_record)

    gt_classes = np.unique(y)
    pred_classes = np.unique(pred_any)
    nunique_gt = len(gt_classes)
    nunique_pred = len(pred_classes)

    assert nunique_gt >= nunique_pred, "should not predict non existing class(es), but \n{} < \n{}".format(
        gt_classes, pred_classes)
    any_cm = confusion_matrix(y, pred_any)
    majority_cm = confusion_matrix(y, pred_maj)
    all_cm = confusion_matrix(y, pred_all)

    any_acc = metrics.balanced_accuracy_score(y, pred_any)
    maj_acc = metrics.balanced_accuracy_score(y, pred_maj)
    all_acc = metrics.balanced_accuracy_score(y, pred_all)
    print(any_acc, maj_acc, all_acc)
    result_logger_ids18(fingerprint, gt_classes, (any_cm, majority_cm, all_cm),
                        'test')
예제 #12
0
def classify(dataroot,classifier_name):
        K=5
        balance = get_balancing_technique()
        train_data = []
        #single fold 29M records
        # 4 folds 120M records
        # if 20M records require 5% RAM
        # then 120M records require 30% memory
        print("Reading the data...")
        tick=time.time()
        label_to_id, id_to_label, _ = get_ids18_mappers()
        num_train_records = 0
        print("Reading 4 folds ")
        
        if balance=='with_loss' or balance=='no' or balance=='with_loss_sub': 
            regex  = 'r_fold_{}.csv'
        elif balance=='explicit':
            regex = 'bal_fold_{}.csv'
            
        for fold_index in tqdm(range(K)):
            if fold_index==0:
                continue
            reader = pd.read_csv(join(dataroot,regex.format(fold_index)),chunksize=10**6, usecols=get_cols4ml(), dtype=get_dtype4normalized())# 10**6 rows read in 9min, total is 29*10**6
            # remove the extra header row
            for df in tqdm(reader):
                y_str = df.Label.values
                x = df.drop(columns=['Label']).values
                train_data.append((x,encode_label(y_str)))
                num_train_records +=df.shape[0]
                print(df.memory_usage(deep=True).sum()*(799902)/(1024*1024*1024 ))
        tock = time.time()
        print("read data in {:.2f}".format(tock-tick)) # 24min

        classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None)
        pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
            
        fingerprint = pre_fingerprint + config
        logdir = join(fingerprint,'log')
        ensure_dir(logdir)
               
        X_train = np.concatenate([fold[0] for fold in train_data ],axis=0)
        y_train = np.concatenate([fold[1] for fold in train_data ],axis=0)
        classifier_args['runs_dir']=logdir

        print("Start training")
        tick = time.time()
        clf= get_classifier(classifier_args)
        print("classes")
        print(np.unique(y_train))
        clf.fit(X_train, y_train)
        fn = classifier_args['runs_dir']+'.pkl'
        pickle.dump(clf,open(fn,'wb'))
        print("Done training {} flow records in {:.2f} sec".format(y_train.shape[0],time.time()-tick))