Exemplo n.º 1
0
def result_logger_ids18(fingerprint, cm_ids, cm_tuple,fold_index):
    cmdir = join(fingerprint,'cm')
    recalldir = join(fingerprint,'recall')
    evaldir = join(fingerprint,'eval')
    ensure_dirs([cmdir,recalldir,evaldir])
    (cm_any, cm_majority, cm_all) = cm_tuple
    
    _, id_to_label, _ = get_ids18_mappers()
    cm_labels = np.array([id_to_label[cm_id] for cm_id in cm_ids])
    #logging, plotting
    
    plot_confusion_matrix(join(cmdir,'any_{}.jpg'.format(fold_index)), [], [],cm=cm_any, classes=cm_labels, id_to_label=id_to_label)
    plot_confusion_matrix(join(cmdir,'majority_{}.jpg'.format(fold_index)), [], [],cm=cm_majority, classes=cm_labels, id_to_label=id_to_label)
    plot_confusion_matrix(join(cmdir,'all_{}.jpg'.format(fold_index)), [], [],cm=cm_all, classes=cm_labels, id_to_label=id_to_label)

    plot_confusion_matrix(join(cmdir,'any_norm_{}.jpg'.format(fold_index)), [], [],cm=cm_any, classes=cm_labels,id_to_label=id_to_label, normalize=True)
    plot_confusion_matrix(join(cmdir,'majority_norm_{}.jpg'.format(fold_index)), [], [],cm=cm_majority, classes=cm_labels,id_to_label=id_to_label, normalize=True)
    plot_confusion_matrix(join(cmdir,'all_norm_{}.jpg'.format(fold_index)), [], [],cm=cm_all, classes=cm_labels,id_to_label=id_to_label, normalize=True)


    print_evaluation(cm_any, cm_labels, evaldir, fold_index, 'any')
    print_evaluation(cm_majority, cm_labels, evaldir, fold_index, 'majority')
    print_evaluation(cm_all, cm_labels, evaldir, fold_index, 'all')

    print_absolute_recall(cm_any, cm_labels, recalldir, fold_index, 'any')
    print_absolute_recall(cm_majority, cm_labels, recalldir, fold_index, 'majority')
    print_absolute_recall(cm_all, cm_labels, recalldir, fold_index, 'all')
Exemplo n.º 2
0
def get_class_weights(dataroot, p=1):
    df = pd.read_csv(join(dataroot, 'label_dist.csv'),
                     names=['Label', 'Count'])
    label_to_id, id_to_label, _ = get_ids18_mappers()
    #order of labels should be same as label ids on train data
    counts = []
    print(id_to_label)
    for i in range(len(id_to_label)):
        label = id_to_label[i]
        if label in df['Label'].values:
            c = df[df['Label'] == label]['Count'].iloc[0]
            counts.append(c)
        else:
            print('not found', label)
            counts.append(0)

    counts = np.array(counts)
    normed_weights = [1 - (count / sum(counts)) for count in counts]
    return np.array(normed_weights)

    weight = 1. / counts  # make it probabilyt where frequent class has smaller weight
    s = sum(weight)
    weight = weight / s  # normalization
    np.set_printoptions(precision=3)
    print("class weights = ", weight)
    return weight
Exemplo n.º 3
0
def classify(dataroot,classifier_name):
        K=5
        balance = get_balancing_technique()
        train_data = []
        #single fold 29M records
        # 4 folds 120M records
        # if 20M records require 5% RAM
        # then 120M records require 30% memory
        print("Reading the data...")
        tick=time.time()
        label_to_id, id_to_label, _ = get_ids18_mappers()
        num_train_records = 0
        print("Reading 4 folds ")
        
        if balance=='with_loss' or balance=='no' or balance=='with_loss_sub': 
            regex  = 'r_fold_{}.csv'
        elif balance=='explicit':
            regex = 'bal_fold_{}.csv'
            
        for fold_index in tqdm(range(K)):
            if fold_index==0:
                continue
            reader = pd.read_csv(join(dataroot,regex.format(fold_index)),chunksize=10**6, usecols=get_cols4ml(), dtype=get_dtype4normalized())# 10**6 rows read in 9min, total is 29*10**6
            # remove the extra header row
            for df in tqdm(reader):
                y_str = df.Label.values
                x = df.drop(columns=['Label']).values
                train_data.append((x,encode_label(y_str)))
                num_train_records +=df.shape[0]
                print(df.memory_usage(deep=True).sum()*(799902)/(1024*1024*1024 ))
        tock = time.time()
        print("read data in {:.2f}".format(tock-tick)) # 24min

        classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None)
        pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
            
        fingerprint = pre_fingerprint + config
        logdir = join(fingerprint,'log')
        ensure_dir(logdir)
               
        X_train = np.concatenate([fold[0] for fold in train_data ],axis=0)
        y_train = np.concatenate([fold[1] for fold in train_data ],axis=0)
        classifier_args['runs_dir']=logdir

        print("Start training")
        tick = time.time()
        clf= get_classifier(classifier_args)
        print("classes")
        print(np.unique(y_train))
        clf.fit(X_train, y_train)
        fn = classifier_args['runs_dir']+'.pkl'
        pickle.dump(clf,open(fn,'wb'))
        print("Done training {} flow records in {:.2f} sec".format(y_train.shape[0],time.time()-tick))
Exemplo n.º 4
0
    def __init__(self, csv_file, chunksize=10**4):
        self.csv_file = csv_file
        self.chunksize = chunksize
        self.seen_so_far = 0  # number of flow records seen so far
        self.seen_chunks = 0
        self.iterableReader = pd.read_csv(csv_file,
                                          engine='c',
                                          usecols=get_cols4ml(),
                                          dtype=get_dtype4normalized(),
                                          chunksize=chunksize)

        label_to_id, id_to_label, _ = get_ids18_mappers()
        self.label_to_id = label_to_id
Exemplo n.º 5
0
def get_class_weights(dataroot, p=1):
    df = pd.read_csv(join(dataroot, 'label_dist.csv'),
                     names=['Label', 'Count'])
    label_to_id, id_to_label, _ = get_ids18_mappers()
    #order of labels should be same as label ids on train data
    counts = []
    for i in range(len(id_to_label)):
        label = id_to_label[i]
        if label in df['Label'].values:
            c = df[df['Label'] == label]['Count'].iloc[0]
            counts.append(c)
        else:
            print('not found', label)
            counts.append(0)

    counts = np.array(counts)
    normed_weights = [1 - (count / sum(counts)) for count in counts]
    return np.array(normed_weights)
def classify(dataroot, classifier_name):
    K = 5
    fraction = 1

    #total_records = 6907705; # in fold fraction after removin small classes <K
    folds_df = []
    fold_root = join(dataroot, 'folds_fraction_{}'.format(fraction))
    print("Reading the data...")
    ds_list = []
    for fold_index in range(K):
        df = pd.read_csv(join(fold_root, 'fold_{}.csv'.format(fold_index)))
        folds_df.append(df)
        ds_list.append(df.Label)
    total_df = pd.concat(folds_df)
    total_label_df = pd.concat(ds_list)
    labels = total_label_df.sort_values().unique()
    total_records = total_label_df.shape[0]
    #labels,labels_d = get_labels(total_label_df.unique())
    label_to_id, id_to_label, _ = get_ids18_mappers()
    class_weight = get_class_weights(
        encode_label(total_label_df.values, label_to_id))

    balance = get_balancing_technique()
    input_dim = folds_df[0].shape[
        1] - 2  # because we remove Label and FlowID columns from X
    gt_num_class = len(label_to_id)
    num_class = len(labels)
    assert gt_num_class == num_class, 'all classess should be observed gt_classes!=observed_classes {}!={}'.format(
        gt_num_class, num_class)

    classifier_args, config = get_args(classifier_name, total_records,
                                       gt_num_class, input_dim, class_weight,
                                       balance)
    pre_fingerprint = join(
        dataroot, 'r_{}_c_{}_k_{}'.format(fraction, classifier_name, str(K)))

    fingerprint = pre_fingerprint + '_mem_constrained' + config
    logdir = join(pre_fingerprint + config, 'log')
    runs_dir = get_runs_dir(logdir)
    classifier_args['runs_dir'] = runs_dir
    clf = get_classifier(classifier_args)
    time_inference(classifier_name, clf, total_df, dataroot)
Exemplo n.º 7
0
def classify(dataroot, classifier_name='cnn'):
    class_weight = get_class_weights(dataroot)
    balance = get_balancing_technique()
    print('balancing technique ', balance)
    if balance == 'explicit':
        train_csv = join(dataroot, 'bal_train.csv')
        val_csv = join(dataroot, 'bal_fold_1.csv'
                       )  # no need to use bal__fold because it is shuffled
    else:
        train_csv = join(dataroot, 'r_train.csv')
        val_csv = join(dataroot, 'r_fold_1.csv')

    result_val = subprocess.run(['wc', '-l', val_csv], stdout=subprocess.PIPE)
    result_train = subprocess.run(['wc', '-l', train_csv],
                                  stdout=subprocess.PIPE)
    train_records = int(result_train.stdout.split()[0]) - 1  # for the header
    val_records = int(result_val.stdout.split()[0]) - 1
    print("Number of train and val records ({},{})".format(
        train_records, val_records))

    num_epochs = 40
    label_to_id, id_to_label, _ = get_ids18_mappers()
    #class_weight = None
    class_weight = get_class_weights(dataroot)
    if balance == 'with_loss_inverse':
        class_weight = 1. / class_weight

    num_class = len(label_to_id)  # we assume all the categories are observed

    classifier_args, config = get_args(classifier_name, num_class,
                                       class_weight)
    pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
    fingerprint = pre_fingerprint + config
    logdir = join(fingerprint, 'log')
    ensure_dir(logdir)
    classifier_args['runs_dir'] = logdir
    clf = get_classifier(classifier_args)
    clf.fit(train_csv, val_csv, num_epochs, train_records, val_records)
Exemplo n.º 8
0
def classify(dataroot, classifier_name):
    K = 5
    fraction = 1
    label_to_id, id_to_label, _ = get_ids18_mappers()
    #class_weight = get_class_weights(encode_label(total_label_df.values,label_to_id))
    class_weight = None

    balance = get_balancing_technique()
    input_dim = 78  # because we remove Label and FlowID columns from X
    gt_num_class = len(label_to_id)

    classifier_args, config = get_args(classifier_name, gt_num_class,
                                       input_dim, class_weight, balance)
    pre_fingerprint = join(
        dataroot, 'r_{}_c_{}_k_{}'.format(fraction, classifier_name, str(K)))

    fingerprint = pre_fingerprint + '_mem_constrained' + config
    logdir = join(pre_fingerprint + config, 'log')

    cm_any = np.zeros((gt_num_class, gt_num_class), dtype=float)
    cm_majority = np.zeros((gt_num_class, gt_num_class), dtype=float)
    cm_all = np.zeros((gt_num_class, gt_num_class), dtype=float)

    kfold_feature_importance = np.zeros(input_dim, dtype=np.float)
    for fold_index in range(K):
        print('###################################')
        print("Fold ", fold_index)
        test_df = pd.read_csv(join(dataroot, 'fold_{}.csv'.format(fold_index)))
        runs_dir = join(logdir, 'fold_{}'.format(fold_index))
        # for mem constrained experiemnt II, we need same classifier CSVs_r_1 for all memories
        start = runs_dir.find('CSVs_r_')
        end = runs_dir.find('SR_10')
        CSV_dirname = runs_dir[start:end - 1]
        #runs_dir = runs_dir.replace(CSV_dirname,'CSVs_r_1.0')
        classifier_args['runs_dir'] = runs_dir
        #----------------
        loader = ClassifierLoader()
        clf = loader.load(classifier_args)
        print("Loaded Classifier!")
        if classifier_name == 'forest':
            kfold_feature_importance += clf.feature_importances_

        flowids_test, y_flowid_test, grouped = group_data(test_df)
        y_flowid_test = encode_label(y_flowid_test, label_to_id)
        pred_any, pred_majority, pred_all, duration = predict_fold(
            classifier_name, clf, test_df, y_flowid_test, grouped, dataroot)
        assert pred_any.shape == pred_majority.shape, "any and majority shapes should be same {},{}".format(
            pred_any.shape, pred_majority.shape)

        acc_pred_any = 100 * metrics.balanced_accuracy_score(
            y_flowid_test, pred_any)
        acc_pred_majority = 100 * metrics.balanced_accuracy_score(
            y_flowid_test, pred_majority)
        acc_pred_all = 100 * metrics.balanced_accuracy_score(
            y_flowid_test, pred_all)
        print(
            "Fold Local Balanced accuracy(any,majority,all): ({:.2f},{:.2f},{:.2f})"
            .format(acc_pred_any, acc_pred_majority, acc_pred_all))

        any_cm_i = confusion_matrix(y_flowid_test, pred_any)
        majority_cm_i = confusion_matrix(y_flowid_test, pred_majority)
        all_cm_i = confusion_matrix(y_flowid_test, pred_all)

        result_logger_ids18(fingerprint, y_flowid_test,
                            (any_cm_i, majority_cm_i, all_cm_i), id_to_label,
                            str(fold_index) + '_')

        cm_any += any_cm_i
        cm_majority += majority_cm_i
        cm_all += all_cm_i
    if classifier_name == 'forest':
        print_feature_importance(
            kfold_feature_importance,
            join(dataroot, 'folds_fraction_{}'.format(fraction),
                 'feature_selection.csv'))

    print(dataroot, classifier_name)
    result_logger_ids18(fingerprint, y_flowid_test,
                        (cm_any, cm_majority, cm_all), id_to_label, 'avg_')
Exemplo n.º 9
0
 def encode_label(self, str_labels):
     label_to_id, id_to_label, _ = get_ids18_mappers()
     return [label_to_id[str_label] for str_label in str_labels]
Exemplo n.º 10
0
def get_num_ws_classes():
    label_to_id, id_to_label, _ = get_ids18_mappers()
    return len(label_to_id)