예제 #1
0
    def test_compute_recall(self):
        file_path = os.path.join(ROOT_DIR, 'datasets/tests/example_keys')
        eval_tool = EvaluationTool(legit=0)
        load_tool = LoadingTool()
        stats = defaultdict(lambda: defaultdict(int))
        trues = pd.Series()
        preds = pd.Series()
        for chunk in load_tool.load_classifications(file_path, ';', True):
            chunk_stats = eval_tool.compute_stats(chunk)
            trues = trues.append(chunk[0])
            preds = preds.append(chunk[1])
            for label in chunk_stats:
                stats[label]['FP'] += chunk_stats[label]['FP']
                stats[label]['FN'] += chunk_stats[label]['FN']
                stats[label]['TP'] += chunk_stats[label]['TP']

        labels = [1, 2]
        rec = [eval_tool.compute_recall(x, stats) for x in labels]
        rec_sklearn = list(
            recall_score(y_true=trues,
                         y_pred=preds,
                         labels=labels,
                         average=None))

        assert rec == rec_sklearn
예제 #2
0
    def test_read_keys(self):
        file_path = os.path.join(ROOT_DIR, 'datasets/tests/example_keys')
        load_tool = LoadingTool()
        metadata = pd.DataFrame()
        for chunk in load_tool.load_classifications(file_path, ';', True):
            metadata = metadata.append(chunk[2])

        expected = pd.DataFrame(np.array(
            [[1, 2, 1, 2, 1, 1, 2, 0, 1, 0, 1, 2, 0, 1, 1],
             [1, 2, 1, 2, 1, 1, 2, 0, 1, 0, 1, 2, 0, 1, 1],
             [1, 2, 1, 2, 1, 1, 2, 0, 1, 0, 1, 2, 0, 1, 1]]).transpose(),
                                columns=['timestamp', 'host', 'user'])
        assert np.allclose(expected, metadata)
    def test_train_classifier(self):
        tr_path = os.path.join(DATA_DIR, 'test_tr')
        rfc = RFC(n_estimators=100, criterion="entropy", n_jobs=-1)
        sampling_settings = {
            'bin_count': 16,
            'neg_samples': 7,
            'bin_samples': 20,
            'seed': 0,
            'nan_value': -1000000
        }
        loading_tool = LoadingTool(sampling_settings)
        clas_tool = ClassificationTool(rfc)
        tr_data = loading_tool.load_training_data(tr_path)
        tr_data = loading_tool.quantize_data(tr_data)
        clas_tool.train_classifier(tr_data)
        tr_data = None

        assert list(clas_tool.classifier.classes_) == [0, 1, 2, 3]
예제 #4
0
    def test_compute_recall_unbalanced(self):
        file_path = os.path.join(ROOT_DIR, 'datasets/tests/example_unbalanced')
        eval_tool = EvaluationTool()
        load_tool = LoadingTool()
        stats = defaultdict(lambda: defaultdict(int))
        trues = pd.Series()
        preds = pd.Series()
        for chunk in load_tool.load_classifications(file_path, ';'):
            chunk_stats = eval_tool.compute_stats(chunk)
            trues = trues.append(chunk[0])
            preds = preds.append(chunk[1])
            for label in chunk_stats:
                stats[label]['FP'] += chunk_stats[label]['FP']
                stats[label]['FN'] += chunk_stats[label]['FN']
                stats[label]['TP'] += chunk_stats[label]['TP']

        rec = [eval_tool.compute_recall(x, stats) for x in eval_tool.labels]

        assert np.isnan(rec[3])
예제 #5
0
    def test_compute_relaxed_stats(self):
        file_path = os.path.join(ROOT_DIR, 'datasets/tests/example_relax')
        eval_tool = EvaluationTool(legit=0)
        load_tool = LoadingTool()
        stats = defaultdict(lambda: defaultdict(set))
        trues = pd.Series()
        preds = pd.Series()
        metadata = pd.DataFrame()
        for chunk in load_tool.load_classifications(file_path, ';', True):
            chunk_stats = eval_tool.compute_stats_for_agg('user', chunk, True)
            trues = trues.append(chunk[0])
            preds = preds.append(chunk[1])
            metadata = metadata.append(chunk[2])
            for k, v in chunk_stats.items():
                stats[k]['FP'] = stats[k]['FP'] | v['FP']
                stats[k]['FN'] = stats[k]['FN'] | v['FN']
                stats[k]['TP'] = stats[k]['TP'] | v['TP']
        stats = eval_tool.aggregate_stats(stats)

        expected_stats = {
            0: {
                'TP': 1,
                'FP': 1,
                'FN': 1
            },
            1: {
                'TP': 7,
                'FP': 0,
                'FN': 2
            },
            2: {
                'TP': 1,
                'FP': 0,
                'FN': 0
            },
            3: {
                'TP': 2,
                'FP': 1,
                'FN': 3
            }
        }
        assert stats == expected_stats
예제 #6
0
    def test_get_avg_rec_nans_true(self):
        file_path = os.path.join(ROOT_DIR, 'datasets/tests/example_unbalanced')
        eval_tool = EvaluationTool()
        load_tool = LoadingTool()
        stats = defaultdict(lambda: defaultdict(int))
        trues = pd.Series()
        preds = pd.Series()
        for chunk in load_tool.load_classifications(file_path, ';'):
            chunk_stats = eval_tool.compute_stats(chunk)
            trues = trues.append(chunk[0])
            preds = preds.append(chunk[1])
            for label in chunk_stats:
                stats[label]['FP'] += chunk_stats[label]['FP']
                stats[label]['FN'] += chunk_stats[label]['FN']
                stats[label]['TP'] += chunk_stats[label]['TP']

        rec = eval_tool.get_avg_recall(stats=stats, nan=True)

        # TODO: Think of a better assert
        assert np.allclose(rec, 0.242857)
    def test_save_predictions(self):
        tr_path = os.path.join(DATA_DIR, 'test_tr')
        t_path = os.path.join(DATA_DIR, 'test_t')
        rfc = RFC(n_estimators=100, criterion="entropy", n_jobs=-1)
        sampling_settings = {
            'bin_count': 16,
            'neg_samples': 7,
            'bin_samples': 20,
            'seed': 0,
            'nan_value': -1000000
        }
        loading_tool = LoadingTool(sampling_settings)
        clas_tool = ClassificationTool(rfc)
        tr_data = loading_tool.load_training_data(tr_path)
        tr_data = loading_tool.quantize_data(tr_data)
        clas_tool.train_classifier(tr_data)
        tr_data = None

        output_file = os.path.join(ROOT_DIR, 'outputs/rfc.test')

        for t_data in loading_tool.load_testing_data(t_path):
            t_data = loading_tool.quantize_data(t_data)
            clas_tool.save_predictions(t_data, output_file)

        t_data = None

        assert os.path.isfile(output_file)
        os.remove(output_file)
예제 #8
0
    def test_get_labels_with_prec_above(self):
        file_path = os.path.join(ROOT_DIR, 'datasets/tests/example_keys')
        e_tool = EvaluationTool()
        l_tool = LoadingTool()
        stats = defaultdict(lambda: defaultdict(int))
        trues = pd.Series()
        preds = pd.Series()

        for chunk in l_tool.load_classifications(file_path, ';', True):
            chunk_stats = e_tool.compute_stats(chunk)
            for label in chunk_stats:
                stats[label]['FP'] += chunk_stats[label]['FP']
                stats[label]['FN'] += chunk_stats[label]['FN']
                stats[label]['TP'] += chunk_stats[label]['TP']

        prec = [e_tool.compute_precision(x, stats) for x in e_tool.labels]

        threshold = 0.3
        precs_above_threshold = e_tool.get_labels_with_prec_above_thres(
            threshold, e_tool.labels, stats)
        expected = [0, 1]
        assert expected == precs_above_threshold
예제 #9
0
    def test_get_stats_counts_one_label(self):
        file_path = os.path.join(ROOT_DIR, 'datasets/tests/example_one_label')
        eval_tool = EvaluationTool()
        load_tool = LoadingTool()
        stats = defaultdict(lambda: defaultdict(set))
        trues = pd.Series()
        preds = pd.Series()
        metadata = pd.DataFrame()
        for chunk in load_tool.load_classifications(file_path, ';', True):
            chunk_stats = eval_tool.compute_stats_for_agg('user', chunk)
            trues = trues.append(chunk[0])
            preds = preds.append(chunk[1])
            metadata = metadata.append(chunk[2])
            for k, v in chunk_stats.items():
                stats[k]['FP'] = stats[k]['FP'] | v['FP']
                stats[k]['FN'] = stats[k]['FN'] | v['FN']
                stats[k]['TP'] = stats[k]['TP'] | v['TP']
        stats = eval_tool.aggregate_stats(stats)

        expected_counts = {'TP': 1, 'FP': 0, 'FN': 0}
        counts = eval_tool.get_stats_counts(1, stats)
        assert expected_counts == counts
예제 #10
0
    def test_get_avg_recall(self):
        file_path = os.path.join(ROOT_DIR, 'datasets/tests/example_strings')
        eval_tool = EvaluationTool()
        load_tool = LoadingTool()
        stats = defaultdict(lambda: defaultdict(int))
        trues = pd.Series()
        preds = pd.Series()
        for chunk in load_tool.load_classifications(file_path, ';'):
            chunk_stats = eval_tool.compute_stats(chunk)
            trues = trues.append(chunk[0])
            preds = preds.append(chunk[1])
            for label in chunk_stats:
                stats[label]['FP'] += chunk_stats[label]['FP']
                stats[label]['FN'] += chunk_stats[label]['FN']
                stats[label]['TP'] += chunk_stats[label]['TP']

        rec = eval_tool.get_avg_recall(stats=stats)
        rec_avg_sklearn = recall_score(y_true=trues,
                                       y_pred=preds,
                                       labels=eval_tool.labels,
                                       average='macro')
        assert np.allclose(rec, rec_avg_sklearn)
예제 #11
0
    def test_compute_stats(self):
        file_path = os.path.join(ROOT_DIR, 'datasets/tests/example_keys')
        eval_tool = EvaluationTool(legit=0)
        load_tool = LoadingTool()
        result = defaultdict(lambda: defaultdict(int))
        for chunk in load_tool.load_classifications(file_path, ';', True):
            chunk_stats = eval_tool.compute_stats(chunk)
            for label in chunk_stats:
                result[label]['FP'] += chunk_stats[label]['FP']
                result[label]['FN'] += chunk_stats[label]['FN']
                result[label]['TP'] += chunk_stats[label]['TP']

        expected = defaultdict(lambda: defaultdict(int))
        expected[0]['TP'] = 1
        expected[0]['FP'] = 2
        expected[0]['FN'] = 4
        expected[1]['TP'] = 5
        expected[1]['FP'] = 3
        expected[1]['FN'] = 2
        expected[2]['TP'] = 1
        expected[2]['FP'] = 3
        expected[2]['FN'] = 2

        assert result == expected
예제 #12
0
    def test_decision_tree(self):
        tree = DecisionTree(max_features='sqrt',
                            min_samples_split=2,
                            random_state=0)
        sktree = DecisionTreeClassifier(criterion='entropy',
                                        min_samples_split=2,
                                        max_features='sqrt',
                                        random_state=0)
        data = pd.read_csv(os.path.join(ROOT_DIR, 'datasets', 'letter'),
                           header=None)
        X = data[data.columns[1:]]
        y = data[data.columns[0]]
        data = None
        X.rename(columns=lambda x: x - 1, inplace=True)
        y = y.apply(lambda x: ord(x))
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.98,
                                                            random_state=0)
        tree.fit(X_train, y_train)
        sktree.fit(X_train, y_train)

        tree_output_file = os.path.join(ROOT_DIR, 'outputs/tree.test')
        sktree_output_file = os.path.join(ROOT_DIR, 'outputs/sktree.test')
        if os.path.isfile(tree_output_file):
            os.remove(tree_output_file)
        if os.path.isfile(sktree_output_file):
            os.remove(sktree_output_file)

        tree_clas_tool = ClassificationTool(tree)
        sktree_clas_tool = ClassificationTool(sktree)

        tree_clas_tool.save_predictions((X_test, y_test),
                                        tree_output_file,
                                        None,
                                        False,
                                        legit=None)
        sktree_clas_tool.save_predictions((X_test, y_test),
                                          sktree_output_file,
                                          None,
                                          False,
                                          legit=None)

        loading_tool = LoadingTool()
        eval_tool = EvaluationTool()
        stats = defaultdict(lambda: defaultdict(int))
        trues = pd.Series()
        preds = pd.Series()
        for chunk in loading_tool.load_classifications(tree_output_file, ';'):
            chunk_stats = eval_tool.compute_stats(chunk)
            trues = trues.append(chunk[0])
            preds = preds.append(chunk[1])
            for label in chunk_stats:
                stats[label]['FP'] += chunk_stats[label]['FP']
                stats[label]['FN'] += chunk_stats[label]['FN']
                stats[label]['TP'] += chunk_stats[label]['TP']

        prec = eval_tool.get_avg_precision(stats=stats)

        skstats = defaultdict(lambda: defaultdict(int))
        sktrues = pd.Series()
        skpreds = pd.Series()
        for chunk in loading_tool.load_classifications(sktree_output_file,
                                                       ';'):
            chunk_stats = eval_tool.compute_stats(chunk)
            sktrues = sktrues.append(chunk[0])
            skpreds = skpreds.append(chunk[1])
            for label in chunk_stats:
                skstats[label]['FP'] += chunk_stats[label]['FP']
                skstats[label]['FN'] += chunk_stats[label]['FN']
                skstats[label]['TP'] += chunk_stats[label]['TP']

        skprec = eval_tool.get_avg_precision(stats=skstats)

        assert math.isclose(prec, skprec, abs_tol=0.02)