Пример #1
0
def _misclassification_rates(train, test, clf=GOOD[0]):
    '''@return (mis-)classification rates per class in test'''
    (X, y, _) = counter.to_features_cumul(counter.outlier_removal(train))
    clf.fit(fit.scale(X, clf), y)
    (X_test, y_test, y_testd) = counter.to_features_cumul(test)
    X_test = fit.scale(X_test, clf)
    return _predict_percentages(
        _class_predictions(y_test, clf.predict(X_test)),
        _gen_url_list(y_test, y_testd))
Пример #2
0
 def test__binarized_fake_vs_fit(self):
     c_list = [counter._test(x) for x in [1, 2, 2, 2, 2, 3, 4]]
     bg_mock = {'background': c_list[:], 'a': c_list[:], 'b': c_list[:]}
     s = scenario.Scenario('asdf/2015-12-12--3@7')
     s.traces = bg_mock
     Xa, ya, _ = s.binarized().get_features_cumul(current_sites=False)
     Xc, yc, _ = counter.to_features_cumul(bg_mock)
     yc = list(mymetrics.binarized(yc, transform_to=1))
     self.assertTrue(np.array_equal(ya, yc), "ya:{}\nyc:{}".format(ya, yc))
     self.assertTrue(np.array_equal(Xa, Xc))
Пример #3
0
def my_grid_helper(trace_dict,
                   outlier_removal=True,
                   cumul=True,
                   folds=config.FOLDS):
    '''@return grid-search on trace_dict result (clf, results)'''
    if outlier_removal:
        trace_dict = counter.outlier_removal(trace_dict)
    if cumul:
        (X, y, _) = counter.to_features_cumul(trace_dict)
        return my_grid(X, y, folds=folds)
    else:  # panchenko 1
        (X, y, _) = counter.to_features(trace_dict)
        return my_grid(X, y, C=2**17, gamma=2**-19, folds=folds)
Пример #4
0
def closed_world(scenarios, def0, cumul=True, with_svm=True, common=False):
    '''cross test on dirs: 1st has training data, rest have test

    :param: scenarios: dict mapping scenario_name to its traces dict

        If scenarios has only one set, it is cross-validated etc.  If
        there are more than one, the first is taken as baseline and
        training, while the others are tested against this.

    :param: cumul triggers CUMUL, else version 1,
    :param: common determines whether to reduce the test data to
            common keys.
    '''
    # stats = {k: scenario._mean_std(v, "total_bytes_in") for (k, v) in scenarios.iteritems()}
    # durations = {k: _average_duration(v) for (k,v) in scenarios.iteritems()}

    # no-split, best result of 10-fold tts
    simulated_original(scenarios[def0], def0)

    # training set
    (train, test) = _tts(scenarios[def0])
    clfs = GOOD[:]
    if with_svm:
        if def0 in SVC_TTS_MAP and cumul:
            logging.info('reused svc: %s for scenario: %s',
                         SVC_TTS_MAP[def0],
                         def0)
            clfs.append(SVC_TTS_MAP[def0])
        else:
            now = time.time()
            (clf, _, _) = fit.my_grid_helper(
                counter.outlier_removal(train, 2), cumul)
            logging.debug('parameter search took: %s', time.time() - now)
            if cumul:
                SVC_TTS_MAP[def0] = clf
                clfs.append(SVC_TTS_MAP[def0])
            else:
                clfs.append(clf)

    # X,y for eval
    if cumul:
        (X, y, _) = counter.to_features_cumul(counter.outlier_removal(test, 1))
    else:
        (X, y, _) = counter.to_features(counter.outlier_removal(test, 1))
    # evaluate accuracy on all of unaddoned
    print('cross-validation on X,y')
    for clf in clfs:
        _verbose_test_11(X, y, clf)

    # vs test sets
    its_traces0 = scenarios[def0]
    for (scenario_path, its_traces) in scenarios.iteritems():
        if scenario_path == def0:
            continue
        print('\ntrain: {} VS {} (overhead {}%)'.format(
            def0, scenario_path,
            scenario.size_increase(its_traces0, its_traces)))
        if common and its_traces.keys() != its_traces0.keys():
            # td: refactor code duplication with above (search for keys = ...)
            keys = set(its_traces0.keys())
            keys = keys.intersection(its_traces.keys())
            tmp = {}
            tmp0 = {}
            for key in keys:
                tmp0[key] = its_traces0[key]
                tmp[key] = its_traces[key]
            its_traces0 = tmp0
            its_traces = tmp
        if cumul:
            (X2, y2, _) = counter.to_features_cumul(its_traces)
        else:
            max_len = _dict_elementwise(
                max,
                counter._find_max_lengths(its_traces0),
                counter._find_max_lengths(its_traces))
            (X, y, _) = counter.to_features(
                counter.outlier_removal(its_traces0, 2), max_len)
            (X2, y2, _) = counter.to_features(
                counter.outlier_removal(its_traces, 1), max_len)
        for clf in clfs:
            now = time.time()
            print('{}: {}'.format(_clf_name(clf), _xtest(X, y, X2, y2, clf)),
                  end='')
            print('({} seconds)'.format(time.time() - now))
Пример #5
0
 def test_tf_cumul_foreground(self):
     X, y, yd = counter.to_features_cumul({'a': self.c_list})
     self.assertFalse(-1 in y)
Пример #6
0
 def test_tf_cumul_background(self):
     X, y, yd = counter.to_features_cumul({'background': self.c_list})
     self.assertTrue(-1 in y, '-1 not in {}'.format(set(y)))