def _misclassification_rates(train, test, clf=GOOD[0]): '''@return (mis-)classification rates per class in test''' (X, y, _) = counter.to_features_cumul(counter.outlier_removal(train)) clf.fit(fit.scale(X, clf), y) (X_test, y_test, y_testd) = counter.to_features_cumul(test) X_test = fit.scale(X_test, clf) return _predict_percentages( _class_predictions(y_test, clf.predict(X_test)), _gen_url_list(y_test, y_testd))
def test__binarized_fake_vs_fit(self): c_list = [counter._test(x) for x in [1, 2, 2, 2, 2, 3, 4]] bg_mock = {'background': c_list[:], 'a': c_list[:], 'b': c_list[:]} s = scenario.Scenario('asdf/2015-12-12--3@7') s.traces = bg_mock Xa, ya, _ = s.binarized().get_features_cumul(current_sites=False) Xc, yc, _ = counter.to_features_cumul(bg_mock) yc = list(mymetrics.binarized(yc, transform_to=1)) self.assertTrue(np.array_equal(ya, yc), "ya:{}\nyc:{}".format(ya, yc)) self.assertTrue(np.array_equal(Xa, Xc))
def my_grid_helper(trace_dict, outlier_removal=True, cumul=True, folds=config.FOLDS): '''@return grid-search on trace_dict result (clf, results)''' if outlier_removal: trace_dict = counter.outlier_removal(trace_dict) if cumul: (X, y, _) = counter.to_features_cumul(trace_dict) return my_grid(X, y, folds=folds) else: # panchenko 1 (X, y, _) = counter.to_features(trace_dict) return my_grid(X, y, C=2**17, gamma=2**-19, folds=folds)
def closed_world(scenarios, def0, cumul=True, with_svm=True, common=False): '''cross test on dirs: 1st has training data, rest have test :param: scenarios: dict mapping scenario_name to its traces dict If scenarios has only one set, it is cross-validated etc. If there are more than one, the first is taken as baseline and training, while the others are tested against this. :param: cumul triggers CUMUL, else version 1, :param: common determines whether to reduce the test data to common keys. ''' # stats = {k: scenario._mean_std(v, "total_bytes_in") for (k, v) in scenarios.iteritems()} # durations = {k: _average_duration(v) for (k,v) in scenarios.iteritems()} # no-split, best result of 10-fold tts simulated_original(scenarios[def0], def0) # training set (train, test) = _tts(scenarios[def0]) clfs = GOOD[:] if with_svm: if def0 in SVC_TTS_MAP and cumul: logging.info('reused svc: %s for scenario: %s', SVC_TTS_MAP[def0], def0) clfs.append(SVC_TTS_MAP[def0]) else: now = time.time() (clf, _, _) = fit.my_grid_helper( counter.outlier_removal(train, 2), cumul) logging.debug('parameter search took: %s', time.time() - now) if cumul: SVC_TTS_MAP[def0] = clf clfs.append(SVC_TTS_MAP[def0]) else: clfs.append(clf) # X,y for eval if cumul: (X, y, _) = counter.to_features_cumul(counter.outlier_removal(test, 1)) else: (X, y, _) = counter.to_features(counter.outlier_removal(test, 1)) # evaluate accuracy on all of unaddoned print('cross-validation on X,y') for clf in clfs: _verbose_test_11(X, y, clf) # vs test sets its_traces0 = scenarios[def0] for (scenario_path, its_traces) in scenarios.iteritems(): if scenario_path == def0: continue print('\ntrain: {} VS {} (overhead {}%)'.format( def0, scenario_path, scenario.size_increase(its_traces0, its_traces))) if common and its_traces.keys() != its_traces0.keys(): # td: refactor code duplication with above (search for keys = ...) keys = set(its_traces0.keys()) keys = keys.intersection(its_traces.keys()) tmp = {} tmp0 = {} for key in keys: tmp0[key] = its_traces0[key] tmp[key] = its_traces[key] its_traces0 = tmp0 its_traces = tmp if cumul: (X2, y2, _) = counter.to_features_cumul(its_traces) else: max_len = _dict_elementwise( max, counter._find_max_lengths(its_traces0), counter._find_max_lengths(its_traces)) (X, y, _) = counter.to_features( counter.outlier_removal(its_traces0, 2), max_len) (X2, y2, _) = counter.to_features( counter.outlier_removal(its_traces, 1), max_len) for clf in clfs: now = time.time() print('{}: {}'.format(_clf_name(clf), _xtest(X, y, X2, y2, clf)), end='') print('({} seconds)'.format(time.time() - now))
def test_tf_cumul_foreground(self): X, y, yd = counter.to_features_cumul({'a': self.c_list}) self.assertFalse(-1 in y)
def test_tf_cumul_background(self): X, y, yd = counter.to_features_cumul({'background': self.c_list}) self.assertTrue(-1 in y, '-1 not in {}'.format(set(y)))