def test_all_chains(): nomix =[True, False] clf_type = ['svm', 'rf', None] exact = [1, 2, 3, 4] threshold = [1, 2, 3, 4, 5, 6] results = dict() for c in clf_type: for m in nomix: for t in threshold: results[('threshold', c, m, t)] = predict_threshold(clf_type=c, nomix=m, threshold=t) for m in nomix: for t in threshold: results[('duo_threshold', c, m, t)] = predict_duo_threshold(nomix=m, threshold=t) for m in nomix: results[('duo', c, m)] = predict_duo(nomix=m) for c in clf_type: for m in nomix: for t in threshold: for e in exact: results[('exact_threshold', c, m, t, e)] = predict_exact_or_threshold(nomix=m, clf_type=c, threshold=t, exact=e) threshold = [1, 2] for t in threshold: results[('onlymix_threshold', t)] = predict_threshold(nomix=False, only_mix=True, threshold=t) data.serialize_keep_copy(settings.ANALYSIS_FOLDER + '/chaining.pkl', results) return results
def _serialize_cv_results(stats_list, persist, where): res_stats_map = defaultdict(list) for s, data_set_id in stats_list: res_stats_map[data_set_id].append(s) if persist: data.serialize_keep_copy(where, res_stats_map, keep_copy=True) return res_stats_map
def serialize(self, where=None, keep_copy=True): if not where: stats_file = settings.EVAL_FOLDER + self.id else: stats_file = where + self.id data.serialize_keep_copy(stats_file, self, keep_copy=keep_copy)
def serialize(self, to=settings.CLF_FOLDER, id=''): """ Write classifier to disk. :param to: folder where to save :return: """ to += '/{!s}_{!s}_{!s}.pkl'.format(self.dga, self.clf_type, id) data.serialize_keep_copy(to, self)
def test_rf_mix_chains(): results = dict() threshold = [1, 2] clfs_all = classifiers.ClassificationEnsemble(nomix=False, only_type='rf', only_mix=True).clfs for i in range(3, 19): for t in threshold: clfs_sel = classifiers.ClassificationEnsemble(nomix=False, only_type='rf', only_mix=True,clf_list=random.sample(clfs_all, i)) results[('mix_rf', t, i)] = predict_threshold(nomix=False, only_mix=True, threshold=t, clfs=clfs_sel) data.serialize_keep_copy(settings.ANALYSIS_FOLDER + '/chaining.pkl', results) return results
def predict_all_mixed_sets_on_x(n_jobs=8): w = Workspace(days=1, empty=True) w.load_all(settings.SetTypes.mixed_dga_grouped_family.value) clfs = classifiers.ClassificationEnsemble().clfs clfs = [c for c in clfs if 'mix' not in c.dga_type] parallel = Parallel(n_jobs=n_jobs, verbose=1) ds = w.data_sets_loaded['mixed_dga_grouped_family_50000_59_2.pkl'] dmns, lbls, groups = ds.expand() # res is list of tuples: clf.clf_type, clf.dga_type, lbls, pred_lbl res = parallel( delayed(eval_train_test.predict_all_on_x)(clf, ds, dmns, lbls, groups) for clf in clfs ) data.serialize_keep_copy(settings.ANALYSIS_FOLDER + '/x_vs_all_results.pkl', (res, dmns))