def time_feature_extraction(rep=1): w = Workspace(days=1, empty=True) set_name = 'mixed_dga_grouped_family_50000_59_0.pkl' w.load(set_name, settings.SetTypes.mixed_dga_grouped_family.value) domains = [ld.domain for ld in w.data_sets_loaded['mixed_dga_grouped_family_50000_59_0.pkl'].full] t = Timer(lambda: feature_extraction.extract_all_features(domains)) print(t.timeit(number=rep))
def train_mixed_dga(clf_types=['svm', 'rf'], n_jobs=-1): w = Workspace(days=1, empty=True) w.load_all(settings.SetTypes.mixed_dga_grouped_family.value) parallel = Parallel(n_jobs=n_jobs, verbose=1) parallel( delayed(_training)(clf_type, s) for s in w.data_sets_loaded.values() for clf_type in clf_types )
def start_single_grouped_day_logo(n_jobs=-1, clf_type=None): w = Workspace(days=1, empty=True) w.load_all(settings.SetTypes.single_dga_grouped_day.value) if not clf_type: eval_train_test.logo_cv('svm', w.data_sets_loaded.values(), n_jobs=n_jobs) eval_train_test.logo_cv('rf', w.data_sets_loaded.values(), n_jobs=n_jobs) elif clf_type == 'rf': eval_train_test.logo_cv('rf', w.data_sets_loaded.values(), n_jobs=n_jobs) elif clf_type == 'svm': eval_train_test.logo_cv('svm', w.data_sets_loaded.values(), n_jobs=n_jobs)
def start_mix_dga_kfold(repetitions=5, n_jobs=-1, clf_type=None): w = Workspace(days=1, empty=True) w.load_all(settings.SetTypes.mixed_dga_grouped_family.value) if not clf_type: eval_train_test.kfold_cv('svm_mix', w.data_sets_loaded.values(), repetitions=repetitions, n_jobs=n_jobs) eval_train_test.kfold_cv('rf_mix', w.data_sets_loaded.values(), repetitions=repetitions, n_jobs=n_jobs) elif clf_type == 'rf': eval_train_test.kfold_cv('rf_mix', w.data_sets_loaded.values(), repetitions=repetitions, n_jobs=n_jobs) elif clf_type == 'svm': eval_train_test.kfold_cv('svm_mix', w.data_sets_loaded.values(), repetitions=repetitions, n_jobs=n_jobs)
def predict_exact_or_threshold(clf_type=None, nomix=True, threshold=1, exact=1): w = Workspace(days=1, empty=True) w.load_all(settings.SetTypes.mixed_dga_grouped_family.value) clfs = classifiers.ClassificationEnsemble(nomix=nomix, only_type=clf_type, only_mix=False) ds = w.data_sets_loaded['mixed_dga_grouped_family_50000_59_2.pkl'] dmns, lbls, groups = ds.expand() lbls, predicted = clfs.predict_exactly_one_or_threshold(dmns, lbls, threshold=threshold, exact=exact) stats = Statistic(set_id='mixed_dga_grouped_family_50000_59_2.pkl', id='threshold_test') stats.add_run(lbls, predicted, dmns) return stats
def train_all_available_dga(clf_type=None): """ Trains all available single DGAs. :param clf_type: 'svm', 'rf', None (None means for both) :return: """ w = Workspace(days=1, empty=True) w.load_all(settings.SetTypes.single_dga.value) if not clf_type: for ct in ['svm', 'rf']: for s in w.data_sets_loaded.values(): _training(ct, s) elif clf_type == 'rf': for s in w.data_sets_loaded.values(): _training(clf_type, s) elif clf_type == 'svm': for s in w.data_sets_loaded.values(): _training(clf_type, s)
def predict_all_mixed_sets_on_x(n_jobs=8): w = Workspace(days=1, empty=True) w.load_all(settings.SetTypes.mixed_dga_grouped_family.value) clfs = classifiers.ClassificationEnsemble().clfs clfs = [c for c in clfs if 'mix' not in c.dga_type] parallel = Parallel(n_jobs=n_jobs, verbose=1) ds = w.data_sets_loaded['mixed_dga_grouped_family_50000_59_2.pkl'] dmns, lbls, groups = ds.expand() # res is list of tuples: clf.clf_type, clf.dga_type, lbls, pred_lbl res = parallel( delayed(eval_train_test.predict_all_on_x)(clf, ds, dmns, lbls, groups) for clf in clfs ) data.serialize_keep_copy(settings.ANALYSIS_FOLDER + '/x_vs_all_results.pkl', (res, dmns))
def start_x_trained_y_test(n_jobs=-1, clf_type=None, x='siemens'): if x == 'rwth': load_clfs_from = '/work/ss930620/dga_detection_workspace/clfs/' elif x == 'siemens': load_clfs_from = '/work/ss930620/dga_detection_workspace_siemens/clfs/' w = Workspace(days=1, empty=True) w.load_all(settings.SetTypes.mixed_dga_grouped_family.value) test_data_sets = w.data_sets_loaded.values() if not clf_type: eval_train_test.trained_clfs_vs_sets(classifiers.ClassificationEnsemble(only_mix=True, clfs_from_path=load_clfs_from).clfs, test_data_sets, n_jobs=n_jobs) elif clf_type == 'rf': eval_train_test.trained_clfs_vs_sets( classifiers.ClassificationEnsemble(only_mix=True, only_type='rf', clfs_from_path=load_clfs_from).clfs, test_data_sets, n_jobs=n_jobs) elif clf_type == 'svm': eval_train_test.trained_clfs_vs_sets( classifiers.ClassificationEnsemble(only_mix=True, only_type='svm', clfs_from_path=load_clfs_from).clfs, test_data_sets, n_jobs=n_jobs)
def time_training(rep=1): w = Workspace(days=1, empty=True) set_name = 'mixed_dga_grouped_family_50000_59_0.pkl' w.load(set_name, settings.SetTypes.mixed_dga_grouped_family.value) dom, lab, _ = w.data_sets_loaded['mixed_dga_grouped_family_50000_59_0.pkl'].expand() svm = SVMClassifier(dga='mix') t = Timer(lambda: svm.training(dom, lab)) print('SVM Training: ' + str(t.timeit(number=rep))) rf = RFClassifier(dga='mix') t = Timer(lambda: rf.training(dom, lab)) print('RF Training: ' + str(t.timeit(number=rep))) set_name = 'mixed_dga_grouped_family_50000_59_1.pkl' w.load(set_name, settings.SetTypes.mixed_dga_grouped_family.value) dom, lab, _ = w.data_sets_loaded['mixed_dga_grouped_family_50000_59_1.pkl'].expand() t = Timer(lambda: svm.predict(dom, lab)) print('SVM Classify: ' + str(t.timeit(number=rep))) t = Timer(lambda: rf.predict(dom, lab)) print('RF Classify: ' + str(t.timeit(number=rep)))