def time_feature_extraction(rep=1): w = Workspace(days=1, empty=True) set_name = 'mixed_dga_grouped_family_50000_59_0.pkl' w.load(set_name, settings.SetTypes.mixed_dga_grouped_family.value) domains = [ld.domain for ld in w.data_sets_loaded['mixed_dga_grouped_family_50000_59_0.pkl'].full] t = Timer(lambda: feature_extraction.extract_all_features(domains)) print(t.timeit(number=rep))
def training(self, train, labels): """ Training on given data. :param train: array-like containing domain name strings :param labels: array-like containing labels :return: void """ log.info('Starting training of RF classifier with training set of cardinality: {!s}'.format(len(train))) feature_matrix = extract_all_features(train) self.clf.fit(feature_matrix, labels)
def predict(self, test, labels=None): """ Predict test data :param test: array of samples to predict :return: array of true labels, array of predicted labels """ log.info('Starting prediction of {!s} samples'.format(len(test))) feature_matrix = extract_all_features(test) prediction = self.clf.predict(feature_matrix) if labels is not None: return labels, prediction else: return prediction
def predict_all_plain(self, domains, n_jobs=-1): """ Predicts the label of d using all classifiers present :param domains: iterable containing domains as str :return: dictionry containing results (dga, svm/rf):label """ feature_matrix = extract_all_features(domains) parallel = Parallel(n_jobs=n_jobs, verbose=1) res = parallel( delayed(_predict)(c, feature_matrix) for c in self.clfs ) # TODO merged = {} for d in res: keys = list(d.keys()) if keys[0] in res: merged[keys[0]] += d[keys[0]] else: merged.update(d) return merged
def leave_one_group_out_deprecated(clf, data_set: GroupedDataSet, n_jobs=8): log.info('Starting leave on group out cv.') logo = LeaveOneGroupOut() domains, labels, groups = data_set.expand() log.info('Set dimensions: {!s} x {!s} x {!s}'.format( len(domains), len(labels), len(groups))) log.info('Starting feature extraction.') feature_matrix = extract_all_features(domains) if isinstance(clf, SVC): std_scale = preprocessing.StandardScaler() feature_matrix = std_scale.fit_transform(feature_matrix) log.info('Feature extraction finished.') scores = cross_val_score(clf, feature_matrix, labels, groups, cv=logo, scoring=stats_metrics.multi_scorer_gridsearch, n_jobs=n_jobs, verbose=2) return scores
def grid_search(clf, param_grid, data_set: DataSet, persist=True, n_splits=5, n_jobs=8): log.info('Starting automated grid search. With {!s} jobs. Set {!s}'.format( n_jobs, data_set.id)) domains, labels = data_set.expand() log.info('Set size is: {!s}. Parameter ranges are: {!s}'.format( len(data_set.full), param_grid)) feature_matrix = extract_all_features(domains) if isinstance(clf, SVC): feature_matrix = preprocessing.StandardScaler().fit_transform( feature_matrix) cv = StratifiedKFold(n_splits=n_splits, shuffle=True) grid = GridSearchCV( clf, param_grid=param_grid, cv=cv, n_jobs=n_jobs, scoring=stats_metrics.multi_scorer_gridsearch, refit=False, verbose=2, return_train_score=False ) # turn refit to True to be able to reuse the best estimator grid.fit(feature_matrix, labels) log.info('Grid search finished.') log.info('Full statistics: \n {!s}'.format(grid.cv_results_)) log.info('Best parameters choice: {!s} with score: {!s}'.format( grid.best_params_, grid.best_score_)) with open( settings.GRID_SEARCH_FOLDER_TMP + ''.join(stats_metrics.get_rand_id()), 'rb') as f: all_scores = pickle.load(f) if persist: grid_str = '__' if isinstance(clf, SVC): for g in param_grid: for k in g.keys(): grid_str += '{!s}{!s}_'.format(k, len(g[k])) else: for k in param_grid.keys(): grid_str += '{!s}{!s}_'.format(k, len(param_grid[k])) grid_str += 'params{!s}__'.format(len(param_grid)) grid_file = GRID_SEARCH_FOLDER + '{!s}.pkl'.format(data_set.id + grid_str + NOW_STR) if os.path.isfile(grid_file): grid_file += COPY_SUFFIX + ''.join(stats_metrics.get_rand_id()) data_set.serialize(keep_copy=False) with open(grid_file, 'wb+') as f: pickle.dump(grid, f) with open(grid_file + SCORE_SUFFIX, 'wb+') as f: pickle.dump(all_scores, f) return grid, all_scores