예제 #1
0
def time_feature_extraction(rep=1):
    w = Workspace(days=1, empty=True)
    set_name = 'mixed_dga_grouped_family_50000_59_0.pkl'
    w.load(set_name, settings.SetTypes.mixed_dga_grouped_family.value)
    domains = [ld.domain for ld in w.data_sets_loaded['mixed_dga_grouped_family_50000_59_0.pkl'].full]

    t = Timer(lambda: feature_extraction.extract_all_features(domains))
    print(t.timeit(number=rep))
예제 #2
0
    def training(self, train, labels):
        """
        Training on given data.
        :param train: array-like containing domain name strings
        :param labels: array-like containing labels
        :return: void
        """
        log.info('Starting training of RF classifier with training set of cardinality: {!s}'.format(len(train)))

        feature_matrix = extract_all_features(train)
        self.clf.fit(feature_matrix, labels)
예제 #3
0
    def predict(self, test, labels=None):
        """
        Predict test data
        :param test: array of samples to predict
        :return: array of true labels, array of predicted labels
        """

        log.info('Starting prediction of {!s} samples'.format(len(test)))

        feature_matrix = extract_all_features(test)
        prediction = self.clf.predict(feature_matrix)

        if labels is not None:
            return labels, prediction
        else:
            return prediction
예제 #4
0
    def predict_all_plain(self, domains, n_jobs=-1):
        """
        Predicts the label of d using all classifiers present
        :param domains: iterable containing domains as str
        :return: dictionry containing results (dga, svm/rf):label
        """
        feature_matrix = extract_all_features(domains)

        parallel = Parallel(n_jobs=n_jobs, verbose=1)

        res = parallel(
            delayed(_predict)(c, feature_matrix)
            for c in self.clfs
        )
        # TODO
        merged = {}
        for d in res:
            keys = list(d.keys())
            if keys[0] in res:
                merged[keys[0]] += d[keys[0]]
            else:
                merged.update(d)
        return merged
예제 #5
0
def leave_one_group_out_deprecated(clf, data_set: GroupedDataSet, n_jobs=8):
    log.info('Starting leave on group out cv.')
    logo = LeaveOneGroupOut()
    domains, labels, groups = data_set.expand()
    log.info('Set dimensions: {!s} x {!s} x {!s}'.format(
        len(domains), len(labels), len(groups)))
    log.info('Starting feature extraction.')
    feature_matrix = extract_all_features(domains)
    if isinstance(clf, SVC):
        std_scale = preprocessing.StandardScaler()
        feature_matrix = std_scale.fit_transform(feature_matrix)

    log.info('Feature extraction finished.')

    scores = cross_val_score(clf,
                             feature_matrix,
                             labels,
                             groups,
                             cv=logo,
                             scoring=stats_metrics.multi_scorer_gridsearch,
                             n_jobs=n_jobs,
                             verbose=2)
    return scores
예제 #6
0
def grid_search(clf,
                param_grid,
                data_set: DataSet,
                persist=True,
                n_splits=5,
                n_jobs=8):

    log.info('Starting automated grid search. With {!s} jobs. Set {!s}'.format(
        n_jobs, data_set.id))
    domains, labels = data_set.expand()

    log.info('Set size is: {!s}. Parameter ranges are: {!s}'.format(
        len(data_set.full), param_grid))

    feature_matrix = extract_all_features(domains)
    if isinstance(clf, SVC):
        feature_matrix = preprocessing.StandardScaler().fit_transform(
            feature_matrix)

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True)

    grid = GridSearchCV(
        clf,
        param_grid=param_grid,
        cv=cv,
        n_jobs=n_jobs,
        scoring=stats_metrics.multi_scorer_gridsearch,
        refit=False,
        verbose=2,
        return_train_score=False
    )  # turn refit to True to be able to reuse the best estimator
    grid.fit(feature_matrix, labels)

    log.info('Grid search finished.')
    log.info('Full statistics: \n {!s}'.format(grid.cv_results_))
    log.info('Best parameters choice: {!s} with score: {!s}'.format(
        grid.best_params_, grid.best_score_))

    with open(
            settings.GRID_SEARCH_FOLDER_TMP +
            ''.join(stats_metrics.get_rand_id()), 'rb') as f:
        all_scores = pickle.load(f)

    if persist:
        grid_str = '__'

        if isinstance(clf, SVC):
            for g in param_grid:
                for k in g.keys():
                    grid_str += '{!s}{!s}_'.format(k, len(g[k]))
        else:
            for k in param_grid.keys():
                grid_str += '{!s}{!s}_'.format(k, len(param_grid[k]))

        grid_str += 'params{!s}__'.format(len(param_grid))
        grid_file = GRID_SEARCH_FOLDER + '{!s}.pkl'.format(data_set.id +
                                                           grid_str + NOW_STR)
        if os.path.isfile(grid_file):
            grid_file += COPY_SUFFIX + ''.join(stats_metrics.get_rand_id())
        data_set.serialize(keep_copy=False)
        with open(grid_file, 'wb+') as f:
            pickle.dump(grid, f)
        with open(grid_file + SCORE_SUFFIX, 'wb+') as f:
            pickle.dump(all_scores, f)

    return grid, all_scores