Пример #1
0
class UnivariateSelectChiFPRPrim(primitive):
    def __init__(self, random_state=0):
        super(UnivariateSelectChiFPRPrim, self).__init__(name='UnivariateSelectChiFPR')
        self.id = 27
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the pvalues below alpha based on a FPR test with Chi-square. FPR test stands for False Positive Rate test. It controls the total amount of false detections."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'd'

    def can_accept(self, data):
        return self.can_accept_d(data, 'Classification')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFpr(chi2, alpha=0.05)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        try:
            mask = self.selector.get_support(indices=False)
            final_cols = list(compress(cols, mask))
            output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        except Exception as e:
            print(e)
        final_output = {0: output}
        return final_output
Пример #2
0
class f_regressionFPRPrim(primitive):
    def __init__(self, random_state=0):
        super(f_regressionFPRPrim, self).__init__(name='f_regressionFPR')
        self.id = 29
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the pvalues below alpha based on a FPR test with F-value between label/feature for regression tasks. FPR test stands for False Positive Rate test. It controls the total amount of false detections."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFpr(f_regression)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        mask = self.selector.get_support(indices=False)
        final_cols = list(compress(cols, mask))
        output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        final_output = {0: output}
        return final_output
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Пример #4
0
def evaluate_model(classifier, data_records, class_labels, labels):

    attribute_values = []
    accuracy_values = []

    # Scoring the attributes using F_test and false positive rate
    clf = SelectFpr(f_classif, alpha=0.9)
    clf.fit(data_records, class_labels)
    print(clf.scores_)
    print('\n')

    ranked_attr_indices = [0] * len(clf.scores_)
    for i, x in enumerate(sorted(range(len(clf.scores_)), key=lambda y: clf.scores_[y])):
        ranked_attr_indices[x] = i

    # Performing a 4-fold cross validation against varying number of attributes. The attributes are chosen
    # on the basis of their scores
    for idx in range(2, len(ranked_attr_indices)):
        filtered_records = data_records[:, ranked_attr_indices[:idx]]
        for idx2 in ranked_attr_indices[:idx]:
            print(labels[idx2])
        validation_score = cross_validation.cross_val_score(classifier, filtered_records, class_labels, cv=5)
        accuracy = max(validation_score) * 100
        attribute_values.append(idx)
        accuracy_values.append(accuracy)
        print('Cross validation score - ' + str(idx) + ' attributes :' + str(validation_score) + '\n')

    return (attribute_values, accuracy_values)
Пример #5
0
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4., 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Пример #6
0
 def test_select_fpr_int(self):
     model = SelectFpr()
     X = np.array(
         [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]],
         dtype=np.int64)
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select fpr",
         [("input", Int64TensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X, model, model_onnx,
         basename="SklearnSelectFpr")
Пример #7
0
 def test_select_fpr_int(self):
     model = SelectFpr()
     X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]])
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, 'select fpr', [('input', Int64TensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnSelectFpr",
         allow_failure=
         "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
def test_select_fpr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFpr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="fpr", param=0.0001).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Пример #9
0
def test_select_fpr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFpr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='fpr',
                                   param=0.0001).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
def feature_SelectFpr(x_data, y_data):
    # print(x_data)
    # print(y_data)
    bestfeatures = SelectFpr(f_classif, alpha=0.01)
    fit = bestfeatures.fit(x_data, y_data)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(x_data.columns)
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Specs', 'Score']  # naming the dataframe columns
    top_20_features = featureScores.nlargest(20, 'Score')
    return top_20_features
 def test_select_fpr_float(self):
     model = SelectFpr()
     X = np.array(
         [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]],
         dtype=np.float32,
     )
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select fpr", [("input", FloatTensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnSelectFpr",
         allow_failure="StrictVersion(onnx.__version__)"
         " < StrictVersion('1.2') or "
         "StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.2.1')",
     )
Пример #12
0
def train_decisiontree_FPR(configurationname,
                           train_data,
                           score_function,
                           undersam=False,
                           oversam=False,
                           export=False):
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectFpr(score_function)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    # if export:
    print("Exporting decision tree image...")
    export_graphviz(dtc,
                    out_file=DATAP + "/temp/trees/sltree_" +
                    configurationname + ".dot",
                    filled=True)
    transform(fitted_ids)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Пример #13
0
def test_select_heuristics_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fpr, fdr or fwe heuristics
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ["fdr", "fpr", "fwe"]:
        X_r2 = GenericUnivariateSelect(f_regression, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_equal(support[:5], np.ones((5,), dtype=np.bool))
        assert_less(np.sum(support[5:] == 1), 3)
def test_select_fpr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fpr heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="fpr", param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert (support[:5] == 1).all()
    assert np.sum(support[5:] == 1) < 3
Пример #15
0
def test_select_heuristics_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fpr, fdr or fwe heuristics
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
                           shuffle=False, random_state=0, noise=10)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ['fdr', 'fpr', 'fwe']:
        X_r2 = GenericUnivariateSelect(
            f_regression, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool))
        assert_less(np.sum(support[5:] == 1), 3)
Пример #16
0
def test_select_fpr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fpr heuristic
    """
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='fpr',
                    param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert(support[:5] == 1).all()
    assert(np.sum(support[5:] == 1) < 3)
Пример #17
0
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False):
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectFpr(score_function)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True)
        transform(fitted_ids)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Пример #18
0
# select k-best - f classif
k_best_f = SelectKBest(score_func=f_classif, k='all')
k_best_f = k_best_f.fit(x_train_labeled_robust_scaled, y_train_labeled)
print("K-Best F-Classif Scores:", sorted(k_best_f.scores_, reverse=True))
print()

# select k-best - f mutual info classif
k_best_m = SelectKBest(score_func=mutual_info_classif, k='all')
k_best_m = k_best_m.fit(x_train_labeled_robust_scaled, y_train_labeled)
print("K-Best F-Classif Scores:", sorted(k_best_m.scores_, reverse=True))
print()

# select fpr - f classif
fpr_f = SelectFpr(score_func=f_classif)
fpr_f = fpr_f.fit(x_train_labeled_robust_scaled, y_train_labeled)
print("Select FPR: F-Classif Scores:", sorted(fpr_f.scores_, reverse=True))
print()

# select fpr - f mutual info classif
fpr_m = SelectFpr(score_func=mutual_info_classif)
fpr_m = fpr_m.fit(x_train_labeled_robust_scaled, y_train_labeled)
print("Select FPR: F-Classif Scores:", sorted(fpr_m.scores_, reverse=True))
print()

# tree feature selection
model = ExtraTreesClassifier()
model.fit(x_train_labeled_robust_scaled, y_train_labeled)
print("Tree Features")
print(sorted(model.feature_importances_, reverse=True))
print()
    else:
        print("Loading tfidf model...")
        model_tfidf = TfidfModel.load(FLAGS.tfidfFile)

    print("Converting to tfidf vectors...")
    comments_tfidf = model_tfidf[comments_corpus]
    comments_vecs = np.vstack(
        [sparse2full(c, len(comments_dictionary)) for c in comments_tfidf])

    chi2_features = None
    if doTrain:
        # Find most descrimitive words for any of the labels
        print("Finding discrimitive features...")
        labels = np.array(data['any'])
        model_fpr = SelectFpr(chi2, alpha=0.025)
        model_fpr.fit(comments_vecs, labels)
        chi2_features = model_fpr.get_support(indices=True)
        np.save(FLAGS.chi2File, chi2_features)

    else:
        print("Loading discrimitive features data...")
        chi2_features = np.load(FLAGS.chi2File)

    print("Calculating tfidf weighted word2vec vectors...")
    chi2_tfidf_vecs = comments_vecs[:, chi2_features]
    fpr_embeddings = None
    if doTrain:
        print('Fitting FastText embedding model...')
        ft_model = FastText(sentences=docs, size=300, workers=8)
        fpr_embeddings = [
            ft_model.wv[t]
from sklearn.feature_selection import SelectFpr
from sklearn.cross_validation import KFold
from data_extractor import TrainingDataExtractor
from sklearn.metrics import roc_auc_score

datasource = TrainingDataExtractor()
features, labels = datasource.all_data()

number_of_features_to_reduce_to = [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]

for reduce_down_to in number_of_features_to_reduce_to:
    print 'reducing down to ' + str(reduce_down_to)
    estimator = LogisticRegression()
    selector = SelectFpr(alpha=reduce_down_to)

    selector.fit(features, labels)

    print 'performing 6-fold cross-validation'
    kf = KFold(len(features), 6, shuffle=False, random_state=None)
    roc_scores = []
    for train_indices, test_indices in kf:
        X_train, X_test = [
            features[train_index] for train_index in train_indices
        ], [features[test_index] for test_index in test_indices]
        y_train, y_test = [
            labels[train_index] for train_index in train_indices
        ], [labels[test_index] for test_index in test_indices]

        test_model = LogisticRegression()

        X_train = selector.transform(X_train)
# import data of all Count and Position features. Training and test sets altogether
dfCountfeatures = pd.read_csv('data/CountingAndPositionFeatures_TrainAndTestData.csv')
dfTrainRaw = pd.read_csv('data/train.csv')

# get only training data  
TrainQueryIDs = dfTrainRaw["id"]
relevance = dfTrainRaw["relevance"]
dfCountfeatures_TrainSet = dfCountfeatures[dfCountfeatures["id"].isin(TrainQueryIDs)]
#select these features which have non-zero variance
selector = VarianceThreshold()
selector.fit_transform(dfCountfeatures_TrainSet).shape # only one feature with zero variance - shape (74067L, 262L)

# select feature based on p-values from univariate regression with target feature (relevance)
selector2= SelectFpr(f_regression, alpha = 0.01)
selector2.fit(dfCountfeatures_TrainSet.drop("id", axis = 1), relevance)
selector2.get_support(indices=True).size # left 226 features out of 262 with p-value <=1%
# get titles of features which were selected
selectedCountfeatures = dfCountfeatures.columns[selector2.get_support(indices=True)]

# check correlation amongst features
corrReduced = dfCountfeatures_TrainSet[selectedCountfeatures].corr()
corrReduced.iloc[:,:] = np.tril(corrReduced.values, k=-1)
corrReduced =corrReduced.stack()
# get pairs of features which are highly correlated
corrReduced[corrReduced.abs()>0.8].size # 578 pairs correlated more than 80% out of 25.425
len(set(corrReduced[corrReduced.abs()>0.8].index.labels[0])) # 172 features to be removed due to high correlation with other features
# get feature titles which will be used in training the model after removing highly correlated features
indices = set(corrReduced[corrReduced.abs()>0.8].index.labels[0])
selectedCountfeatures2 = [i for j, i in enumerate(selectedCountfeatures.tolist()) if j not in indices]
selectedCountfeatures2.append("id")
Пример #22
0
################################################################################
pl.figure(1)
pl.clf()

x_indices = np.arange(x.shape[-1])

################################################################################
# Univariate feature selection
from sklearn.feature_selection import SelectFpr, f_classif
# As a scoring function, we use a F test for classification
# We use the default selection function: the 10% most significant
# features

selector = SelectFpr(f_classif, alpha=0.1)
selector.fit(x, y)
scores = -np.log10(selector._pvalues)
scores /= scores.max()
pl.bar(x_indices-.45, scores, width=.3,
        label=r'Univariate score ($-Log(p_{value})$)',
        color='g')

################################################################################
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(x, y)

svm_weights = (clf.coef_**2).sum(axis=0)
svm_weights /= svm_weights.max()
pl.bar(x_indices-.15, svm_weights, width=.3, label='SVM weight',
        color='r')
def find_statistical_saboteurs(
    groups_data, pvalue_threshold=0.1, effect_threshold=0, max_significant_members=10
):
    """Return statistics on possible bad elements in the data.

    Parameters
    ----------
    groups_data
      Result of ``csv_to_groups_data()``.

    pvalue_threshold
      Only failure-associated elements with a p-value below this threshold
      will be included in the final statistics.
    """
    groups_data = deepcopy(groups_data)
    twins, almost_tweens, has_twins = _find_twins(groups_data)
    members_sets = [set(group["members"]) for group in groups_data.values()]
    all_members = set().union(*members_sets)
    conserved_members = members_sets[0].intersection(*members_sets)
    members_with_twins = set().union(*twins.values())
    varying_members = sorted(
        all_members.difference(conserved_members).difference(members_with_twins)
    )

    # Build the data

    def build_data_and_observed(selected_members, by_group=False):
        data = []
        observed = []
        for group_name, group_data in groups_data.items():
            attempts = int(group_data["attempts"])
            failures = int(group_data["failures"])
            vector = [[(mb in group_data["members"]) for mb in selected_members]]
            if by_group:
                data += vector
                observed.append(1.0 * failures / attempts)
            else:
                data += attempts * vector
                observed += (attempts - failures) * [0] + failures * [1]
        return np.array(data), np.array(observed)

    # LASSO model (gives positive / negative impact)
    data, observed = build_data_and_observed(varying_members)
    regression = linear_model.RidgeCV()
    regression.fit(data, observed)

    # ANOVA analysis (for p-values)
    selector = SelectFpr(f_classif, alpha=pvalue_threshold)
    selector.fit(data, observed)

    # select the most interesting parts
    data_ = zip(selector.pvalues_, regression.coef_, varying_members)
    significant_members = OrderedDict(
        [
            (name, {"pvalue": pvalue, "twins": twins.get(name, [])})
            for pvalue, coef, name in sorted(data_)
            if (pvalue < pvalue_threshold) and (coef > 0)
        ]
    )

    if len(significant_members) == 0:
        return {
            "groups_data": groups_data,
            "conserved_members": conserved_members,
            "varying_members": varying_members,
            "significant_members": significant_members,
        }
    # LASSO model (significant parts only)
    data, observed = build_data_and_observed(significant_members)
    regression.fit(data, observed)
    zipped = zip(regression.coef_, significant_members.items())
    for coef, (name, data_) in zipped:
        data_["effect"] = coef
    for member in list(significant_members.keys()):
        if significant_members[member]["effect"] < effect_threshold:
            significant_members.pop(member)

    # print (significant_members)
    # significant_members = significant_members[:max_significant_members]

    # Build a classifier to compute a L1 score
    classifier = linear_model.LogisticRegressionCV(penalty="l2")
    classifier.fit(data, observed)
    f1_score = metrics.f1_score(observed, classifier.predict(data))

    # Find constructs which are less explained by the parts:
    data, observed = build_data_and_observed(significant_members, by_group=True)
    regression.fit(data, observed)
    predictions = regression.predict(data)
    zipped = zip(groups_data.values(), observed, predictions)
    intercept = min(0.9, max(0.1, regression.intercept_))
    for group_data, obs, pred in zipped:
        std = binom.std(group_data["attempts"], intercept) / group_data["attempts"]
        group_data["failure_rate"] = obs
        group_data["deviation"] = np.round((obs - pred) / std, decimals=1)

    return {
        "groups_data": groups_data,
        "conserved_members": conserved_members,
        "varying_members": varying_members,
        "significant_members": significant_members,
        "f1_score": f1_score,
    }
Пример #24
0
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectFpr
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

df = pd.read_csv('Train_CV_Data.csv')
X_train = np.asarray(df.loc[:2000000, 'srcPort':'HTTPM4'])
Y_train = np.asarray(df.loc[:2000000, 'malicious'], dtype=np.int32)
print(np.sum(Y_train == 1))

kBest = SelectKBest(chi2, k=12)
kBest.fit(X_train, Y_train)
mask1 = kBest.get_support(indices=True)

fpr = SelectFpr(chi2, alpha=0.0001)
fpr.fit(X_train, Y_train)
mask2 = fpr.get_support(indices=True)

rf = RandomForestClassifier(n_estimators=50)

rfe = RFE(rf, n_features_to_select=12, step=1)
rfe.fit(X_train, Y_train)
mask3 = rfe.get_support(indices=True)

print('K-Best Feat :', mask1)
print('False Positive based :', mask2)
print('RFE based :', mask3)
Пример #25
0
data2 = pdc.objFeatures[tr2_mask][:, featureIds]
data = np.vstack([data1, data2])
labels1 = np.zeros((data1.shape[0],))
labels2 = np.ones((data2.shape[0],))
labels = np.hstack([labels1, labels2])
X1 = data1[:1000]
X2 = data2[-1000:]
X = np.vstack([X1, X2])
Y1 = labels1[:X1.shape[0]]
Y2 = labels2[:X2.shape[0]]
Y = np.hstack([Y1, Y2])

from sklearn.feature_selection import SelectFpr, f_classif

selector = SelectFpr(f_classif, alpha=0.1)
selector.fit(X, Y)
scores = -np.log10(selector._pvalues)
scores /= scores.max()

from sklearn import svm
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)
print 'SVM error:', clf.score(data, labels)
pred = clf.predict(data)
match = numpy.sum(pred == labels)
print match, labels.shape[0]
print match / float(labels.shape[0])

svm_weights = (clf.coef_**2).sum(axis=0)
svm_weights /= svm_weights.max()
Пример #26
0
################################################################################
pl.figure(1)
pl.clf()

x_indices = np.arange(x.shape[-1])

################################################################################
# Univariate feature selection
from sklearn.feature_selection import SelectFpr, f_classif
# As a scoring function, we use a F test for classification
# We use the default selection function: the 10% most significant
# features

selector = SelectFpr(f_classif, alpha=0.1)
selector.fit(x, y)
scores = -np.log10(selector._pvalues)
scores /= scores.max()
pl.bar(x_indices-.45, scores, width=.3,
        label=r'Univariate score ($-Log(p_{value})$)',
        color='g')

################################################################################
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(x, y)

svm_weights = (clf.coef_**2).sum(axis=0)
svm_weights /= svm_weights.max()
pl.bar(x_indices-.15, svm_weights, width=.3, label='SVM weight',
        color='r')