class UnivariateSelectChiFPRPrim(primitive): def __init__(self, random_state=0): super(UnivariateSelectChiFPRPrim, self).__init__(name='UnivariateSelectChiFPR') self.id = 27 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the pvalues below alpha based on a FPR test with Chi-square. FPR test stands for False Positive Rate test. It controls the total amount of false detections." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'd' def can_accept(self, data): return self.can_accept_d(data, 'Classification') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFpr(chi2, alpha=0.05) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) try: mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) except Exception as e: print(e) final_output = {0: output} return final_output
class f_regressionFPRPrim(primitive): def __init__(self, random_state=0): super(f_regressionFPRPrim, self).__init__(name='f_regressionFPR') self.id = 29 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the pvalues below alpha based on a FPR test with F-value between label/feature for regression tasks. FPR test stands for False Positive Rate test. It controls the total amount of false detections." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFpr(f_regression) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) final_output = {0: output} return final_output
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def evaluate_model(classifier, data_records, class_labels, labels): attribute_values = [] accuracy_values = [] # Scoring the attributes using F_test and false positive rate clf = SelectFpr(f_classif, alpha=0.9) clf.fit(data_records, class_labels) print(clf.scores_) print('\n') ranked_attr_indices = [0] * len(clf.scores_) for i, x in enumerate(sorted(range(len(clf.scores_)), key=lambda y: clf.scores_[y])): ranked_attr_indices[x] = i # Performing a 4-fold cross validation against varying number of attributes. The attributes are chosen # on the basis of their scores for idx in range(2, len(ranked_attr_indices)): filtered_records = data_records[:, ranked_attr_indices[:idx]] for idx2 in ranked_attr_indices[:idx]: print(labels[idx2]) validation_score = cross_validation.cross_val_score(classifier, filtered_records, class_labels, cv=5) accuracy = max(validation_score) * 100 attribute_values.append(idx) accuracy_values.append(accuracy) print('Cross validation score - ' + str(idx) + ' attributes :' + str(validation_score) + '\n') return (attribute_values, accuracy_values)
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4., 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fpr_int(self): model = SelectFpr() X = np.array( [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.int64) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, "select fpr", [("input", Int64TensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectFpr")
def test_select_fpr_int(self): model = SelectFpr() X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]]) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, 'select fpr', [('input', Int64TensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectFpr", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
def test_select_fpr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFpr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="fpr", param=0.0001).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_fpr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFpr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fpr', param=0.0001).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def feature_SelectFpr(x_data, y_data): # print(x_data) # print(y_data) bestfeatures = SelectFpr(f_classif, alpha=0.01) fit = bestfeatures.fit(x_data, y_data) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(x_data.columns) featureScores = pd.concat([dfcolumns, dfscores], axis=1) featureScores.columns = ['Specs', 'Score'] # naming the dataframe columns top_20_features = featureScores.nlargest(20, 'Score') return top_20_features
def test_select_fpr_float(self): model = SelectFpr() X = np.array( [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.float32, ) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, "select fpr", [("input", FloatTensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectFpr", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False): print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectFpr(score_function) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) # if export: print("Exporting decision tree image...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def test_select_heuristics_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fpr, fdr or fwe heuristics X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ["fdr", "fpr", "fwe"]: X_r2 = GenericUnivariateSelect(f_regression, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_equal(support[:5], np.ones((5,), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 3)
def test_select_fpr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fpr heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fpr", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert (support[:5] == 1).all() assert np.sum(support[5:] == 1) < 3
def test_select_heuristics_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fpr, fdr or fwe heuristics X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ['fdr', 'fpr', 'fwe']: X_r2 = GenericUnivariateSelect( f_regression, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 3)
def test_select_fpr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fpr heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFpr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fpr', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert(support[:5] == 1).all() assert(np.sum(support[5:] == 1) < 3)
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False): print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectFpr(score_function) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
# select k-best - f classif k_best_f = SelectKBest(score_func=f_classif, k='all') k_best_f = k_best_f.fit(x_train_labeled_robust_scaled, y_train_labeled) print("K-Best F-Classif Scores:", sorted(k_best_f.scores_, reverse=True)) print() # select k-best - f mutual info classif k_best_m = SelectKBest(score_func=mutual_info_classif, k='all') k_best_m = k_best_m.fit(x_train_labeled_robust_scaled, y_train_labeled) print("K-Best F-Classif Scores:", sorted(k_best_m.scores_, reverse=True)) print() # select fpr - f classif fpr_f = SelectFpr(score_func=f_classif) fpr_f = fpr_f.fit(x_train_labeled_robust_scaled, y_train_labeled) print("Select FPR: F-Classif Scores:", sorted(fpr_f.scores_, reverse=True)) print() # select fpr - f mutual info classif fpr_m = SelectFpr(score_func=mutual_info_classif) fpr_m = fpr_m.fit(x_train_labeled_robust_scaled, y_train_labeled) print("Select FPR: F-Classif Scores:", sorted(fpr_m.scores_, reverse=True)) print() # tree feature selection model = ExtraTreesClassifier() model.fit(x_train_labeled_robust_scaled, y_train_labeled) print("Tree Features") print(sorted(model.feature_importances_, reverse=True)) print()
else: print("Loading tfidf model...") model_tfidf = TfidfModel.load(FLAGS.tfidfFile) print("Converting to tfidf vectors...") comments_tfidf = model_tfidf[comments_corpus] comments_vecs = np.vstack( [sparse2full(c, len(comments_dictionary)) for c in comments_tfidf]) chi2_features = None if doTrain: # Find most descrimitive words for any of the labels print("Finding discrimitive features...") labels = np.array(data['any']) model_fpr = SelectFpr(chi2, alpha=0.025) model_fpr.fit(comments_vecs, labels) chi2_features = model_fpr.get_support(indices=True) np.save(FLAGS.chi2File, chi2_features) else: print("Loading discrimitive features data...") chi2_features = np.load(FLAGS.chi2File) print("Calculating tfidf weighted word2vec vectors...") chi2_tfidf_vecs = comments_vecs[:, chi2_features] fpr_embeddings = None if doTrain: print('Fitting FastText embedding model...') ft_model = FastText(sentences=docs, size=300, workers=8) fpr_embeddings = [ ft_model.wv[t]
from sklearn.feature_selection import SelectFpr from sklearn.cross_validation import KFold from data_extractor import TrainingDataExtractor from sklearn.metrics import roc_auc_score datasource = TrainingDataExtractor() features, labels = datasource.all_data() number_of_features_to_reduce_to = [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5] for reduce_down_to in number_of_features_to_reduce_to: print 'reducing down to ' + str(reduce_down_to) estimator = LogisticRegression() selector = SelectFpr(alpha=reduce_down_to) selector.fit(features, labels) print 'performing 6-fold cross-validation' kf = KFold(len(features), 6, shuffle=False, random_state=None) roc_scores = [] for train_indices, test_indices in kf: X_train, X_test = [ features[train_index] for train_index in train_indices ], [features[test_index] for test_index in test_indices] y_train, y_test = [ labels[train_index] for train_index in train_indices ], [labels[test_index] for test_index in test_indices] test_model = LogisticRegression() X_train = selector.transform(X_train)
# import data of all Count and Position features. Training and test sets altogether dfCountfeatures = pd.read_csv('data/CountingAndPositionFeatures_TrainAndTestData.csv') dfTrainRaw = pd.read_csv('data/train.csv') # get only training data TrainQueryIDs = dfTrainRaw["id"] relevance = dfTrainRaw["relevance"] dfCountfeatures_TrainSet = dfCountfeatures[dfCountfeatures["id"].isin(TrainQueryIDs)] #select these features which have non-zero variance selector = VarianceThreshold() selector.fit_transform(dfCountfeatures_TrainSet).shape # only one feature with zero variance - shape (74067L, 262L) # select feature based on p-values from univariate regression with target feature (relevance) selector2= SelectFpr(f_regression, alpha = 0.01) selector2.fit(dfCountfeatures_TrainSet.drop("id", axis = 1), relevance) selector2.get_support(indices=True).size # left 226 features out of 262 with p-value <=1% # get titles of features which were selected selectedCountfeatures = dfCountfeatures.columns[selector2.get_support(indices=True)] # check correlation amongst features corrReduced = dfCountfeatures_TrainSet[selectedCountfeatures].corr() corrReduced.iloc[:,:] = np.tril(corrReduced.values, k=-1) corrReduced =corrReduced.stack() # get pairs of features which are highly correlated corrReduced[corrReduced.abs()>0.8].size # 578 pairs correlated more than 80% out of 25.425 len(set(corrReduced[corrReduced.abs()>0.8].index.labels[0])) # 172 features to be removed due to high correlation with other features # get feature titles which will be used in training the model after removing highly correlated features indices = set(corrReduced[corrReduced.abs()>0.8].index.labels[0]) selectedCountfeatures2 = [i for j, i in enumerate(selectedCountfeatures.tolist()) if j not in indices] selectedCountfeatures2.append("id")
################################################################################ pl.figure(1) pl.clf() x_indices = np.arange(x.shape[-1]) ################################################################################ # Univariate feature selection from sklearn.feature_selection import SelectFpr, f_classif # As a scoring function, we use a F test for classification # We use the default selection function: the 10% most significant # features selector = SelectFpr(f_classif, alpha=0.1) selector.fit(x, y) scores = -np.log10(selector._pvalues) scores /= scores.max() pl.bar(x_indices-.45, scores, width=.3, label=r'Univariate score ($-Log(p_{value})$)', color='g') ################################################################################ # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(x, y) svm_weights = (clf.coef_**2).sum(axis=0) svm_weights /= svm_weights.max() pl.bar(x_indices-.15, svm_weights, width=.3, label='SVM weight', color='r')
def find_statistical_saboteurs( groups_data, pvalue_threshold=0.1, effect_threshold=0, max_significant_members=10 ): """Return statistics on possible bad elements in the data. Parameters ---------- groups_data Result of ``csv_to_groups_data()``. pvalue_threshold Only failure-associated elements with a p-value below this threshold will be included in the final statistics. """ groups_data = deepcopy(groups_data) twins, almost_tweens, has_twins = _find_twins(groups_data) members_sets = [set(group["members"]) for group in groups_data.values()] all_members = set().union(*members_sets) conserved_members = members_sets[0].intersection(*members_sets) members_with_twins = set().union(*twins.values()) varying_members = sorted( all_members.difference(conserved_members).difference(members_with_twins) ) # Build the data def build_data_and_observed(selected_members, by_group=False): data = [] observed = [] for group_name, group_data in groups_data.items(): attempts = int(group_data["attempts"]) failures = int(group_data["failures"]) vector = [[(mb in group_data["members"]) for mb in selected_members]] if by_group: data += vector observed.append(1.0 * failures / attempts) else: data += attempts * vector observed += (attempts - failures) * [0] + failures * [1] return np.array(data), np.array(observed) # LASSO model (gives positive / negative impact) data, observed = build_data_and_observed(varying_members) regression = linear_model.RidgeCV() regression.fit(data, observed) # ANOVA analysis (for p-values) selector = SelectFpr(f_classif, alpha=pvalue_threshold) selector.fit(data, observed) # select the most interesting parts data_ = zip(selector.pvalues_, regression.coef_, varying_members) significant_members = OrderedDict( [ (name, {"pvalue": pvalue, "twins": twins.get(name, [])}) for pvalue, coef, name in sorted(data_) if (pvalue < pvalue_threshold) and (coef > 0) ] ) if len(significant_members) == 0: return { "groups_data": groups_data, "conserved_members": conserved_members, "varying_members": varying_members, "significant_members": significant_members, } # LASSO model (significant parts only) data, observed = build_data_and_observed(significant_members) regression.fit(data, observed) zipped = zip(regression.coef_, significant_members.items()) for coef, (name, data_) in zipped: data_["effect"] = coef for member in list(significant_members.keys()): if significant_members[member]["effect"] < effect_threshold: significant_members.pop(member) # print (significant_members) # significant_members = significant_members[:max_significant_members] # Build a classifier to compute a L1 score classifier = linear_model.LogisticRegressionCV(penalty="l2") classifier.fit(data, observed) f1_score = metrics.f1_score(observed, classifier.predict(data)) # Find constructs which are less explained by the parts: data, observed = build_data_and_observed(significant_members, by_group=True) regression.fit(data, observed) predictions = regression.predict(data) zipped = zip(groups_data.values(), observed, predictions) intercept = min(0.9, max(0.1, regression.intercept_)) for group_data, obs, pred in zipped: std = binom.std(group_data["attempts"], intercept) / group_data["attempts"] group_data["failure_rate"] = obs group_data["deviation"] = np.round((obs - pred) / std, decimals=1) return { "groups_data": groups_data, "conserved_members": conserved_members, "varying_members": varying_members, "significant_members": significant_members, "f1_score": f1_score, }
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectFpr from sklearn.ensemble import RandomForestClassifier import pandas as pd import numpy as np df = pd.read_csv('Train_CV_Data.csv') X_train = np.asarray(df.loc[:2000000, 'srcPort':'HTTPM4']) Y_train = np.asarray(df.loc[:2000000, 'malicious'], dtype=np.int32) print(np.sum(Y_train == 1)) kBest = SelectKBest(chi2, k=12) kBest.fit(X_train, Y_train) mask1 = kBest.get_support(indices=True) fpr = SelectFpr(chi2, alpha=0.0001) fpr.fit(X_train, Y_train) mask2 = fpr.get_support(indices=True) rf = RandomForestClassifier(n_estimators=50) rfe = RFE(rf, n_features_to_select=12, step=1) rfe.fit(X_train, Y_train) mask3 = rfe.get_support(indices=True) print('K-Best Feat :', mask1) print('False Positive based :', mask2) print('RFE based :', mask3)
data2 = pdc.objFeatures[tr2_mask][:, featureIds] data = np.vstack([data1, data2]) labels1 = np.zeros((data1.shape[0],)) labels2 = np.ones((data2.shape[0],)) labels = np.hstack([labels1, labels2]) X1 = data1[:1000] X2 = data2[-1000:] X = np.vstack([X1, X2]) Y1 = labels1[:X1.shape[0]] Y2 = labels2[:X2.shape[0]] Y = np.hstack([Y1, Y2]) from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) selector.fit(X, Y) scores = -np.log10(selector._pvalues) scores /= scores.max() from sklearn import svm # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, Y) print 'SVM error:', clf.score(data, labels) pred = clf.predict(data) match = numpy.sum(pred == labels) print match, labels.shape[0] print match / float(labels.shape[0]) svm_weights = (clf.coef_**2).sum(axis=0) svm_weights /= svm_weights.max()