Exemplo n.º 1
0
def recursiveFeatureSelectorCV(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFECV(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)
    transformed_test_data = rfe.transform(test_data)
    
    return transformed_train_data,transformed_test_data
Exemplo n.º 2
0
def lr_with_fs():
    """
    Submission: lr_with_fs_0703_01.csv
    E_val:
    E_in:
    E_out:
    """
    from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    import pylab as pl

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    pkl_path = Path.of_cache('lr_with_fs.RFECV.pkl')
    rfe = IO.fetch_cache(pkl_path)
    if rfe is None:
        rfe = RFECV(estimator=LogisticRegression(class_weight='auto'),
                    cv=StratifiedKFold(y, 5), scoring='roc_auc')
        rfe.fit(X_scaled, y)
        IO.cache(rfe, pkl_path)

        print("Optimal number of features : %d" % rfe.n_features_)

        # Plot number of features VS. cross-validation scores
        pl.figure()
        pl.xlabel("Number of features selected")
        pl.ylabel("Cross validation score (AUC)")
        pl.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_)
        pl.savefig('lr_with_fs.refcv')

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_new, y)

    print('CV scores: %s' % clf.scores_)
    print('Ein: %f' % Util.auc_score(clf, X_new, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('rfe', rfe),
                                 ('scale_new', new_scaler),
                                 ('lr', clf)]), 'lr_with_fs_0703_01')
Exemplo n.º 3
0
    def rfecv_selection(self, x_train, y_train, cvfolds=5, feature_names=None, svm_kernel='linear',
                        rf_n_esimators=20, estimator='svm', log_root=None, log_file=None):
        # Recursive feature elimination with SVM or Random Forest
        # Input: Training/target nparrays and estimator params
        # Output: RFECV object; selected features as nparray; (array of selected/dropped feats)
        if estimator == 'svm':
            clf = SVC(kernel = svm_kernel)
        elif estimator == 'rf':
            clf = RandomForestClassifierWithCoef(n_estimators = rf_n_esimators)
        elif estimator == 'mnb':
            clf = MultinomialNB()

        rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, cvfolds),
              scoring='accuracy')

        rfecv.fit(x_train, y_train)

        if log_root and log_file:
            log_text = '\nOptimal number of features: '+str(rfecv.n_features_)
            with open(log_file, 'a') as log: log.write(log_text)
        else:
            print("\nOptimal number of features : %d" % rfecv.n_features_)

        x_train_rerf = rfecv.transform(x_train)

        if feature_names is not None:
            ranked_feats = rfecv.ranking_
            selected_feats = [feature_names[ix] for ix in range(0,len(ranked_feats)) if ranked_feats[ix]==1]
            dropped_feats = [feature_names[ix] for ix in range(0,len(ranked_feats)) if ranked_feats[ix]!=1]
            if log_root and log_file:
                log_text = '\n\nSelected Features: \n'
                with open(log_file, 'a') as log: log.write(log_text)
                with open(log_file, 'a') as log:
                    for item in selected_feats:
                        log.write(item+', ')
                log_text = '\n\nDropped Features: \n'
                with open(log_file, 'a') as log: log.write(log_text)
                with open(log_file, 'a') as log:
                    for item in dropped_feats:
                        log.write(item+', ')
            else:
                print('\nSelected Features: {0}\n'.format(selected_feats))
                print('\nDropped Features: {0}\n'.format(dropped_feats))
            return rfecv, x_train_rerf, rfecv.n_features_, selected_feats, dropped_feats
        else:
            return rfecv, x_train_rerf, rfecv.n_features_
Exemplo n.º 4
0
    def recursive_feature_elimination(self, x: np.ndarray, y: np.ndarray, clf=None) -> np.ndarray:
        selector = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y), scoring='accuracy', verbose=True)
        print("begin eliminate")
        selector.fit(x, y)

        print("Optimal number of features : %d" % selector.n_features_)

        # Plot number of features VS. cross-validation scores
        plt.figure()
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation score (nb of correct classifications)")
        plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
        plt.show()

        selected_features = self.features[selector.get_support()]
        print(selected_features)
        x = selector.transform(x)
        return x
Exemplo n.º 5
0
full_df = pd.concat([train_df.drop(['SalePrice'], axis=1), test_df])
idx_split = train_df.shape[0]

full_df = preprocessing(full_df)
full_df = StandardScaler().fit_transform(full_df)

#print(len(categorical_features)+len(ordinal_features)+ len(numerical_features) + len(bin_features))

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline

estimator = GradientBoostingRegressor()
model = RFECV(estimator, step=1, cv=5)
model.fit(full_df[:idx_split, :], y_train)
full_df = model.transform(full_df)

# clf = Pipeline([
#    ('feature_selection', RFECV(ExtraTreesRegressor())),
#    ('classification', ExtraTreesRegressor())
#  ])

X_train = full_df[:idx_split, :]
print(X_train.shape)
s = find_seed(X_train, y_train, 64)
print(score(X_train, y_train, seed=s, estim=64))

clf = GradientBoostingRegressor(n_estimators=64, random_state=s)
clf.fit(X_train, y_train)
y_test = clf.predict(full_df[idx_split:, :])
print(y_test)
Exemplo n.º 6
0
def main():
	X = select_features(raw_data, features)
	y = raw_data['diagnosis'].copy()

	if features != 'all':
		print('')
		print("--------------------------------------------------------")
		print("Displaying correlation matrix for the selected features")
		print("--------------------------------------------------------")
		corr_mtrx(X)
		print('')
		print("-------------------------------------------------------")
		print("Displaying dispersion matrix for the selected features")
		print("-------------------------------------------------------")
		X_plot = X.copy()
		X_plot['diagnosis'] = raw_data['diagnosis'].copy()
		corr_plot(X_plot)
		print('')

	# Scaling the X features so they range between -1 and 1 with an average value of 0
	# NOTE: applying StandardScaler() transforms the pandas dataframe into a numpy array,
	# which is problematic when we want to use pandas specific  attributes like .columns
	# from sklearn.preprocessing import StandardScaler
	# sc = StandardScaler()
	# X_scaled = sc.fit_transform(X)
	X_scaled = (X - X.mean()) / (X.std())
	# Transform the 'diagnosis' column so the values are numerical (1=Malignant, 0=Benign)
	y = y.map({'M': 1, 'B': 0})


	## ==================== 4. RANDOMLY SPLITTING THE DATA INTO TRAINING AND TEST SETS ==================== ##
	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=0)


	## ==================== 5. CONSTRUCTING THE MODELS ==================== ##
	print("==============================================================")
	print("= STEP 1: CONSTRUCTING MODELS WITH ALL THE SELECTED FEATURES =")
	print("==============================================================")
	from sklearn import linear_model	# Logistic regression
	logreg = linear_model.LogisticRegression(random_state=0)
	logreg_params = {'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]}

	from sklearn import tree	# Decision tree
	dtree = tree.DecisionTreeClassifier(criterion='entropy', max_features='sqrt', random_state=0)
	dtree_params = {'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
					'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10]}

	from sklearn.model_selection import GridSearchCV	# Grid search (parameters optimization)
	from sklearn import metrics
	models_dict = {logreg: logreg_params, dtree: dtree_params}

	for model, params in models_dict.items():	# Loop over the classifiers
		clf = GridSearchCV(model, params, cv=5, scoring='precision')
		clf.fit(X_train, y_train)
		print("Classifier:")
		print(clf.best_estimator_)
		print('')
		print("Cross-validation: searching for the best parameters...\n")
		print("Best fit parameters: ", clf.best_params_)
		print("Precision score obtained on the training set: %.2f" % clf.best_score_)
		prediction = clf.predict(X_test)
		print("Precision score obtained on the test set: %.2f" % metrics.precision_score(prediction, y_test))
		print('')
		print("--------------------------------------------------------")
	input("Program paused, press Enter to continue...\n")


	## ==================== 6. FEATURES SELECTION ==================== ##
	print("=================================================================")
	print("= STEP 2: CONSTRUCTING MODELS WITH NON-CORRELATED FEATURES ONLY =")
	print("=================================================================")
	# Here, we shall perform the same computations than in Step 1 with a reduced number
	# of features, eliminating features that are correlated to other ones.
	X_filt = X_scaled.filter(regex='^(radius_|concave points_|texture_|smoothness_|symmetry_)\D*$', axis=1)
	X_train2, X_test2, y_train2, y_test2 = train_test_split(X_filt, y, test_size=0.30, random_state=0)
	print("The features retained for this step are: ")
	print(X_filt.columns)
	print('')
	print("--------------------------------------------------------")
	for model, params in models_dict.items():	# Loop over the classifiers
		clf = GridSearchCV(model, params, cv=5, scoring='precision')
		clf.fit(X_train2, y_train2)
		print("Classifier: ")
		print(clf.best_estimator_)
		print('')
		print("Cross-validation: searching for the best parameters...\n")
		print("Best fit parameters: ", clf.best_params_)
		print("Precision score obtained on the training set: %.2f" % clf.best_score_)
		prediction = clf.predict(X_test2)
		print("Precision score obtained on the test set: %.2f" % metrics.precision_score(prediction, y_test2))
		print('')
		print("--------------------------------------------------------")
	input("Program paused, press Enter to continue...\n")

	# Step 3 only works on the whole dataset for now
	if features == 'all'
		print("====================================================================")
		print("= STEP 3: CONSTRUCTING MODELS WITH THE MOST 'SIGNIFICANT' FEATURES =")
		print("====================================================================")
		# Applying RFE method with cross validation to i) classify features from best to worst and
		# ii) find the optimal number of features to use
		from sklearn.feature_selection import RFECV
		clf1 = linear_model.LogisticRegression(random_state=0)	# Creating a new regressor with default parameters
		rfecv = RFECV(estimator=clf1, step=1, cv=5, scoring='precision')
		rfecv = rfecv.fit(X_train, y_train)
		print("Classifier:")
		print(rfecv.estimator_)
		print('')
		print("Optimal number of features : %d" % rfecv.n_features_)	# With C=1.0, the optimal number of features is 20
		
		top_feat1 = pd.Series(rfecv.grid_scores_[:rfecv.n_features_], index=X_train.columns[rfecv.support_]).sort_values(ascending=False)
		print("Best feature rankings and precision scores:")
		print(top_feat1)
		print('')

		X_train_best = rfecv.transform(X_train)	# Reshaping X_train in order to keep the top 20 features
		X_test_best = rfecv.transform(X_test)	# Reshaping X_test in order to keep the top 20 features
		clf1.fit(X_train_best, y_train)
		predictrain1 = clf1.predict(X_train_best)	# Computing predicted values on the training set
		predictest1 = clf1.predict(X_test_best)		# Computing predicted values on the test set
		print("Precision score obtained on the training set: %.2f" % metrics.precision_score(predictrain1, y_train))
		print("Precision score obtained on the test set: %.2f" % metrics.precision_score(predictest1, y_test))
		print('')
		print("--------------------------------------------------------")

		# The decision tree classifier has inherently an attribute estimating feature importances; we shall try it
		clf2 = tree.DecisionTreeClassifier(max_features='sqrt', random_state=0)	# Creating a new tree with default parameters
		clf2.fit(X_train, y_train)
		top_feat2 = pd.Series(clf2.feature_importances_, index=X_train.columns).sort_values(ascending=False)
		print("Classifier:")
		print(rfecv.estimator_)
		print('')
		print("Feature rankings and Gini scores:")
		print(top_feat2)
		# List of all the features having a Gini score of 0.0
		droplist = ['area_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'fractal_dimension_worst',
					'fractal_dimension_mean', 'radius_se', 'perimeter_se', 'symmetry_worst', 'compactness_se',
					'concave points_se', 'smoothness_worst', 'smoothness_se']

		print('')
		print("Computing a new decision tree without the following features:")
		print(droplist)
		print('')
		X_filtrain = X_train.drop(droplist, axis=1)	# Removing all the features in dropllist from the training set
		X_filtest = X_test.drop(droplist, axis=1)	# Removing all the features in dropllist from the test set
		clf2.fit(X_filtrain, y_train)
		predictrain2 = clf2.predict(X_filtrain)	# Computing predicted values on the training set
		predictest2 = clf2.predict(X_filtest)	# Computing predicted values on the test set
		print("Precision score obtained on the training set: %.2f" % metrics.precision_score(predictrain2, y_train))
		print("Precision score obtained on the test set: %.2f" % metrics.precision_score(predictest2, y_test))
Exemplo n.º 7
0
from __future__ import division
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV

X = np.load("../feats/train_formatted.npy")
y = np.load("../feats/train_y.npy")
X_test = np.load("../feats/test_formatted.npy")
y_test = np.load("../feats/test_y.npy")

clf = LogisticRegression()
selector = RFECV(clf)
selector.fit(X, y)
X = selector.transform(X)
X_test = selector.transform(X_test)

scores = selector.ranking_
print 'Index    :   score'
sortedIdx = [i[0] for i in sorted(enumerate(scores), key=lambda x: x[1])]
top = 384
for i in range(top):
    print str(sortedIdx[i]) + ' :   ' + str(scores[sortedIdx[i]])

clf.fit(X, y)
pred = clf.predict(X_test)
accuracy = sum(pred == y_test) / y_test.size
print 'Logistic Regression Accuracy: ' + str(accuracy)
Exemplo n.º 8
0
else:
    classifier = RandomForestClassifier(n_estimators=200)

if FEATURE_SELECTION:
    print("Before FS:", X.shape[1])

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0
for train, test in cv.split(X, y):
    if FEATURE_SELECTION:
        selector = RFECV(estimator, step=1, cv=3, scoring='roc_auc')
        selector = selector.fit(X[train], y[train])
        X_r = selector.transform(X)
        print("After FS" + str(i + 1) + ":", X_r.shape[1])
    else:
        X_r = X

    # Fit classifier
    classifier.fit(X_r[train], y[train])

    # Grid search output
    if GRID_SEARCH:
        print("Grid scores on development set:")
        means = classifier.cv_results_['mean_test_score']
        stds = classifier.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     classifier.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
Exemplo n.º 9
0
__author__ = 'jeronicarandellsaladich'
# Recursive Feature Elimination
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR


X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
print X
estimator = SVR(kernel="linear")
selector = RFECV(estimator, step=5)
selector = selector.fit(X, y)
print selector.support_
print selector.ranking_
print selector.transform(X)
Exemplo n.º 10
0
import itertools
from sklearn.feature_selection import RFECV
from contemppoetry import *


print('******RFECV-LogisticRegression')
for penalty, C in itertools.product(['l1', 'l2'], PARAM_RANGE):
    rfe = RFECV(estimator=LogisticRegression(penalty=penalty, C=C),
                scoring='accuracy', cv=5)
    rfe.fit(X, y)

    # list selected features by rank
    print(
        [feature_names[i] for i in np.argsort(rfe.ranking_) if rfe.support_[i]]
    )

    pipe = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression())])
    param_grid = {'clf__penalty': ['l1', 'l2'],
                  'clf__C': PARAM_RANGE}
    gs = GridSearchCV(estimator=pipe,
                      param_grid=param_grid,
                      scoring='accuracy',
                      cv=5)
    my_print(*my_cross_val_score(gs, X=rfe.transform(X), y=y, gs=False))
Exemplo n.º 11
0
def lin_model(labelled_data, unlabelled_data):
    """ Parameters: training dataframe, unknown dataframe
        Returns: results dataframe (Instance, Income)

        Drops NaN from training data,
        Replaces NaN in test data with ffill, 
        target-encodes non-numeric fields, 
        scales values,
        80/20 splits data to help verify model, 
        selects features using RFECV, with a lasso mode, cv set to 5,
        uses KNeighborRegressor for 11 nearest neighbours weighted to distance
    """
    print("cleaning data...")
    clean_labelled = labelled_data.dropna()
    clean_unlabelled = unlabelled_data[all_columns]
    # not ideal but fillna the mean freezes for some reason
    clean_unlabelled = clean_unlabelled.fillna(method="ffill") 
    # clean_unlabelled = clean_unlabelled.fillna("None")

    # remove some columns
    # clean_labelled = drop_columns(clean_labelled)
    # clean_unlabelled = drop_columns(clean_unlabelled)

    # print("one hot encoding data...")
    # One hot encoding
    # ohe = OneHotEncoder(
    #     categories="auto", 
    #     handle_unknown="ignore",
    #     sparse=False
    # )
    # clean_labelled = encode_training(ohe, clean_labelled)
    # clean_unlabelled = encode_testing(ohe, clean_unlabelled)

    clean_labelled = constrain_col_vals(clean_labelled)
    clean_unlabelled = constrain_col_vals(clean_unlabelled)
    unknown_data = clean_unlabelled.drop(["Instance"], axis=1)

    print("splitting data into train and test...")
    # 80/20 split
    split = split_data(clean_labelled)
    train_data, train_target, test_data, test_target = split

    print("target encoding data...")
    # Target encoding
    tar_encode = TargetEncoder()
    train_data = tar_encode.fit_transform(train_data, train_target)
    test_data = tar_encode.transform(test_data)
    unknown_data = tar_encode.transform(unknown_data)

    print("scaling values...")
    # scaling values
    scaler = StandardScaler()
    train_data = scaler.fit_transform(train_data)
    test_data = scaler.transform(test_data)
    unknown_data = scaler.transform(unknown_data)

    print("selecting features...")
    # feature selection
    lasso = lm.Lasso()
    selector = RFECV(lasso, cv=5)
    train_data = selector.fit_transform(train_data, train_target)
    test_data = selector.transform(test_data)
    unknown_data = selector.transform(unknown_data)

    print("fitting model...")
    # fit model
    # lasso = lm.LassoCV(cv=5)
    # lasso.fit(train_data, train_target)
    neigh = KNeighborsRegressor(
        n_neighbors=11,
        weights="distance"
    )
    neigh.fit(train_data, train_target) 

    print("analysing test results...")
    # validate test
    test_result = neigh.predict(test_data)
    error = np.sqrt(mean_squared_error(test_target, test_result))
    variance = explained_variance_score(test_target, test_result)
    print("Root mean squared error of test data: ", error)
    print("Variance: ", variance)

    print("predicting unknown data...")
    # predict and format
    values = neigh.predict(unknown_data)
    results = pandas.DataFrame({
        "Instance": clean_unlabelled["Instance"].values,
        "Income": values.flatten()
    })
    print("Finished.")
    return results
print("特征筛选已开始")
clf = svm.SVC(kernel='linear')
rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(8), scoring='accuracy')
rfecv.fit(train_data, train_label)
print("最佳特征数目为 : %d" % rfecv.n_features_)
x_label = range(1, len(rfecv.grid_scores_) + 1)
y_label = rfecv.grid_scores_
support = rfecv.support_
plt.figure()
plt.xlabel(u"所选特征数量")
plt.ylabel(u"交叉验证得分(分类精度)")
plt.plot(x_label, y_label)
plt.show()

#获取有效特征
train_data1 = rfecv.transform(train_data)

#准备需要验证的分类器
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
classifiers = {
    'logistic re ':
    LogisticRegression(C=1.1, penalty='l1', tol=0.01),  #
    'SVC        ':
def select_features_rfecv(X, y):
    """Return a new instance of the classification source X."""
    estimator = LinearSVC()
    selector = RFECV(estimator, step=10)
    selector.fit(X, y)
    return selector.transform(X)
Exemplo n.º 14
0
def stratShuffleSplitRFECVRandomForestClassification(
    nEstimators,
    iterator1,
    minSamplesSplit,
    maxFeatures,
    maxDepth,
    nFolds,
    targetDataMatrix,
    trainingData,
    trainingDataMatrix,
    SEED,
):
    """

    :param nEstimators: This is the number of trees in the forest (typically 500-1000 or so)
    :param iterator1: This is the number of model iterations. For a breakdown of model structure, see the wiki
                      (it's clearly marked...somewhere)
    :param minSamplesSplit: this is the minimum number of samples to split. 2 is a bit small...less is typically more.
    :param maxFeatures:
    :param nFolds:
    :param targetDataMatrix:
    :param trainingData:
    :param trainingDataMatrix:
    :param SEED:
    :return:
    """
    import multiprocessing
    import numpy as np

    multiprocessing.cpu_count()
    # from helperFunctions import *
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import metrics
    from sklearn import cross_validation
    from sklearn.feature_selection import RFECV
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.cross_validation import StratifiedShuffleSplit

    # rfecv pre-allocation tables, seeding
    X_train = []
    X_holdout = []
    y_train = []
    y_holdout = []
    rfecvGridScoresAll = []
    optimumLengthAll = []
    # feature_names = []
    a = []
    rfc_all_f1 = []
    nameListAll = pd.DataFrame()
    optimumLengthAll = pd.DataFrame()
    classScoreAll = pd.DataFrame()
    classScoreAll2 = pd.DataFrame()
    classScoreAll3 = pd.DataFrame()
    featureImportancesAll = pd.DataFrame()
    rfecvGridScoresAll = pd.DataFrame()

    # Re-definition of the RFC to employ feature importance as a proxy for weighting to employ RFECV.
    class RandomForestClassifierWithCoef(RandomForestClassifier):
        def fit(self, *args, **kwargs):
            super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
            self.coef_ = self.feature_importances_

    ## Re-creation of the RFC object with ranking proxy coefficients
    rfc = RandomForestClassifierWithCoef(
        n_estimators=nEstimators,
        min_samples_split=minSamplesSplit,
        bootstrap=True,
        n_jobs=-1,
        max_features=maxFeatures,
        oob_score=True,
        max_depth=maxDepth,
    )

    ## Employ Recursive feature elimination with automatic tuning of the number of features selected with CV (RFECV)
    #
    for kk in range(0, iterator1):
        print "iteration no: ", kk + 1
        # Shuffle and split the dataset using a stratified approach to minimize the influence of class imbalance.
        SSS = StratifiedShuffleSplit(targetDataMatrix, n_iter=1, test_size=0.10, random_state=SEED * kk)
        for train_index, test_index in SSS:
            X_train, X_holdout = trainingDataMatrix[train_index], trainingDataMatrix[test_index]
            y_train, y_holdout = targetDataMatrix[train_index], targetDataMatrix[test_index]

        # Call the RFECV function. Additional splitting is done by stratification shuffling and splitting. 5 folds. 5 times,
        # with a random seed controlling the split.

        rfecv = RFECV(
            estimator=rfc,
            step=1,
            cv=StratifiedKFold(y_train, n_folds=nFolds, shuffle=True, random_state=SEED * kk),
            scoring="accuracy",
        )  # Can  use 'accuracy' or 'f1' f1_weighted, f1_macro, f1_samples

        # First, the recursive feature elimination model is trained. This fits to the optimum model and begins recursion.
        rfecv = rfecv.fit(X_train, y_train)

        # Second, the cross-validation scores are calculated such that grid_scores_[i] corresponds to the CV score
        # of the i-th subset of features. In other words, from all the features to a single feature, the cross validation
        # score is recorded.
        rfecvGridScoresAll = rfecvGridScoresAll.append([rfecv.grid_scores_])

        # Third, the .support_ attribute reports whether the feature remains after RFECV or not. The possible parameters are
        # inspected by their ranking. Low ranking features are removed.
        supPort = (
            rfecv.support_
        )  # True/False values, where true is a parameter of importance identified by recursive alg.
        possParams = rfecv.ranking_
        min_feature_params = rfecv.get_params(deep=True)
        optimumLengthAll = optimumLengthAll.append([rfecv.n_features_])
        featureSetIDs = list(supPort)
        featureSetIDs = list(featureSetIDs)
        # print feature_names
        feature_names = list(trainingData.columns.values)
        namedFeatures = list(trainingData.columns.values)
        namedFeatures = np.array(namedFeatures)

        # Loop over each item in the list of true/false values, if true, pull out the corresponding feature name and store
        # it in the appended namelist. This namelist is rewritten each time, but the information is retained.
        nameList = []  # Initialize a blank array to accept the list of names for features identified as 'True',
        # or important.
        # print featureSetIDs
        # print len(featureSetIDs)
        for i in range(0, len(featureSetIDs)):
            if featureSetIDs[i]:
                nameList.append(feature_names[i])
            else:
                a = 1
                # print("didn't make it")
                # print(feature_names[i])
        nameList = pd.DataFrame(nameList)
        nameListAll = nameListAll.append(nameList)  # append the name list
        nameList = list(nameList)
        nameList = np.array(nameList)

        # Fourth, the training process begins anew, with the objective to trim to the optimum feature and retrain the model
        # without cross validation i.e., test the holdout set. The new training test set size for the holdout validation
        # should be the entire 90% of the training set (X_trimTrainSet). The holdout test set also needs to be
        # trimmed. The same transformation is performed on the holdout set (X_trimHoldoutSet).
        X_trimTrainSet = rfecv.transform(X_train)
        X_trimHoldoutSet = rfecv.transform(X_holdout)

        # Fifth, no recursive feature elimination is needed (it has already been done and the poor features removed).
        # Here the model is trained against the trimmed training set X's and corresponding Y's.
        rfc.fit(X_trimTrainSet, y_train)

        # Holdout test results are generated here.
        preds = rfc.predict(
            X_trimHoldoutSet
        )  # Predict the class from the holdout dataset. Previous call: rfecv.predict(X_holdout)
        print preds
        print y_holdout
        rfc_all_f1 = metrics.f1_score(y_holdout, preds, average="weighted")  # determine the F1
        rfc_all_f2 = metrics.r2_score(y_holdout, preds)  # determine the R^2 Score
        rfc_all_f3 = metrics.mean_absolute_error(
            y_holdout, preds
        )  # determine the MAE - Do this because we want to determine sign.

        # append the previous scores for aggregated analysis
        classScoreAll = classScoreAll.append([rfc_all_f1])  # append the previous scores for aggregated analysis.
        classScoreAll2 = classScoreAll2.append([rfc_all_f2])
        classScoreAll3 = classScoreAll3.append([rfc_all_f3])
        refinedFeatureImportances = (
            rfc.feature_importances_
        )  # determine the feature importances for aggregated analysis.
        featureImportancesAll = featureImportancesAll.append([refinedFeatureImportances])

    # Output file creation
    print ("List of Important Features Identified by Recursive Selection Method:")
    print (nameListAll)
    nameListAll.to_csv("./outputFiles/class_IFIRS.csv")
    nameListAll.count()

    print ("f1 weighted score for all runs:")
    print (classScoreAll)
    classScoreAll.to_csv("./outputFiles/f1_score_all.csv")

    print ("R^2 score for all runs:")
    print (classScoreAll2)
    classScoreAll2.to_csv("./outputFiles/class_Rsq_score_all.csv")

    print ("MAE score for all runs:")
    print (classScoreAll3)
    classScoreAll3.to_csv("./outputFiles/class_MAE_score_all.csv")

    print ("Optimal number of features:")
    print (optimumLengthAll)
    optimumLengthAll.to_csv("./outputFiles/class_optimum_length.csv")

    print ("Selected Feature Importances:")
    print (featureImportancesAll)
    featureImportancesAll.to_csv("./outputFiles/class_sel_feature_importances.csv")

    print ("mean_squared_error Grid Score for Increasing Features")
    print (rfecvGridScoresAll)
    rfecvGridScoresAll.to_csv("./outputFiles/class_rfecv_grid_scores.csv")
Exemplo n.º 15
0
    """
    Y = df_temp["cascadeSize"].between(step[i+1],10e6).tolist()
    #Y = df_temp["step"+str(step[i+1])].tolist()
    X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2)
    logreg = linear_model.LogisticRegression(C=1e5,max_iter=1e3)
    logreg.fit(X_train,y_train)
    print "step"+str(step[i+1])
    print X.shape
    print logreg.score(X_test,y_test)

    # perform recursive feature selection(backward selectoin)
    rfecv = RFECV(estimator=logreg, step=1, cv=StratifiedKFold(y_train, 4),scoring='accuracy')
    rfecv.fit(X_train, y_train)

    print("Optimal number of features : %d" % rfecv.n_features_)
    X_train_new = rfecv.transform(X_train)
    print("best features: ")
    print X_train_new

    # Plot number of features VS. cross-validation scores
    #plt.figure()
    #plt.xlabel("Number of features selected")
    #plt.ylabel("Cross validation score (nb of correct classifications)")
    #plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    #plt.show()





Exemplo n.º 16
0
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(train_feature, train_target)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()


clf = svm.SVC(kernel='linear')
clf.fit(train_feature, train_target)

svm_weights = (clf.coef_ ** 2).sum(axis=0)
svm_weights /= svm_weights.max()


clf_selected = svm.SVC(kernel='linear')
clf_selected.fit(selector.transform(train_feature), train_target)

svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
svm_weights_selected /= svm_weights_selected.max()



''' tree method'''
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier


forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)
Exemplo n.º 17
0
def predictAndPlot(data, header, features, name):
    print "\n%s" % name

    # First reduce the data to relevant features.
    features_plus_date = np.hstack((0, features))
    analyzed_data = data[:, features_plus_date]

    # Remove rows with missing data.
    for i in range(len(analyzed_data[0])):
        analyzed_data = analyzed_data[analyzed_data[:, i] != '']

    # If it is a retention feature, skip the last X entries.
    if "retention" in name:
        if "1d" in name:
            retention_feature_linesSkipped = 3
        elif "3d" in name:
            retention_feature_linesSkipped = 7
        elif "7d" in name:
            retention_feature_linesSkipped = 15
        elif "14d" in name:
            retention_feature_linesSkipped = 29
        elif "28d" in name:
            retention_feature_linesSkipped = 57
        else:
            retention_feature_linesSkipped = 0
        analyzed_data = analyzed_data[:-retention_feature_linesSkipped, :]

        # The second-last line is # votes. If smaller than 50, skip this entry.
        # analyzed_data = analyzed_data[analyzed_data[:, -2].astype(float) >= min_daily_regs]

    # I added the date to simply for plotting reasons. Just in case. Could be removed if not needed.
    dates = analyzed_data[:, 0]

    # Set best model and best score default values.
    best_model = ""
    best_score = -100

    # Iterate through all models to obtain the best parameters and features via cross validation
    for model_type in list_of_models:
        # Get training data X and y.
        X = analyzed_data[:, 1:-1].astype(float) # Ignore dates (first column) and "y" (last column)
        y = analyzed_data[:, -1].astype(float)

        model = define_model(model_type) # Set model parameters based on model_type

        # Perform differently depending on which model is used.
        # Random Forest has to be treated differently because it doesn't support RFECV.
        if model_type == "RF":
            to_be_used_threshold = "median"  # Default value. Will be overwritten.
            score = -100.

            # Loop through different thresholds. Use the one with the highest score.
            for model_threshold in ("10.*median", "3.*median", "1*median", "0.3*median", "0.1*median", "0.03*median"):
                try:
                    # Use only the "model_threshold" best features.
                    model.fit(X, y)
                    X_new = model.transform(X, threshold=model_threshold)
                    header_new = model.transform(header[features][:-1], threshold=model_threshold)

                    # Fit the model again with reduced features X_new and return out of bag score.
                    model.fit(X_new, y)
                    rf_score = model.oob_score_

                    # I try to keep the amount of features as small as possible.
                    # The rf_score of a model with more features needs to be 2% better to justify more params.
                    # In some cases the score is negative so it also needs to be better overall.
                    if (rf_score > score * 1.02) and (rf_score > score):
                        score = rf_score
                        to_be_used_threshold = model_threshold
                except:
                    # Just a debug output.
                    print "There was an error at model threshold: %s" % model_threshold

            print "Score is %2.3f with threshold: %s" % (score, to_be_used_threshold)
        elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"):
            selector = RFECV(model)
            selector = selector.fit(X, y)
            header_new = header[features][:-1]
            score = selector.score(X, y)
            print "Score is %2.3f with model: %s" % (score, model_type)
        else:
            print "Something went wrong!"

        if score > best_score:
            best_score = score
            best_model = model_type

    print "Best score is %2.3f with model: %s" % (best_score, best_model)


    # Predict using the best model, parameters and features, obtained before.
    model_type = best_model
    model = define_model(model_type)

    if model_type == "RF":
        # In some rare cases the model does not work, because all features were discarded.
        # Therefore try to do it again without a threshold, that should always work (model_threshold).
        try:
            model.fit(X, y)
            X_new = model.transform(X, threshold = to_be_used_threshold)
            header_new = model.transform(header[features][:-1], threshold=to_be_used_threshold)

            model.fit(X_new, y)
            prediction = model.predict(X_new)
            score = model.oob_score_
        except:
            print "Fitting the model didn't work! The prediction might be sub-optimal. \nThreshold: %s" % model_threshold
            model.fit(X, y)
            prediction = model.predict(X)
            #score = model.oob_score_
            score = 0
    elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"):
        selector = RFECV(model)
        selector = selector.fit(X, y)
        header_new = header[features][:-1]
        prediction = selector.predict(X)
        score = selector.score(X, y)
    else:
        print "lol!"

    # Now derive the importances respectively feature coefficients.
    try:
        # This only works with "RF"
        importances = model.feature_importances_
        importances_list = np.vstack((importances, header_new))
        importances_list = np.transpose(importances_list)
        importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::-1]
    except:
        # This should work with all other models.
        try:
            X_new = selector.transform(X)
            header_new = selector.transform(header_new)
            model.fit(X_new, y)
            med_value = np.median(X_new, axis=0)
            med_value[med_value == 0] = np.mean(X_new, axis=0)[med_value == 0]
            importances = model.coef_ * np.median(X_new, axis=0)
            importances_list = np.vstack((importances, header_new))
            importances_list = np.transpose(importances_list)
            importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::1]
        except:
            # If the above doesnt work, just give a blank output.
            importances_list = np.zeros((10, 2))

    score = "%s, %s\nOOB Score = %2.2f" % (name, model_type, score)

    plot_predictionVsActual(prediction, y, score)
    return prediction, y, dates, importances_list
Exemplo n.º 18
0
    'Male'
]]
y = ad_data['Clicked on Ad']
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)
l1 = LogisticRegression()
l1.fit(X_train, y_train)
p1 = l1.predict(X_test)
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
l2 = LogisticRegression()
rfecv = RFECV(estimator=l2, step=1, cv=StratifiedKFold(2), scoring='accuracy')
rfecv.fit(X_train, y_train)
print(rfecv.transform(X_train)[:1, :])
print(X_train.head(1))
print('By comparing the two we find the feature not selected')
print('Number of best suited features using RFFECV')
print(rfecv.n_features_)
p2 = rfecv.predict(X_test)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
scaled_data = scaler.transform(X_train)
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
pca.fit(scaled_data)
xtrain_pca = pca.transform(scaled_data)
xtest_pca = pca.transform(scaler.transform(X_test))
l3 = LogisticRegression()
Exemplo n.º 19
0
for clf_label, clf in classifiers.items():
    # Print message to user
    print(f"Now working on {clf_label}.")
    
    #Define cross validation split method, scoring metric, total variance to keep for PCA, and parameter grid for optimization
    split = TimeSeriesSplit(n_splits=10)
    score = 'roc_auc'
    totalVariance = 0.99
    param_grid = parameters[clf_label]

    
    # 1. Feature Selection: RFECV with clf as the base estimator
    selector = RFECV(estimator = clf,step=1, cv = split, scoring=score,n_jobs= -1)    
    selector.fit(X_train_corr,y_train.values.ravel())
    X_train_RFECV = selector.transform(X_train_corr)
    X_test_RFECV = selector.transform(X_test_corr)
    
    # 2. Dimension Reduction: PCA
    pca = PCA(totalVariance, svd_solver = 'full').fit(X_train_RFECV)
    X_train_PCA = pca.transform(X_train_RFECV)
    X_test_PCA = pca.transform(X_test_RFECV)
    
    df_results['Num_Features'][clf_label] = pca.n_components_
    # 3. Hyper-parameter Optimization
    GSCV = GridSearchCV(clf,
                        param_grid,
                        cv = split,  
                        n_jobs= -1,
                        scoring = score)
    # 4. Fit Model
Exemplo n.º 20
0
from __future__ import division
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV

X = np.load("../feats/train_formatted.npy")
y = np.load("../feats/train_y.npy")
X_test = np.load("../feats/test_formatted.npy")
y_test = np.load("../feats/test_y.npy")

clf = LogisticRegression()
selector = RFECV(clf)
selector.fit(X, y)
X = selector.transform(X)
X_test = selector.transform(X_test)

scores = selector.ranking_
print 'Index    :   score'
sortedIdx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])]
top = 384
for i in range(top):
    print str(sortedIdx[i]) + ' :   ' + str(scores[sortedIdx[i]])


clf.fit(X, y) 
pred = clf.predict(X_test)
accuracy = sum(pred == y_test)/y_test.size
print 'Logistic Regression Accuracy: ' + str(accuracy)

def Randomforest(features, classes):
    #import libraries
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn import preprocessing
    from sklearn.feature_selection import RFECV
    from sklearn.metrics import roc_auc_score
    import imblearn
    from imblearn.over_sampling import SMOTE, ADASYN
    from scipy import stats
    from sklearn import svm
    from sklearn.svm import SVC
    from sklearn.model_selection import StratifiedKFold

   
    #define empty arrays for results
    acc= []
    AUC = []
    
    # Optional: if using RFE for feature selection
    #define empty arrays for number of features and column numbers of selected features
    n_features=[]
    selected_features = np.zeros((len(X[1,:])))

    
    #define k for number of iterations (cross validation
    k = 50
    #start K-fold loop     
    for i in range(0,k):
        print([i],) # To print every iteration to make progress durig running visible
        sys.stdout.flush() # To print the previous line on the screen immeidately. Without this, it is stored in a buffer and printed later.
        
        # Train-test split, percentage of test group tuned by 'test_size'
        # random-state=i makes sure every iteration used a unique subset as testing group
        X_train, X_test, y_train, y_test = train_test_split(features, classes, test_size=0.1, random_state=i)
        
        # Optional for unbalanced classes: resample training set by SMOTE
        # Make sure that amount of subjects in all classes are equal, makes use of synthetic subjects
        X_train_r, y_train_r = SMOTE().fit_sample(X_train, y_train)

        

        # Feature selection: Do t test for p < 0.05

        #name the 2 classes in the training set
        class_1 = X_train[y_train == 1]
        class_2 = X_train[y_train == 0]
        
        h,p = stats.ttest_ind( class_1,class_2,equal_var = False,nan_policy='omit')     
        treshold = p < 0.05   # set treshold for P < 0.05
        p[treshold] = 0  # All low values set to 0
        mask = p == 0    # define mask
        X_train = X_train[:,mask]  
        X_test = X_test[:,mask]
              
        #standarization of training and testing data
        X_train_scaled = preprocessing.scale(X_train)
        X_test_scaled = preprocessing.scale(X_test)
                 
        # Optional: 
        # Feature selection: Using Recursive feature extraction (RFE)    
        svc = SVC(kernel="linear") # selects classifier to provide information about feature importance
        
        selector = RFECV(estimator=svc, step=1, cv=StratifiedKFold(10)) # set umber of features to remove at each iteration, set number of iterations in corss validation
        
        selector = selector.fit(X_train_scaled, y_train) #fit the RFE to the training set
        
        X_train_FS = selector.transform(X_train_scaled) #extract selected features from the training set
        X_test_FS = X_test_scaled[:,selector.support_]  #extract selected features from the testing set, according to outcome of RFE executed in the training set
        n_features.append(selector.n_features_)       #fill in number of selected features per iteration 
        selected_features[mask] = selected_features[mask]+selector.support_ # fill in which features are selected per iteration
    
        # RandomForest classification
        t=100 # define number of trees
        clf = RandomForestClassifier(n_estimators=t) # define classifier
        clf = clf.fit(X_train_scaled, y_train) # fit classifier to training and testing data
        score = clf.score(X_test_scaled, y_test) # define accuracy score
        acc.append(score)  # fill in accuracy to accuracy-array 
        score_AUC = clf.predict_proba(X_test_scaled) # define Area under the ROC curve (AUC) score
        score_AUC = score_AUC[:,1]
        ROC_AUC = roc_auc_score(y_test, score_AUC)
        AUC.append(ROC_AUC)  #  fill in AUC to AUC-array
        
    # Print statements    

    print('accuracies: \n',acc)
    print('accuracies by a k fold CV Random Forest: '+ str(np.mean(acc)) + ' ( std : ' + str(np.std(acc)) + ' )' )
    print('AUCs: \n',AUC)
    print('AUC by a k fold CV Random Forest: '+ str(np.mean(AUC)) + ' ( std : ' + str(np.std(AUC)) + ' )' )
    
    # Define dataframe with performance scores
    df = pd.DataFrame({'acc_coarse': scores,'AUC_coarse': AUC})
    
    # export outcomes to a csv_file on local map               
    df.to_csv('name_file.csv', encoding='utf-8', index=False)
Exemplo n.º 22
0
proba1 = [i for index, i in enumerate(knn.predict_proba(testX))]
print (pd.Series(proba1))'''


#Testing

test = pd.read_csv('data/test.csv', encoding='utf-8')

'''sk = SelectKBest(f_regression, k=60)
sk.fit(X,y)
X = sk.transform(X)
test = sk.transform(test)'''

rfe = RFECV(LinearSVC(), step = 1)
rfe.fit(X, y)
X = rfe.transform(X)
test = rfe.transform(test)  

knn = KNeighborsClassifier(n_neighbors=90, leaf_size=10, p=2)
knn.fit(X, y)
pred = np.array(knn.predict(test))
proba = [i for index, i in enumerate(knn.predict_proba(test))]
print (pd.Series(proba))
probadf = pd.DataFrame(proba, columns=['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'])
nrow = probadf.shape[0]+1
ids = pd.Series(np.arange(nrow))
ids = ids.drop(0)
result = pd.concat([ids, probadf], axis=1)
result.to_csv('Submission.csv', header=True, index=None)

import scipy as sp
  
  #select features using rfecv only on train data
  #rfe = RFE(estimator=classifier, cv=5,n_features_to_select=10,step=2)
  rfe = RFECV(estimator=classifier, cv=5,step=2, scoring='f1')
  print("going to select optimal features")
  rfe.fit(normalized_matrix_train, y_all[train])
  ranked_features=(rfe.ranking_).tolist()
  #print("shape of train matrix after rfe.fit is: " +str(normalized_matrix_train.shape))
  index=[]
  for i in range(0,len(ranked_features)):
      if ranked_features[i] is 1:
          index.append(i)
      
  print("index is"+str(index))
  
  rfe.transform(normalized_matrix_train)
  #print("shape of transformed train matrix is: " +str(normalized_matrix_train.shape))
  classifier.fit(normalized_matrix_train,y_all[train])
  rfe.transform(normalised_matrix_test)
  #print("shape of transformed test matrix is: " +str(normalised_matrix_test.shape))
  probas_ = classifier.predict_proba(normalised_matrix_test)
  ##########  ADDING VARIABLES FOR CLASSIFICATION REPORT HERE ####################
 
  
  
  y_proba_report.extend(probas_)
  y_predicted2=(classifier.predict(normalised_matrix_test))     
  print("f1-score for this set of features is:  "+ str(f1_score(y_all[test],y_predicted2)))
  clf_score=classifier.score(normalised_matrix_test, y_all[test])
  print("score for this set of features is:  "+ str(clf_score))
  y_predicted_report.extend(y_predicted2)
Exemplo n.º 24
0
class trainModel(object):
    ''' model training - a-site prediction
    '''
    def __init__(self,
                 asiteFn=None,
                 cdsFn=None,
                 cdsIdxFn=None,
                 classifier="rf",
                 RelE=None):
        self.asiteFn = asiteFn
        self.cdsFn = cdsFn
        self.cdsIdxFn = cdsIdxFn
        self.classifier = classifier
        self.RelE = RelE

    def rfFit(self):
        self.traning = pd.read_table(self.asiteFn + ".txt", header=0)
        # column names
        self.colNames = list(self.traning.columns.values)
        self.colNames.remove("asite")
        self.X = np.array(pd.get_dummies(self.traning[self.colNames]))
        self.y = np.array(self.traning["asite"])
        ## feature selection
        self.clf = RandomForestClassifier(max_features=None, n_jobs=-1)
        self.clf = self.clf.fit(self.X, self.y)
        self.importances = self.clf.feature_importances_
        self.selector = RFECV(self.clf, step=1, cv=5)
        self.selector = self.selector.fit(self.X, self.y)
        self.sltX = self.selector.transform(self.X)
        print(
            "[result]\tOptimal number of features by recursive selection: %d" %
            self.selector.n_features_,
            flush=True)
        ## define a new classifier for reduced features
        self.reducedClf = RandomForestClassifier(max_features=None, n_jobs=-1)
        self.reducedClf = self.reducedClf.fit(self.sltX, self.y)
        ## cross validation
        scores = cross_val_score(self.reducedClf, self.sltX, self.y, cv=10)
        print("[result]\tAccuracy: %0.3f (+/- %0.3f)" %
              (scores.mean(), scores.std() * 2),
              flush=True)

    def rfImportance(self):
        ## compute the std and index for the feature importance
        std = np.std(
            [tree.feature_importances_ for tree in self.clf.estimators_],
            axis=0)
        idx = np.argsort(self.importances)[::-1]
        featureNames = (pd.get_dummies(
            self.traning[self.colNames]).columns.values)
        importantFeatures = featureNames[idx]
        ## Plot the feature importances of the classifier
        plt.figure()
        plt.title("Feature importances")
        plt.bar(range(self.X.shape[1]),
                self.importances[idx],
                color=sns.xkcd_rgb["denim blue"],
                yerr=std[idx],
                align="center")
        plt.xticks(range(self.X.shape[1]),
                   importantFeatures,
                   rotation='vertical')
        plt.xlim([-1, 10])
        plt.ylim([0, 1])
        #plt.gca().tight_layout()
        plt.gcf()
        plt.savefig(self.asiteFn + ".feature_importances.pdf",
                    facecolor="white")

    def rfPredict(self):
        ## create df for cds
        self.cds = pd.read_table(self.cdsFn + ".txt", header=0)
        cdsX = np.array(pd.get_dummies(self.cds[self.colNames]))
        ## selected a subset of features and predict a-site
        sltcdsX = self.selector.transform(cdsX)
        self.cds["asite"] = self.reducedClf.predict(sltcdsX)

    def svmFit(self):
        ## grid search
        self.clf = svm.SVC()
        paramGrid = [{'C': [0.01, 0.1, 1, 10, 100, 1000, 10000]}]
        self.clfGs = GridSearchCV(estimator=self.clf,
                                  param_grid=paramGrid,
                                  n_jobs=-1)
        self.clfGs.fit(self.X, self.y)
        print("[result]\t best estimator parameters: c=",
              self.clfGs.best_estimator_.C,
              flush=True)
        ## model fitting and cross validation
        self.clf = svm.SVC(C=self.clfGs.best_estimator_.C)
        scores = cross_val_score(self.clf, self.X, self.y, cv=10)
        print("[result]\tAccuracy: %0.3f (+/- %0.3f)" %
              (scores.mean(), scores.std() * 2),
              flush=True)

    def rocCurve(self):
        ''' plotting multi-class roc curve
        '''
        # shuffle and split training and test sets
        clf = self.reducedClf if self.classifier == "rf" else self.clf
        self.OvrClf = OneVsRestClassifier(clf)
        classes = list(range(9, 19)) if not self.RelE else list(range(1, 9))
        self.y = label_binarize(self.y, classes=classes)
        nClasses = self.y.shape[1]
        X_train, X_test, y_train, y_test = train_test_split(self.X,
                                                            self.y,
                                                            test_size=.5,
                                                            random_state=0)
        if self.classifier == "rf":
            y_score = self.OvrClf.fit(X_train, y_train).predict_proba(X_test)
        else:
            y_score = self.OvrClf.fit(X_train,
                                      y_train).decision_function(X_test)
        # Compute ROC curve and ROC area for each class
        fpr, tpr, roc_auc = {}, {}, {}
        for i in range(nClasses):
            fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(),
                                                  y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        # Plot ROC curve
        sns.reset_orig()
        plt.clf()
        plt.figure()
        plt.plot(fpr["micro"],
                 tpr["micro"],
                 '--',
                 linewidth=3,
                 label='micro-average (area = {0:0.2f})'
                 ''.format(roc_auc["micro"]))
        for i in range(nClasses):
            pos = classes[i]
            plt.plot(fpr[i],
                     tpr[i],
                     label='A-site @ {0} (area = {1:0.2f})'
                     ''.format(pos, roc_auc[i]))
        #
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate', fontsize=18)
        plt.ylabel('True Positive Rate', fontsize=18)
        plt.tick_params(axis='both', which='major', labelsize=18)
        plt.legend(loc="lower right", fontsize=12)
        plt.gcf()
        plt.savefig(self.asiteFn + ".roc.pdf")

    def recoverAsite(self):
        ## adjust by the a-site location and calculate the a-site location in nt space, -1 is the missing value
        if not self.RelE:
            self.cds['a_start'] = np.where(
                self.cds['gene_strand'] == '+',
                (self.cds['start'] + self.cds['asite']), (-1)).astype(int)
            self.cds['a_end'] = np.where(
                self.cds['gene_strand'] == '+', (self.cds['a_start'] + 3),
                (self.cds['end'] - self.cds['asite'])).astype(int)
            self.cds['a_start'] = np.where(self.cds['gene_strand'] == '-',
                                           (self.cds['a_end'] - 3),
                                           (self.cds['a_start'])).astype(int)
        else:
            self.cds['a_start'] = np.where(
                self.cds['gene_strand'] == '+',
                (self.cds['end'] - self.cds['asite']), (-1)).astype(int)
            self.cds['a_end'] = np.where(
                self.cds['gene_strand'] == '+', (self.cds['a_start'] + 3),
                (self.cds['start'] + self.cds['asite'])).astype(int)
            self.cds['a_start'] = np.where(self.cds['gene_strand'] == '-',
                                           (self.cds['a_end'] - 3),
                                           (self.cds['a_start'])).astype(int)
        # remove start/end for reads
        self.cds.drop(['start', 'end'], axis=1, inplace=True)
        ## use to group by command to retrieve ribosome coverage
        cnt = self.cds.groupby(["chrom", "a_start", "a_end", "strand"])
        cnt = cnt.size().reset_index(name="ribosome_count")
        ## left outer join the null df and the groupby_df_count to get ribsome counts at each position
        cdsIdx = pd.read_table(self.cdsIdxFn, header=0)
        riboCnt = pd.merge(cdsIdx,
                           cnt,
                           how="left",
                           left_on=["chrom", "start", "end", "gene_strand"],
                           right_on=["chrom", "a_start", "a_end", "strand"])
        riboCnt.drop(['a_start', 'a_end', 'strand'], axis=1, inplace=True)
        riboCnt["ribosome_count"].fillna(value=0, inplace=True)
        riboCnt["ribosome_count"] = riboCnt["ribosome_count"].astype(int)
        riboCnt = riboCnt.sort_values(by=["chrom", "start", "end"])
        riboCnt.to_csv(path_or_buf=self.cdsFn + '.model_input.txt',
                       sep='\t',
                       header=True,
                       index=False)
Exemplo n.º 25
0
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
    print(confusion_matrix(y_true, y_pred))

    print(best_score ,clf.best_score_)
    if i == 1:
        break
    else:
        best_score = clf.best_score_
        # remove some features
        rfecv = RFECV(estimator=clf.best_estimator_, step=1, cv=2,
                scoring='accuracy')
        rfecv.fit(X_train, y_train)
        print("Optimal number of features : %d" % rfecv.n_features_)
        X_train = rfecv.transform(X_train)
        X_test = rfecv.transform(X_test)

for j in range(5):
    print(j)
    magDict = {}
    with hdf.File('./truth/truth'+str(j).zfill(2)+'_Oii.hdf5', 'r') as f:
        dset = f['truth%s_Oii' % (str(j).zfill(2))]
        magDict['u'] = dset['OMAG'][:,0] # u band
        magDict['g'] = dset['OMAG'][:,1] # g band
        magDict['r'] = dset['OMAG'][:,2] # r band
        magDict['i'] = dset['OMAG'][:,3] # i band
        magDict['z'] = dset['OMAG'][:,4] # z band

    # we only want the g mag < 22 galaxies
    mask = np.where(magDict['g'] < 22)[0]
print("Reduced number of features:", features_kbest.shape[1])

#For top n features

fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)

print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

#Recursively Eliminating Features

warnings.filterwarnings(action="ignore", module="scipy",
message="^internal gelsd")

features, target = make_regression(n_samples = 10000,
                                   n_features = 100,
                                   n_informative = 2,
                                   random_state = 1)

ols = linear_model.LinearRegression()

rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
print(rfecv.transform(features))

print(rfecv.n_features_)

print(rfecv.support_)

print(rfecv.ranking_)
    elif classifier == "RandomForest":
        estimator = RandomForestClassifier(max_depth=2)
        #clf1 = RandomForestClassifier()
    elif classifier == "Adaboost":
        estimator = AdaBoostClassifier()
        #clf1 = AdaBoostClassifier()
    else:
        estimator = LogisticRegression()
        #clf1 = LogisticRegression()

    rfecv = RFECV(estimator, step=1, cv=StratifiedKFold(2))

    rfecv.fit(x_data, y_data)
    print('number of features selected:', rfecv.n_features_)

    x_new = rfecv.transform(x_data)

    #### Extract the important features ###########
    #selected_inds = rfecv.get_support(indices=True)
    #feat_coefs = rfecv.estimator_.coef_

    #print(feat_coefs)

    #selected_feats = [training_head[ind] for ind in selected_inds]
    #selected_vals = list(zip(selected_feats, feat_coefs[0]))

    #feature_frame_selected = pd.DataFrame(selected_vals, columns=['selected_features','Coefficients'])
    #feature_frame_selected = feature_frame_selected.sort_values(["ranking"],ascending=False)
    #feat_file = 'selected_features_rfecv_finalresults_1_19_19/withmetrics/final_feature_set_'+filename+"_"+cur_model+"_"+device+".csv"
    #feature_frame_selected.to_csv(feat_file)
Exemplo n.º 28
0
class FeatureVectorsClassifier(Model):
    """
        This class represents a task 1, subtask A model that trains pattern recognition classifiers
        on feature vectors extracted from the individual images.
    """
    def __init__(self, classifier, kwargs):
        """Constructs a supervised task1, subtask A model based on pattern recognition classifiers.

        Parameters:
            classifier      The provided supervised classifier.

            kwargs          The parameters for feature generation.
        """
        assert classifier in CLASSIFIERS
        self.classifier = classifier
        self.features = Features(**kwargs)
        self.scaler = StandardScaler()
        self.feature_preselector = SelectKBest(chi2, k=NUM_FEATURES)
        self.feature_selector = RFECV(
            self.classifier,
            scoring=SCORING,
            cv=StratifiedKFold(NUM_FOLDS, random_state=RANDOM_STATE),
            n_jobs=-1)

    def fit(self, videos):
        LOGGER.debug("Preparing training samples for %s ...", self)
        X = []
        y = []
        for video_num, video in enumerate(videos):
            LOGGER.debug("Processing video number %d / %d ...", video_num + 1,
                         len(videos))
            for screen in video.screens:
                for page in video.pages:
                    LOGGER.debug("Processing (%s, %s) ...", page, screen)
                    X.append(self.features.get_pairwise_features(page, screen))
                    y.append(1 if page in screen.matching_pages else 0)
                    LOGGER.debug("Done processing (%s, %s).", page, screen)
            LOGGER.debug("Done processing video number %d / %d.",
                         video_num + 1, len(videos))
        LOGGER.debug("Done preparing training samples for %s.", self)

        LOGGER.debug(
            "Fitting the feature preselector (%d samples, %d features) ...",
            len(X), len(X[0]))
        self.feature_preselector.fit(X, y)
        X = self.feature_preselector.transform(X)
        LOGGER.debug("Done fitting the feature preselector (%d features).",
                     X.shape[1])

        LOGGER.debug("Fitting the feature scaler ...")
        self.scaler.fit(X)
        X = self.scaler.transform(X)
        LOGGER.debug("Done fitting the feature scaler.")

        if self.classifier.__class__ != SVC:
            LOGGER.debug(
                "Fitting the feature selector (%d samples, %d features) ...",
                *X.shape)
            self.feature_selector.fit(X, y)
            X = self.feature_selector.transform(X)
            LOGGER.debug("Done fitting the feature selector. (%d features)",
                         X.shape[1])

        if self.classifier.__class__ in PARAM_GRIDS and self.classifier.__class__ != SVC:
            LOGGER.debug(
                "Optimizing the classifier parameters and fitting the classifier ..."
            )
            param_grid = PARAM_GRIDS[self.classifier.__class__]
            optimizer = GridSearchCV(self.classifier,
                                     param_grid,
                                     scoring=SCORING,
                                     refit=True,
                                     cv=StratifiedKFold(
                                         NUM_FOLDS, random_state=RANDOM_STATE))
            optimizer.fit(X, y)
            self.classifier = optimizer.best_estimator_
            LOGGER.debug(
                "Done optimizing the classifier parameters and fitting the classifier."
            )
        else:
            LOGGER.debug("Fitting the classifier ...")
            self.classifier.fit(X, y)
            LOGGER.debug("Done fitting the classifier.")

    def predict(self, observations):
        rankings = []
        for observation_num, (screen_video,
                              page_video) in enumerate(observations):
            LOGGER.debug("Processing observation number %d / %d ...",
                         observation_num + 1, len(observations))
            screens = screen_video.screens
            pages = page_video.pages
            for screen in screens:
                LOGGER.debug("Processing %s ...", screen)
                X = []
                for page in pages:
                    LOGGER.debug("Processing %s ...", page)
                    X.append(self.features.get_pairwise_features(page, screen))
                    LOGGER.debug("Done processing %s.", page)
                X = self.feature_preselector.transform(X)
                X = self.scaler.transform(X)
                if self.classifier.__class__ != SVC:
                    X = self.feature_selector.transform(X)
                ranking = self._predict_confidence(X)
                rankings.append(ranking)
                LOGGER.debug("Done processing %s.", screen)
            LOGGER.debug("Done processing observation number %d / %d.",
                         observation_num + 1, len(observations))
        return rankings

    def _predict_confidence(self, X):
        """Produces confidence scores for class 1 for each of the provided feature vectors.

        Parameters:
            X   The list of provided feature vectors."""
        assert "decision_function" in dir(self.classifier) \
            or "predict_proba" in dir(self.classifier)
        if "decision_function" in dir(self.classifier):
            confidence = self.classifier.decision_function(X)
        else:
            confidence = self.classifier.predict_proba(X)[:, 1]
        return confidence

    def _filename(self):
        return "%s.%s-%s-%s" % (__name__, self.__class__.__name__,
                                self.features.__repr__(),
                                self.classifier.__class__.__name__)

    def __repr__(self):
        return "Feature vectors classifier (%s, %s)" % (
            self.features, self.classifier.__class__.__name__)
Exemplo n.º 29
0
plt.show()

print(
    "\n--------------------------Recursive feature elimination-------------------------\n"
)

from sklearn.feature_selection import RFECV

rfe = RFECV(estimator=LogisticRegression(random_state=rs), cv=10)
rfe.fit(X_train_log, y_train_log)  # run the RFECV

# comparing how many variables before and after
print("Original feature set", X_train_log.shape[1])
print("Number of features after elimination", rfe.n_features_)

X_train_sel_log = rfe.transform(X_train_log)
X_test_sel_log = rfe.transform(X_test_log)
print("Features sorted by their rank:")
print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), feature_names)))

# init grid search CV on transformed dataset
cv.fit(X_train_sel_log, y_train_log)

print(
    "\n--------------------------Test the best model-------------------------\n"
)

# test the best model
print("Train accuracy:", cv.score(X_train_sel_log, y_train_log))
print("Test accuracy:", cv.score(X_test_sel_log, y_test_log))
Exemplo n.º 30
0
import warnings
warnings.filterwarnings("ignore")

rfe2 = RFECV(estimator = DecisionTreeClassifier(random_state=rs, max_depth=8, min_samples_leaf=30), cv=10)
rfe2.fit(X_train, y_train)

print("Original feature set", X_train.shape[1])
print("number of features after elimination", rfe2.n_features_)
#Before: 100, after: 12


# In[790]:


#the dataset with the RFE with logistic regression model
X_train_sel = rfe.transform(X_train)
X_test_sel = rfe.transform(X_test)


# In[791]:


#the dataset with the RFE with decision tree model
X_train_nn = rfe2.transform(X_train)
X_test_nn = rfe2.transform(X_test)


# In[843]:


print(rfe2.support_)
Exemplo n.º 31
0
print(mean_squared_error(y_Train2, y_hat))
print(mean_squared_error(y_Test2, y_hatTest))
print(mean_absolute_error(y_Train2, y_hat))
print(mean_absolute_error(y_Test2, y_hatTest))
print(r2_score(y_Train2, y_hat))
print(r2_score(y_Test2, y_hatTest))

# In[98]:

# RandomForest Regressor y1
rfrmodel = RFECV(RandomForestRegressor(),
                 cv=3,
                 scoring='neg_mean_squared_error',
                 step=1)
rfrmodel.fit(x_Train, y_Train1)
x_rfr = rfrmodel.transform(x_Train)
print(x_Train.shape)
print(x_rfr.shape)
print(rfrmodel.support_)
y_hatRFR = rfrmodel.predict(x_Train)
y_hatRFRTest = rfrmodel.predict(x_Test)
print('Random Forrest Regression Results (Y1)')
print(mean_squared_error(y_Train1, y_hatRFR))
print(mean_squared_error(y_Test1, y_hatRFRTest))
print(mean_absolute_error(y_Train1, y_hatRFR))
print(mean_absolute_error(y_Test1, y_hatRFRTest))
print(r2_score(y_Train1, y_hatRFR))
print(r2_score(y_Test1, y_hatRFRTest))

# In[99]:
Exemplo n.º 32
0
#%% =============================特征工程================================
# 降维
st = time.time()
pca = PCA(n_components=0.95, random_state=666)
feature_train_ = pca.fit_transform(feature_train_)
feature_validation_ = pca.transform(feature_validation_)
feature_test_ = pca.transform(feature_test_)
et = time.time()
print(f"Running time of pca is {et-st:.3f}")

# 递归特征消除筛选特征
st = time.time()
selector = RFECV(LinearSVC(random_state=666), step=0.2, cv=5, n_jobs=3)
selector = selector.fit(feature_train_, label_train)
feature_train_ = selector.transform(feature_train_)
feature_validation_ = selector.transform(feature_validation_)
feature_test_ = selector.transform(feature_test_)
et = time.time()
print(f"Running time of RFECV is {et-st:.3f}")

#%% =============================训练模型================================
# 训练单一模型
model = LinearSVC(C=1, random_state=666)
model.fit(feature_train_, label_train)

# # 模型融合
# clf1 = LogisticRegression(random_state=666)
# clf2 = RidgeClassifier(random_state=666)
# clf3 = LinearSVC(C=1, random_state=666)
# clf4 = SVC(C=1, kernel="sigmoid")
Exemplo n.º 33
0
def test_rfecv():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)   # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert len(rfecv.grid_scores_) == X.shape[1]
    assert len(rfecv.ranking_) == X.shape[1]
    X_r = rfecv.transform(X)

    # All the noisy variable were filtered out
    assert_array_equal(X_r, iris.data)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Test using a customized loss function
    scoring = make_scorer(zero_one_loss, greater_is_better=False)
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scoring)
    ignore_warnings(rfecv.fit)(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test using a scorer
    scorer = get_scorer('accuracy')
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test fix on grid_scores
    def test_scorer(estimator, X, y):
        return 1.0
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=test_scorer)
    rfecv.fit(X, y)
    assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
    # In the event of cross validation score ties, the expected behavior of
    # RFECV is to return the FEWEST features that maximize the CV score.
    # Because test_scorer always returns 1.0 in this example, RFECV should
    # reduce the dimensionality to a single feature (i.e. n_features_ = 1)
    assert rfecv.n_features_ == 1

    # Same as the first two tests, but with step=2
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=2)
    rfecv.fit(X, y)
    assert len(rfecv.grid_scores_) == 6
    assert len(rfecv.ranking_) == X.shape[1]
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Verifying that steps < 1 don't blow up.
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)
Exemplo n.º 34
0
# plt.figure()
# plt.plot(lr_coef_mean.T, 'b', linewidth=1)
# plt.plot(lr_coef_mean.T + lr_coef_sem.T, 'b--', linewidth=1)
# plt.plot(lr_coef_mean.T - lr_coef_sem.T, 'b--', linewidth=1)
# plt.xticks(np.arange(0, 168, 1), labels, rotation='vertical')

# plt.margins(0.4)
# # Tweak spacing to prevent clipping of tick-labels
# plt.subplots_adjust(bottom=0.15)

rfecv = RFECV(estimator=lr_mean,
              step=1,
              cv=StratifiedKFold(9),
              scoring='roc_auc')
rfecv.fit(X, y)
X_rfecv = rfecv.transform(X)
rfecv_scores = cross_val_score(lr_mean,
                               X_rfecv,
                               y,
                               scoring="roc_auc",
                               cv=StratifiedKFold(9))

score_rfecv, perm_scores_rfecv, pvalue_rfecv = permutation_test_score(
    lr_mean,
    X_rfecv,
    y,
    scoring="roc_auc",
    cv=StratifiedKFold(9),
    n_permutations=2000,
    n_jobs=2)
Exemplo n.º 35
0
    def rfe(self,n=None,rfe_model=None):
        if isinstance(n,int)==True:
            if rfe_model==None:
                from sklearn.linear_model import LogisticRegression
                rfe_model=LogisticRegression(solver='lbfgs')
                kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
                selector = RFECV(estimator=rfe_model, min_features_to_select = n, cv=kfold, n_jobs=-1).fit(self.X,self.y)
                keep = [i for i in range(0,len(selector.support_)) if selector.support_[i]==True]
                self.X=self.X.iloc[:,keep]
            else:
                kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
                selector = RFECV(estimator=rfe_model, min_features_to_select = n, cv=kfold, n_jobs=-1).fit(self.X,self.y)
                keep = [i for i in range(0,len(selector.support_)) if selector.support_[i]==True]
                self.X=self.iloc[:,keep]
        else:
            n = ''.join(n.split()).lower()
            n = n[:3]
            if n == 'opt':
                if rfe_model==None:
                    if self.X.shape[1]>1000:
                        steps=int((round(floor(self.X.shape[1]),-3)/1000)*8)
                    else:
                        steps=1
                    nof_list=np.arange(self.X.shape[1]-1,1,step=-steps)
                    #print(nof_list)
                    check_point=np.arange(1,self.X.shape[1]-1,step=floor(0.1*self.X.shape[1]))
                    #print(check_point)
                    high_score=0
                    #Variable to store the optimum features
                    n_best=0
                    score_list =[]
                    X_train, X_test, y_train, y_test = train_test_split(self.X,self.y, test_size = 0.2, random_state = self.random_state)
                    print("Optimizing...")
                    for i in nof_list:
                        #print("Testing n = ",i)
                        num_col=X_train.shape[1]
                        from sklearn.linear_model import LogisticRegression
                        rfe_model=LogisticRegression(solver='lbfgs')
                        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
                        selector = RFECV(estimator=rfe_model, min_features_to_select = i, cv=kfold, n_jobs=-1).fit(X_train,y_train)
                        X_train_rfe=selector.transform(X_train)
                        cols_kept = selector.get_support(indices=True)
                        drop_cols=set(np.arange(0,num_col))-set(cols_kept)
                        drop_cols=list(drop_cols)
                        X_test_rfe=selector.transform(X_test)
                        model=LogisticRegression(solver='lbfgs')
                        model.fit(X_train_rfe,y_train)
                        preds = model.predict(X_test_rfe)
                        score = roc_auc_score(y_test, preds)
                        score_list.append(score)
                        if(score>high_score):
                            high_score = score
                            n_best = i
                        if i in check_point:
                            print(" Best n so far: {} \n Score: {} \n".format(n_best,high_score))
                        X_train=X_train.drop(X_train.columns[drop_cols],axis=1)
                        X_test=X_test.drop(X_test.columns[drop_cols],axis=1)
                    print("Optimal n: {} \n Score: {} \n".format(n_best,high_score))
                    rfe_model=LogisticRegression(solver='lbfgs')
                    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
                    selector = RFECV(estimator=rfe_model, min_features_to_select = n_best, cv=kfold, n_jobs=-1).fit(self.X,self.y)
                    keep = [i for i in range(0,len(selector.support_)) if selector.support_[i]==True]
                    self.X=self.X.iloc[:,keep]


                else:
                    #allow for other model support
                    pass
Exemplo n.º 36
0
def main():
    #ファイル名は修正して使用する。
    print('学習データは [./data]配下に格納してください')
    print('学習データのファイル名(csv)を入力してください')
    train_file = input('>> ')
    print('検証データは [./data]配下に格納してください')
    print('検証データのファイル名(csv)を入力してください')
    test_file = input('>> ')
    drop_columns = list()

    df = pd.read_csv('./data/' + train_file, ',')
    df_test = pd.read_csv('./data/' + test_file, ',')

    print(df.nunique(dropna=False))

    print('正解データの列名を入力してください')
    print(df.columns.values)
    ans_col = list()
    ans_col_name = input('>> ')
    ans_col.append(ans_col_name)

    #正解データ列の指定
    y_train = df.loc[:, ans_col]

    print('検証データのid名を入力してください')
    print(df_test.columns.values)
    test_id = input('>> ')
    y_id = pd.DataFrame(df_test.loc[:, test_id])

    print(df.dtypes)
    print('不要な列名を入力してください(複数ある場合は「,」で区切って入力)')
    print(df.columns.values)
    drop_col = input('>> ')

    if drop_col != '':
        drop_index = drop_col.find(',')
        if drop_index != -1:
            drop_columns = drop_col.split(',')
        else:
            drop_columns = drop_col

    #不要列削除
    if drop_columns != '':
        df = df.drop(drop_columns, axis=1)
        df_test = df_test.drop(drop_columns, axis=1)

    #学習データから正解データを削除
    df = df.drop(ans_col, axis=1)

    #カテゴリ変数
    list_category = list()
    for category_columns in df.columns:
        if df[category_columns].dtypes == object:
            list_category.append(category_columns)

    print('カテゴリ変数')
    print(list_category)

    #####################
    #----- モデル用 -----#
    ####################
    print('モデル用の前処理開始')
    df_ohe = one_hot_encoding(df, list_category)
    print('ワンホットエンコーディング後サイズ:' + str(df_ohe.shape))
    imp = SimpleImputer()
    imp.fit(df_ohe)
    df_ohe = pd.DataFrame(imp.transform(df_ohe), columns=df_ohe.columns.values)

    rf = RandomForestClassifier(random_state=1)
    rf.fit(df_ohe, y_train)

    #特徴選択
    #select = RFECV(RandomForestClassifier(n_estimators=100, random_state=1), min_features_to_select=10,step=0.05)
    select = RFECV(estimator=rf)
    select.fit(df_ohe, y_train)

    #特徴選択後のサイズ
    X_train = select.transform(df_ohe)
    X_train = pd.DataFrame(X_train,
                           columns=df_ohe.columns.values[select.support_])
    print('前処理完了後サイズ:' + str(X_train.shape))

    #重要度
    importances = pd.DataFrame({
        "features": df_ohe.columns,
        "importances": rf.feature_importances_,
        "select": select.support_
    })
    print(importances)

    #####################
    #----- スコア用 -----#
    ####################
    print('スコア用の前処理開始')
    df_test_ohe = one_hot_encoding(df_test, list_category)

    print('ワンホットエンコーディング後サイズ:' + str(df_test_ohe.shape))

    # モデルと整合を合わせる
    X_test = check_columns(X_train, df_test_ohe)

    imp.fit(X_test)
    X_test = pd.DataFrame(imp.transform(X_test), columns=X_test.columns.values)

    print('前処理完了後サイズ:' + str(X_test.shape))

    #select_score = 'f1'
    select_score = 'roc_auc'

    scores = learn_and_score(X_train, y_train, select_score, train_file[:-4])

    print('選択評価指標:' + select_score)
    print('######## 評価結果 ########')
    print(pd.Series(scores).sort_values(ascending=False))

    print('検証に使用するモデルの略称入力してください')

    for key in scores.keys():
        print(key)

    modelname = ''
    model_num = input('>>')
    if model_num != '':
        modelname = model_num

    model = load_model('./model/' + train_file[:-4] + '_' + modelname +
                       '_learned.pkl')
    #予想確率
    #pre = pd.DataFrame(model.predict_proba(X_test), columns=ans_col)
    #予測
    pre = pd.DataFrame(model.predict(X_test), columns=ans_col)
    score = y_id.join(pre)
    if os.path.isdir('./pred') == False:
        os.mkdir('./pred')

    pred_name = './pred/' + test_file[:-4] + '_' + modelname + '_pred.csv'
    score.to_csv(pred_name, index=False)

    print('検証結果を' + pred_name + 'に保存しました')
Exemplo n.º 37
0
# Load libraries
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
import warnings

# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore",
                        module="scipy",
                        message="^internal gelsd")

# Generate features matrix, target vector, and the true coefficients
X, y = make_regression(n_samples=10000,
                       n_features=100,
                       n_informative=2,
                       random_state=1)

# Create a linear regression
ols = linear_model.LinearRegression()

# Create recursive feature eliminator that scores features by mean squared errors
rfecv = RFECV(estimator=ols, step=1, scoring='neg_mean_squared_error')

# Fit recursive feature eliminator
rfecv.fit(X, y)

# Recursive feature elimination
rfecv.transform(X)

# Number of best features
rfecv.n_features_
Exemplo n.º 38
0
    #select features using rfecv only on train data
    #rfe = RFE(estimator=classifier, cv=5,n_features_to_select=10,step=2)
    rfe = RFECV(estimator=classifier, cv=5, step=2, scoring='f1')
    print("going to select optimal features")
    rfe.fit(normalized_matrix_train, y_all[train])
    ranked_features = (rfe.ranking_).tolist()
    #print("shape of train matrix after rfe.fit is: " +str(normalized_matrix_train.shape))
    index = []
    for i in range(0, len(ranked_features)):
        if ranked_features[i] is 1:
            index.append(i)

    print("index is" + str(index))

    rfe.transform(normalized_matrix_train)
    #print("shape of transformed train matrix is: " +str(normalized_matrix_train.shape))
    classifier.fit(normalized_matrix_train, y_all[train])
    rfe.transform(normalised_matrix_test)
    #print("shape of transformed test matrix is: " +str(normalised_matrix_test.shape))
    probas_ = classifier.predict_proba(normalised_matrix_test)
    ##########  ADDING VARIABLES FOR CLASSIFICATION REPORT HERE ####################

    y_proba_report.extend(probas_)
    y_predicted2 = (classifier.predict(normalised_matrix_test))
    print("f1-score for this set of features is:  " +
          str(f1_score(y_all[test], y_predicted2)))
    clf_score = classifier.score(normalised_matrix_test, y_all[test])
    print("score for this set of features is:  " + str(clf_score))
    y_predicted_report.extend(y_predicted2)
    y_test_report.extend(y_all[test])
Exemplo n.º 39
0
def XGB_ModelBuilder(X_train, y_train, X_test, y_test, X_unknown=[]):

    # XGB_ModelBuilder.py
    # Created by KAC on 02/12/2020
    """ This function takes in data and completes a grid search to tune parameters automatically. It then makes predictions
    and calculates an MAE score for those predictions."""

    import numpy as np
    import pandas as pd
    from sklearn.feature_selection import RFECV
    from sklearn.metrics import log_loss
    from xgboost import XGBClassifier as XGB
    from sklearn.model_selection import cross_val_score, RandomizedSearchCV
    from sklearn.metrics import make_scorer

    # scorer = make_scorer(log_loss, greater_is_better=False)
    XGB_model = XGB()
    selector = RFECV(estimator=XGB_model, scoring='neg_log_loss', cv=5)
    selector.fit(X_train, y_train)
    CV_score = cross_val_score(selector,
                               X_train,
                               y_train,
                               scoring='neg_log_loss',
                               cv=5)
    scr = np.mean(CV_score)
    print(
        pd.DataFrame({
            'Variable': X_train.columns,
            'Importance': selector.ranking_
        }).sort_values('Importance', ascending=True).head(50))
    print("Optimal number of features: ", selector.n_features_)
    print("Log Loss for All Features: ", scr)

    if selector.n_features_ < len(X_train.columns):
        X_train_transformed = selector.transform(X_train)
        X_test_transformed = selector.transform(X_test)

        CV_score = cross_val_score(selector,
                                   X_train_transformed,
                                   y_train,
                                   scoring='neg_log_loss',
                                   cv=5)
        scr = np.mean(CV_score)
        print("Log Loss for Selected Features on Training Data: ", scr)
    else:
        X_train_transformed = X_train
        X_test_transformed = X_test
        print(
            "Not optimal to remove features. Proceeding to parameter tuning.")

    # Current Best: {'subsample': 0.9, 'n_estimators': 250, 'min_child_weight': 2, 'max_depth': 8, 'learning_rate': 0.02, 'colsample_bytree': 0.85}
    parameters = {
        "learning_rate": [0.01, 0.015, 0.02, 0.025, 0.03],  #[0.01, 0.05, 0.1],
        "n_estimators": [250, 500, 600],  #[500, 750, 1000],
        "max_depth": [8, 9, 10, 12],  #[3, 6, 9],
        "min_child_weight": [2, 5, 8],  #[1, 2],
        "colsample_bytree": [0.7, 0.75, 0.8, 0.85],  #[0.5, 0.75, 1],
        "subsample": [0.9, 1]  #[0.5, 0., 1]
    }
    rsearch = RandomizedSearchCV(estimator=XGB_model,
                                 param_distributions=parameters,
                                 scoring='neg_log_loss',
                                 n_iter=250,
                                 cv=5)  #XGB_model
    rsearch.fit(X_train_transformed, y_train)
    print(rsearch.best_params_)

    CV_score = cross_val_score(rsearch,
                               X_train_transformed,
                               y_train,
                               scoring='neg_log_loss',
                               cv=5)
    scr = np.mean(CV_score)
    print(
        "Log Loss for Selected Features and Parameter Tuning on Training Data: ",
        scr)

    predictions = rsearch.predict_proba(X_test_transformed)

    pred_scr = round(log_loss(y_test, predictions), 5)
    print("2019 Score: ", pred_scr)

    if X_unknown is not None:
        X_final = pd.concat([X_train, X_test])
        X_final = RFECV.transform(X_final)
        y_final = pd.concat([y_train, y_test])

        X_unknown = RFECV.transform(X_unknown)

        rsearch.fit(X_final, y_final)
        predictions_final = rsearch.predict(X_unknown)

    else:
        predictions_final = []

    return predictions, predictions_final
Exemplo n.º 40
0
# Before feature selection
#Logistic Regression
model = LogisticRegression(random_state=0)
df = df.append(performanceEvaluation(model, X, y, cv, len(X[0][:])))

#Random Forest
model = RandomForestClassifier(n_estimators = 100, random_state=0)
df = df.append(performanceEvaluation(model, X, y, cv, len(X[0][:])))


## Feature selection: Recursive Feature Elimination
#Logistic Regression
model = LogisticRegression(random_state=0)
rfecv = RFECV(estimator=model, step=1, cv=cv,scoring='accuracy')
rfecv.fit(X, y)
X_new = rfecv.transform(X)
df_rfe = df_rfe.append(performanceEvaluation(model, X_new, y, cv, len(X_new[0][:])))

# =============================================================================
#Comment out to get more insights

# #opt = rfecv.n_features_
# #num = rfecv.support_
# #sc = rfecv.grid_scores_
# #est = rfecv.estimator_
# =============================================================================
 

#Random Forest
model = RandomForestClassifier(n_estimators = 100, random_state=0)
rfecv = RFECV(estimator=model, step=1, cv=cv,scoring='accuracy')
Exemplo n.º 41
0
X = np.array([exposure.equalize_adapthist(item[0].reshape(lats.shape).T,
                                          clip_limit=0.03).ravel() for item\
              in sat_data]).T
Y = mask.ravel()
print("Train dataset is formed.")

# if layers aresn't defined apply recursive feature elimination procedure to the full set of features/layers
features_mask = None
if not any(SAT_LAYERS):
    print("Performing recursive feature ellimination...")
    rfecv_clf = RFECV(rf_clf,
                      step=1,
                      min_features_to_select=2,
                      scoring='f1',
                      n_jobs=3,
                      verbose=1)
    rfecv_clf.fit(X, Y)
    rf_clf = rfecv_clf.estimator_
    print("Selected features are: ", rfecv_clf.support_)
    features_mask = rfecv_clf.support_
else:
    print("Training the classifier... ")
    rf_clf.fit(X, Y)

if features_mask is None:
    scores = cross_val_score(rf_clf, X, Y, cv=5)
else:
    scores = cross_val_score(rf_clf, rfecv_clf.transform(X), Y, cv=5)

print("Score estimations are: ", scores)
Exemplo n.º 42
0
warnings.filterwarnings(action="ignore", module="scipy",
                        message="^internal gelsd")

# Сгенерировать матрицу признаков, вектор целей и истинные коэффициенты
features, target = make_regression(n_samples = 10000,
                                   n_features = 100,
                                   n_informative = 2,
                                   random_state = 1)

# Создать объект линейной регрессии
ols = linear_model.LinearRegression()

# Рекурсивно устранить признаки
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)


# In[33]:


# Количество самых лучших признаков
rfecv.n_features_


# In[34]:


# Какие категории самые лучшие
rfecv.support_
Exemplo n.º 43
0
selector.fit(train_set, train_labels)

plt.plot(selector.grid_scores_)
plt.xlabel("Number of Feature")
plt.ylabel("Macro F1 Score")
plt.title("Feature Selection Scores")

print(selector.n_features_)

rankings = pd.DataFrame({
    "feature": list(train_set.columns),
    "rank": list(selector.ranking_)
}).sort_values("rank")
rankings.head(10)

train_selected = selector.transform(train_set)
test_selected = selector.transform(test_set)

selected_features = train_set.columns[np.where(selector.ranking_ == 1)]
train_selected = pd.DataFrame(train_selected, columns=selected_features)
test_selected = pd.DataFrame(test_selected, columns=selected_features)

model_results = cv_model(train_selected, train_labels, LinearSVC(), "LSVC-SEL",
                         model_results)
model_results = cv_model(train_selected, train_labels, GaussianNB(), "GNB-SEL",
                         model_results)
model_results = cv_model(
    train_selected, train_labels,
    MLPClassifier(hidden_layer_sizes=(32, 64, 128, 64, 32)), "MLP-SEL",
    model_results)
model_results = cv_model(train_selected, train_labels,
from sklearn.metrics import accuracy_score, roc_auc_score

df1 = pd.read_csv('EventDetectionData.csv')
scores = []
for i in range(150, 200):
    score = []
    X_train, X_test, y_train, y_test = train_test_split(
        df1.iloc[:, 1:i], df1['target'], test_size=0.3,
        random_state=69)  # 70% training and 30% test

    log = LogisticRegression()

    rfecv = RFECV(estimator=log, step=1, cv=5, scoring='roc_auc')

    X_train_new = rfecv.fit_transform(X_train, y_train)
    X_test_new = rfecv.transform(X_test)

    j = rfecv.n_features_

    C_range = 10.**np.arange(-5, 1)

    penalty_options = ['l1', 'l2']

    param_grid = dict(C=C_range, penalty=penalty_options)

    grid = GridSearchCV(log, param_grid, cv=5, scoring='roc_auc')

    grid.fit(X_train_new, y_train)

    y_train_pred = grid.predict(X_train_new)
Exemplo n.º 45
0
labels = np.array(labels)
features = np.array(features)


### RFECV method

### RFECV method and try 4 different classifier method: logistic regression, Decision Tree, Random Forrest and Adaptive boosting 

### logistic regression

clf_Log = LogisticRegression(random_state = 14, C= 5, class_weight='balanced')
selectorCV_Log = RFECV(clf_Log, step=1, cv=5, scoring = 'f1')
selectorCV_Log.fit(features, labels)
refcv_figure(selectorCV_Log)
clf = selectorCV_Log.estimator_
features_new = selectorCV_Log.transform(features)
test_clf(clf, labels, features_new, folds = 1000)

param_grid = {"C": [0.01, 0.1, 1, 5, 10, 100, 1000],
              "penalty" : ['l1', 'l2']
        }

clf_Log_searchCV = GridSearchCV(clf, param_grid, scoring ='f1', cv=10)
clf_Log_searchCV.fit(features, labels)
print clf_Log_searchCV.best_estimator_
test_clf(clf_Log_searchCV.best_estimator_, labels, features_new, folds = 1000)


### Decision Tree

clf_DT = DecisionTreeClassifier(random_state= 32, class_weight='balanced')