Пример #1
0
def train_test(X_train, Y_train, X_test, Y_test, cv_params, custom_grid=False):

    if custom_grid:
        random_grid = load_grid(custom_grid)
    else:
        alpha = np.linspace(30000, 20000, 500)
        #solver = ['svd', 'cholesky', 'lsqr']

        # Create the random grid
        random_grid = {'alpha': alpha}
        #'solver' : solver}
    print_grid(random_grid)
    estimator = Ridge(alpha=90000)
    ridge_random = RFECV(estimator, step=500, cv=5, verbose=10)
    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    #ridge_random = RandomizedSearchCV(selector, param_distributions = random_grid, n_iter = cv_params["n_iter"],
    #                                      cv = cv_params["cv"], verbose=10, random_state=42, n_jobs = cv_params["n_jobs"],
    #                                      pre_dispatch='2*n_jobs')
    ridge_random.fit(X_train, Y_train)

    best_grid_params = {'alpha': 30000}
    best_random = ridge_random.get_support()
    best_model_params = ridge_random.get_params()
    train_predictions = ridge_random.predict(X_train)
    test_predictions = ridge_random.predict(X_test)
    #metrics
    r_train = pearsonr(Y_train, train_predictions)
    r_test = pearsonr(Y_test, test_predictions)
    mse_train = mse(Y_train, train_predictions)
    mse_test = mse(Y_test, test_predictions)
    metrics = {
        "r_train": r_train,
        "r_test": r_test,
        "mse_train": mse_train,
        "mse_test": mse_test
    }
    print(f"pearsonr train: {r_train}")
    print(f"pearsonr test: {r_test}")
    print(f"mse train: {mse_train}")
    print(f"mse test: {mse_test}")
    print(best_model_params)
    return best_grid_params, best_model_params, train_predictions, test_predictions, metrics, {}
Пример #2
0
def stratShuffleSplitRFECVRandomForestClassification(
        nEstimators, iterator1, minSamplesSplit, maxFeatures, maxDepth, nFolds,
        targetDataMatrix, trainingData, trainingDataMatrix, SEED):
    '''

    :param nEstimators: This is the number of trees in the forest (typically 500-1000 or so)
    :param iterator1: This is the number of model iterations. For a breakdown of model structure, see the wiki
                      (it's clearly marked...somewhere)
    :param minSamplesSplit: this is the minimum number of samples to split. 2 is a bit small...less is typically more.
    :param maxFeatures:
    :param nFolds:
    :param targetDataMatrix:
    :param trainingData:
    :param trainingDataMatrix:
    :param SEED:
    :return:
    '''
    import multiprocessing
    import numpy as np
    multiprocessing.cpu_count()
    # from helperFunctions import *
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import metrics
    from sklearn import cross_validation
    from sklearn.feature_selection import RFECV
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.cross_validation import StratifiedShuffleSplit

    # rfecv pre-allocation tables, seeding
    X_train = []
    X_holdout = []
    y_train = []
    y_holdout = []
    rfecvGridScoresAll = []
    optimumLengthAll = []
    # feature_names = []
    a = []
    rfc_all_f1 = []
    nameListAll = pd.DataFrame()
    optimumLengthAll = pd.DataFrame()
    classScoreAll = pd.DataFrame()
    classScoreAll2 = pd.DataFrame()
    classScoreAll3 = pd.DataFrame()
    featureImportancesAll = pd.DataFrame()
    rfecvGridScoresAll = pd.DataFrame()

    # Re-definition of the RFC to employ feature importance as a proxy for weighting to employ RFECV.
    class RandomForestClassifierWithCoef(RandomForestClassifier):
        def fit(self, *args, **kwargs):
            super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
            self.coef_ = self.feature_importances_

    ## Re-creation of the RFC object with ranking proxy coefficients
    rfc = RandomForestClassifierWithCoef(n_estimators=nEstimators,
                                         min_samples_split=minSamplesSplit,
                                         bootstrap=True,
                                         n_jobs=-1,
                                         max_features=maxFeatures,
                                         oob_score=True,
                                         max_depth=maxDepth)

    ## Employ Recursive feature elimination with automatic tuning of the number of features selected with CV (RFECV)
    #
    for kk in range(0, iterator1):
        print "iteration no: ", kk + 1
        # Shuffle and split the dataset using a stratified approach to minimize the influence of class imbalance.
        SSS = StratifiedShuffleSplit(targetDataMatrix,
                                     n_iter=1,
                                     test_size=0.10,
                                     random_state=SEED * kk)
        for train_index, test_index in SSS:
            X_train, X_holdout = trainingDataMatrix[
                train_index], trainingDataMatrix[test_index]
            y_train, y_holdout = targetDataMatrix[
                train_index], targetDataMatrix[test_index]

        # Call the RFECV function. Additional splitting is done by stratification shuffling and splitting. 5 folds. 5 times,
        # with a random seed controlling the split.

        rfecv = RFECV(
            estimator=rfc,
            step=1,
            cv=StratifiedKFold(y_train,
                               n_folds=nFolds,
                               shuffle=True,
                               random_state=SEED * kk),
            scoring='accuracy'
        )  # Can  use 'accuracy' or 'f1' f1_weighted, f1_macro, f1_samples

        # First, the recursive feature elimination model is trained. This fits to the optimum model and begins recursion.
        rfecv = rfecv.fit(X_train, y_train)

        # Second, the cross-validation scores are calculated such that grid_scores_[i] corresponds to the CV score
        # of the i-th subset of features. In other words, from all the features to a single feature, the cross validation
        # score is recorded.
        rfecvGridScoresAll = rfecvGridScoresAll.append([rfecv.grid_scores_])

        # Third, the .support_ attribute reports whether the feature remains after RFECV or not. The possible parameters are
        # inspected by their ranking. Low ranking features are removed.
        supPort = rfecv.support_  # True/False values, where true is a parameter of importance identified by recursive alg.
        possParams = rfecv.ranking_
        min_feature_params = rfecv.get_params(deep=True)
        optimumLengthAll = optimumLengthAll.append([rfecv.n_features_])
        featureSetIDs = list(supPort)
        featureSetIDs = list(featureSetIDs)
        # print feature_names
        feature_names = list(trainingData.columns.values)
        namedFeatures = list(trainingData.columns.values)
        namedFeatures = np.array(namedFeatures)

        # Loop over each item in the list of true/false values, if true, pull out the corresponding feature name and store
        # it in the appended namelist. This namelist is rewritten each time, but the information is retained.
        nameList = [
        ]  # Initialize a blank array to accept the list of names for features identified as 'True',
        # or important.
        # print featureSetIDs
        # print len(featureSetIDs)
        for i in range(0, len(featureSetIDs)):
            if featureSetIDs[i]:
                nameList.append(feature_names[i])
            else:
                a = 1
                # print("didn't make it")
                # print(feature_names[i])
        nameList = pd.DataFrame(nameList)
        nameListAll = nameListAll.append(nameList)  # append the name list
        nameList = list(nameList)
        nameList = np.array(nameList)

        # Fourth, the training process begins anew, with the objective to trim to the optimum feature and retrain the model
        # without cross validation i.e., test the holdout set. The new training test set size for the holdout validation
        # should be the entire 90% of the training set (X_trimTrainSet). The holdout test set also needs to be
        # trimmed. The same transformation is performed on the holdout set (X_trimHoldoutSet).
        X_trimTrainSet = rfecv.transform(X_train)
        X_trimHoldoutSet = rfecv.transform(X_holdout)

        # Fifth, no recursive feature elimination is needed (it has already been done and the poor features removed).
        # Here the model is trained against the trimmed training set X's and corresponding Y's.
        rfc.fit(X_trimTrainSet, y_train)

        # Holdout test results are generated here.
        preds = rfc.predict(
            X_trimHoldoutSet
        )  # Predict the class from the holdout dataset. Previous call: rfecv.predict(X_holdout)
        print preds
        print y_holdout
        rfc_all_f1 = metrics.f1_score(y_holdout, preds,
                                      average='weighted')  # determine the F1
        rfc_all_f2 = metrics.r2_score(y_holdout,
                                      preds)  # determine the R^2 Score
        rfc_all_f3 = metrics.mean_absolute_error(
            y_holdout, preds
        )  # determine the MAE - Do this because we want to determine sign.

        # append the previous scores for aggregated analysis
        classScoreAll = classScoreAll.append([
            rfc_all_f1
        ])  # append the previous scores for aggregated analysis.
        classScoreAll2 = classScoreAll2.append([rfc_all_f2])
        classScoreAll3 = classScoreAll3.append([rfc_all_f3])
        refinedFeatureImportances = rfc.feature_importances_  # determine the feature importances for aggregated analysis.
        featureImportancesAll = featureImportancesAll.append(
            [refinedFeatureImportances])

    # Output file creation
    print(
        "List of Important Features Identified by Recursive Selection Method:")
    print(nameListAll)
    nameListAll.to_csv('./outputFiles/class_IFIRS.csv')
    nameListAll.count()

    print("f1 weighted score for all runs:")
    print(classScoreAll)
    classScoreAll.to_csv('./outputFiles/f1_score_all.csv')

    print("R^2 score for all runs:")
    print(classScoreAll2)
    classScoreAll2.to_csv('./outputFiles/class_Rsq_score_all.csv')

    print("MAE score for all runs:")
    print(classScoreAll3)
    classScoreAll3.to_csv('./outputFiles/class_MAE_score_all.csv')

    print("Optimal number of features:")
    print(optimumLengthAll)
    optimumLengthAll.to_csv('./outputFiles/class_optimum_length.csv')

    print("Selected Feature Importances:")
    print(featureImportancesAll)
    featureImportancesAll.to_csv(
        './outputFiles/class_sel_feature_importances.csv')

    print("mean_squared_error Grid Score for Increasing Features")
    print(rfecvGridScoresAll)
    rfecvGridScoresAll.to_csv('./outputFiles/class_rfecv_grid_scores.csv')
Пример #3
0
def stratShuffleSplitRFECVRandomForestClassification(
    nEstimators,
    iterator1,
    minSamplesSplit,
    maxFeatures,
    maxDepth,
    nFolds,
    targetDataMatrix,
    trainingData,
    trainingDataMatrix,
    SEED,
):
    """

    :param nEstimators: This is the number of trees in the forest (typically 500-1000 or so)
    :param iterator1: This is the number of model iterations. For a breakdown of model structure, see the wiki
                      (it's clearly marked...somewhere)
    :param minSamplesSplit: this is the minimum number of samples to split. 2 is a bit small...less is typically more.
    :param maxFeatures:
    :param nFolds:
    :param targetDataMatrix:
    :param trainingData:
    :param trainingDataMatrix:
    :param SEED:
    :return:
    """
    import multiprocessing
    import numpy as np

    multiprocessing.cpu_count()
    # from helperFunctions import *
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import metrics
    from sklearn import cross_validation
    from sklearn.feature_selection import RFECV
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.cross_validation import StratifiedShuffleSplit

    # rfecv pre-allocation tables, seeding
    X_train = []
    X_holdout = []
    y_train = []
    y_holdout = []
    rfecvGridScoresAll = []
    optimumLengthAll = []
    # feature_names = []
    a = []
    rfc_all_f1 = []
    nameListAll = pd.DataFrame()
    optimumLengthAll = pd.DataFrame()
    classScoreAll = pd.DataFrame()
    classScoreAll2 = pd.DataFrame()
    classScoreAll3 = pd.DataFrame()
    featureImportancesAll = pd.DataFrame()
    rfecvGridScoresAll = pd.DataFrame()

    # Re-definition of the RFC to employ feature importance as a proxy for weighting to employ RFECV.
    class RandomForestClassifierWithCoef(RandomForestClassifier):
        def fit(self, *args, **kwargs):
            super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
            self.coef_ = self.feature_importances_

    ## Re-creation of the RFC object with ranking proxy coefficients
    rfc = RandomForestClassifierWithCoef(
        n_estimators=nEstimators,
        min_samples_split=minSamplesSplit,
        bootstrap=True,
        n_jobs=-1,
        max_features=maxFeatures,
        oob_score=True,
        max_depth=maxDepth,
    )

    ## Employ Recursive feature elimination with automatic tuning of the number of features selected with CV (RFECV)
    #
    for kk in range(0, iterator1):
        print "iteration no: ", kk + 1
        # Shuffle and split the dataset using a stratified approach to minimize the influence of class imbalance.
        SSS = StratifiedShuffleSplit(targetDataMatrix, n_iter=1, test_size=0.10, random_state=SEED * kk)
        for train_index, test_index in SSS:
            X_train, X_holdout = trainingDataMatrix[train_index], trainingDataMatrix[test_index]
            y_train, y_holdout = targetDataMatrix[train_index], targetDataMatrix[test_index]

        # Call the RFECV function. Additional splitting is done by stratification shuffling and splitting. 5 folds. 5 times,
        # with a random seed controlling the split.

        rfecv = RFECV(
            estimator=rfc,
            step=1,
            cv=StratifiedKFold(y_train, n_folds=nFolds, shuffle=True, random_state=SEED * kk),
            scoring="accuracy",
        )  # Can  use 'accuracy' or 'f1' f1_weighted, f1_macro, f1_samples

        # First, the recursive feature elimination model is trained. This fits to the optimum model and begins recursion.
        rfecv = rfecv.fit(X_train, y_train)

        # Second, the cross-validation scores are calculated such that grid_scores_[i] corresponds to the CV score
        # of the i-th subset of features. In other words, from all the features to a single feature, the cross validation
        # score is recorded.
        rfecvGridScoresAll = rfecvGridScoresAll.append([rfecv.grid_scores_])

        # Third, the .support_ attribute reports whether the feature remains after RFECV or not. The possible parameters are
        # inspected by their ranking. Low ranking features are removed.
        supPort = (
            rfecv.support_
        )  # True/False values, where true is a parameter of importance identified by recursive alg.
        possParams = rfecv.ranking_
        min_feature_params = rfecv.get_params(deep=True)
        optimumLengthAll = optimumLengthAll.append([rfecv.n_features_])
        featureSetIDs = list(supPort)
        featureSetIDs = list(featureSetIDs)
        # print feature_names
        feature_names = list(trainingData.columns.values)
        namedFeatures = list(trainingData.columns.values)
        namedFeatures = np.array(namedFeatures)

        # Loop over each item in the list of true/false values, if true, pull out the corresponding feature name and store
        # it in the appended namelist. This namelist is rewritten each time, but the information is retained.
        nameList = []  # Initialize a blank array to accept the list of names for features identified as 'True',
        # or important.
        # print featureSetIDs
        # print len(featureSetIDs)
        for i in range(0, len(featureSetIDs)):
            if featureSetIDs[i]:
                nameList.append(feature_names[i])
            else:
                a = 1
                # print("didn't make it")
                # print(feature_names[i])
        nameList = pd.DataFrame(nameList)
        nameListAll = nameListAll.append(nameList)  # append the name list
        nameList = list(nameList)
        nameList = np.array(nameList)

        # Fourth, the training process begins anew, with the objective to trim to the optimum feature and retrain the model
        # without cross validation i.e., test the holdout set. The new training test set size for the holdout validation
        # should be the entire 90% of the training set (X_trimTrainSet). The holdout test set also needs to be
        # trimmed. The same transformation is performed on the holdout set (X_trimHoldoutSet).
        X_trimTrainSet = rfecv.transform(X_train)
        X_trimHoldoutSet = rfecv.transform(X_holdout)

        # Fifth, no recursive feature elimination is needed (it has already been done and the poor features removed).
        # Here the model is trained against the trimmed training set X's and corresponding Y's.
        rfc.fit(X_trimTrainSet, y_train)

        # Holdout test results are generated here.
        preds = rfc.predict(
            X_trimHoldoutSet
        )  # Predict the class from the holdout dataset. Previous call: rfecv.predict(X_holdout)
        print preds
        print y_holdout
        rfc_all_f1 = metrics.f1_score(y_holdout, preds, average="weighted")  # determine the F1
        rfc_all_f2 = metrics.r2_score(y_holdout, preds)  # determine the R^2 Score
        rfc_all_f3 = metrics.mean_absolute_error(
            y_holdout, preds
        )  # determine the MAE - Do this because we want to determine sign.

        # append the previous scores for aggregated analysis
        classScoreAll = classScoreAll.append([rfc_all_f1])  # append the previous scores for aggregated analysis.
        classScoreAll2 = classScoreAll2.append([rfc_all_f2])
        classScoreAll3 = classScoreAll3.append([rfc_all_f3])
        refinedFeatureImportances = (
            rfc.feature_importances_
        )  # determine the feature importances for aggregated analysis.
        featureImportancesAll = featureImportancesAll.append([refinedFeatureImportances])

    # Output file creation
    print ("List of Important Features Identified by Recursive Selection Method:")
    print (nameListAll)
    nameListAll.to_csv("./outputFiles/class_IFIRS.csv")
    nameListAll.count()

    print ("f1 weighted score for all runs:")
    print (classScoreAll)
    classScoreAll.to_csv("./outputFiles/f1_score_all.csv")

    print ("R^2 score for all runs:")
    print (classScoreAll2)
    classScoreAll2.to_csv("./outputFiles/class_Rsq_score_all.csv")

    print ("MAE score for all runs:")
    print (classScoreAll3)
    classScoreAll3.to_csv("./outputFiles/class_MAE_score_all.csv")

    print ("Optimal number of features:")
    print (optimumLengthAll)
    optimumLengthAll.to_csv("./outputFiles/class_optimum_length.csv")

    print ("Selected Feature Importances:")
    print (featureImportancesAll)
    featureImportancesAll.to_csv("./outputFiles/class_sel_feature_importances.csv")

    print ("mean_squared_error Grid Score for Increasing Features")
    print (rfecvGridScoresAll)
    rfecvGridScoresAll.to_csv("./outputFiles/class_rfecv_grid_scores.csv")
model.fit(X, y)

#inspecting
#support = array of true and false
support = model.support_
# ranking = features chosen = 1
ranking = model.ranking_
#cross validation scores, one scores for each feature
grid_scores = model.grid_scores_
#number of selected features
selected_features = model.n_features_
# same as support, mask of selected features
model.get_support()

#information about the model
model.get_params()
model.set_params()
model.estimator_.coef_

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(model.grid_scores_) + 1), model.grid_scores_)
plt.show()

new_data = X.ix[:, [
    1, 2, 3, 4, 7, 8, 9, 10, 12, 13, 15, 17, 18, 19, 21, 26, 27, 28, 29, 32,
    33, 34
]]
new_data['labels'] = df['label']
Пример #5
0
 def train_feature_reducer(cls, model, x_train, y_train):
     rfecv = RFECV(estimator=model, step=0.05, scoring='f1_macro', n_jobs=-1)  # , cv=StratifiedKFold)
     rfecv.fit(x_train, y_train)
     print("Optimal number of features : %d" % rfecv.n_features_)
     print("Params: {}".format(rfecv.get_params()))
     return rfecv