示例#1
0
X.info()
#get all the labels
y1 = data[['effect']]

########################################################################################################################
#    prediction with FeatureBoost
########################################################################################################################

#Using XGBoost for featureboost, each feature get a score.
#Using SelectFromModel, we can extract features, whose score is higher than average score.
model = XGBClassifier()
model.fit(X, y1.values.ravel())
print('feature_importances of all')
print(model.feature_importances_)
selection = SelectFromModel(model, prefit=True)
select_X = selection.transform(X)
select_X = pd.DataFrame(select_X)
print(select_X)

y1.info()
print(y1)
#split training data and test data, The ratio is 4: 1
X_train, X_test, y_train, y_test = train_test_split(select_X,
                                                    y1,
                                                    test_size=0.2,
                                                    random_state=0)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
#特征值缩放-标准化,决策树模型不依赖特征缩放
#stdsc=StandardScaler()
#X_train_std=stdsc.fit_transform(X_train)
#X_test_std=stdsc.fit_transform(X_test)
#随机森林评估特征重要性
feat_labels = df_all.columns[1:-1]
forest = RandomForestClassifier(n_estimators=4,
                                max_depth=10,
                                n_jobs=-1,
                                random_state=0)
forest.fit(X_train, y_train)
score = forest.score(X_test, y_test)
forest_y_score = forest.predict_proba(X_test)
importances = forest.feature_importances_

model = SelectFromModel(forest, threshold=0.004, prefit=True)
X_new = model.transform(X_train)
forest.fit(X_new, y_train)

indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    #给予10000颗决策树平均不纯度衰减的计算来评估特征重要性
    print("%2d) %-*s %f" %
          (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
#可视化特征重要性-依据平均不纯度衰减
plt.title('Feature Importance-RandomForest')
plt.bar(range(X_train.shape[1]),
        importances[indices],
        color='lightblue',
        align='center')
plt.xticks(range(X_train.shape[1]), feat_labels, rotation=90)
示例#3
0
madelon_booster = AdaBoostClassifier(algorithm='SAMME',
                                     learning_rate=1,
                                     base_estimator=madelon_base,
                                     random_state=55)
adult_booster = AdaBoostClassifier(algorithm='SAMME',
                                   learning_rate=1,
                                   base_estimator=adult_base,
                                   random_state=55)
OF_booster = AdaBoostClassifier(algorithm='SAMME',
                                learning_rate=1,
                                base_estimator=OF_base,
                                random_state=55)

pipeM = Pipeline([('Scale', StandardScaler()),
                  ('Cull1',
                   SelectFromModel(RandomForestClassifier(random_state=1),
                                   threshold='median')),
                  ('Cull2',
                   SelectFromModel(RandomForestClassifier(random_state=2),
                                   threshold='median')),
                  ('Cull3',
                   SelectFromModel(RandomForestClassifier(random_state=3),
                                   threshold='median')),
                  ('Cull4',
                   SelectFromModel(RandomForestClassifier(random_state=4),
                                   threshold='median')),
                  ('Boost', madelon_booster)])

pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', adult_booster)])

#
madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX,
示例#4
0
from sklearn.datasets import load_iris
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

X, Y = load_iris(return_X_y=True)
print(X.shape)

model = ExtraTreesClassifier(n_estimators=50)
model.fit(X, Y)
print(model.feature_importances_)

sfModel = SelectFromModel(model, prefit=True)
X1 = sfModel.transform(X)
print(X1.shape)
print(X1)
示例#5
0
lr.fit(X_train_selected, y_train)
print("Score with only selected features: %f" %
      lr.score(X_test_selected, y_test))

# %% [markdown] {"deletable": true, "editable": true}
# ### Model-based Feature Selection
# A somewhat more sophisticated method for feature selection is using a supervised machine learning model and selecting features based on how important they were deemed by the model. This requires the model to provide some way to rank the features by importance. This can be done for all tree-based models (which implement ``get_feature_importances``) and all linear models, for which the coefficients can be used to determine how much influence a feature has on the outcome.
#
# Any of these models can be made into a transformer that does feature selection by wrapping it with the ``SelectFromModel`` class:

# %% {"deletable": true, "editable": true}
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

select = SelectFromModel(RandomForestClassifier(n_estimators=100,
                                                random_state=42),
                         threshold="median")

# %% {"deletable": true, "editable": true}
select.fit(X_train, y_train)
X_train_rf = select.transform(X_train)
print(X_train.shape)
print(X_train_rf.shape)

# %% {"deletable": true, "editable": true}
mask = select.get_support()
# visualize the mask. black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')

# %% {"deletable": true, "editable": true}
X_test_rf = select.transform(X_test)
# "PRI.ACTIVE.ACCTS", "PRI.OVERDUE.ACCTS", "PRI.CURRENT.BALANCE", "PRI.SANCTIONED.AMOUNT",
# "PRI.DISBURSED.AMOUNT", "SEC.NO.OF.ACCTS", "SEC.ACTIVE.ACCTS", "SEC.OVERDUE.ACCTS", "SEC.CURRENT.BALANCE",
# "SEC.SANCTIONED.AMOUNT", "SEC.DISBURSED.AMOUNT", "PRIMARY.INSTAL.AMT", "SEC.INSTAL.AMT",
# "NEW.ACCTS.IN.LAST.SIX.MONTHS", "DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS"]
# model.get_booster().feature_names = x_FN
# plot_importance(model.get_booster())
# plt.show()

# print(thresholds)

n = 0
b_acc = 0

thresholds = np.sort(model.feature_importances_)
for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    select_x_train = selection.transform(x_train)
    selection_model = XGBClassifier()
    selection_model.fit(select_x_train, y_train)

    select_x_test = selection.transform(x_test)
    y_predict = selection_model.predict(select_x_test)

    acc = selection_model.score(select_x_test, y_test)
    acc_score = accuracy_score(y_test, y_predict)
    if acc > b_acc:
        n = select_x_train.shape[1]
        b_acc = acc
        L_selection = selection
        print("Thresh=%.3f, n=%d, acc: %.15f%%, acc_score: %.15f%%" %
示例#7
0
        trainRows = []
        for idx in trainidx:
            trainRows.append(orgMatrix[(mc + nmc)[idx]])

        trainingMat = np.array(trainMatrix)

        # Build a forest and compute the feature importances
        forest = ExtraTreesClassifier(n_estimators=5,
                                      max_features=expectedFeatureCount)

        #forest = LinearSVC()

        forest.fit(trainingMat, trainRes)

        model = SelectFromModel(forest, prefit=True)
        X_new = model.transform(Dall)

        print(X_new.shape)

        print("Training")
        for org in mc + nmc:

            if not org in trainOrgs:
                continue

            orgidx = (mc + nmc).index(org)

            predClass = forest.predict(Dall[orgidx, ])
            predProbs = 0  #forest.predict_proba(Dall[orgidx,])
示例#8
0
#基于树模型
from sklearn.datasets import load_iris

iris = load_iris()
ix, iy = iris.data, iris.target
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel

model1 = ExtraTreesClassifier()
model2 = GradientBoostingClassifier()
model1.fit(ix, iy)
model2.fit(ix, iy)
model1.feature_importances_
model2.feature_importances_
clf1 = SelectFromModel(model1, prefit=True)
clf2 = SelectFromModel(model2, prefit=True)
clf1.get_support()
clf2.get_support()

#---
# sklearn 交叉验证
from sklearn.cross_validation import cross_val_score
#cross_val_score(model, X, y, cv=10)
from sklearn.cross_validation import cross_val_predict
#cross_val_predict(model, X, y, cv=10)
from sklearn.cross_validation import LeaveOneOut
#scores = cross_val_score(model, X, y, cv=LeaveOneOut(len(X)))

# ---
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import roc_auc_score

import h5py
file = h5py.File('/Users/Tina/Documents/ml_proj1_data.h5', 'r')
file.keys()
xtest = file['xtest']
xtrain = file['xtrain']
xval = file['xval']
ytrain = file['ytrain']
yval = file['yval']

#______________________________________Feature selection
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(C=0.05, penalty="l1", dual=False).fit(xtrain, ytrain)
model = SelectFromModel(lsvc, prefit=True)
X_train_new = model.transform(xtrain)
X_val_new = model.transform(xval)
X_test_new = model.transform(xtest)

#_______________________________________scale all inputs : train,val & test in range [-1,1] for svm
from sklearn import preprocessing
import numpy as np
max_abs_scaler = preprocessing.MaxAbsScaler()
xtrainScaled = max_abs_scaler.fit_transform(X_train_new)

max_abs_scaler2 = preprocessing.MaxAbsScaler()
xvalScaled = max_abs_scaler2.fit_transform(X_val_new)

max_abs_scaler3 = preprocessing.MaxAbsScaler()
xtestScaled = max_abs_scaler3.fit_transform(X_test_new)
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import numpy as np

rng = np.random.RandomState(1)
X = rng.randint(0, 2, (200, 20))
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

fs_univariate = SelectKBest(k=10)
fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100),
                                threshold='median')

fs_univariate.fit(X, y)
print('Features selected by univariate selection:')
print(fs_univariate.get_support())
print('')

fs_modelbased.fit(X, y)
print('Features selected by model-based selection:')
print(fs_modelbased.get_support())
示例#11
0
        if key == 'xgboost':
            model = XGBClassifier(**params_XGboost[str(i)])
        elif key == 'catboost':
            model = CatBoostClassifier(**paramsCatBoost[str(i)])

        y = target.iloc[:, i]
        train_X, val_X, train_y, val_y = \
                     train_test_split(X_train, y, random_state=SEED,
                                      shuffle=True)
        model.fit(train_X.values, train_y.values)
        perm = PermutationImportance(model,
                                     cv=5,
                                     scoring='roc_auc',
                                     random_state=SEED)
        perm.fit(val_X.values, val_y.values)
        sel = SelectFromModel(perm, threshold=value['threshold'], prefit=True)
        X_train_transformed = sel.transform(X_train)
        X_test_transformed = sel.transform(X_test)

        prediction, cv_scores_mean = train_and_predict(model,
                                                       X_train_transformed,
                                                       y.values,
                                                       X_test_transformed, cv)

        cv_scores.append(cv_scores_mean)
        predictions.append(prediction)

    print(round(np.array(cv_scores).mean(), 5))
    write_to_submission_file(predictions, ID, value['filename'])

#simple blending
def main():

    data = pd.read_csv(
        'selfie_dataset.txt',
        sep=" ",
        header=None,
        names=[
            "Nome", "Rate", "partial_faces", "is_female", "baby", "child",
            "teenager", "youth", "middle_age", "senior", "white", "black",
            "asian", "oval_face", "round_face", "heart_face", "smiling",
            "mouth_open", "frowning", "wearing_glasses", "wearing_sunglasses",
            "wearing_lipstick", "2tongue_out0", "duck_face", "black_hair",
            "blond_hair", "brown_hair", "red_hair", "curly_hair",
            "straight_hair", "braid_hair", "showing_cellphone",
            "using_earphone", "using_mirror", "wearing_hat", "braces",
            "harsh_lighting", "dim_lighting"
        ])

    labels1 = np.array(data['Rate'])

    mx = max(labels1)
    mn = min(labels1)

    labels = []
    for i in labels1:
        if ((i >= 0) and (i < (mx + mn) / 3)):
            labels.append(1)
        elif ((i >= (mx + mn) / 5) and (i < 2 * (mx + mn) / 5)):
            labels.append(2)
        elif ((i >= 2 * (mx + mn) / 5) and (i < 3 * (mx + mn) / 5)):
            labels.append(3)
        elif ((i >= 3 * (mx + mn) / 5) and (i < 4 * (mx + mn) / 5)):
            labels.append(4)
        elif ((i >= 4 * (mx + mn) / 5) and (i < 5 * (mx + mn) / 5)):
            labels.append(5)

    features1 = data.drop("Rate", axis=1)
    features2 = features1.drop("Nome", axis=1)

    feature_list = list(features2.columns)
    features = np.array(features2)

    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.1, random_state=0)

    print('The shape of our train_features is:', train_features.shape)
    print('The shape of our test_features is:', test_features.shape)

    isTrained = False
    min_importance = 0.04
    n_estimators = 200
    retrain = True

    if (isTrained):

        if (retrain):
            crf = joblib.load("classifier.pkl")

            rf = SelectFromModel(crf, threshold=min_importance)
            rf.fit(train_features, train_labels)

            train_features = rf.transform(train_features)
            test_features = rf.transform(test_features)

            print('The shape of our important_train_features is:',
                  train_features.shape)
            print('The shape of our important_test_features is:',
                  test_features.shape)

            rf_important = RandomForestClassifier(n_estimators=n_estimators,
                                                  random_state=1)

            rf_important.fit(train_features, train_labels)

            rf = rf_important

            print(rf_important)
            print("\n\n")
            predictions = rf_important.predict(test_features)
            importances = list(rf_important.feature_importances_)
        else:
            rf = joblib.load("classifier.pkl")
            print(rf)
            print("\n\n")
            predictions = rf.predict(test_features)
            importances = list(rf.feature_importances_)

    else:

        rf = RandomForestClassifier(n_estimators=n_estimators,
                                    criterion="entropy",
                                    random_state=2)
        rf.fit(train_features, train_labels)
        joblib.dump(rf, 'classifier.pkl')

        print(rf)
        print("\n\n")
        predictions = rf.predict(test_features)
        importances = list(rf.feature_importances_)

    print('Mean Absolute Error:', mean_absolute_error(test_labels,
                                                      predictions))

    print('Train Accuracy:', rf.score(train_features, train_labels), '%')
    print('Test Accuracy:', rf.score(test_features, test_labels), '%')

    print("\n\n")

    print("Importances: \n")
    feature_importances = [
        (feature, round(importance, 4))
        for feature, importance in zip(feature_list, importances)
    ]
    feature_importances = sorted(feature_importances,
                                 key=lambda x: x[1],
                                 reverse=True)
    for pair in feature_importances:
        print('{} : {}'.format(*pair))

    print()
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# Path names
main_path = root_path
byte1g_matrix_path = os.path.join(main_path, byte1g_matrix_path)
byte2g_matrix_path = os.path.join(main_path, byte2g_matrix_path)
byte3g_matrix_path = os.path.join(main_path, byte3g_matrix_path)

rfc = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
sfm = SelectFromModel(rfc, threshold=2e-5)


def roc_auc_score_multiclass(actual_class, pred_class, average="macro"):
    # creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:
        # creating a list of all the classes except the current class
        other_class = [x for x in unique_class if x != per_class]

        # marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        # using the sklearn metrics method to calculate the roc_auc_score
示例#14
0
# print results
print('accuracy =', acc)
print(cr)

print('confusion matrix:')
print(cm)

## plot results
thresh = cm.max() / 2
cmdf = DataFrame(cm, index=['NoPain', 'Pain'], columns=['NoPain', 'Pain'])
sns.heatmap(cmdf, cmap='RdBu_r')
plt.xlabel('Predicted')
plt.ylabel('Observed')
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j + 0.5,
             i + 0.5,
             format(cm[i, j], 'd'),
             horizontalalignment="center",
             color="white")

model = SelectFromModel(logreg, prefit=True)
X_new = model.transform(X)
print(X_new.shape)

selector = RFE(logreg, 1)
selector = selector.fit(X_train, y_train)
selector.support_
order = selector.ranking_
order
print(order)
示例#15
0
_, _, accuracy3 = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy3 * 100))

# In[ ]:

plt.plot(history.history['mean_squared_error'])
plt.show()
plt.plot(history.history['accuracy'], color='red')
plt.show()

# ## Decision Tree

# In[ ]:

sel = SelectFromModel(RandomForestClassifier(n_estimators=100))
sel.fit(X, y)
sel.get_support()
selected_columns = X.columns[(sel.get_support())]
X_new = X[selected_columns]
length = len(selected_columns)

# In[ ]:

pd.Series(sel.estimator_.feature_importances_.ravel()).hist()

# In[ ]:

plt.scatter(x=X_new['MWG'], y=X_new['NWG'], c=y, cmap='rainbow')
plt.legend(y, prop={'size': 5})
示例#16
0
]

# For undersample
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train_undersample, y_train_undersample)

# Print the name and gini importance of each feature
for feature in zip(feat_labels, clf.feature_importances_):
    print(feature)

# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(clf, threshold=0.15)

# Train the selector
sfm.fit(X_train_undersample, y_train_undersample)

# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train_undersample = sfm.transform(X_train_undersample)
X_important_test_undersample = sfm.transform(X_test_undersample)

# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=10000,
示例#17
0
y_train = y_df[msk]
x_train = X_df[msk]
y_test = y_df[~msk]
x_test = X_df[~msk]
'''
train = df[msk]
test = df[~msk]
y_train = train.iloc[:, 0]
x_train = train.iloc[:, 1:]
y_test = test.iloc[:, 0]
x_test = test.iloc[:, 1:]
'''

# 101011110111 2048
rf = RandomForestClassifier(max_depth=100, n_estimators=1000)
embeded_rf_selector = SelectFromModel(rf, max_features=2048)

embeded_rf_selector.fit(X, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:, embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')
'''
rf.fit(x_train, y_train.values.ravel())
y_pred_rf = rf.predict(x_test)
predictions_rf = y_pred_rf
accuracy_rf = accuracy_score(y_test, predictions_rf)
f1_rf = f1_score(y_test, predictions_rf)
precision_rf = precision_score(y_test, predictions_rf)
print(' Feature Set            | Accuracy                         | F1 measure                       |  Precesion')
print('RandomForest            |', accuracy_rf, '|', f1_rf, '|', precision_rf, '|', )