a.append(s[0])
df.drop('B', axis=1, inplace=True)
df['Gender'] = a

for i in df.index:
    if (df.loc[i, 'Gender'] == 'm'):
        df.loc[i, 'Gender'] = 1.0
    else:
        df.loc[i, 'Gender'] = 0.0

target = df['Gender']
target = list(target)
df.drop('Gender', axis=1, inplace=True)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
clf.fit(df, target)
model = SelectFromModel(clf, prefit=True)
train_reduced = model.transform(df)
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
train_red = pca.fit_transform(train_reduced)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model = LinearDiscriminantAnalysis()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_red,
                                                    target,
                                                    test_size=0.2)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
Пример #2
0
parameters = {
    'bootstrap': False,
    'min_samples_leaf': 4,
    'n_estimators': 50,
    'min_samples_split': 10,
    'max_features': 'sqrt',
    'max_depth': 5
}

model = RandomForestClassifier(**parameters)
model.fit(X_train_reduced, Y_train)

# In[133]:

output = model.predict(test1_reduced).astype(int)
model1 = round(model.score(X_train_reduced, Y_train) * 100, 2)
model1

# #### Applying Random Forest Classifier. One can play with parameters (hyperparameter tuning to  increase score). I have achieved  .803 with less feature engineering. However, as i increased the number of dummies for age, it came down to 78.9.

# In[131]:

output = model.predict(test1_reduced).astype(int)
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": output
})
submission.to_csv("titanic51_submission.csv", index=False)

#  I'll update as i improve. Your guidance is appreciated. Also thanks a lot for all the tutorials where i learned a lot.
Пример #3
0
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
features.plot(kind='barh', figsize=(20, 20))
#plt.show()

model = SelectFromModel(clf, threshold=0.005, prefit=True)
train_reduce = model.transform(train)
test_reduce = model.transform(test)
print(train_reduce.shape)
############################交叉验证
train_x = train_reduce[:623]
train_cv = train_reduce[623:]
train_y = targets[:623].as_matrix()
train_cv_y = targets[623:].as_matrix()

from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import svm
model = ensemble.GradientBoostingClassifier(n_estimators=50)
model.fit(train_x, train_y)
print(model.score(train_cv, train_cv_y))

output = model.predict(test_reduce).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('test.csv')
df_output['PassengerId'] = aux['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId', 'Survived']].to_csv('output.csv', index=False)
print("Accuracy: {:.4f}".format(acc))
y_pred_ada = clfGB.predict(x_test)
cnf_matrix = confusion_matrix(y_test, y_pred_ada)
print(cnf_matrix)
TPR = cnf_matrix[0][0] / (cnf_matrix[0][0] + cnf_matrix[1][0])
specificity = cnf_matrix[1][1] / (cnf_matrix[0][1] + cnf_matrix[1][1])
print("sensitivity (TPR)", TPR)
print("AccuracyGradientClassifier:", acc)
print("specificity (1-FPR)", specificity)

#linearRegression

lm = linear_model.LinearRegression()
model = lm.fit(x_train, y_train.ravel())
predictions = lm.predict(x_test)
scoreLR = model.score(x_test, y_test)
print("linear regression:", scoreLR)
print(predictions[0:5])

print("ADA boost: ", scoreAda)
print("RF: ", scoreRF)
#print("AdaBoost: ",scoreAB)
print("KNC: ", scoreKNC)
print("scoreRFExtreme: ", scoreRFExtreme)
print("MLP: ", scoreMLP)

#y_pred_ada=bdt.fit(x_train, y_train.ravel()).predict(x_test)
y_pred_ada = bdt.predict(x_test)

print(y_test[0:5], y_pred_ada[0:5])
cnf_matrix = confusion_matrix(y_test, y_pred_ada)
def main():

    data = pd.read_csv(
        'selfie_dataset.txt',
        sep=" ",
        header=None,
        names=[
            "Nome", "Rate", "partial_faces", "is_female", "baby", "child",
            "teenager", "youth", "middle_age", "senior", "white", "black",
            "asian", "oval_face", "round_face", "heart_face", "smiling",
            "mouth_open", "frowning", "wearing_glasses", "wearing_sunglasses",
            "wearing_lipstick", "2tongue_out0", "duck_face", "black_hair",
            "blond_hair", "brown_hair", "red_hair", "curly_hair",
            "straight_hair", "braid_hair", "showing_cellphone",
            "using_earphone", "using_mirror", "wearing_hat", "braces",
            "harsh_lighting", "dim_lighting"
        ])

    labels1 = np.array(data['Rate'])

    mx = max(labels1)
    mn = min(labels1)

    labels = []
    for i in labels1:
        if ((i >= 0) and (i < (mx + mn) / 3)):
            labels.append(1)
        elif ((i >= (mx + mn) / 5) and (i < 2 * (mx + mn) / 5)):
            labels.append(2)
        elif ((i >= 2 * (mx + mn) / 5) and (i < 3 * (mx + mn) / 5)):
            labels.append(3)
        elif ((i >= 3 * (mx + mn) / 5) and (i < 4 * (mx + mn) / 5)):
            labels.append(4)
        elif ((i >= 4 * (mx + mn) / 5) and (i < 5 * (mx + mn) / 5)):
            labels.append(5)

    features1 = data.drop("Rate", axis=1)
    features2 = features1.drop("Nome", axis=1)

    feature_list = list(features2.columns)
    features = np.array(features2)

    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.1, random_state=0)

    print('The shape of our train_features is:', train_features.shape)
    print('The shape of our test_features is:', test_features.shape)

    isTrained = False
    min_importance = 0.04
    n_estimators = 200
    retrain = True

    if (isTrained):

        if (retrain):
            crf = joblib.load("classifier.pkl")

            rf = SelectFromModel(crf, threshold=min_importance)
            rf.fit(train_features, train_labels)

            train_features = rf.transform(train_features)
            test_features = rf.transform(test_features)

            print('The shape of our important_train_features is:',
                  train_features.shape)
            print('The shape of our important_test_features is:',
                  test_features.shape)

            rf_important = RandomForestClassifier(n_estimators=n_estimators,
                                                  random_state=1)

            rf_important.fit(train_features, train_labels)

            rf = rf_important

            print(rf_important)
            print("\n\n")
            predictions = rf_important.predict(test_features)
            importances = list(rf_important.feature_importances_)
        else:
            rf = joblib.load("classifier.pkl")
            print(rf)
            print("\n\n")
            predictions = rf.predict(test_features)
            importances = list(rf.feature_importances_)

    else:

        rf = RandomForestClassifier(n_estimators=n_estimators,
                                    criterion="entropy",
                                    random_state=2)
        rf.fit(train_features, train_labels)
        joblib.dump(rf, 'classifier.pkl')

        print(rf)
        print("\n\n")
        predictions = rf.predict(test_features)
        importances = list(rf.feature_importances_)

    print('Mean Absolute Error:', mean_absolute_error(test_labels,
                                                      predictions))

    print('Train Accuracy:', rf.score(train_features, train_labels), '%')
    print('Test Accuracy:', rf.score(test_features, test_labels), '%')

    print("\n\n")

    print("Importances: \n")
    feature_importances = [
        (feature, round(importance, 4))
        for feature, importance in zip(feature_list, importances)
    ]
    feature_importances = sorted(feature_importances,
                                 key=lambda x: x[1],
                                 reverse=True)
    for pair in feature_importances:
        print('{} : {}'.format(*pair))

    print()