示例#1
0
        var = "test_" + s
        mean = scores[var].mean()
        std = scores[var].std()
        print("{:.<13s}{:>7.4f}{:>10.4f}".format(s, mean, std))

# In[30]:

dtc = DecisionTreeClassifier(criterion='gini',
                             max_depth=8,
                             min_samples_split=5,
                             min_samples_leaf=5)
dtc = dtc.fit(X_train, y_train)

features = X.columns.values.tolist()
classes = [0, 1]
#FI = list();
#for i in range(len(X.columns)):
#FI.append(dtc.feature_importances_[i])
#len(FI)
FI_df = pd.DataFrame({
    'Feature': features,
    'Importance': dtc.feature_importances_
})
predictions = dtc.predict(X_validate)

print(
    '\n****************Decision Tree with Depth = 8 branches****************\n'
)
DecisionTree.display_binary_split_metrics(dtc, X_train, y_train, X_validate,
                                          y_validate)
示例#2
0
        print("{:.<13s}{:>7.4f}{:>10.4f}".format(s, mean, std))
        if mean > max_f1:
            max_f1 = mean
            best_depth = d

print("\nBest based on F1-Score")
print("Best Depth = ", best_depth)
# Evaluate the tree with the best depth
dtc = DecisionTreeClassifier(max_depth=best_depth,
                             min_samples_leaf=5,
                             min_samples_split=5)
dtc = dtc.fit(X_train_t, y_train_t)

print("\nDecision Tree")
print("\nDepth", best_depth)
DecisionTree.display_binary_split_metrics(dtc, X_train_t, y_train_t, \
                                     X_validate_t, y_validate_t)

#Huge false positive rate! We are missing most of the churn, need to move on to
#lose and retained functiosn to have any hope of modeling this data
#Lose will be the montly payments lost i.e. churn not predicted plus the cost
#of preventing churn i.e. cost per true negative or false negative
#Retained will be the monthly payments for true negatives i.e. payments from
#churn prevented

# Setup false positive and false negative costs for each transaction
#Make a nominal cost for a fale positive i.e. they get thing even though
#They won't churn
fp_cost = np.array(df['MonthlyCharges'] * .2)
#False negatives cost the full amount i.e. they churn and we lose them
fn_cost = np.array(df['MonthlyCharges'])
#Nominal true positive cost i.e. cost to run program to stop churn for
features = X.columns
classes = ['Good', 'bad']

# ### For decision tree of depth 5
#

# In[29]:

dtc_depth5 = DecisionTreeClassifier(criterion='gini',
                                    max_depth=5,
                                    min_samples_split=5,
                                    min_samples_leaf=5)
dtc_depth5 = dtc_depth5.fit(X, y)

DecisionTree.display_importance(dtc_depth5, features)
DecisionTree.display_binary_metrics(dtc_depth5, X, y)
dot_data_depth5 = export_graphviz(dtc_depth5,
                                  filled=True,
                                  rounded=True,
                                  class_names=classes,
                                  feature_names=features,
                                  out_file=None)

# Accuracy =  0.7780

# In[31]:

#write tree to png file
graph_png = graph_from_dot_data(dot_data_depth5)
    for s in score_list:
        dtc_10 = cross_val_score(dtc, X, y, scoring=s, cv=10)
        mean = dtc_10.mean()
        std = dtc_10.std()
        mean_score.append(mean)
        std_score.append(std)
        print("{:.<13s}{:>7.4f}{:>10.4f}".format(s, mean, std))

print("Splitting the dataset and comparing Training and Validation:")
X_train, X_validate, y_train, y_validate = \
            train_test_split(X,y,test_size = 0.3, random_state=7)
#Fitting a tree with the best depth using the training data
dtc1 = DecisionTreeClassifier(criterion='entropy', max_depth=9)
dtc1 = dtc1.fit(X_train, y_train)
features = list(X)
DecisionTree.display_importance(dtc1, features)
DecisionTree.display_binary_metrics(dtc1, X_validate, y_validate)

#df_senti=pd.DataFrame(sentiment_score)
#df2=df.join(df_senti)
print("\nAverageSentiment by Cluster:")
#df2.columns=['url', 'Article', 'topic', 'T1', 'T2', 'T3', 'T4', 'T5', 'senti']
df2=df.drop(['NhtsaID','T1','T2','T3','T4','T5','T6','mileage','T7','mph','Year',\
             'index'],axis=1)
df3 = df2.groupby(['topic']).mean()
print(df3)

## WORD CLOUDS
print("***************************************************************")
print("WordClouds........")
# misc_test:  0.124555555556
# FPR:  0.0383243028941
# =============================================================================

X_train, X_validate, y_train, y_validate = train_test_split(X,
                                                            np_y,
                                                            test_size=0.3,
                                                            random_state=12345)

#Decison Tree
dtc = DecisionTreeClassifier(max_depth=25,
                             min_samples_leaf=3,
                             min_samples_split=5,
                             random_state=12345)
dtc = dtc.fit(X_train, y_train)
DecisionTree.display_binary_split_metrics(dtc, X_train, y_train, X_validate,
                                          y_validate)
features = list(X)
classes = ['No-Default', 'Default']

DecisionTree.display_importance(dtc, features)

# =============================================================================
# #Output
# Model Metrics..........       Training     Validation
# Observations...........          21000           9000
# Features...............             41             41
# Maximum Tree Depth.....             25             25
# Minimum Leaf Size......              3              3
# Minimum split Size.....              5              5
# Mean Absolute Error....         0.0552         0.1807
# Avg Squared Error......         0.0276         0.1502
示例#6
0
            best_max_features = f

print("\nBest based on F1-Score")
print("Best Number of Estimators (trees) = ", best_estimator)
print("Best Maximum Features = ", best_max_features)

# Evaluate the random forest with the best configuration
X_train_t, X_validate_t, y_train_t, y_validate_t = \
            train_test_split(X_t, np_y_t,test_size = 0.3, random_state=12345)

rfc = RandomForestClassifier(n_estimators=best_estimator, criterion="gini", \
                    max_depth=100, min_samples_split=2, \
                    min_samples_leaf=1, max_features=best_max_features, \
                    n_jobs=1, bootstrap=True, random_state=12345)
rfc = rfc.fit(X_train_t, y_train_t)
DecisionTree.display_importance(rfc, col)

#Decision Tree Models
# Cross Validation
depth_list = [6, 7, 8, 10, 12]
max_f1 = 0
for d in depth_list:
    print("\nMaximum Tree Depth: ", d)
    dtc = DecisionTreeClassifier(max_depth=d, min_samples_leaf=5, \
                                 min_samples_split=5)
    dtc = dtc.fit(X_t, y_t)
    scores = cross_validate(dtc, X_t, y_t, scoring=score_list, \
                            return_train_score=False, cv=10)

    print("{:.<13s}{:>6s}{:>13s}".format("Metric", "Mean", "Std. Dev."))
    for s in score_list:
            best_estimator    = e
            best_max_features = f

print("\nBest based on F1-Score")
print("Best Number of Estimators (trees) = ", best_estimator)
print("Best Maximum Features = ", best_max_features)

rfc_train = RandomForestClassifier(n_estimators=best_estimator, criterion="gini", \
                    max_depth=10, min_samples_split=2, \
                    min_samples_leaf=1, max_features= best_max_features,\
                    n_jobs=1, bootstrap=True, random_state=12345, 
                    class_weight='balanced').fit(X_train, y_train)

## Validating on the test data
print("\nTraining Data\nRandom Selection of 70% of Original Data")
DecisionTree.display_binary_split_metrics(rfc_train, X_train, y_train, \
                                              X_validate, y_validate)
DecisionTree.display_importance(rfc, encoded_df1.columns)


## Cross validation on Decision Trees
max_depth=[5,6,7,8,10,12,15,20,25]
for i in max_depth:
    dtc = DecisionTreeClassifier(criterion='gini', max_depth=i, \
    min_samples_split=5, min_samples_leaf=5)
    dtc = dtc.fit(X,y)
    score_list = ['accuracy', 'recall', 'precision', 'f1']
    mean_score = []
    std_score = []
    print("For max_depth=",i)
    print("{:.<13s}{:>6s}{:>13s}".format("Metric", "Mean", "Std. Dev."))
    for s in score_list:
        Matrix[a,2] = scores[Var_names[1]].mean()
        Matrix[a,3] = scores[Var_names[1]].std()
        Matrix[a,4] = scores[Var_names[2]].mean()
        Matrix[a,5] = scores[Var_names[2]].std()
        Matrix[a,6] = scores[Var_names[3]].mean()
        Matrix[a,7] = scores[Var_names[3]].std()
        a = a+1

Index = ['Logistic regression','DT_Max_Depth_1','DT_Max_Depth_2','DT_Max_Depth_3','DT_Max_Depth_4',        'DT_Max_Depth_5','DT_Max_Depth_6','DT_Max_Depth_7','DT_Max_Depth_8','DT_Max_Depth_9',        'DT_Max_Depth_10','NN_1L_3P','NN_1L_11P','NN_2L_5P_4P','NN_2L_6P_5P','NN_2L_7P_6P',        'RF_10T_autoF','RF_10T_0.3F','RF_10T_0.5F','RF_10T_0.7F','RF_15T_autoF','RF_15T_0.3F',        'RF_15T_0.5F','RF_15T_0.7F','RF_20T_autoF','RF_20T_0.3F','RF_20T_0.5F','RF_20T_0.7F'] 
Model_Comparison = pd.DataFrame(Matrix, index = Index, columns = Score_names) 
print(Model_Comparison)



print('\n\n\n\n****Selecting Best Model****\n\n')
X_train, X_validate, y_train, y_validate =             train_test_split(X, y,test_size = 0.3, random_state=7)

rfc = RandomForestClassifier(n_estimators=15, criterion="gini",                     max_depth=None, min_samples_split=2,                     min_samples_leaf=1, max_features=0.5,                     n_jobs=1, bootstrap=True, random_state=12345)
rfc= rfc.fit(X_train, y_train)

col = ['Age','Credit_Limit','Jun_Status','May_Status','Apr_Status','Mar_Status', 'Feb_Status','Jan_Status','Jun_Bill','May_Bill','Apr_Bill','Mar_Bill','Feb_Bill', 'Jan_Bill','Jun_Payment','May_Payment','Apr_Payment','Mar_Payment','Feb_Payment', 'Jan_Payment','Jun_PayPercent','May_PayPercent','Apr_PayPercent','Mar_PayPercent', 'Feb_PayPercent','Jan_PayPercent','Gender','Education0','Education1','Education2', 'Education3','Education4','Education5','Marital_Status0','Marital_Status1','Marital_Status2', 'card_class0','card_class1']
DecisionTree.display_importance(rfc, col)
y_tpredict =  rfc.predict(X_train) 
y_vpredict =  rfc.predict(X_validate) 
print(classification_report(y_validate,y_predict))
CM_Train = pd.DataFrame(confusion_matrix(y_train,y_tpredict),index =                         ['Class 0 ', ' Class 1'], columns=['Class 0 ', ' Class 1' ])
CM_Test = pd.DataFrame(confusion_matrix(y_validate,y_vpredict),index =                        ['Class 0 ', ' Class 1'], columns=['Class 0 ', ' Class 1' ])
print('\n\nConfusion Matrix Training Set\n',CM_Train )
print('\n\nConfusion Matrix Validation Set\n',CM_Test )