var = "test_" + s mean = scores[var].mean() std = scores[var].std() print("{:.<13s}{:>7.4f}{:>10.4f}".format(s, mean, std)) # In[30]: dtc = DecisionTreeClassifier(criterion='gini', max_depth=8, min_samples_split=5, min_samples_leaf=5) dtc = dtc.fit(X_train, y_train) features = X.columns.values.tolist() classes = [0, 1] #FI = list(); #for i in range(len(X.columns)): #FI.append(dtc.feature_importances_[i]) #len(FI) FI_df = pd.DataFrame({ 'Feature': features, 'Importance': dtc.feature_importances_ }) predictions = dtc.predict(X_validate) print( '\n****************Decision Tree with Depth = 8 branches****************\n' ) DecisionTree.display_binary_split_metrics(dtc, X_train, y_train, X_validate, y_validate)
print("{:.<13s}{:>7.4f}{:>10.4f}".format(s, mean, std)) if mean > max_f1: max_f1 = mean best_depth = d print("\nBest based on F1-Score") print("Best Depth = ", best_depth) # Evaluate the tree with the best depth dtc = DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf=5, min_samples_split=5) dtc = dtc.fit(X_train_t, y_train_t) print("\nDecision Tree") print("\nDepth", best_depth) DecisionTree.display_binary_split_metrics(dtc, X_train_t, y_train_t, \ X_validate_t, y_validate_t) #Huge false positive rate! We are missing most of the churn, need to move on to #lose and retained functiosn to have any hope of modeling this data #Lose will be the montly payments lost i.e. churn not predicted plus the cost #of preventing churn i.e. cost per true negative or false negative #Retained will be the monthly payments for true negatives i.e. payments from #churn prevented # Setup false positive and false negative costs for each transaction #Make a nominal cost for a fale positive i.e. they get thing even though #They won't churn fp_cost = np.array(df['MonthlyCharges'] * .2) #False negatives cost the full amount i.e. they churn and we lose them fn_cost = np.array(df['MonthlyCharges']) #Nominal true positive cost i.e. cost to run program to stop churn for
features = X.columns classes = ['Good', 'bad'] # ### For decision tree of depth 5 # # In[29]: dtc_depth5 = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_split=5, min_samples_leaf=5) dtc_depth5 = dtc_depth5.fit(X, y) DecisionTree.display_importance(dtc_depth5, features) DecisionTree.display_binary_metrics(dtc_depth5, X, y) dot_data_depth5 = export_graphviz(dtc_depth5, filled=True, rounded=True, class_names=classes, feature_names=features, out_file=None) # Accuracy = 0.7780 # In[31]: #write tree to png file graph_png = graph_from_dot_data(dot_data_depth5)
for s in score_list: dtc_10 = cross_val_score(dtc, X, y, scoring=s, cv=10) mean = dtc_10.mean() std = dtc_10.std() mean_score.append(mean) std_score.append(std) print("{:.<13s}{:>7.4f}{:>10.4f}".format(s, mean, std)) print("Splitting the dataset and comparing Training and Validation:") X_train, X_validate, y_train, y_validate = \ train_test_split(X,y,test_size = 0.3, random_state=7) #Fitting a tree with the best depth using the training data dtc1 = DecisionTreeClassifier(criterion='entropy', max_depth=9) dtc1 = dtc1.fit(X_train, y_train) features = list(X) DecisionTree.display_importance(dtc1, features) DecisionTree.display_binary_metrics(dtc1, X_validate, y_validate) #df_senti=pd.DataFrame(sentiment_score) #df2=df.join(df_senti) print("\nAverageSentiment by Cluster:") #df2.columns=['url', 'Article', 'topic', 'T1', 'T2', 'T3', 'T4', 'T5', 'senti'] df2=df.drop(['NhtsaID','T1','T2','T3','T4','T5','T6','mileage','T7','mph','Year',\ 'index'],axis=1) df3 = df2.groupby(['topic']).mean() print(df3) ## WORD CLOUDS print("***************************************************************") print("WordClouds........")
# misc_test: 0.124555555556 # FPR: 0.0383243028941 # ============================================================================= X_train, X_validate, y_train, y_validate = train_test_split(X, np_y, test_size=0.3, random_state=12345) #Decison Tree dtc = DecisionTreeClassifier(max_depth=25, min_samples_leaf=3, min_samples_split=5, random_state=12345) dtc = dtc.fit(X_train, y_train) DecisionTree.display_binary_split_metrics(dtc, X_train, y_train, X_validate, y_validate) features = list(X) classes = ['No-Default', 'Default'] DecisionTree.display_importance(dtc, features) # ============================================================================= # #Output # Model Metrics.......... Training Validation # Observations........... 21000 9000 # Features............... 41 41 # Maximum Tree Depth..... 25 25 # Minimum Leaf Size...... 3 3 # Minimum split Size..... 5 5 # Mean Absolute Error.... 0.0552 0.1807 # Avg Squared Error...... 0.0276 0.1502
best_max_features = f print("\nBest based on F1-Score") print("Best Number of Estimators (trees) = ", best_estimator) print("Best Maximum Features = ", best_max_features) # Evaluate the random forest with the best configuration X_train_t, X_validate_t, y_train_t, y_validate_t = \ train_test_split(X_t, np_y_t,test_size = 0.3, random_state=12345) rfc = RandomForestClassifier(n_estimators=best_estimator, criterion="gini", \ max_depth=100, min_samples_split=2, \ min_samples_leaf=1, max_features=best_max_features, \ n_jobs=1, bootstrap=True, random_state=12345) rfc = rfc.fit(X_train_t, y_train_t) DecisionTree.display_importance(rfc, col) #Decision Tree Models # Cross Validation depth_list = [6, 7, 8, 10, 12] max_f1 = 0 for d in depth_list: print("\nMaximum Tree Depth: ", d) dtc = DecisionTreeClassifier(max_depth=d, min_samples_leaf=5, \ min_samples_split=5) dtc = dtc.fit(X_t, y_t) scores = cross_validate(dtc, X_t, y_t, scoring=score_list, \ return_train_score=False, cv=10) print("{:.<13s}{:>6s}{:>13s}".format("Metric", "Mean", "Std. Dev.")) for s in score_list:
best_estimator = e best_max_features = f print("\nBest based on F1-Score") print("Best Number of Estimators (trees) = ", best_estimator) print("Best Maximum Features = ", best_max_features) rfc_train = RandomForestClassifier(n_estimators=best_estimator, criterion="gini", \ max_depth=10, min_samples_split=2, \ min_samples_leaf=1, max_features= best_max_features,\ n_jobs=1, bootstrap=True, random_state=12345, class_weight='balanced').fit(X_train, y_train) ## Validating on the test data print("\nTraining Data\nRandom Selection of 70% of Original Data") DecisionTree.display_binary_split_metrics(rfc_train, X_train, y_train, \ X_validate, y_validate) DecisionTree.display_importance(rfc, encoded_df1.columns) ## Cross validation on Decision Trees max_depth=[5,6,7,8,10,12,15,20,25] for i in max_depth: dtc = DecisionTreeClassifier(criterion='gini', max_depth=i, \ min_samples_split=5, min_samples_leaf=5) dtc = dtc.fit(X,y) score_list = ['accuracy', 'recall', 'precision', 'f1'] mean_score = [] std_score = [] print("For max_depth=",i) print("{:.<13s}{:>6s}{:>13s}".format("Metric", "Mean", "Std. Dev.")) for s in score_list:
Matrix[a,2] = scores[Var_names[1]].mean() Matrix[a,3] = scores[Var_names[1]].std() Matrix[a,4] = scores[Var_names[2]].mean() Matrix[a,5] = scores[Var_names[2]].std() Matrix[a,6] = scores[Var_names[3]].mean() Matrix[a,7] = scores[Var_names[3]].std() a = a+1 Index = ['Logistic regression','DT_Max_Depth_1','DT_Max_Depth_2','DT_Max_Depth_3','DT_Max_Depth_4', 'DT_Max_Depth_5','DT_Max_Depth_6','DT_Max_Depth_7','DT_Max_Depth_8','DT_Max_Depth_9', 'DT_Max_Depth_10','NN_1L_3P','NN_1L_11P','NN_2L_5P_4P','NN_2L_6P_5P','NN_2L_7P_6P', 'RF_10T_autoF','RF_10T_0.3F','RF_10T_0.5F','RF_10T_0.7F','RF_15T_autoF','RF_15T_0.3F', 'RF_15T_0.5F','RF_15T_0.7F','RF_20T_autoF','RF_20T_0.3F','RF_20T_0.5F','RF_20T_0.7F'] Model_Comparison = pd.DataFrame(Matrix, index = Index, columns = Score_names) print(Model_Comparison) print('\n\n\n\n****Selecting Best Model****\n\n') X_train, X_validate, y_train, y_validate = train_test_split(X, y,test_size = 0.3, random_state=7) rfc = RandomForestClassifier(n_estimators=15, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=0.5, n_jobs=1, bootstrap=True, random_state=12345) rfc= rfc.fit(X_train, y_train) col = ['Age','Credit_Limit','Jun_Status','May_Status','Apr_Status','Mar_Status', 'Feb_Status','Jan_Status','Jun_Bill','May_Bill','Apr_Bill','Mar_Bill','Feb_Bill', 'Jan_Bill','Jun_Payment','May_Payment','Apr_Payment','Mar_Payment','Feb_Payment', 'Jan_Payment','Jun_PayPercent','May_PayPercent','Apr_PayPercent','Mar_PayPercent', 'Feb_PayPercent','Jan_PayPercent','Gender','Education0','Education1','Education2', 'Education3','Education4','Education5','Marital_Status0','Marital_Status1','Marital_Status2', 'card_class0','card_class1'] DecisionTree.display_importance(rfc, col) y_tpredict = rfc.predict(X_train) y_vpredict = rfc.predict(X_validate) print(classification_report(y_validate,y_predict)) CM_Train = pd.DataFrame(confusion_matrix(y_train,y_tpredict),index = ['Class 0 ', ' Class 1'], columns=['Class 0 ', ' Class 1' ]) CM_Test = pd.DataFrame(confusion_matrix(y_validate,y_vpredict),index = ['Class 0 ', ' Class 1'], columns=['Class 0 ', ' Class 1' ]) print('\n\nConfusion Matrix Training Set\n',CM_Train ) print('\n\nConfusion Matrix Validation Set\n',CM_Test )