features, target, target_reg = transformData(features_raw, target_raw) # ##shuffle and split the data to create train and test datasets from projectFunctions import splitData X_train, X_test, y_train, y_test = splitData(features, target, 0.3) Xr_train, Xr_test, yr_train, yr_test = splitData(features, target_reg, 0.3) # from projectFunctions import decTree, drawTree, kneighbors, decTreeReg, kneighbhorsReg sample_size = len(X_train) feature_cols = features.columns #Usin gini and depth = 3 results, learner = decTree(sample_size, X_train, y_train, X_test, y_test, 'entropy', 4) drawTree(learner, feature_cols, 'fire_dt.png') print "Accuracy for Decision tree Classifier - Training, Test sets: %.5f, %.5f" % ( results['acc_train'], results['acc_test']) print "-----------------------------------------------------------------------" #decision tree regression results_dreg, learner_dreg = decTreeReg(Xr_train, yr_train, Xr_test, yr_test, 'entropy', 4) print "R2 score for Decision tree regression -Training, Test sets: %.5f, %.5f" % ( results_dreg['acc_train'], results_dreg['acc_test']) print "-----------------------------------------------------------------------" #kneighbors classifier resultsK = kneighbors(X_train, y_train, X_test, y_test) print "Accuracy for K-Neighbors Classifier-Training, Test sets: %.5f, %.5f" % ( resultsK['acc_train'], resultsK['acc_test'])
# Success - Display the first record if data is not None: data.columns = col_names #display(data.head(n=1)) #explore the data from projectFunctions import exploreData exploreData(data) drop_col = ['skin', 'label'] features = data.drop(drop_col, axis=1) target = data['label'] #if features is not None: #display(features.head(n=1)) # #shuffle and split the data to create train and test datasets from projectFunctions import splitData X_train, X_test, y_train, y_test = splitData(features, target, 0.3) from projectFunctions import decTree, drawTree sample_size = len(X_train) feature_cols = features.columns results, learner = decTree(sample_size, X_train, y_train, X_test, y_test, 'gini', 3) drawTree(learner, feature_cols, 'diabetes.png') print "Times for Training, Prediction: %.5f, %.5f" % (results['train_time'], results['pred_time']) print "Accuracy for Training, Test sets: %.5f, %.5f" % (results['acc_train'], results['acc_test']) print "-----------------------------------------------------------------------"
results, imp_features = randomForest(X_train, y_train, X_test, y_test) print "Accuracy for Random forest Classifier - Training, Test sets: %.5f, %.5f" % ( results['acc_train'], results['acc_test']) print "-----------------------------------------------------------------------" #Use only imporatant features from random forest #X_train = X_train[imp_features] #X_test = X_test[imp_features] #Usin gini and depth = 3 results, learner = decTree(sample_size, X_train, y_train, X_test, y_test, 'entropy', 4) feature_cols = X_train.columns feature_cols = [x.encode('utf-8') for x in feature_cols] drawTree(learner, feature_cols, 'churn.png') print "Accuracy for Decision tree Classifier - Training, Test sets: %.5f, %.5f" % ( results['acc_train'], results['acc_test']) print "-----------------------------------------------------------------------" #kneighbors classifier resultsK = kneighbors(X_train, y_train, X_test, y_test) print "Accuracy for K-Neighbors Classifier-Training, Test sets: %.5f, %.5f" % ( resultsK['acc_train'], resultsK['acc_test']) print "-----------------------------------------------------------------------" #SVM classifier resultsS = svmClass(X_train, y_train, X_test, y_test) print "Accuracy for SVM Classifier-Training, Test sets: %.5f, %.5f" % ( resultsS['acc_train'], resultsS['acc_test']) print "-----------------------------------------------------------------------"
from projectFunctions import transformData features, target = transformData(features_raw, target_raw) #features['NumSoftTissues'] = np.nan_to_num(features['NumSoftTissues']) #shuffle and split the data to create train and test datasets from projectFunctions import splitData X_train, X_test, y_train, y_test = splitData(features, target, 0.3) from projectFunctions import decTree, drawTree sample_size = len(X_train) feature_cols = features.columns #Usin gini and depth = 3 results, learner = decTree(sample_size, X_train, y_train, X_test, y_test, 'gini', 20) drawTree(learner, feature_cols, 'ifraud_gini.png') print "Times for Training, Prediction: %.5f, %.5f" % (results['train_time'], results['pred_time']) print "Accuracy for Training, Test sets: %.5f, %.5f" % (results['acc_train'], results['acc_test']) print "-----------------------------------------------------------------------" #Usin entropy and depth = 3 results, learner = decTree(sample_size, X_train, y_train, X_test, y_test, 'entropy', 20) drawTree(learner, feature_cols, 'ifraud_etropy.png') print "Times for Training, Prediction: %.5f, %.5f" % (results['train_time'], results['pred_time']) print "Accuracy for Training, Test sets: %.5f, %.5f" % (results['acc_train'], results['acc_test']) print "-----------------------------------------------------------------------"