def make_gb_report(path="images/classification_report.png"): X_train, X_test, y_train, y_test = make_dataset() _, ax = plt.subplots() bayes = GaussianNB() viz = ClassificationReport(bayes, ax=ax, classes=['unoccupied', 'occupied']) viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.poof(outpath=path)
def make_gb_report(path="images/classification_report.png"): X_train, X_test, y_train, y_test = make_dataset() _, ax = plt.subplots() bayes = GaussianNB() viz = ClassificationReport(bayes, ax=ax, classes=['unoccupied', 'occupied']) viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.poof(outpath=path)
def log_classification_report_chart(classifier, X_train, X_test, y_train, y_test, experiment=None): """Log classification report chart. Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method. Tip: Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example. Args: classifier (:obj:`classifier`): | Fitted sklearn classifier object X_train (:obj:`ndarray`): | Training data matrix X_test (:obj:`ndarray`): | Testing data matrix y_train (:obj:`ndarray`): | The classification target for training y_test (:obj:`ndarray`): | The classification target for testing experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``): | Neptune ``Experiment`` object to control to which experiment you log the data. | If ``None``, log to currently active, and most recent experiment. Returns: ``None`` Examples: .. code:: python3 rfc = RandomForestClassifier() rfc.fit(X_train, y_train) neptune.init('my_workspace/my_project') exp = neptune.create_experiment() log_classification_report_chart(rfc, X_train, X_test, y_train, y_test, experiment=exp) """ assert is_classifier(classifier), 'classifier should be sklearn classifier.' exp = _validate_experiment(experiment) try: fig, ax = plt.subplots() visualizer = ClassificationReport(classifier, support=True, is_fitted=True, ax=ax) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.finalize() exp.log_image('charts_sklearn', fig, image_name='Classification Report') plt.close(fig) except Exception as e: print('Did not log Classification Report chart. Error: {}'.format(e))
def DTC(X_train, y_train, X_test, y_test): dtc = DecisionTreeClassifier(random_state=2) dtc.fit(X_train, y_train) print("DecisionTreeClassifier:train set") y_pred = dtc.predict(X_train) pred = dtc.predict_proba(X_test) print("DecisionTreeClassifier:Confusion Matrix: ", confusion_matrix(y_train, y_pred)) print("DecisionTreeClassifier:Accuracy : ", accuracy_score(y_train, y_pred) * 100) print("DecisionTreeClassifier:Test set") y_pred = dtc.predict(X_test) print("DecisionTreeClassifier:Confusion Matrix: ", confusion_matrix(y_test, y_pred)) print("DecisionTreeClassifier:Accuracy : ", accuracy_score(y_test, y_pred) * 100) #Confusion Matrix matrix = confusion_matrix(y_test, y_pred) class_names = [0, 1] fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names) sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt='g') ax.xaxis.set_label_position("top") plt.tight_layout() plt.title('Confusion matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.show() #ROC_AUC curve probs = dtc.predict_proba(X_test) probs = probs[:, 1] auc = roc_auc_score(y_test, probs) print('AUC: %.2f' % auc) le = preprocessing.LabelEncoder() y_test1 = le.fit_transform(y_test) fpr, tpr, thresholds = roc_curve(y_test1, probs) plot_roc_curve(fpr, tpr) #Classification Report target_names = ['Yes', 'No'] prediction = dtc.predict(X_test) print(classification_report(y_test, prediction, target_names=target_names)) classes = ["Yes", "No"] visualizer = ClassificationReport(dtc, classes=classes, support=True) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) g = visualizer.poof()
def RF_Model(X,Y,X1,Y1): global acc1 print("___________________________Random Forest__________________________________________") model1=RandomForestClassifier() model1.fit(X,Y) y_pred1 = model1.predict(X1) print("_____________Report___________________") acc1=cal_accuracy(Y1, y_pred1) # print("_____________user input ___________________") #confusion Matrix import matplotlib.pyplot as plt1 matrix =confusion_matrix(Y1, y_pred1) class_names=[0,1] fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt1.xticks(tick_marks, class_names) plt1.yticks(tick_marks, class_names) sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g') ax.xaxis.set_label_position("top") plt1.tight_layout() plt1.title('Confusion matrix', y=1.1) plt1.ylabel('Actual label') plt1.xlabel('Predicted label') fig.canvas.set_window_title('RF') plt.show() #ROC_AUC curve probs = model1.predict_proba(X1) probs = probs[:, 1] auc = roc_auc_score(Y1, probs) print('AUC: %.2f' % auc) le = preprocessing.LabelEncoder() y_test1=le.fit_transform(Y1) fpr1, tpr1, thresholds = roc_curve(y_test1, probs) #fig.canvas.set_window_title('XGBoost') plot_roc_curve(fpr1, tpr1) #Classification Report target_names = ['Yes', 'No'] prediction=model1.predict(X1) #print(classification_report(Y1, prediction, target_names=target_names)) classes = ["Yes", "No"] visualizer1 = ClassificationReport(model1, classes=classes, support=True) visualizer1.fit(X, Y) visualizer1.score(X1, Y1) #fig.canvas.set_window_title('XGBoost') g = visualizer1.poof()
def classification_report(self) -> None: """Show precision, recall and F1 score by class """ visualizer = ClassificationReport(self.trained_model, cmap="YlGn", size=(600, 360)) visualizer.fit(self.X_train, self.y_train) visualizer.score(self.X_test, self.y_test) save_dir = f"{self.plots_dir}/classification_report_{self.model_id}.png" visualizer.show(outpath=save_dir) if not LOCAL: upload_to_s3(save_dir, f'plots/classification_report_{self.model_id}.png', bucket=S3_BUCKET_NAME) plt.clf()
def naive_report(x_test, x_train, y_test, y_train, csv_str): _, ax = plt.subplots() visualizer = ClassificationReport( MLPClassifier() ) visualizer.fit(x_train, y_train) visualizer.score(x_test, y_test) visualizer.poof(outpath="{}/naive-classification.png".format(csv_str)) _, ax = plt.subplots() visualizer = ClassificationReport( MLPClassifier(hidden_layer_sizes=1, solver='sgd', learning_rate='adaptive', activation='logistic', alpha=1) ) visualizer.fit(x_train, y_train) visualizer.score(x_test, y_test) visualizer.poof(outpath="{}/worst-classification.png".format(csv_str))
def visual_model_selection(X, y, estimator, path): """ Test various estimators. """ model = Pipeline([ ('label_encoding', EncodeCategorical(X.keys())), ('one_hot_encoder', OneHotEncoder()), ('estimator', estimator) ]) _, ax = plt.subplots() # Instantiate the classification model and visualizer visualizer = ClassificationReport(model, ax=ax, classes=['edible', 'poisonous']) visualizer.fit(X, y) visualizer.score(X, y) visualizer.poof(outpath=path)
def classification_report(X_train, y_train, X_test, y_test): viz = ClassificationReport(GradientBoostingClassifier(n_estimators=120, learning_rate=0.1, max_depth=5), cmap='PuOr') viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.show() viz = ClassificationReport(RandomForestClassifier(n_estimators=150, max_features='auto', random_state=42), cmap='PuOr') viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.show() return
def ClassReport_Graph(Classif, Data_train, Target_train, Data_test, Target_test, Class, ModelName='Classifier', Accur=False, Predict=None): """ Function imports method to report and analyse predictions from different scikit-learn model implementations INPUT: training examples' features, training examples' outputs, testing examples' features, testing examples' outputs and list with the names of the classes """ try: from yellowbrick.classifier import ClassificationReport if(Accur==True): print((ModelName+" accuracy: %0.4f")%(metrics.accuracy_score(Target_test, Predict, normalize=True))) view_graph = ClassificationReport(Classif, classes=Class, size=(900, 720)) #Object for classification model and visualization view_graph.fit(Data_train, Target_train) # Fit the training data to the visualizer view_graph.score(Data_test, Target_test) # Evaluate the model on the test data graph = view_graph.poof() # Draw/show/poof the data return graph except: print("CLASSIFICATION-REPORT_ERROR\n")
def visual_model_selection(X, y, estimator, path): """ Test various estimators. """ model = Pipeline([('label_encoding', EncodeCategorical(X.keys())), ('one_hot_encoder', OneHotEncoder()), ('estimator', estimator)]) _, ax = plt.subplots() # Instantiate the classification model and visualizer visualizer = ClassificationReport(model, ax=ax, classes=['edible', 'poisonous']) visualizer.fit(X, y) visualizer.score(X, y) visualizer.poof(outpath=path)
def visualize_model(X, y, estimator, **kwargs): """ Test various estimators. """ y = LabelEncoder().fit_transform(y) model = Pipeline([('one_hot_encoder', OneHotEncoder()), ('estimator', estimator)]) # Instantiate the classification model and visualizer visualizer = ClassificationReport(model, classes=['edible', 'poisonous'], cmap="YlGn", size=(600, 360), **kwargs) visualizer.fit(X, y) visualizer.score(X, y) visualizer.poof()
def classificationreport(clf, classes, X_train, y_train, X_test, y_test): #classes = ['increase','little change', 'decrease'] img = io.BytesIO() #plt.switch_backend('Agg') #plt.style.use('ggplot') visualizer = ClassificationReport(clf, classes=classes, support=True) visualizer.fit(X_train, y_train) # Fit the visualizer and the model visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show(outpath=img) # Finalize and show the figure plt.figure(figsize=(8, 8)) img.seek(0) graph_url = base64.b64encode(img.getvalue()).decode() return 'data:image/png;base64,{}'.format(graph_url)
def visual_model_selection(X_train, X_test, y_train, y_test, estimator, show_plot=True): """ Takes train and test data sets for both features and target plus an estimator and returns a visual classification report. """ from sklearn.pipeline import Pipeline from yellowbrick.classifier import ClassificationReport #y_train = preprocessing.LabelEncoder().fit_transform(y_train.values.ravel()) #y_test = preprocessing.LabelEncoder().fit_transform(y_test.values.ravel()) model = Pipeline([('estimator', estimator)]) # Instantiate the classification model and visualizer visualizer = ClassificationReport(model, classes=['on-time', 'delayed']) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.poof() return visualizer.scores
def classifier_report(classifier, X_test, y_test): classes = np.unique(y_test) cm = ConfusionMatrix(classifier, classes=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) cm.fit(X_test, y_test) cm.score(X_test, y_test) filename = classifier.__class__.__name__ + '_confusion_matrix.png' cm.poof(outpath=filename, clear_figure=True, kwargs=dict(transparent=False, dpi=80, inches='tight')) ex.add_artifact(filename) visualizer = ClassificationReport(classifier, classes=classes, support=True) visualizer.fit(X_test, y_test) visualizer.score(X_test, y_test) visualizer.poof(outpath="classification_report.png", clear_figure=True, kwargs=dict(transparent=False, dpi=80, inches='tight')) ex.add_artifact('classification_report.png')
def naive_report(x_test, x_train, y_test, y_train, csv_str): _, ax = plt.subplots() visualizer = ClassificationReport( DecisionTreeClassifier(random_state=0), ax=ax) visualizer.fit(x_train, y_train) visualizer.score(x_test, y_test) visualizer.poof(outpath="{}/naive-classification.png".format(csv_str)) _, ax = plt.subplots() visualizer = ClassificationReport(DecisionTreeClassifier( random_state=0, min_samples_leaf=20, min_samples_split=20, max_depth=5, max_leaf_nodes=5), ax=ax) visualizer.fit(x_train, y_train) visualizer.score(x_test, y_test) visualizer.poof(outpath="{}/worst-classification.png".format(csv_str))
def classifier_evaluator(gnb, data_train, target_train, data_test, target_test): ''' Function to evalute how good a classifier was trained and how it behaved against a test and validation set :param gnb: :param data_train: :param target_train: :param data_test: :param target_test: :return: Nothing ''' # Instantiate the classification model and visualizer visualizer = ClassificationReport(gnb, classes=['Won', 'Loss']) visualizer.fit(data_train, target_train) # Fit the training data to the visualizer visualizer.score(data_test, target_test) # Evaluate the model on the test data g = visualizer.show() # Other methods -> Draw()/show()/poof() the data
def visualize_model(X, y, estimator, path, **kwargs): """ Test various estimators. """ y = LabelEncoder().fit_transform(y) model = Pipeline([("one_hot_encoder", OneHotEncoder()), ("estimator", estimator)]) _, ax = plt.subplots() # Instantiate the classification model and visualizer visualizer = ClassificationReport(model, classes=["edible", "poisonous"], cmap="YlGn", size=(600, 360), ax=ax, **kwargs) visualizer.fit(X, y) visualizer.score(X, y) visualizer.poof(outpath=path)
def plot_precision_recall_f1(self, classes=['Won', 'Loss'], display=False): """ Plot Precision Recall F1 # Arguments: - classes: A list of all labels - display: boolean value for showing plot or not; default is False """ self.train() # Instantiate the classification model and visualizer visualizer = ClassificationReport(self.svc_model, classes=classes) visualizer.fit( self.data_train, self.label_train) # Fit the training data to the visualizer visualizer.score( self.data_test, self.label_test) # Evaluate the model on the test data visualizer.poof(outpath=self.cfg['plot_path'] + "linear-svc-report.png") # save the data if display: g = visualizer.poof() # show the data
def test_prepredict_classifier(self): """ Test the prepredict estimator with classification report """ # Make prepredictions X, y = self.multiclass.X, self.multiclass.y y_pred = GaussianNB().fit(X.train, y.train).predict(X.test) # Create prepredict estimator with prior predictions estimator = PrePredict(y_pred, CLASSIFIER) assert estimator.fit(X.train, y.train) is estimator assert estimator.predict(X.train) is y_pred assert estimator.score(X.test, y.test) == pytest.approx(0.41, rel=1e-3) # Test that a visualizer works with the pre-predictions. viz = ClassificationReport(estimator) viz.fit(None, y.train) viz.score(None, y.test) viz.finalize() self.assert_images_similar(viz)
def showReport(): # Load the classification data set data = load_data('occupancy') # Specify the features of interest and the classes of the target features = ["temperature", "relative humidity", "light", "C02", "humidity"] classes = ['unoccupied', 'occupied'] # Extract the numpy arrays from the data frame X = data[features].as_matrix() y = data.occupancy.as_matrix() # Create the train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate the classification model and visualizer bayes = GaussianNB() visualizer = ClassificationReport(bayes, classes=classes) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data g = visualizer.poof() # Draw/show/poof the data
def visualize_model(X, y, estimators,pred=False,disc=False, conf=False, bal=False,**kwargs): """ Visualize models using the yellowbrick plotting library. """ # Instantiate the classification model and visualizer visualizer = ClassificationReport( estimators, classes=['Reach, 1 Reach, or L/R Reach', 'Null, Multiple Reaches, Or Multiple Arms'], cmap="YlGn", size=(600, 360), **kwargs ) visualizer.fit(X, y) visualizer.score(X, y) visualizer.show() if pred: class_prediction_errors(X, y, estimators, **kwargs) if disc: discrimination_thresholding(X, y, estimators, **kwargs) if conf: confusion_matrix(X, y, estimators, **kwargs) if bal: plot_class_balance(y, **kwargs)
def classification(fname="classification.png"): # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18,6)) # Add ClassificationReport to the reft data = load_spam(split=True) oz = ClassificationReport(MultinomialNB(), classes=["ham", "spam"], ax=axes[0]) oz.fit(data.X.train, data.y.train) oz.score(data.X.test, data.y.test) oz.finalize() # Add DiscriminationThreshold to the right data = load_spam(split=False) oz = DiscriminationThreshold(LogisticRegression(), ax=axes[1]) oz.fit(data.X, data.y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def linear_svm_model(df): df = to_numeric(df) le = preprocessing.LabelEncoder() df["MultipleLines"] = le.fit_transform(df["MultipleLines"]) df["InternetService"] = le.fit_transform(df["InternetService"]) df["OnlineSecurity"] = le.fit_transform(df["OnlineSecurity"]) df["OnlineBackup"] = le.fit_transform(df["OnlineBackup"]) df["DeviceProtection"] = le.fit_transform(df["DeviceProtection"]) df["TechSupport"] = le.fit_transform(df["TechSupport"]) df["StreamingTV"] = le.fit_transform(df["StreamingTV"]) df["StreamingMovies"] = le.fit_transform(df["StreamingMovies"]) df["Contract"] = le.fit_transform(df["Contract"]) df["PaymentMethod"] = le.fit_transform(df["PaymentMethod"]) cols = [col for col in df.columns if col not in ['Churn']] data = df[cols] target = df['Churn'] data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=0.30, random_state=10) # create an object of type LinearSVC svc_model = LinearSVC(penalty='l1', dual=False) # train the algorithm on training data and predict using the testing data pred = svc_model.fit(data_train, target_train).predict(data_test) # print the accuracy score of the model print("LinearSVC accuracy : ", accuracy_score(target_test, pred, normalize=True)) # Instantiate the classification model and visualizer visualizer = ClassificationReport(svc_model, classes=[0, 1]) visualizer.fit(data_train, target_train) # Fit the training data to the visualizer visualizer.score(data_test, target_test) # Evaluate the model on the test data g = visualizer.poof() # Draw/show/poof the data plot_confusion_matrix(target_test, pred, ["No", "Yes"])
def classification(fname="classification.png"): # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18, 6)) # Add ClassificationReport to the reft data = load_spam(split=True) oz = ClassificationReport(MultinomialNB(), classes=["ham", "spam"], ax=axes[0]) oz.fit(data.X.train, data.y.train) oz.score(data.X.test, data.y.test) oz.finalize() # Add DiscriminationThreshold to the right data = load_spam(split=False) oz = DiscriminationThreshold(LogisticRegression(), ax=axes[1]) oz.fit(data.X, data.y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def train_model(svm_model, healed_data, target_string): #svm_model.fit(healed_data["train_features"], healed_data["train_target"])s # y = LabelEncoder().fit_transform(healed_data["train_target"]) label_classes = list(set(healed_data["train_target"])) try: visualizer = ClassificationReport(svm_model, classes=label_classes, cmap="YlGn") except Exception as e: st.error("Viz error: " + str(e)) try: visualizer.fit(healed_data["train_features"], healed_data["train_target"]) except Exception as e: st.error("Fit error: " + str(e)) try: visualizer.score(healed_data["test_features"], healed_data["test_target"]) except Exception as e: st.error("Score error: " + str(e)) visualizer.show() # st.write(visualizer) st.pyplot(plt.savefig("models/svm_model_eval_" + target_string + ".png")) #plt.savefig("models/svm_model_eval_" + target_string + ".png") # save model output model_output_loc = "models/svm_model_" + target_string + ".pkl" model_output = open(model_output_loc, "wb") pickle.dump(svm_model, model_output) model_output.close() print("saving model to: " + model_output_loc) return
def logisticRegressionTest(X_train, X_test, y_train, y_test, X_1_df, Y_1_df): path = Path(__file__).parent.absolute() #Creates a new directory under svm-linear if it doesn't exist Path("output/logistic-regression/").mkdir(parents=True, exist_ok=True) print('-----------------------------') print('Logistic Regression Test was Called. Wait...') c = [0.01, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 5, 10, 15, 20] #c=[0.01,0.2, 0.4] runningTime = [] trainAccuracy = [] testAccuracy = [] param = [] for i in c: # capture the start time start = time.time() clf = LogisticRegression(random_state=1, C=i, class_weight='balanced', max_iter=10000).fit(X_train, np.ravel(y_train)) y_pred_lr = clf.predict(X_test) y_train_pred_lr = clf.predict(X_train) # capture the end time of calculation end = time.time() runningTime.append(end - start) param.append(i) trainAccuracy.append((accuracy_score(y_train, y_train_pred_lr))) testAccuracy.append(accuracy_score(y_test, y_pred_lr)) #Printing the metrics/Generating visualization print( "Classification report, class prediction error, Test accuracy, Running time for LR is generated in the output folder" ) #Creates a new directory under svm-linear if it doesn't exist Path("output/logistic-regression/").mkdir(parents=True, exist_ok=True) plt.clf() #Generating Test accuracy plot plt.plot(c, trainAccuracy, 'ro-', c, testAccuracy, 'bv--') plt.legend(['Train Accuracy', 'Test Accuracy']) plt.xlabel('C Param value') plt.ylabel('Accuracy') plt.title("Logistic Regression-Accuracy") strFile = str(path) + "/output/logistic-regression" + "/Accuracy.png" print(os.path.isfile(strFile)) if os.path.isfile(strFile): os.remove(strFile) plt.savefig(strFile) plt.clf() #Genrerating the running time plot plt.plot(c, runningTime, 'ro-') plt.legend(['Running time(s)']) plt.xlabel('C Param Value') plt.ylabel('Running time(seconds)') plt.title("Running time") strFile = str(path) + "/output/logistic-regression" + "/Running Time.png" if os.path.isfile(strFile): os.remove(strFile) plt.savefig(strFile) plt.clf() #Finding the max accuracy maxValue = max(testAccuracy) max_index = testAccuracy.index(maxValue) optimum_param = param[max_index] #Printing the average running time print("The average running time - %.3f seconds" % mean(runningTime)) print("The maximum test accuracy - %.3f " % maxValue) print("Corresponding C param value for max test accuracy", optimum_param) # Training a classifier pca = PCA(n_components=2) X_transform = pca.fit_transform(X_1_df) X_train1, X_test1, y_train1, y_test1 = train_test_split( pd.DataFrame(X_transform), Y_1_df, random_state=1, test_size=0.2) clf = LogisticRegression(random_state=1, C=optimum_param, class_weight='balanced', max_iter=10000).fit(X_train1, np.ravel(y_train1)) #Generating decision boundary chart y = pd.DataFrame(Y_1_df).to_numpy() y = y.astype(np.int).flatten() plot_decision_regions(X_transform, y, clf=clf, legend=2) # Adding axes annotations plt.xlabel('x') plt.ylabel('y') plt.title('Decision Boundary') strFile = str( path) + "/output/logistic-regression" + "/Decision Boundary.png" if os.path.isfile(strFile): os.remove(strFile) plt.savefig(strFile) plt.clf() #Printing the classification report vizualizer = ClassificationReport(LogisticRegression( random_state=1, C=optimum_param, class_weight='balanced', max_iter=10000), classes=[0, 1, 2, 3, 4, 5], support=True, size=(1400, 1000)) vizualizer.fit(X_train, y_train.values.ravel()) vizualizer.score(X_test, y_test) strFile = str( path) + "/output/logistic-regression" + "/Classification Report.png" if os.path.isfile(strFile): os.remove(strFile) vizualizer.show(strFile) plt.clf() return
'clf_gnb': 'GaussianNB', 'clf_svc': 'SVC', 'clf_dt': 'DecisionTreeClassifier', 'clf_knn_tuned': 'KNearestNeighbors', 'clf_rf_tuned': 'RandomForest' } best_classifier = max(f1_score_all.iteritems(), key=operator.itemgetter(1))[0] best_classifier = classifiers[best_classifier] #dump_classifier_and_data(clf, data_dict, features_list) ################################################################################ ###Scoring and Plotting and Displaying print "Best classifier:", best_classifier #GaussianNaiveBayes visualizer_gnb = ClassificationReport(clf_gnb, classes=classes) visualizer_gnb.fit(features_train_pca, labels_train) # Fit the visualizer and the model visualizer_gnb.score(features_test_pca, labels_test) # Evaluate the model on the test data g = visualizer_gnb.poof() #SVM visualizer_svc = ClassificationReport(clf_svc, classes=classes) visualizer_svc.fit(features_train_pca, labels_train) # Fit the visualizer and the model visualizer_svc.score(features_test_pca, labels_test) # Evaluate the model on the test data g = visualizer_svc.poof() #DecisionTree visualizer_dt = ClassificationReport(clf_dt, classes=classes) visualizer_dt.fit(features_train_pca, labels_train) # Fit the visualizer and the model visualizer_dt.score(features_test_pca,
#using Explainer to Explain the predictions exp = explainer.explain_instence(X_test.values[0], predict_fn, num_features=6) exp.show_in_notebook(show_all=False) """## Yellowbrick""" # HeatMap for Co-Relation visualizer = Rank2D(algorithm="pearson", size=(1080, 720)) visualizer.fit_transform(X_train) visualizer.poof() # Evaluation Metrics visualizer = ClassificationReport(model, size=(1080, 720)) visualizer.fit(X_train, y_train) visualizer.score(X_train, y_train) visualizer.poff() """# Using API in WebApp (Flask)""" # Commented out IPython magic to ensure Python compatibility. # %%writefile server.py # # #!/usr/bin/env python # # -*- coding: utf-8 -*- # from __future__ import unicode_literals # from flask import Flask, request, jsonify # from sklearn.externals import joblib # import traceback # import pandas as pd
# In[41]: target_names = crimes['primary_type'].unique() print(target_names) # In[42]: # Classification Report # Instantiate the classification model and visualizer from yellowbrick.classifier import ClassificationReport target_names = a visualizer = ClassificationReport(model_knn, classes=target_names, size=(1080, 720)) visualizer.fit(X=x_train, y=y_train) # Fit the training data to the visualizer print("Visualizer score is: ", visualizer.score(x_test, y_test)) # Evaluate the model on the test data print(classification_report(y_test, predicted_result, target_names=a)) g = visualizer.poof() # In[121]: #Model with Neural networks from sklearn.neural_network import MLPClassifier neural_model = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(100, 100, 100), activation='relu', random_state=1, max_iter=100)
def RF_trainmodel(configini): warnings.simplefilter("ignore") configFile = str(configini) config = ConfigParser() config.read(configFile) modelDir = config.get('SML settings', 'model_dir') modelDir_out = os.path.join(modelDir, 'generated_models') if not os.path.exists(modelDir_out): os.makedirs(modelDir_out) tree_evaluations_out = os.path.join(modelDir_out, 'model_evaluations') if not os.path.exists(tree_evaluations_out): os.makedirs(tree_evaluations_out) model_nos = config.getint('SML settings', 'No_targets') csv_dir = config.get('General settings', 'csv_path') csv_dir_in = os.path.join(csv_dir, 'targets_inserted') N_feature_importance = config.getint('RF settings', 'Feature_importances_to_plot') under_sample_correction_value = config.getfloat( 'RF settings', 'under_sample_correction_value') n_estimators = config.getint('RF settings', 'n_estimators') max_features = config.get('RF settings', 'max_features') max_depth = config.getint('RF settings', 'max_depth') ensemble_method = config.get('RF settings', 'ensemble_method') n_jobs = config.getint('RF settings', 'n_jobs') criterion = config.get('RF settings', 'criterion') min_samples_leaf = config.getint('RF settings', 'min_samples_leaf') test_size = config.getfloat('RF settings', 'test_size') loop = 1 model_paths = [] target_names = [] filesFound = [] features = pd.DataFrame() modelFrame = pd.DataFrame() log_path = config.get('General settings', 'project_path') log_path = os.path.join(log_path, 'logs') ########### GET MODEL PATHS AND NAMES ########### for i in range(model_nos): currentModelPaths = 'model_path_' + str(loop) currentModelNames = 'target_name_' + str(loop) currentModelPaths = config.get('SML settings', currentModelPaths) currentModelNames = config.get('SML settings', currentModelNames) model_paths.append(currentModelPaths) target_names.append(currentModelNames) loop += 1 loop = 0 ########### FIND CSV FILES AND CONSOLIDATE ########### print('consolidating csvs....') for i in os.listdir(csv_dir_in): if i.__contains__(".csv"): currentFn = os.path.join(csv_dir_in, i) print(currentFn) df = pd.read_csv(currentFn) features = features.append(df, ignore_index=True) features = features.loc[:, ~features.columns.str.contains('^Unnamed')] ########### REMOVE TARGET VALUES, IF THEY EXSIST ########## frame_number = features.pop('frames').values video_number = features.pop('video_no').values try: for i in range(model_nos): currentModelName = target_names[i] modelFrame[currentModelName] = features.pop( currentModelName).values except KeyError: print('No target data found in input data... ') features = features.fillna(0) ########## REMOVE COORDINATE DATA ########################################### featuresDf = features.drop([ "scorer", "Ear_left_1_x", "Ear_left_1_y", "Ear_left_1_p", "Ear_right_1_x", "Ear_right_1_y", "Ear_right_1_p", "Nose_1_x", "Nose_1_y", "Nose_1_p", "Center_1_x", "Center_1_y", "Center_1_p", "Lat_left_1_x", "Lat_left_1_y", "Lat_left_1_p", "Lat_right_1_x", "Lat_right_1_y", "Lat_right_1_p", "Tail_base_1_x", "Tail_base_1_y", "Tail_base_1_p", "Tail_end_1_x", "Tail_end_1_y", "Tail_end_1_p", "Ear_left_2_x", "Ear_left_2_y", "Ear_left_2_p", "Ear_right_2_x", "Ear_right_2_y", "Ear_right_2_p", "Nose_2_x", "Nose_2_y", "Nose_2_p", "Center_2_x", "Center_2_y", "Center_2_p", "Lat_left_2_x", "Lat_left_2_y", "Lat_left_2_p", "Lat_right_2_x", "Lat_right_2_y", "Lat_right_2_p", "Tail_base_2_x", "Tail_base_2_y", "Tail_base_2_p", "Tail_end_2_x", "Tail_end_2_y", "Tail_end_2_p" ], axis=1) feature_list = list(featuresDf.columns) ########## STANDARDIZE DATA ########################################### # scaler = MinMaxScaler() # scaled_values = scaler.fit_transform(featuresDf) # featuresDf.loc[:,:] = scaled_values ########################## CREATE MODEL FOR EACH TARGET ################################################### for i in range(model_nos): currTargetName = target_names[i] featuresDf[currTargetName] = modelFrame[modelFrame.columns[i]] targetFrameRows = featuresDf.loc[featuresDf[currTargetName] == 1] nonTargetFrameRows = featuresDf.loc[featuresDf[currTargetName] == 0] # SPLIT THE DATA UP IN TRAINING AND TESTING if "attack_prediction" in currTargetName: # sample = int((len(targetFrameRows) * 12)) # nonTargetFrameRows = nonTargetFrameRows.sample(sample, replace=False) trainFrame = pd.concat([targetFrameRows, nonTargetFrameRows]) trainFrame = trainFrame.sample(frac=1) target = trainFrame.pop(currTargetName).values featuresDf = featuresDf.drop([currTargetName], axis=1) data_train, data_test, target_train, target_test = train_test_split( trainFrame, target, test_size=test_size) print('SMOTE skipped') elif "anogenital_prediction" in currTargetName: sample = int((len(targetFrameRows) * 1.5)) nonTargetFrameRows = nonTargetFrameRows.sample(sample, replace=False) trainFrame = pd.concat([targetFrameRows, nonTargetFrameRows]) trainFrame = trainFrame.sample(frac=1) target = trainFrame.pop(currTargetName).values featuresDf = featuresDf.drop([currTargetName], axis=1) data_train, data_test, target_train, target_test = train_test_split( trainFrame, target, test_size=test_size) # print('Applying SMOTE for anogenital...') # smt = SMOTEENN(sampling_strategy=1) # data_train, target_train = smt.fit_sample(data_train, target_train) elif "tail_rattle" in currTargetName: sample = int((len(targetFrameRows) * 2.5)) nonTargetFrameRows = nonTargetFrameRows.sample(sample, replace=False) trainFrame = pd.concat([targetFrameRows, nonTargetFrameRows]) trainFrame = trainFrame.sample(frac=1) target = trainFrame.pop(currTargetName).values featuresDf = featuresDf.drop([currTargetName], axis=1) data_train, data_test, target_train, target_test = train_test_split( trainFrame, target, test_size=test_size) # print('Applying SMOTE for tail rattle...') # smt = SMOTEENN(sampling_strategy=1) # data_train, target_train = smt.fit_sample(data_train, target_train) elif "pursuit_prediction" in currTargetName: sample = int((len(targetFrameRows) * 2.5)) nonTargetFrameRows = nonTargetFrameRows.sample(sample, replace=False) trainFrame = pd.concat([targetFrameRows, nonTargetFrameRows]) trainFrame = trainFrame.sample(frac=1) target = trainFrame.pop(currTargetName).values featuresDf = featuresDf.drop([currTargetName], axis=1) data_train, data_test, target_train, target_test = train_test_split( trainFrame, target, test_size=test_size) # print('Applying SMOTE for pursuit...') # smt = SMOTEENN(sampling_strategy=1) # data_train, target_train = smt.fit_sample(data_train, target_train) elif "lateral_threat" in currTargetName: sample = int((len(targetFrameRows) * 2.5)) nonTargetFrameRows = nonTargetFrameRows.sample(sample, replace=False) trainFrame = pd.concat([targetFrameRows, nonTargetFrameRows]) trainFrame = trainFrame.sample(frac=1) target = trainFrame.pop(currTargetName).values featuresDf = featuresDf.drop([currTargetName], axis=1) data_train, data_test, target_train, target_test = train_test_split( trainFrame, target, test_size=test_size) # print('Applying SMOTE lateral threat...') # smt = SMOTEENN(sampling_strategy=1) # data_train, target_train = smt.fit_sample(data_train, target_train) else: sample = int( (len(targetFrameRows) * under_sample_correction_value)) nonTargetFrameRows = nonTargetFrameRows.sample(sample, replace=False) trainFrame = pd.concat([targetFrameRows, nonTargetFrameRows]) trainFrame = trainFrame.sample(frac=1) target = trainFrame.pop(currTargetName).values featuresDf = featuresDf.drop([currTargetName], axis=1) data_train, data_test, target_train, target_test = train_test_split( trainFrame, target, test_size=test_size) # RANDOM FORREST if ensemble_method == 'RF': clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features, n_jobs=n_jobs, criterion=criterion, min_samples_leaf=min_samples_leaf, bootstrap=True, verbose=1) clf.fit(data_train, target_train) clf_pred = clf.predict(data_test) print("Accuracy " + str(currTargetName) + ' model:', metrics.accuracy_score(target_test, clf_pred)) if ensemble_method == 'GBC': clf = GradientBoostingClassifier(max_depth=max_depth, n_estimators=n_estimators, learning_rate=0.1, max_features='sqrt', verbose=1) clf.fit(data_train, target_train) clf_pred = clf.predict(data_test) print( str(currTargetName) + str(" Accuracy train: ") + str(clf.score(data_train, target_train))) print( str(currTargetName) + str(" Accuracy test: ") + str(clf.score(data_test, target_test))) if ensemble_method == 'XGB': data_train = xgb.DMatrix(data_train, target) data_test = xgb.DMatrix(data_test) clf = xgb.XGBClassifier(max_depth=max_depth, min_child_weight=1, learning_rate=0.1, n_estimators=n_estimators, silent=0, objective='binary:logistic', max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=1, seed=1, missing=None, verbosity=3) clf.fit(data_train, target_train, verbose=True) clf_pred = clf.predict(data_test) print( str(currTargetName) + str(" Accuracy train: ") + str(clf.score(data_train, target_train))) print( str(currTargetName) + str(" Accuracy test: ") + str(clf.score(data_test, target_test))) # SAVE MODELS modelfn = str(currTargetName) + '.sav' modelPath = os.path.join(modelDir_out, modelfn) pickle.dump(clf, open(modelPath, 'wb')) # VISUALIZE A SINGLE TREE print('Generating model evaluations...') if ensemble_method == 'RF': estimator = clf.estimators_[3] dot_name = os.path.join(tree_evaluations_out, str(currTargetName) + '_tree.dot') file_name = os.path.join(tree_evaluations_out, str(currTargetName) + '_tree.pdf') class_names = ['Not_' + currTargetName, currTargetName] export_graphviz(estimator, out_file=dot_name, filled=True, rounded=True, special_characters=False, impurity=False, class_names=class_names, feature_names=trainFrame.columns) commandStr = ('dot ' + str(dot_name) + ' -T pdf -o ' + str(file_name) + ' -Gdpi=600') call(commandStr, shell=True) ################ VISUALIZE CLASSIFICATION REPORT ################################## try: visualizer = ClassificationReport(clf, classes=class_names, support=True) visualizer.fit(data_train, target_train) visualizer.score(data_test, target_test) visualizerPath = os.path.join( tree_evaluations_out, str(currTargetName) + '_classificationReport.png') g = visualizer.poof(outpath=visualizerPath) except KeyError: print(('Warning, not enough data for ') + str(currTargetName)) ################ FEATURE IMPORTANCE LOG ################################## importances = list(clf.feature_importances_) feature_importances = [ (feature, round(importance, 2)) for feature, importance in zip(feature_list, importances) ] feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True) feature_importance_list = [ ('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances ] feature_importance_list_varNm = [ i.split(':' " ", 3)[1] for i in feature_importance_list ] feature_importance_list_importance = [ i.split(':' " ", 3)[2] for i in feature_importance_list ] log_df = pd.DataFrame() log_df['Feature_name'] = feature_importance_list_varNm log_df['Feature_importance'] = feature_importance_list_importance logPath = os.path.join(log_path, str(currTargetName) + '.csv') log_df.to_csv(logPath) ################ FEATURE IMPORTANCE BAR GRAPH ################################# log_df['Feature_importance'] = log_df['Feature_importance'].apply( pd.to_numeric) log_df['Feature_name'] = log_df['Feature_name'].map( lambda x: x.lstrip('+-').rstrip('Importance')) log_df = log_df.head(N_feature_importance) ax = log_df.plot.bar(x='Feature_name', y='Feature_importance', legend=False, rot=90, fontsize=6) figName = str(currTargetName) + '_feature_bars.png' figSavePath = os.path.join(tree_evaluations_out, figName) plt.tight_layout() plt.savefig(figSavePath, dpi=600) plt.close('all') print('Train model complete.')
sales_data['Supplies Subgroup'] = le.fit_transform( sales_data['Supplies Subgroup']) sales_data['Region'] = le.fit_transform(sales_data['Region']) sales_data['Route To Market'] = le.fit_transform(sales_data['Route To Market']) sales_data['Opportunity Result'] = le.fit_transform( sales_data['Opportunity Result']) sales_data['Competitor Type'] = le.fit_transform(sales_data['Competitor Type']) sales_data['Supplies Group'] = le.fit_transform(sales_data['Supplies Group']) print(sales_data.head()) cols = [ col for col in sales_data.columns if col not in ['Opportunity Number', 'Opportunity Result'] ] data = sales_data[cols] target = sales_data['Opportunity Result'] print(data.head(n=2)) print(target.head(n=2)) data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=0.30, random_state=10) gnb = GaussianNB() pred = gnb.fit(data_train, target_train).predict(data_test) print("Naive-Bayes accuracy : ", accuracy_score(target_test, pred, normalize=True)) visualizer = ClassificationReport(gnb, classes=['Won', 'Loss']) visualizer.fit(data_train, target_train) visualizer.score(data_test, target_test) g = visualizer.show()
for i in range(1, 31): knn = KNeighborsClassifier(n_neighbors=i) knn.fit(X_train, y_train) pred_i = knn.predict(X_test) error_rate.append(np.mean(pred_i != y_test)) plt.figure(figsize=(10, 6)) plt.plot(range(1, 31), error_rate, marker='o', color='red', visible='True') plt.xlabel('k') plt.ylabel('Error Rate') plt.show() print('\n') viz = ClassificationReport(KNeighborsClassifier(n_neighbors=2)) viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.show() knn = KNeighborsClassifier(n_neighbors=2) knn.fit(X_train, y_train) pred1 = knn.predict(X_test) cf1 = confusion_matrix(y_test, pred1) # print(cf1_m) # print('\n') cr1 = classification_report(y_test, pred1) print(cr1) viz1 = ClassificationReport(KNeighborsClassifier(n_neighbors=31)) viz1.fit(X_train, y_train) viz1.score(X_test, y_test) viz1.show() knn = KNeighborsClassifier(n_neighbors=31)
data=data, target=target, ) # Load the data and create document vectors corpus = load_corpus('hobbies') tfidf = TfidfVectorizer() docs = tfidf.fit_transform(corpus.data) labels = corpus.target X_train, X_test, y_train, y_test = train_test_split(docs.toarray(), labels, test_size=0.2, random_state=42) visualizer = ClassificationReport(GaussianNB(), classes=corpus.categories) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.poof() visualizer = ClassificationReport(SGDClassifier(), classes=corpus.categories) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.poof() visualizer = ConfusionMatrix(LogisticRegression(), classes=corpus.categories) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.poof() visualizer = ConfusionMatrix(MultinomialNB(), classes=corpus.categories) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer