def analyze_logistic(X, y, model, scale_columns, analyze_params=False, balance_outcomes=False): """ Function for doing analysis of logistic regression. Plots cumulative gain, confusion matrix and grid search of optimal learning rate/epochs in SGD with k-fold CV (optional). Performs scaling of all continuous features in the data set. Inputs: - X: design matrix, shape (n, p) - y: targets, shape (n,) - scale_columns: list of indices of which columns to MinMax scale - analyze_params: boolean, option to perform grid search of learning rate and n_epochs in SGD - balance_outcomes: boolean, option to balance training data in case of skewed classes """ #split data in train/validate and test X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1) #balance training set such that outcomes are 50/50 in training data if balance_outcomes: non_default_inds = np.where(y_train_val == 0)[0] default_inds = np.where(y_train_val == 1)[0] remove_size = len(non_default_inds) - len(default_inds) remove_inds = np.random.choice(non_default_inds, size=remove_size, replace=False) X_train_val = np.delete(X, remove_inds, axis=0) y_train_val = np.delete(y, remove_inds, axis=0) #end if #scale continuous features minmaxscaler = MinMaxScaler(feature_range=(-1, 1)) scaler = ColumnTransformer(remainder='passthrough', transformers=[('minmaxscaler', minmaxscaler, scale_columns)]) #scale only test data at this point (CV scales training/validation) scaler.fit(X_train_val) X_test = scaler.transform(X_test) if analyze_params: #initialize vectors for saving results error_scores = pd.DataFrame( columns=['log eta', 'n_epochs', 'mse', 'r2', 'accuracy']) n_etas = 4 eta_vals = np.linspace(-1, -4, n_etas) n_epoch_vals = np.array([10, 100, 500, 1000]) n_epochs = len(n_epoch_vals) accuracy_scores = np.zeros((n_etas, n_epochs)) max_accuracy = 0 best_eta = 0 best_n_epochs = 0 #perform grid search of best learning rate #and number of epochs with k-fold cross-validation i = 0 for eta in eta_vals: model.set_eta(10**eta) j = 0 for epoch in n_epoch_vals: model.set_n_epochs(epoch) #perform cross validation mse, r2, accuracy = CV(X_train_val, y_train_val, model) accuracy_scores[i, j] = accuracy error_scores = error_scores.append( { 'log eta': eta, 'n_epochs': epoch, 'mse': mse, 'r2': r2, 'accuracy': accuracy }, ignore_index=True) #check if current configuration is better if accuracy > max_accuracy: max_accuracy = accuracy best_eta = eta best_n_epochs = epoch j += 1 #end for epoch i += 1 #end for eta #set optimal model parameters model.set_eta(10**best_eta) model.set_n_epochs(best_n_epochs) #plot heatmap of grid search acc_table = pd.pivot_table(error_scores, values='accuracy', index=['log eta'], columns='n_epochs') idx_i = np.where(acc_table == max_accuracy)[0] idx_j = np.where(acc_table == max_accuracy)[1] fig = plt.figure() ax = sns.heatmap(acc_table, annot=True, fmt='.2g', cbar=True, linewidths=1, linecolor='white', cbar_kws={'label': 'Accuracy'}) ax.add_patch( Rectangle((idx_j, idx_i), 1, 1, fill=False, edgecolor='red', lw=2)) ax.set_xlabel('Number of epochs') ax.set_ylabel(r'log$_{10}$ of Learning rate') bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) plt.show() #end if #scale training data X_train_val = scaler.transform(X_train_val) #pylearn model model.fit(X_train_val, y_train_val) pred_train = model.predict(X_train_val) pred_test = model.predict(X_test) #sklearn model clf = linear_model.LogisticRegressionCV() clf.fit(X_train_val, y_train_val) pred_skl = clf.predict(X_test) #get accuracy scores accuracy_on_test = accuracy_score(y_test, pred_test) accuracy_on_train = accuracy_score(y_train_val, pred_train) accuracy_skl = accuracy_score(y_test, pred_skl) #predict pred_train_prob = model.predict(X_train_val, probability=True) pred_test_prob = model.predict(X_test, probability=True) #get area ratio and plot cumulaive gain area_ratio_train = cumulative_gain_area_ratio(y_train_val, pred_train_prob, title='Training results') area_ratio_test = cumulative_gain_area_ratio(y_test, pred_test_prob, title=None) plt.show() #plot confusion matrix ax1 = plot_confusion_matrix(y_test, pred_test, normalize=True, cmap='Blues', title=' ') ax2 = plot_confusion_matrix(y_train_val, pred_train, normalize=True, cmap='Blues', title='Training data') bottom, top = ax1.get_ylim() ax1.set_ylim(bottom + 0.5, top - 0.5) ax2.set_ylim(bottom + 0.5, top - 0.5) plt.show() #print some stats print('===accuracy and area ratio stats===') print('accuracy on test:', accuracy_on_test) print('accuracy on train:', accuracy_on_train) print('accuracy skl:', accuracy_skl) print('area ratio train:', area_ratio_train) print('area ratio test:', area_ratio_test) if analyze_params: print('===grid search stats===') print('max accuracy:', max_accuracy) print('eta:', best_eta) print('n_epochs:', best_n_epochs)
m = model_info['model'] m.fit(X_train, y_train) p_train = m.predict(X_train) p_test = m.predict(X_test) model_info['train'] = accuracy_score(y_train, p_train) model_info['test'] = accuracy_score(y_test, p_test) dump(m, model_info['nome'] + '.joblib') del model_info['model'] # grafico prestazioni df = pd.DataFrame(models_info) #df.plot.bar(x='nome') #plt.show() # ricarico un modello salvato model = load('tree.joblib') new_example = [[0.0, 0.1, 2.4, 2.4]] p = model.predict(new_example) print('Prediction', dataset['target_names'][p]) # carico kNN e produco la matrice di confusione knn = load('kNN.joblib') p_test = knn.predict(X_test) plot_confusion_matrix(y_test, p_test) plt.show()
def test_array_like(self): plot_confusion_matrix([0, 'a'], ['a', 0]) plot_confusion_matrix([0, 1], [1, 0]) plot_confusion_matrix(['b', 'a'], ['a', 'b'])
def models(): data = request.get_json() if data['model'] == 'svm': training = pd.read_csv('training.csv') testing = pd.read_csv('testing.csv') x_train = training.drop(columns=['target']) y_train = training['target'] x_test = testing.drop(columns=['target']) y_test_actual = testing['target'] classifier = SVC(kernel=data['kernel']) classifier.fit(x_train, y_train) y_test_obtained = classifier.predict(x_test) testing['obtained'] = y_test_obtained testing.to_csv('result.csv', index=False) plot_confusion_matrix(y_test_actual, y_test_obtained) plt.savefig('figure.png') if data['model'] == 'knn': training = pd.read_csv('training.csv') testing = pd.read_csv('testing.csv') x_train = training.drop(columns=['target']) y_train = training['target'] x_test = testing.drop(columns=['target']) y_test_actual = testing['target'] classifier = KNeighborsClassifier(n_neighbors=data['k']) classifier.fit(x_train, y_train) y_test_obtained = classifier.predict(x_test) testing['obtained'] = y_test_obtained testing.to_csv('result.csv', index=False) plot_confusion_matrix(y_test_actual, y_test_obtained) plt.savefig('figure.png') if data['model'] == 'logistic_regression': training = pd.read_csv('training.csv') testing = pd.read_csv('testing.csv') x_train = training.drop(columns=['target']) y_train = training['target'] x_test = testing.drop(columns=['target']) y_test_actual = testing['target'] classifier = LogisticRegression(random_state=0, solver="liblinear", penalty=data['penalty']) classifier.fit(x_train, y_train) y_test_obtained = classifier.predict(x_test) testing['obtained'] = y_test_obtained testing.to_csv('result.csv', index=False) plot_confusion_matrix(y_test_actual, y_test_obtained) plt.savefig('figure.png') if data['model'] == 'linear_regression': training = pd.read_csv('training.csv') testing = pd.read_csv('testing.csv') x_train = training.drop(columns=['target']) y_train = training['target'] x_test = testing.drop(columns=['target']) y_test_actual = testing['target'] classifier = LinearRegression() classifier.fit(x_train, y_train) y_test_obtained = classifier.predict(x_test) testing['obtained'] = y_test_obtained testing.to_csv('result.csv', index=False) plt.scatter(x_train, y_train, color='blue') plt.scatter(x_test, y_test_actual, color='green') plt.plot(x_test, y_test_obtained, color='red') plt.title( "MSE: " + str(metrics.mean_squared_error(y_test_actual, y_test_obtained))) plt.legend(['Fitted Line', 'Training Data', 'Actual Testing Data']) plt.savefig('figure.png') if data['model'] == 'nn': training = pd.read_csv('training.csv') testing = pd.read_csv('testing.csv') x_train = training.drop(columns=['target']) y_train = training['target'] x_test = testing.drop(columns=['target']) y_test_actual = testing['target'] model = Sequential() model.add( Dense(data['layers_dims'][0], activation=data['layers_activation'][0], input_shape=(x_train.shape[1], ))) for i in range(1, len(data['layers_dims'])): model.add( Dense(data['layers_dims'][i], activation=data['layers_activation'][i])) model.add(Dense(1)) model.compile(optimizer='adam', loss='mean_squared_error') model.fit(x_train, y_train, validation_split=0.2, epochs=30) y_test_obtained = model.predict(x_test) testing['obtained'] = y_test_obtained testing.to_csv('result.csv', index=False) return jsonify([]), 200
predictions = keras_model.predict_classes(X_test) for i in range(len(X_test)): print('(Prediction %d) => (expected %s)' % (predictions[i], y_test[i])) print("Accuracy Train: %.2f%%" % (score1[1]*100)) print("Accuracy Test: %.2f%%" % (score[1]*100)) print(history.history.keys()) # summarize history for accuracy plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() # summarize history for loss plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() #print(confusion_matrix(y_test, predictions)) plot_confusion_matrix(y_test2, y_pred, cmap=plt.cm.Oranges) plt.show()
def test_string_classes(self): np.random.seed(0) clf = LogisticRegression() clf.fit(self.X, convert_labels_into_string(self.y)) preds = clf.predict(self.X) plot_confusion_matrix(convert_labels_into_string(self.y), preds)
def create_booster_summary( booster: Union[lgb.Booster, lgb.sklearn.LGBMModel], log_importances: bool = True, max_num_features: int = 10, list_trees: list = None, log_trees_as_dataframe: bool = True, log_pickled_booster: bool = True, log_trees: bool = False, tree_figsize: int = 30, log_confusion_matrix: bool = False, y_true: np.ndarray = None, y_pred: np.ndarray = None, ): """Create model summary after training that can be assigned to the run namespace. See guide with examples in the `Neptune-LightGBM docs`_. You can log multiple types of metadata: - pickled model - feature importance chart - visualized trees - trees represented as DataFrame - confusion matrix (only for classification problems) See Args section for more info how to parametrize behaviour of this function. Note: You can log summary to the new run, or to the same run that you used for logging model training. Second option can be very useful because you have all the information in the single run. Args: booster (:obj:`lgb.Booster` or :obj:`lgb.sklearn.LGBMModel`): Trained LightGBM model. log_importances (bool): Defaults to True. Log feature importance charts. max_num_features (int): Defaults to 10. Max number of top features on the importance charts. Works only if ``log_importances`` is set to ``True``. If None or <1, all features will be displayed. See `lightgbm.plot_importance`_ for details. list_trees (list): Defaults to None. Indices of the target tree to visualize. Works only if ``log_trees`` is set to ``True``. See `lightgbm.plot_tree`_ for details. log_trees_as_dataframe (bool): Defaults to True. Parse the model and log trees in the easy-to-read pandas DataFrame format. Works only for ``lgb.Booster``. See `lightgbm.Booster.trees_to_dataframe`_ for details. log_pickled_booster (bool): Defaults to True. Log model as pickled file. log_trees (bool): Defaults to False. Log visualized trees. This requires graphviz to work. Learn about setup in the `Neptune-LightGBM installation`_ docs. tree_figsize (int): Defaults to 30, Control size of the visualized tree image. Increase this in case you work with large trees. Works only if ``log_trees`` is set to ``True``. log_confusion_matrix (bool): Defaults to False. Log confusion matrix. If set to True, you need to pass ``y_true`` and ``y_pred``. y_true (:obj:`np.ndarray`): Defaults to None. True labels on the test set. Needed only if ``log_confusion_matrix`` is set to True. y_pred (:obj:`np.ndarray`): Defaults to None. Predictions on the test set. Needed only if ``log_confusion_matrix`` is set to True. Returns: dict: Python dictionary with all metadata, that can be assigned to the run namespace. ``run["booster_summary"] = create_booster_summary(...)`` Examples: For more examples visit `example scripts`_. Full script that does logging during model training and logs booster summary after training:: import lightgbm as lgb import neptune.new as neptune import numpy as np from neptune.new.integrations.lightgbm import NeptuneCallback, create_booster_summary from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split # Create run run = neptune.init( project="common/lightgbm-integration", api_token="ANONYMOUS", name="train-cls", tags=["lgbm-integration", "train", "cls"] ) # Create neptune callback neptune_callback = NeptuneCallback(run=run) # Prepare data X, y = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # Define parameters params = { "boosting_type": "gbdt", "objective": "multiclass", "num_class": 10, "metric": ["multi_logloss", "multi_error"], "num_leaves": 21, "learning_rate": 0.05, "feature_fraction": 0.9, "bagging_fraction": 0.8, "bagging_freq": 5, "max_depth": 12, } # Train the model and log metadata to the run in Neptune gbm = lgb.train( params, lgb_train, num_boost_round=200, valid_sets=[lgb_train, lgb_eval], valid_names=["training", "validation"], callbacks=[neptune_callback], ) y_pred = np.argmax(gbm.predict(X_test), axis=1) # Log summary metadata to the same run under the "lgbm_summary" namespace run["lgbm_summary"] = create_booster_summary( booster=gbm, log_trees=True, list_trees=[0, 1, 2, 3, 4], log_confusion_matrix=True, y_pred=y_pred, y_true=y_test ) .. _Neptune-LightGBM docs: https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightgbm _lightgbm.plot_importance: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.plot_importance.html#lightgbm-plot-importance _lightgbm.plot_tree: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.plot_tree.html#lightgbm-plot-tree _lightgbm.Booster.trees_to_dataframe: https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster.trees_to_dataframe _Neptune-LightGBM installation: https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightgbm#install-requirements _example scripts: https://github.com/neptune-ai/examples/tree/main/integrations-and-supported-tools/lightgbm/scripts """ results_dict = {} visuals_path = "visualizations/" if log_importances: split_plot = lgb.plot_importance(booster, importance_type="split", title="Feature importance (split)", max_num_features=max_num_features) gain_plot = lgb.plot_importance(booster, importance_type="gain", title="Feature importance (gain)", max_num_features=max_num_features) results_dict["{}feature_importances/split".format(visuals_path)] \ = neptune.types.File.as_image(split_plot.figure) results_dict["{}feature_importances/gain".format(visuals_path)] \ = neptune.types.File.as_image(gain_plot.figure) if log_trees: try: subprocess.call(["dot", "-V"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) except OSError: log_trees = False message = "Graphviz executables not found, so trees will not be logged. " \ "Make sure the Graphviz executables are on your systems' PATH" warnings.warn(message) if log_trees: trees_series = [] for i in list_trees: digraph = lgb.create_tree_digraph(booster, tree_index=i, show_info="data_percentage") _, ax = plt.subplots(1, 1, figsize=(tree_figsize, tree_figsize)) s = BytesIO() s.write(digraph.pipe(format="png")) s.seek(0) ax.imshow(image.imread(s)) ax.axis("off") trees_series.append(neptune.types.File.as_image(ax.figure)) results_dict["{}trees".format( visuals_path)] = neptune.types.FileSeries(trees_series) if log_trees_as_dataframe: if isinstance(booster, lgb.Booster): df = booster.trees_to_dataframe() html_df = neptune.types.File.as_html(df) results_dict["trees_as_dataframe"] = html_df if not df.empty and not html_df.content: warnings.warn( "'trees_as_dataframe' wasn't logged. Probably generated dataframe was to large." ) else: warnings.warn( "'trees_as_dataframe' won't be logged." " `booster` must be instance of `lightgbm.Booster` class.") if log_pickled_booster: results_dict["pickled_model"] = neptune.types.File.as_pickle(booster) if log_confusion_matrix: ax = plot_confusion_matrix(y_true=y_true, y_pred=y_pred) results_dict[ f"{visuals_path}confusion_matrix"] = neptune.types.File.as_image( ax.figure) return results_dict
# Avoid using 'accuracy' (*even if balanced classes*) - https://stats.stackexchange.com/q/312780 # Brier score (`brier_score_loss`) # Confusion Matrix ### sklearn from sklearn.metrics import confusion_matrix confusion_matrix(y_test, y_pred) ### skplot from scikitplot.metrics import plot_confusion_matrix plot_confusion_matrix(y_test, y_pred, normalize=False) ### Yellowbrick from yellowbrick.classifier import ConfusionMatrix conf_matrix = ConfusionMatrix(rf, classes=cancer.target_names, label_encoder={ 0: 'benign', 1: 'malignant' }) conf_matrix.fit(X_train, y_train) conf_matrix.score(X_test, y_test) conf_matrix.poof()
xAxisTitle = "Sig_Prob" yAxisTitle = "# Events (scaled to 1)" outPlotName = "Discriminator_Beautified.pdf" pB.plotBeautifier(histList, labelList, xAxisTitle, yAxisTitle, outPlotName) # In[27]: type((sigTrainPredict[:, 0]).astype(float)) # In[12]: # Confusion Matrix plt.clf() plt.figure(figsize=(10, 7)) plot_confusion_matrix( np.array(yTest)[:, 0], np.array(xTestPredict)[:, 0] > 0.5) plt.savefig("ConfusionMatrix.pdf") print("Confusion Matrix printed!!!") # In[15]: # For plots based on discriminator cut sigProb = np.arange(0, 1, 0.01) xTestPredictSigProb = np.array(xTestPredict)[:, 0] yTestClass = np.array(yTest)[:, 0] tpr = [] fpr = []
print( df['sport'].unique() ) print( df['altezza'].describe() ) df.plot.scatter(x='altezza', y='peso') df.plot.bar() # SCIKIT dataset = load_wine() X = dataset['data'] y = dataset['target'] X_train, X_test, y_train, y_test = train_test_split(X, y) model = LogisticRegression() model.fit(X_train, y_train) predictions = model.predict(X_test) print( 'accuracy', accuracy_score(y_test, predictions) ) plot_confusion_matrix(y_test, predictions) plt.show()
################################################################################ ## ## Evaluate the better model on valid EvaluateTheBetter = {} EvaluateTheBetter["Model"] = TheBetterModel ## ## AUC score EvaluateTheBetter["Probability"] = EvaluateTheBetter["Model"].predict([Valid["Image"], Valid["Variable"]]) EvaluateTheBetter["AUC"] = roc_auc_score(numpy.array(Valid["Label"]), EvaluateTheBetter["Probability"][:,1]) ## ## plot AUC AUC = plot_roc(y_true=Valid["Label"], y_probas=EvaluateTheBetter["Probability"], plot_micro=False, plot_macro=False, classes_to_plot=[1]).get_figure() AUC.savefig(ResultPath + Time + "/AUC.png") EvaluateTheBetter["Prediction"] = numpy.argmax(EvaluateTheBetter["Probability"], axis=1) EvaluateTheBetter["ConfuseTable"] = confusion_matrix(Valid["Label"], EvaluateTheBetter["Prediction"]) ConfusionTable = plot_confusion_matrix(y_true = Valid["Label"], y_pred = EvaluateTheBetter["Prediction"]).get_figure() ConfusionTable.savefig(ResultPath + Time + "/ConfusionTable.png") ## ## Threshold table _, _, Threshold = roc_curve(numpy.array(Valid["Label"]), EvaluateTheBetter["Probability"][:,1]) ThresholdTable = {"Threshold":[], "Accuracy":[], "Sensitivity":[], "Specificity":[], "Precision":[]} for threshold in Threshold: prediction = EvaluateTheBetter["Probability"][:,1] > threshold accuracy = accuracy_score(Valid["Label"], prediction) confuse = confusion_matrix(Valid["Label"], prediction) sensitivity = confuse[1,1]/sum(confuse[1,:]) specificity = confuse[0,0]/sum(confuse[0,:]) precision = confuse[1,1]/sum(confuse[1,:]) ThresholdTable["Threshold"].append(threshold) ThresholdTable["Accuracy"].append(accuracy) ThresholdTable["Sensitivity"].append(sensitivity)
def test_hide_counts(self): np.random.seed(0) clf = LogisticRegression() clf.fit(self.X, self.y) preds = clf.predict(self.X) plot_confusion_matrix(self.y, preds, hide_counts=True)
print 'recall(macro): ' + str( recall_score(groundTruthList, predictionList, average='macro')) print 'recall(micro): ' + str( recall_score(groundTruthList, predictionList, average='micro')) fs.write('recall(macro): ' + str(recall_score(groundTruthList, predictionList, average='macro')) + '\n') fs.write('recall(micro): ' + str(recall_score(groundTruthList, predictionList, average='micro')) + '\n') print 'f1(macro): ' + str( f1_score(groundTruthList, predictionList, average='macro')) print 'f1(micro): ' + str( f1_score(groundTruthList, predictionList, average='micro')) fs.write('f1(macro): ' + str(f1_score(groundTruthList, predictionList, average='macro')) + '\n') fs.write('f1(micro): ' + str(f1_score(groundTruthList, predictionList, average='micro')) + '\n') print confusion_matrix(groundTruthList, predictionList) plot_confusion_matrix(groundTruthList, predictionList) matrixFileName = dirName + '/confusionMatrix.png' plt.savefig(matrixFileName) #plt.show() fs.close()
density=True, color=None, histtype='step', label='bkg') plt.xlabel("Sig_Prob") plt.ylabel("#Events (Area scaled to 1)") plt.legend() plt.savefig("Discriminator.pdf") print("Discriminator plotted!!!") # Confusion matrix threshold = 0.5 pred_class = pred_one > threshold plt.clf() plt.figure(figsize=(10, 7)) plot_confusion_matrix(labl, pred_class) plt.savefig("ConfusionMatrix.pdf") print("Confusion Matrix printed!!!") # F1 Score nofvalues = 100 thresh = np.empty(nofvalues) f1score = np.empty(nofvalues) plt.clf() for beta in [0.1, 0.25, 0.5, 1, 1.25, 1.5, 2, 5, 10]: for i in range(100): thresh[i] = i * 1.0 / 100 pred_class = pred_one > thresh[i] f1score[i] = fbeta_score(labl, pred_class, beta)
def classify(self, X, type: str, classifier: str, test_prop: float, res: None, res_method: None): if type == 'binary': y = self.df['class'].replace(0, 1) elif type == 'multi': y = self.df['class'] else: raise TypeError("Choose a proper type of classification") X_train, X_test, Y_train, Y_test = train_test_split( X, y, test_size=test_prop, stratify=y) if res == True: if res_method == 'down': nm = NearMiss() X_res, Y_res = nm.fit_resample(X_train, Y_train) elif res_method == 'up': sm = ADASYN() X_res, Y_res = sm.fit_resample(X_train, Y_train) else: raise TypeError( "Resampling method not provided. Please use 'up' for oversampling or 'down' for undersampling." ) if classifier == 'lr': model = LogisticRegression(solver='liblinear', class_weight='balanced', C=0.04, penalty='l2') elif classifier == 'svc': model = LinearSVC(C=0.004, penalty='l2') elif classifier == 'rf': n_est = int( input("Type in number of trees to estimate from: ").strip()) model = RandomForestClassifier(n_estimators=n_est, bootstrap=True, max_depth=5) elif classifier == 'xgb': n_est = int( input("Type in number of trees to estimate from: ").strip()) model = XGBClassifier(n_estimators=n_est, bootstrap=True, max_depth=5, reg_lamba=0.4) elif classifier == 'ada': n_est = int( input("Type in number of trees to estimate from: ").strip()) model = AdaBoostClassifier(n_estimators=n_est, learning_rate=0.005) else: raise TypeError( "Choose a proper classifier. Possible inputs: 'lr', 'svc', 'rf', 'xgb', 'ada' ." ) if res == True: model.fit(X_res, Y_res) else: model.fit(X_train, Y_train) Y_pred = model.predict(X_test) # Accuracy Percentage print(f"Accuracy is {round(accuracy_score(Y_test, Y_pred), 2)*100}%") # Classification Report print(classification_report(Y_pred, Y_test)) # Matthew's Correlation Coefficient print( f"Matthew's Correlation Coefficient is {matthews_corrcoef(Y_test, Y_pred)}" ) # Plots of Confusion Matrix and ROC Curve plot_confusion_matrix(Y_test, Y_pred, figsize=(10, 10)) return model
Y = df.iloc[:,2] #labelencoder_Y = LabelEncoder() #Y= labelencoder_Y.fit_transform(y) #Y=onehotencoder=pd.get_dummies(y) #df = pd.read_csv('open_palm_simi_test1.csv') #X_test= df.iloc[0:,:].values from sklearn.metrics import confusion_matrix X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.33,random_state=42,shuffle=False) # Run classifier, using a model that is too regularized (C too low) to see # the impact on the results model = SVC(kernel = 'linear', C = 1).fit(X_train,y_train) predictions = model.predict(X_test) # creating a confusion matrix cm = plot_confusion_matrix(y_test, predictions) ######################################## #file1 = open("result.txt","w+") #file1.write(predictions[1]) #this will give predicted guesture as output to the text file #file1.close() import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = [10, 10] labels = ['fist_palm','fist_dorsal','open_palm','open_dorsal','three_fingers_dorsal','three_fingers_palm'] sk.metrics.plot_confusion_matrix(model, X_test, y_test, normalize="pred", display_labels = labels ) plt.show()
dataset = pd.read_json("./data.json") #520 x = np.array([]) y = np.array([]) for value in enumerate(dataset["LIGHT_VALUE"]): x = np.append(x, int(value[1])) for value in enumerate(dataset["NEED_LIGHT"]): y = np.append(y, int(value[1])) x = x.reshape(-1, 1) y = y.reshape(-1, 1) x_train, x_test, y_train, y_test = train_test_split(x, y) model = DecisionTreeClassifier() model.fit(x_train, y_train) y_pred = model.predict(x_test) print("Accuratezza test") print(str(accuracy_score(y_test, y_pred) * 100) + "%") plot_confusion_matrix(y_test, y_pred) plt.show() y_pred = model.predict([[400]]) print(bool(y_pred[0]))
test_y = y.values[test_ind] train_X_2 = train_X[:, selector.support_] test_X_2 = test_X[:, selector.support_] clsf_1 = RandomForestClassifier(n_estimators=500, class_weight="balanced", n_jobs=-1) clsf_1.fit(train_X, train_y) RF_pred = clsf_1.predict(test_X) pred_probability = clsf_1.predict_proba(test_X) evaluation_metrics(test_y, RF_pred, pred_probability) _now = dt.now().strftime("%Y%m%d-%H-%M-%S") plot_confusion_matrix(test_y, RF_pred) plt.savefig("{}_result_before_rfe.png".format(_now)) plot_roc(test_y, pred_probability) plt.savefig("{}_roc_result_before_rfe.png".format(_now)) clsf_2 = RandomForestClassifier(n_estimators=500, class_weight="balanced", n_jobs=-1) clsf_2.fit(train_X_2, train_y) RF_pred = clsf_2.predict(test_X_2) pred_probability = clsf_2.predict_proba(test_X_2) evaluation_metrics(test_y, RF_pred, pred_probability) #plot_confusion_matrix(test_y, RF_pred) #plot_roc(test_y, pred_probability)
def evaluate_models(y, preds): print('Accuracy: {}'.format(accuracy_score(y, preds))) print('ROC-AUC: {}'.format(roc_auc_score(y, preds))) print('Recall: {}'.format(recall_score(y, preds))) skplt.plot_confusion_matrix(y, preds, figsize=(8, 6))
round(metrics.mean_absolute_error(y_test, pred), 4)) st.write('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, pred), 4)) st.write('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, pred)), 4)) st.write('Accuracy of Logistic Regression on training set: ', round(logReg.score(X_train, y_train), 4)) st.write('Accuracy of Logistic Regression on test set: ', round(logReg.score(X_test, y_test), 4)) st.subheader("Classification Report") st.text(classification_report(y_test, pred)) try: # Confusion matrix plot_confusion_matrix(y_test, pred, figsize=(7, 5), cmap="PuBuGn") bottom, top = plt.ylim() plt.ylim(bottom + 0.5, top - 0.5) st.pyplot() except: st.write("Confusion matrix do not support multioutput.") except: st.write("Fill all parameters.") # plot_calibration_curve(y_test, [pred]) # st.pyplot() ######################################## # KNN CLASSIFIER ########################################
def plot_classification_metrics(y_true, y_pred_proba, classes_to_plot=None, threshold=None, plot_micro=True): """ Plot ROC Curve, Precision-Recall Curve and Confusion Matrix Parameters ---------- y_true : array-like, shape (n_samples) Ground truth labels. y_pred_proba : array-like, shape (n_samples, n_classes) Prediction probabilities or decision scores for each class returned by a classifier. classes_to_plot : list-like, optional Classes for which the ROC curve should be plotted. e.g. [0, 'cold']. If the specified class does not exist, it will be ignored. If ``None``, all classes will be plotted. threshold : None or float if a float is set, it will be used as the decision threshold for binary classification plot_micro : bool whether to plot the averaged roc_curve and the precision-recall curve using average method 'micro' Returns ------- fig : :class:`matplotlib.figure.Figure` object axs : Axes object or array of Axes objects. """ fig = plt.figure(dpi=100, figsize=(10.5, 8)) ax1 = plt.subplot2grid((4, 4), (0, 0), rowspan=2, colspan=2) ax2 = plt.subplot2grid((4, 4), (0, 2), rowspan=2, colspan=2) ax3 = plt.subplot2grid((4, 4), (2, 0), rowspan=2, colspan=2) # region Plot ROC Curve plot_roc(y_true, y_pred_proba, plot_macro=False, plot_micro=plot_micro, classes_to_plot=classes_to_plot, ax=ax1) # endregion # region plot Precision-Recall Curve plot_precision_recall(y_true, y_pred_proba, plot_micro=plot_micro, classes_to_plot=classes_to_plot, ax=ax2) ax2.legend(loc='lower right') # endregion # region Plot Confusion Matrix y_pred_idx = np.argmax(y_pred_proba, axis=-1) labels = np.sort(np.unique(y_true)) y_pred = labels[y_pred_idx] plot_confusion_matrix(y_true, y_pred, normalize=True, ax=ax3) # endregion axs = [ax1, ax2, ax3] if threshold: # region Plot Confusion Matrix labels = np.sort(np.unique(y_true)) assert len(labels) == 2, """Problem is not binary classification but decision threshold is set""" ax4 = plt.subplot2grid((4, 4), (2, 2), rowspan=2, colspan=2) is_positive = y_pred_proba[:, 1] > threshold y_pred = labels[is_positive.astype('int')] plot_confusion_matrix(y_true, y_pred, normalize=True, ax=ax4) ax4.set_title('Confusion Matrix with adjusted ' 'decision threshold: {:.2}'.format(threshold)) # update color limit im3 = ax3.get_images()[0] clim = im3.get_clim() im4 = ax4.get_images()[0] im4.set_clim(clim) axs.append(ax4) # endregion fig.tight_layout() return fig, axs
def test_labels(self): np.random.seed(0) clf = LogisticRegression() clf.fit(self.X, self.y) preds = clf.predict(self.X) plot_confusion_matrix(self.y, preds, labels=[0, 1, 2])
def plot_image(self): plot_confusion_matrix(self.__train_label, self.__val_preds, normalize=True) plt.show()
from scikitplot.metrics import plot_confusion_matrix model.freeze() test_data = dm.test_dataloader() y_true = np.array([]) y_pred = np.array([]) for i, (x, y) in enumerate(test_data): y = y.cpu().detach().numpy() y_hat = model.forward(x).argmax(axis=1).cpu().detach().numpy() y_true = np.append(y_true, y) y_pred = np.append(y_pred, y_hat) fig, ax = plt.subplots(figsize=(16, 12)) plot_confusion_matrix(y_true, y_pred, ax=ax) neptune_logger.experiment.log_image('confusion_matrix', fig) ## 2: Log model checkpoints to Neptune for k in model_checkpoint.best_k_models.keys(): model_name = 'checkpoints/' + k.split('/')[-1] neptune_logger.experiment.log_artifact(k, model_name) ## 3: Log best model checkpoint score to Neptune neptune_logger.experiment.set_property( 'best_model_score', model_checkpoint.best_model_score.tolist()) ## 4 Log model summary
fp -= tp result = round((tp + tn) / float(tp + tn + fn + fp), 4) accuracys.append(result) #print accuracys fs.write('class accuracys: ' + str(accuracys).strip('[]')) fs.write('\n') fs.write('\n') fs.write( toLatexTab(accuracys, precision_score(groundTruthList, predictionList, average=None), recall_score(groundTruthList, predictionList, average=None), f1_score(groundTruthList, predictionList, average=None), 4)) fs.write('\n') fs.write( toLatexTab(accuracys, precision_score(groundTruthList, predictionList, average=None), recall_score(groundTruthList, predictionList, average=None), f1_score(groundTruthList, predictionList, average=None), 2)) plot_confusion_matrix(groundTruthList, predictionList, x_tick_rotation=90, figsize=(14, 13), title=' ', text_fontsize='large', cmap='Reds') matrixFileName = dirName + '/confusionMatrix.png' plt.savefig(matrixFileName) #plt.show() fs.close()
# fit the model xgb.fit(X_train, y_train) # make prediction y_pred = (xgb.predict_proba(X_val)[:, 1] >= 0.21).astype('int') #setting a threshold !pip install scikit-plot # evaluation from sklearn.metrics import f1_score, classification_report from scikitplot.metrics import plot_confusion_matrix from scikitplot.classifiers import plot_feature_importances print('F1 Score: {}'.format(f1_score(y_pred, y_val))) print(classification_report(y_pred, y_val)) # plot confusion matrix plot_confusion_matrix(y_pred, y_val) # plot importance features = train.drop(columns=['user_id','product_id','reordered']) plot_feature_importances(xgb, feature_names=features.columns, x_tick_rotation=90, max_num_features=20, figsize=(10,8)) # delete X_train, X_test, y_train, y_test del [X_train, X_val, y_train, y_val, means] gc.collect() """## Fit entire training set""" # fit on entire dataset xgb.fit(train.drop(columns=['user_id','product_id','reordered','user_avg_order_dow','user_avg_order_hour']), train['reordered']) # make prediction on test set
# Realizar as previsões previsoes = model_dt.predict(X_valid) # Fazendo as previsões e construindo o relatório report = classification_report(y_valid, previsoes) # Imprimindo o relatório print(report) # Permite verificar a acurácia em um formato de tabela matrix = confusion_matrix(y_valid, previsoes) # Chamando a função para visualizar a confusion matrix plot_confusion_matrix(matrix, target_names = ['0', '1'], normalize = False, title = "Confusion Matrix") """### Realizando o treinamento com **LOGISTIC REGRESSION**""" # Treinando o modelo de Regressão Logística model_lr = LogisticRegression(random_state=42) model_lr.fit(X_train, y_train) # Realizar as previsões previsoes = model_lr.predict(X_valid) # Fazendo as previsões e construindo o relatório report = classification_report(y_valid, previsoes) # Imprimindo o relatório
import numpy as np y_test_pred = np.asarray(model.predict(x_test)) y_test_pred_class = np.argmax(y_test_pred, axis=1) from sklearn.metrics import f1_score f1 = f1_score(y_test, y_test_pred_class, average='micro') neptune.log_metric('test_f1', f1) import matplotlib.pyplot as plt from scikitplot.metrics import plot_confusion_matrix, plot_roc fig, ax = plt.subplots(figsize=(16, 12)) plot_confusion_matrix(y_test, y_test_pred_class, ax=ax) neptune.log_image('diagnostic_charts', fig) fig, ax = plt.subplots(figsize=(16, 12)) plot_roc(y_test, y_test_pred, ax=ax) neptune.log_image('diagnostic_charts', fig) model.save('my_model.h5') neptune.log_artifact('my_model.h5') # tests current_exp = neptune.get_experiment() correct_logs = [ 'batch_loss', 'batch_accuracy', 'epoch_loss', 'epoch_accuracy', 'epoch_val_loss', 'epoch_val_accuracy', 'test_f1', 'diagnostic_charts'
def main(): neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_qualified_name=os.getenv('NEPTUNE_PROJECT')) train_idx = pd.read_csv(TRAIN_IDX_PATH, nrows=NROWS) valid_idx = pd.read_csv(VALID_IDX_PATH, nrows=NROWS) features = pd.read_csv(FEATURES_PATH, nrows=NROWS) train = pd.merge(train_idx, features, on='SK_ID_CURR') valid = pd.merge(valid_idx, features, on='SK_ID_CURR') all_params = { 'num_boost_round': NUM_BOOST_ROUND, 'early_stopping_rounds': EARLY_STOPPING_ROUNDS, **LGBM_PARAMS } with neptune.create_experiment(name='model training', params=all_params, tags=['lgbm'], upload_source_files=get_filepaths(), properties={ 'features_path': FEATURES_PATH, 'features_version': md5_hash(FEATURES_PATH), 'train_split_version': md5_hash(TRAIN_IDX_PATH), 'valid_split_version': md5_hash(VALID_IDX_PATH), }): results = train_evaluate(train, valid, LGBM_PARAMS, callbacks=[neptune_monitor()]) train_score, valid_score = results['train_score'], results[ 'valid_score'] train_preds, valid_preds = results['train_preds'], results[ 'valid_preds'] neptune.send_metric('train_auc', train_score) neptune.send_metric('valid_auc', valid_score) train_pred_path = os.path.join(PREDICTION_DIRPATH, 'train_preds.csv') train_preds.to_csv(train_pred_path, index=None) neptune.send_artifact(train_pred_path) valid_pred_path = os.path.join(PREDICTION_DIRPATH, 'valid_preds.csv') valid_preds.to_csv(valid_pred_path, index=None) neptune.send_artifact(valid_pred_path) model_path = os.path.join(MODEL_DIRPATH, 'model.pkl') joblib.dump(results['model'], model_path) neptune.set_property('model_path', model_path) neptune.set_property('model_version', md5_hash(model_path)) neptune.send_artifact(model_path) if PACKAGE_TO_PROD: saved_path = CreditDefaultClassifier.pack( model=results['model']).save(PRODUCTION_DIRPATH) neptune.set_property('production_model_path', saved_path) fig, ax = plt.subplots(figsize=(16, 12)) sk_metrics.plot_confusion_matrix(valid_preds['TARGET'], valid_preds['preds_pos'] > 0.5, ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'conf_matrix.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path) fig, ax = plt.subplots(figsize=(16, 12)) sk_metrics.plot_roc(valid_preds['TARGET'], valid_preds[['preds_neg', 'preds_pos']], ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'roc_auc.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path) fig, ax = plt.subplots(figsize=(16, 12)) sk_metrics.plot_precision_recall( valid_preds['TARGET'], valid_preds[['preds_neg', 'preds_pos']], ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'prec_recall.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path) fig, ax = plt.subplots(figsize=(16, 12)) plot_prediction_distribution(valid_preds['TARGET'], valid_preds['preds_pos'], ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'preds_dist.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path)