Exemplo n.º 1
0
#escalar los valores de entradas
scaledXMM = scalerMM.fit_transform(x)
scaledXDFMM[x.columns ] = scalerMM.fit_transform(x)
scaledXDFMM.hist(column=['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se'])

scaledXS = scalerS.fit_transform(x)
scaledXDFS[x.columns ] = scalerS.fit_transform(x)
scaledXDFS.hist(column=['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se'])

#SPlitting into train and test
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.3)
#creamos el modelo de un Perceptron multicapa
model = MLPClassifier(alpha=1, max_iter=1000)
#a entrenar
model.fit(xtrain , ytrain)
# Aplicar metrica al modelo

print('Train: ', model.score(xtrain, ytrain))
print('Test: ', model.score(xtest, ytest))
#sacar la prediccion en la parte del test
ytestpred = model.predict(xtest)

#sacar el reporte de clasificacion
print('Classification report: \n', classification_report (ytest, ytestpred))
class_names = [0, 1]
disp = plot_confusion_matrix(model ,xtest , ytest , display_labels = class_names, cmap=plt.cm.Blues ,)
disp.ax.set_title ("Confusion matrix , without normalization")
plt.show()
confusion_matrix (ytest, ytestpred)
Exemplo n.º 2
0
        CV_err_arr = np.append(CV_err_arr, np.mean(err_arr))
        CV_var_arr = np.append(CV_var_arr, np.var(err_arr))

    print(np.round(CV_err_arr, 2))
    print(np.round(np.sqrt(CV_var_arr), 2))

# apply model to test data using hyperparameter k=1 (which was found to be the
# best; this is probably because images are "far" away from each other in
# space and thus there's no noise to be reduced by increasing k):

# create and train the kNN Classifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_nontest, d_nontest.ravel())

# plot the confusion matrix
matrix = plot_confusion_matrix(knn,
                               X_test,
                               d_test,
                               cmap=plt.cm.Blues,
                               normalize='true')
plt.title('Confusion matrix for OvR classifier')
plt.show(matrix)
plt.show()

# test model on the test data
d_hat = knn.predict(X_test)
err = 100 * (1 - metrics.accuracy_score(d_test.ravel(), d_hat))

print(np.round(err, 2))
Exemplo n.º 3
0
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
y_true = np.array([1] * 194 + [0] * 194)
x_pred = probabilities > 0.5
x_pred = x_pred.astype(int)
print (y_test.shape)
print (x_pred.shape)
x_pred = x_pred.reshape(-1,1)
y_test = y_true.reshape(-1,1)
print (y_test.shape)
print (x_pred.shape)
clf= SVC(random_state=0)
clf = svm.SVC(kernel='linear', C = 1.0)
clf.fit(x_pred,y_test.ravel())
plot_confusion_matrix(clf,
x_pred,
y_test,
normalize='all')
plt2.show()

##Fscore outputted here as well as precision and recall
fScore = f1_score(y_test,x_pred,labels=None, pos_label=1,average='binary',sample_weight=None,zero_division='warn')
precison = precision_score(y_test, x_pred,labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')
recall = recall_score(y_test, x_pred,labels=None, pos_label=1, average='binary', sample_weight=None, zero_division='warn')
print("fScore = :",fScore)
print("Precison = :",precison)
print("Recall = :",recall)

#metrics.accuracy_score(y_true,y_pred)
#metrics.multilabel_confusion_matrix(y_true,y_pred) 

y_test = test_df["Class"].values
Exemplo n.º 4
0
    loaded_model = pickle.load(open('svm_color_classifier_poly.pkl', 'rb'))
    result = loaded_model.score(X_test, y_test)
    print('X_test index 0 is {}'.format(X_test[0]))
    print('result is {}'.format(result))

    # pred = loaded_model.predict(np.array([80945, 115532, 228628, 284049, 246331, 234232, 193999, 149803, 176310]).reshape(1, -1))
    #     # ([3, 0, 0, 0, 1, 0, 0, 2, 0])
    # print('pred is {}'.format(pred))

    # Plot non-normalized confusion matrix
    titles_options = [("Confusion matrix, without normalization", None),
                      ("Normalized confusion matrix", 'true')]
    for title, normalize in titles_options:
        disp = plot_confusion_matrix(loaded_model,
                                     X_test,
                                     y_test,
                                     display_labels=labels,
                                     cmap=plt.cm.Blues,
                                     normalize=normalize)
        disp.ax_.set_title(title)

        print(title)
        print(disp.confusion_matrix)

    plt.show()

# svm_color_classifier_poly.pkl Running to save the best model for bin = 24 score=62.5%
# svm_color_classifier_sigmoid.pkl Running to save the best model for bin = 7 score = 40%
# svm_color_classifier_rbf.pkl Running to save the best model for bin = 24 score = 50%
# svm_color_classifier_poly_gamma01.pkl[{'bins': 14, 'score': 0.55}] Running to save the best model for bin = 14 g=0.1
# C=100, gamma = 1, poly score = 50% Running to save the best model for bin = 9
Exemplo n.º 5
0
# followed from https://levelup.gitconnected.com/scikit-learn-python-6-useful-tricks-for-data-scientists-1a0a502a6aa3

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

X, y = make_classification(n_samples=1000, n_features=4, n_classes=2, random_state=123)
y.shape
X.shape
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
clf = LogisticRegression()
clf.fit(X_train, y_train)
confmat = plot_confusion_matrix(clf, X_test, y_test, cmap="Blues")
plt.show()

# True positive : 예측 1(positive) 맞춤(true)
# True neg : 예측 0(neg),  맞춤(true)
# False pos : 예측 1(pos) , 틀림(false) 실제는 neg
# Flase neg: 예측 0(neg), 틀림(false) 실제는 pos  => 병원에서 중요
Exemplo n.º 6
0
def test_error_on_invalid_option(pyplot, fitted_clf, data):
    X, y = data
    msg = r"normalize must be one of \{'true', 'pred', 'all', " r"None\}"

    with pytest.raises(ValueError, match=msg):
        plot_confusion_matrix(fitted_clf, X, y, normalize="invalid")
Exemplo n.º 7
0
def prediction():

    # Manage the user connection
    if 'user' in session:

        user = session['user']
        # idToken expires after 1 hour, so we refresh the token to avoid stale token.
        user = auth.refresh(user['refreshToken'])
        session['user'] = user

        try:

            filename = session['filename']
            # Dictionnary of columns for form select
            cols = df.columns
            df_col_dic = [{'name': col} for col in cols]

            y_predict = best_model.predict(X_test)

            if request.method == 'POST':

                if request.form['pred_btn'] == 'conf_matrix':

                    # Confusion Matrix Plot
                    cm_plot = plot_confusion_matrix(best_model,
                                                    X_test,
                                                    y_test,
                                                    display_labels=y,
                                                    cmap=plt.cm.Blues)

                    # Save as an Image
                    cm_buff = io.BytesIO()
                    plt.savefig(cm_buff, format='png')
                    cm_buff.seek(0)
                    cm_buffer = b''.join(cm_buff)
                    cm_encoded = base64.b64encode(cm_buffer)
                    cm = cm_encoded.decode('utf-8')

                    return render_template('prediction.html',
                                           df_name=filename,
                                           best_model=best_model_name,
                                           cm_plot=cm)

                elif request.form['pred_btn'] == 'pred_table':

                    # We display a prediction table to compare result and prediction
                    df_predictions = pd.DataFrame({
                        "target": y_predict,
                        "prediction": y_test
                    })

                    return render_template(
                        'prediction.html',
                        df_name=filename,
                        best_model=best_model_name,
                        df_prediction=[df_predictions.to_html(classes='data')])

            return render_template('prediction.html',
                                   df_name=filename,
                                   best_model=best_model_name)
        except:

            flash(
                'There is no dataframe uploaded. PLease visit DATASET page first',
                'warning')
            return render_template('prediction.html')

        return render_template('prediction.html')

    return redirect(url_for('login'))
Exemplo n.º 8
0
def generate_models(X_train, X_test, y_train, y_test, show_graphs=True):
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC

    ###########################
    #Linear Regression
    ###########################

    reg = LogisticRegression(penalty='none', tol=0.5, random_state=1)

    reg.fit(X_train, y_train)
    reg_pred = reg.predict(X_test)

    if show_graphs == True:
        # print the initial results
        print("Linear Regression")
        print("The accuracy of the model on test set is: %4.2f " %
              accuracy_score(y_test, reg_pred))
        print("The Kapa of your model is: ",
              round(cohen_kappa_score(y_test, reg_pred), 3))

        # plot confusion matrix
        confusion_matrix(y_test, reg_pred)
        plot_confusion_matrix(reg, X_test, y_test)
        plt.show()
        # print classification report
        print(classification_report(y_test, reg_pred))
    # Save model
    with open('lin_reg.pkl', 'wb') as file:
        pickle.dump(reg, file)

    ###########################
    #Support Vector Classifier
    ###########################

    svc = SVC(random_state=1, probability=True)
    svc.fit(X_train, y_train)
    svc_pred = reg.predict(X_test)

    if show_graphs == True:

        # print the initial results
        print("Support Vector Classifier")
        print("The accuracy of the model on test set is: %4.2f " %
              accuracy_score(y_test, reg_pred))
        print("The Kapa of your model is: ",
              round(cohen_kappa_score(y_test, reg_pred), 3))
        # plot confusion matrix
        confusion_matrix(y_test, svc_pred)
        plot_confusion_matrix(svc, X_test, y_test)
        plt.show()
        # print classification report
        print(classification_report(y_test, svc_pred))
    # Save model
    with open('svc.pkl', 'wb') as file:
        pickle.dump(svc, file)

    ###########################
    #Random Forest
    ###########################
    RanFor = RandomForestClassifier(max_depth=25,
                                    n_estimators=1200,
                                    min_samples_split=2,
                                    min_samples_leaf=1)

    RanFor.fit(X_train, y_train)
    RanFor_pred = RanFor.predict(X_test)

    if show_graphs == True:
        # print the initial results
        print("Random Forest")
        print("The accuracy of the model on test set is: %4.2f " %
              accuracy_score(y_test, RanFor_pred))
        print("The Kapa of your model is: ",
              round(cohen_kappa_score(y_test, RanFor_pred), 3))

        # plot confusion matrix
        confusion_matrix(y_test, RanFor_pred)
        plot_confusion_matrix(RanFor, X_test, y_test)
        plt.show()

        # print classification report
        print(classification_report(y_test, RanFor_pred))

    # Save model
    with open('forest.pkl', 'wb') as file:
        pickle.dump(RanFor, file)

    return reg_pred, svc_pred, RanFor_pred
Exemplo n.º 9
0
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))


# Build a numeric pipeline
numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler())

# Build a categorical pipeline
categorical_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

# Build a categorical transformer
col_transformer = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features))

# Build a main pipeline
lr_pipe = make_pipeline(
    col_transformer,
    LogisticRegression())

# Fit your pipeline on the training set
lr_pipe.fit(X_train, y_train)

# Plot your confusion matrix on your test set 
plot_confusion_matrix(lr_pipe, X_test, y_test,
                      cmap="PuRd");
Exemplo n.º 10
0
        baseline_pipeline = make_pipeline(
            StandardScaler(), DummyClassifier(strategy="most_frequent"))
        baseline_pipeline.fit(x_train, np.array(y_train).ravel())
        fpr, tpr, _ = roc_curve(y_test,
                                baseline_pipeline.predict_proba(x_test)[:, 1])
        roc_auc = auc(fpr, tpr)
        pyplot.plot(fpr,
                    tpr,
                    color="red",
                    label='Baseline AUC = %0.8f' % roc_auc)

        pyplot.legend(loc='lower right')
        pyplot.show()

        best_pipeline = logistic_pipeline  # make confusion matrix for logistic regression model
        plot_confusion_matrix(best_pipeline, test_x_input_features,
                              test_y_output_data)
        pyplot.title("Logistic Regression")
        pyplot.show()

        baseline_pipeline.fit(
            x_input_features,
            y_output_data)  # make confusion matrix for most_frequent model
        plot_confusion_matrix(baseline_pipeline, test_x_input_features,
                              test_y_output_data)
        pyplot.title("Most Frequent Baseline")
        pyplot.show()

        baseline_accuracy = accuracy_score(
            y_pred=baseline_pipeline.predict(test_x_input_features),
            y_true=test_y_output_data)
        print(f"Baseline Accuracy: {baseline_accuracy}")
Exemplo n.º 11
0
# In[20]:

#knn classifier with for loop [1-20] to check the best accuracy of n
for n in range(1, 21):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print('KNeighborsClassifier: n = {} , Accuracy is: {}'.format(
        n, knn.score(X_test, y_test)))

# In[21]:

#plot_confusion_matrix of knn
plot_confusion_matrix(knn,
                      X_test,
                      y_test,
                      display_labels=['Edible', 'Poisonous'],
                      cmap="summer",
                      normalize=None)
plt.title('Confusion Matrix KNN')
plt.show()

# In[22]:

#Print Confusion matrix Accuracy of knn
print('Confusion matrix Accuracy is: {}'.format(
    metrics.accuracy_score(y_test, y_pred)))

# In[23]:

#classification_report of KNN
KNN_REPORT = classification_report(y_test, knn.predict(X_test))
    print("\nAccuracy:", np.mean(scores))

plt.figure()
plt.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
plt.title('Feature importance, sums to 1')
labels = [
    'Median right pupil diameter', 'RCPD right', 'Median left pupil diameter',
    'RCPD left', 'median Light Intensity', 'RCLI', 'PERCLOS',
    'Median gaze x coordinate', 'Median gaze y coordinate'
]
plt.xticks(range(len(RFmatrix_X.columns)), labels)
plt.xticks(fontsize=8, rotation=90)

if Confusion:
    # Plot non-normalized confusion matrix
    plt.figure()
    titles_options = [("Confusion matrix, without normalization", None),
                      ("Normalized confusion matrix", 'true')]
    for title, normalize in titles_options:
        disp = plot_confusion_matrix(
            clf,
            X_test,
            y_test,
            display_labels=['Baseline', '1-back', '2-back'],
            cmap=plt.cm.Blues,
            normalize=normalize)
        plt.xticks(rotation=90)
        disp.ax_.set_title(title)
        plt.savefig('graph.png', dpi=300, bbox_inches='tight')
    plt.show()
Exemplo n.º 13
0
print(f"Labels size: {len(labels)}")

#Split data into training and testing
train_x, test_x, train_y, test_y = train_test_split(data,
                                                    labels,
                                                    test_size=0.2,
                                                    shuffle=True)

#Train the model
nb = Classifier.fit(train_x, train_y)

#Export the vectorizer and the model for use in other programs.
export(vec, nb)

#Display a % prediction for the first 10 labels.
for index in range(1, 10):
    res = nb.predict_proba(test_x[index])
    res = int(res[0][1] * 100)
    print(f"{res}%")

#Displayt the accuracy of the model
print(f"Accuracy is: {nb.score(test_x,test_y)*100}%")

#Predict the test set again so it can be used in the confusion matrix.

#%%
pred_y = nb.predict(test_x)

plot_confusion_matrix(nb, test_x, test_y)

plt.show()
Exemplo n.º 14
0
def test_model(final_search, X_test_df, out_folder, file_h, scaler, threshold,
               display_labels):
    fig, ax = plt.subplots()

    X_test = X_test_df.iloc[:, 5:].values
    y_test = X_test_df.true_label.values
    #X_test_scaled = scaler.fit_transform(X_test)
    y_pred = final_search.predict_proba(X_test)[:, 1]
    y_pred_default = final_search.predict(X_test)
    y_pred_decision = final_search.decision_function(X_test)

    y_pred_fixed = rescore(y_pred, threshold)
    #y_pred_fixed=final_search.predict(X_test)
    y_pred_log = final_search.predict_log_proba(X_test)[:, 1]

    average_precision = average_precision_score(y_test, y_pred)
    print('Average precision-recall score: {0:0.2f}'.format(average_precision))
    file_h = write_line(
        file_h,
        'Average precision-recall score: {0:0.2f}'.format(average_precision))

    ###Plotting results
    log_fpr, log_tpr, log_threshold = roc_curve(y_test, y_pred)
    #    sample_names = [label_dict[val] for val in y_test_labels]
    #sample_names = [label_dict[val] for val in X_test_df.index]
    #print(sample_names)

    pred_result = X_test_df.iloc[:, :5]

    this_result_default = pd.Series(y_pred_default, index=pred_result.index)
    this_result_default.name = "predicted_label"

    this_result_dec = pd.Series(y_pred_decision, index=pred_result.index)
    this_result_dec.name = "decision_function"

    this_result = pd.Series(y_pred, index=pred_result.index)
    this_result.name = "prediction_proba"

    this_result_fixed = pd.Series(y_pred_fixed, index=pred_result.index)
    this_result_fixed.name = "predicted_label_rescored"

    this_result_log = pd.Series(y_pred_log, index=pred_result.index)
    this_result_log.name = "prediction_log_proba"

    #zipped = zip(y_test_labels,y_test,y_pred,sample_names)
    pd.concat([
        pred_result, this_result, this_result_log, this_result_default,
        this_result_fixed, this_result_dec
    ],
              axis=1).to_csv(os.path.join(out_folder, "pred_result.csv"))

    file_h = write_line(
        file_h, '-----------------------------------------------------')
    file_h = write_line(file_h, 'Prediction Result...')

    #    file_h =write_line(file_h, '\tSample\tTest_Label\tPredicted_Label\tSample_names\t')

    #for val in zipped:
    #    file_h = write_line(file_h, "\t%s\t%s\t%0.3f\t%s\t" %(val[0],val[1],val[2],val[3]))
    log_roc_auc = auc(log_fpr, log_tpr)
    file_h = write_line(
        file_h, '-----------------------------------------------------')
    file_h = write_line(file_h, 'Test Results.......')
    file_h = write_line(file_h, 'Log_Thres\tLog_TPR\tLog_FPR\tLog_TPR-Log_FPR')
    for ii in range(len(log_tpr)):
        print(log_threshold[ii], log_tpr[ii], log_fpr[ii],
              log_tpr[ii] - log_fpr[ii])
        file_h = write_line(
            file_h, "%0.5f\t%0.5f\t%0.5f\t%0.5f" %
            (log_threshold[ii], log_tpr[ii], log_fpr[ii],
             log_tpr[ii] - log_fpr[ii]))

    plt.plot(log_fpr,
             log_tpr,
             color='orangered',
             linestyle='--',
             label='ROC curve (area = %0.3f)' % log_roc_auc,
             lw=3)
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k')

    plt.xlim([-0.03, 1.03])
    plt.ylim([-0.03, 1.03])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Performance on validation set (%s)' % y_test.shape[0])
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(out_folder, "roc_curve.pdf"))

    #label_dict = expdesign.set_index("condition_rep")["label"].to_dict()
    plt.close()

    disp = plot_precision_recall_curve(final_search, X_test, y_test)
    disp.ax_.set_title('2-class Precision-Recall curve: '
                       'AP={0:0.2f}'.format(average_precision))
    plt.savefig(os.path.join(out_folder, "precision_recall_curve.pdf"))
    plt.close()

    cm = confusion_matrix(y_test, y_pred_fixed)
    tn, fp, fn, tp = cm.ravel()
    print("tn\tfp\tfn\ttp")
    print(tn, "\t", fp, "\t", fn, "\t", tp)

    file_h = write_line(file_h, "---------Confusion_matrix-------")

    file_h = write_line(file_h, "tn\tfp\tfn\ttp")

    file_h = write_line(
        file_h,
        str(tn) + "\t" + str(fp) + "\t" + str(fn) + "\t" + str(tp))
    file_h = write_line(file_h, "----------------")
    #    display_labels=["Healthy","Tumor"]
    #    display_labels=["No Relapse","Relapse"]
    display_labels = display_labels

    disp = plot_confusion_matrix(final_search,
                                 X_test,
                                 y_test,
                                 display_labels=display_labels,
                                 cmap=plt.cm.Blues)

    disp.ax_.set_title("Confusion Matrix")
    plt.savefig(os.path.join(out_folder, "default_confusion_matrix.pdf"))
    plt.close()

    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=display_labels)
    disp.plot(cmap=plt.cm.Reds)

    disp.ax_.set_title("Confusion Matrix")
    plt.savefig(os.path.join(out_folder, "rescored_confusion_matrix.pdf"))
    plt.close()
    print(
        classification_report(y_test,
                              y_pred_fixed,
                              target_names=display_labels))
    file_h = write_line(file_h, "----Precision Recall F1-score Support------")

    file_h = write_line(
        file_h,
        classification_report(y_test,
                              y_pred_fixed,
                              target_names=display_labels))
    file_h = write_line(file_h, "----------------")

    return file_h
Exemplo n.º 15
0
# # Gradient Boosting algorithms

# # 1. XGBoost

# In[29]:


from xgboost import XGBClassifier

xgb = XGBClassifier(n_jobs=-1, random_state=42, n_estimators=120, max_depth = 5, min_samples_leaf=5)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))
from sklearn.metrics import plot_confusion_matrix, accuracy_score
plot_confusion_matrix(xgb,X_test , y_test, cmap = plt.cm.Blues)
print(confusion_matrix(y_test, y_pred))


# In[36]:



print("Train accuracy",xgb.score(X_train, y_train))
print("Test accuracy",xgb.score(X_test, y_test))


# # 2. LightGBM

# In[ ]:
Exemplo n.º 16
0
    #cm = np.around(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], decimals=2)

    # Use white text if squares are dark; otherwise black.
    threshold = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        color = "white" if cm[i, j] > threshold else "black"
        plt.text(j, i, cm[i, j], horizontalalignment="center", color=color)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    return figure


plot_confusion_matrix(cm, CLASS_NAMES)


def plot_image(i, predictions_array, true_label, img):
    predictions_array, true_label, img = predictions_array, true_label[i], img[
        i]
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])

    plt.imshow(img, cmap=plt.cm.binary)

    predicted_label = np.argmax(predictions_array)
    if predicted_label == true_label:
        color = 'blue'
    else:
Exemplo n.º 17
0
def test_plot_confusion_matrix_deprecation_warning(pyplot, fitted_clf, data):
    with pytest.warns(FutureWarning):
        plot_confusion_matrix(fitted_clf, *data)
Exemplo n.º 18
0
classifier = LinearSVC()
classifier.fit(train_text, train_df["Sentiment"])

test_df = pd.read_csv("reviews_test.csv", header=None, skiprows=[0],
                      names=["text", "Sentiment"], dtype=type_dict)

# test_data = clean_data(test_df.data)
print(test_df.head())
test_df.dropna(inplace=True)

# eval data
eval_df = pd.read_csv("reviews_eval.csv", header=None, skiprows=[0],
                      names=["text", "Sentiment"], dtype=type_dict)
# eval_df.data = clean_data(eval_df.data)
print(eval_df.head())
eval_df.dropna(inplace=True)

print("transforming test")
test_text = vectorizer.transform(test_df["text"].values)

print("transforming eval")
eval_text = vectorizer.transform(eval_df["text"].values)
print("scoring test")
print(classifier.score(test_text, test_df["Sentiment"]))
print("scoring eval")
print(classifier.score(eval_text, eval_df["Sentiment"]))

plot_confusion_matrix(classifier, eval_text, eval_df["Sentiment"], normalize="true")
plot_roc_curve(classifier, eval_text, eval_df["Sentiment"])
plt.show()
Exemplo n.º 19
0
train_acc = svm_clf.score(X_train, y_train)
test_acc = svm_clf.score(X_test, y_test)
train_uar = recall_score(y_train, pred_train, average='macro')
test_uar = recall_score(y_test, pred_dev, average='macro')

print(f"train_acc = {train_acc:.2f}, test_acc = {test_acc:.2f}")
print(f"train_uar = {train_uar:.2f}, test_uar = {test_uar:.3f}")

"""
train_acc = 0.91, test_acc = 0.51
train_uar = 0.91, test_uar = 0.51
"""

plt.figure()
disp = plot_confusion_matrix(svm_clf, X_test, y_test,
                             display_labels=encoder.classes_,
                             cmap=plt.cm.Blues)
disp.ax_.set_title('Confusion matrix')
plt.savefig('confusion_mat_test_SVM.png', dpi=300)

# %% Fit MLP with best hyperparameters
device = torch.device("cuda:0")

X_train, y_train = torch.Tensor(X_train), torch.from_numpy(y_train)
X_test, y_test = torch.Tensor(X_test), torch.from_numpy(y_test)

model, train_uar, test_uar = train_model(X_train, y_train,
                                 X_test, y_test,
                                 l2_lambda=0.001,
                                 lr=0.001)
Exemplo n.º 20
0
x_test = train_data[38000:, 1:]
y_test_digit = train_data[38000:, 0]

rfc.fit(x_train, y_train_digit)

predicted = rfc.predict(x_test)

print("Accuracy: ", accuracy_score(y_test_digit, predicted))
''' Error matrices '''
np.set_printoptions(precision=2)
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(rfc,
                                 x_test,
                                 y_test_digit,
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)

plt.show()
''' Kaggle results '''
test_data = pd.read_csv("digit-recognizer/test.csv").to_numpy()
x_test = test_data[0:, 0:]

x_train = train_data[0:, 1:]
y_train_digit = train_data[0:, 0]

start = time.time()
rfc.fit(x_train, y_train_digit)
predicted = rfc.predict(x_test)
Exemplo n.º 21
0
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

# Dividindo nossos dados em treino e teste. A primeira metade serão os dados de treino, a segunda os dados de teste
X_train, X_test, y_train, y_test = train_test_split(data, digits.target, test_size=0.5, shuffle=False)

classifier = svm.SVC()
classifier.fit(X_train, y_train)

# Predizendo os valores da segunda metade (os dados de teste)
predicted = classifier.predict(X_test)

# Vamos mostrar as imagens testadas e a previsão do nosso classificador?
fig2, axes2 = plt.subplots(1, 8)
images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))

# Plota as primeiras 8 imagens que foram preditas pelo classificador.
for ax, (image, prediction) in zip(axes2, images_and_predictions[:8]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Prediction: %i' % prediction)

# Printando o resultado do nosso classificador 
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(y_test, predicted)))
disp = metrics.plot_confusion_matrix(classifier, X_test, y_test)
disp.figure_.suptitle("Confusion Matrix")
print("Confusion matrix:\n%s" % disp.confusion_matrix)

plt.show()
Exemplo n.º 22
0
# Fit the logistic regression model with the training data
logreg.fit(x_train, y_train)

# Testing
# predicted probabilities
y_predicted = logreg.predict_proba(x_test)

# predicted class labels
k_predicted = logreg.predict(x_test)

# Display 10 selected images from the test set, as gray-scale images, each with a different class label.
import matplotlib.pyplot as plt
for i in range(0,10):
    for j in range(len(y_test)):
        if i is y_test[j]:
            plt.imshow(x_testo[j, :, :], cmap='gray', vmin=0, vmax=255)
            plt.show()
            break
    

# Give the recognition accuracy rate for the whole test set, and show the confusion matrix.
numCorrect = 0
lenset = len(y_test)
for i in range(lenset):
    if y_test[i] == k_predicted[i]:
        numCorrect += 1
print("Accuracy rate: ", (numCorrect/lenset))

from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(logreg, x_test, y_test, values_format = '.5g')  
plt.show()
Exemplo n.º 23
0
X_train, y_train = vectorize_data(train_data, train_labels, channels_to_select)

# Modell-Schätzung
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
model = lda.fit(X_train[:,:2], y_train)

# Vorhersage der Klassen-Zuordnung aufgrund der Trainings-Daten und dem Modell
y_pred = model.predict(X_train[:,:2])

# Visualisierung der richtig und falsch klassifizierten Epochen und der
# linearen Trennlinie (Funktion abgeleitet von https://scikit-learn.org/stable/auto_examples/classification/plot_lda_qda.html#sphx-glr-auto-examples-classification-plot-lda-qda-py)
plot_model_fit(model, X_train[:,:2], y_train, y_pred)
plt.title('LDA Trainings-Daten')

# Confusion Matrix anzeigen
conf_mat = plot_confusion_matrix(model, X_train[:,:2], y_train, cmap=plt.cm.Blues) #, normalize = 'all')
plt.title('Performance Trainings-Daten ')

# Scores berechnen
def print_scores(y_true, y_pred):
    TP = sum(np.logical_and(y_true == 1, y_pred == 1)) # True Positives
    FP =  # False Positives
    TN =  # True Negatives
    FN =  # False Negatives
    sens = # Sensitivität
    spec = # Spezifizität
    prec = # Precision
    acc = # Accuracy
    print('Sensitivität:', sens)
    print('Spezifizität:',spec)
    print('Precision:', prec)
Exemplo n.º 24
0
DecisionTreeClassifier(criterion='entropy')

print(dtree.get_n_leaves())
print(dtree.get_depth())

#%% 

#Evaluate Model Performance

pred_labels = dtree.predict(test.loc[:, pred_vars])
pred_labels[0:4]

#Confusion Matrix

metrics.plot_confusion_matrix(dtree, test.loc[:, pred_vars], test['Class'])

#Classification report

print(metrics.classification_report(test['Class'], pred_labels, digits=5))

#Probabalistic Evaluation

pred_probs = dtree.predict_proba(test.loc[:, pred_vars])
pred_probs[0:5, :]

#%%

#Area Under The Curve

metrics.roc_auc_score(test['Class'], pred_probs[:,1])
Exemplo n.º 25
0
plt.xlabel('False Positive Rate')
#plt.savefig(AUC-ROC.png')

# Precision-Recall Curve

average_precision = average_precision_score(y_test_, y_score)

print('Average precision-recall score: {0:0.2f}'.format(average_precision))
disp = plot_precision_recall_curve(ClassifierSVM, X_test, y_test)
disp.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(average_precision))
plt.savefig('../DeNovo/ClassifiersFiles/AUC-PR.png')

matrix = plot_confusion_matrix(ClassifierSVM,
                               X_test,
                               y_test,
                               cmap=plt.cm.Blues,
                               normalize='true')
plt.title('Confusion matrix for RBF SVM')
#plt.savefig('CM_RBF_SVM.png')

## Path to the trained classifier for latter use.
print(
    "\n\n#############################################################################################################"
)
print('\nPaths to models:\n\n')
print(f"Classifier = joblib.load('{filename_svm}')")
print(f"Vectorizer = joblib.load('{filename_vec}')")
print(f"Variance Treshold = joblib.load('{filename_variance}')")
print(f"Percentile Best Features= joblib.load('{filename_percentile}')")
print('\n\n\n')
Exemplo n.º 26
0
def confusion(modelo,nombre):
    fig=plt.figure()
    ax = fig.add_subplot(111)
    plt.title(nombre + " confusion matrix")
    plot_confusion_matrix(modelo, X_test, y_test, normalize ='true',ax=ax,cmap="Reds")
    plt.savefig("/home/bleon/Documents/TESIS_FILES/Codigos/DATOS/ALL_STARS_ALL/conf_matrix_All_"+nombre+"_2.jpg") 



network,acc = train_and_evaluate(input_data_train, input_data_test, labels_train, labels_test)

"""## Wat is een confusion matrix? (zoek op)

## Hoe ziet een confusion matrix eruit wanneer de predicties 100% accuraat zijn?

We gebruiken de `plot_confusion_matrix` functie uit [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_confusion_matrix.html) om de confusion matrix te plotten
"""

from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(network, input_data_test, labels_test,
                                display_labels=['wit','rood'],
                                cmap=plt.cm.Blues, values_format = '.0f')

"""## Class imbalance

Neurale netwerken werken het best wanneer de input data 'gebalanceerd' is. Dit betekend dat er per klasse evenveel data beschikbaar is. Bijvoorbeeld 500 witte en 500 rode wijnen. Je gaat nu experimenteren wat er gebeurt als we de data ongebalanceerd aanleveren.

Nu koppelen we de functies die je hierboven hebt gemaakt aan elkaar.

De enige input is nu nog:
 * Het totaal aantal rode wijnen
 * De ratio rode tot witte wijnen (0.8 zou dan betekenen dat 80% van de wijnen rood zijn)

Deze functie maakt dan een dataset aan zoals hierboven, daarna wordt het genormaliseerd, gesplitst, getrained en geëvalueerd eveneens zoals je hierboven hebt gedaan.
Ook wordt de accuracy en confusion matrix geplot
"""
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, accuracy_score

dataset = pd.read_csv("data.csv")
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
y = y.reshape(-1, 1)

x_scaler = StandardScaler()
X = x_scaler.fit_transform(X)

x_train, x_test, y_tarin, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_tarin.flatten())
print(classifier.predict(x_scaler.fit_transform([[30, 87000]])))

y_hat = classifier.predict(x_test)

tn, fn, fp, tp = confusion_matrix(y_test, y_hat).ravel()
print(f"tn: {tn}, fn: {fn}, fp: {fp}, tp: {tp}")

accuracy = accuracy_score(y_test, y_hat)
print("accuracy = ", accuracy)

plot_confusion_matrix(classifier, x_test, y_test)
plt.show()
Exemplo n.º 29
0
x.head()
y = df_no_missing['hd'].copy()
x['cp'].unique()
pd.get_dummies(x, columns=['cp']).head()
x_encoded = pd.get_dummies(x, columns=['cp', 'restecg', 'slope', 'thal'])
x_encoded.head()
y.unique()
y[y > 0] = 1
y.unique()
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
x_test_scaled = scale(x_test)
x_train_scaled = scale(x_train)
clf_svm = SVC(random_state=42)
clf_svm.fit(x_train_scaled, y_train)
plot_confusion_matrix(clf_svm,
                      x_test_scaled,
                      y_test,
                      display_labels=["No", "Has HD"])
#plt.show()
param_grid = [
    {
        'C': [1, 10, 100, 1000],
        'gamma': [0.001, 0.0001],
        'kernel': ['rbf']
    },
]
optimal_params = GridSearchCV(SVC(), param_grid, cv=5, verbose=0)
optimal_params.fit(x_train_scaled, y_train)
#plt.show()
optimal_params.best_params_
clf_svm = SVC(random_state=42, C=10, gamma=0.001)
clf_svm.fit(x_train_scaled, y_train)
Exemplo n.º 30
0
    def process_classifier(self, model,
                           df: pd.DataFrame,
                           features: list,
                           label: str,
                           pred_col: str,
                           verbose: bool = False,
                           conf_matrix: bool = False,
                           test_recs: float = 0.25,
                           split_mode: str = 'pctg',
                           metric_round_dig: int = 2,
                           **params):
        """
        Run a full cycle of split-train-predict for any classifier model that respects sklearn interface.


        :param model: Regressor class
        :param ds: Data set with data and metadata
        :param verbose: True for printing actions to console
        :param conf_matrix: True to plot confusion matrix
        :param train_recs: Training records proportion: 1 - test_rec if 0:
        :param test_recs: Testing records proportion: 1 - train_recs if 0:
        :param split_mode(not implemented): * If 'pctg': test_recs and train_recs are considered as percentage.
                            * If 'records': test_recs and train recs are considered as number of records, taking the
                            last test_recs for prediction and the last train_recs before the last test_recs for training
        :param metric_round_dig: number of digits to rounds metrics to
        :param params: ML model hyper-parameters

        :return:
        """

        # VALIDATE AND PROCESS PARAMETERS

        assert (test_recs > 0) and (test_recs < 1), 'test_recs need to be in the range )0,1('

        # PREPARE DATA
        _label_col = label
        _df_train, _df_pred = DataSetSplit.sep_predict_percentage(df, test_recs)
        _X_t = _df_train[features]
        _y_t = _df_train[label]

        # TRAIN
        _clf = model(**params)
        _clf.fit(_X_t, _y_t)

        # calculate train metrics
        _train_pred_col = "y_hat_train"
        _df_train[_train_pred_col] = _clf.predict(_X_t)

        _acc_train = accuracy_score(_df_train[label], _df_train[_train_pred_col]).round(metric_round_dig)
        _prec_train = precision_score(_df_train[label], _df_train[_train_pred_col]).round(metric_round_dig)
        _rec_train = recall_score(_df_train[label], _df_train[_train_pred_col]).round(metric_round_dig)

        if verbose:
            print('')
            print('Train accuracy ', _acc_train)
            print('Train precision ', _prec_train)
            print('Train recall ', _rec_train)

        # PREDICT

        _X_p = _df_pred[features]
        _y_true = _df_pred[label]

        _df_pred[pred_col] = _clf.predict(_X_p)

        _acc = accuracy_score(_df_pred[label], _df_pred[pred_col]).round(3)
        _prec = precision_score(_df_pred[label], _df_pred[pred_col]).round(3)
        _rec = recall_score(_df_pred[label], _df_pred[pred_col]).round(3)

        # if verbose:
        #     print('')
        #     print('Prediction accuracy ', _acc)
        #     print('Prediction precision ', _prec)
        #     print('Prediction recall ', _rec)

        # if conf_matrix:
        #     _cfm = confusion_matrix(np.array(_df_pred[label]), np.array(_df_pred[pred_col]))
        #     _cfm = (_cfm / len(_df_pred)).round(3)
        #     ax = sns.heatmap(_cfm, annot=True, fmt="0.2f")
        #     plt.show()

        if conf_matrix:

            plot_confusion_matrix(_clf, _X_p, _y_true, normalize='pred', cmap='plasma')
            plt.grid = False
            plt.show()


        _metrics = dict()
        _metrics['acc_train'] = _acc_train
        _metrics['prec_train'] = _prec_train
        _metrics['rec_train'] = _rec_train
        _metrics['acc_pred'] = _acc
        _metrics['prec_pred'] = _prec
        _metrics['rec_pred'] = _rec

        return _df_pred, _metrics