示例#1
0
    def main(self):

        print(__doc__)
        #creating object instances
        obj = sat()
        #extracting features and output
        x, y = obj.load_data("gpa.csv")
        #splitting the data
        x_train, x_test, y_train, y_test = obj.split(x, y)
        #scaling the data
        #x_train,x_test,y_train,y_test = obj.scale(x_train,x_test,y_train,y_test)
        #missing value imputation
        x_train = obj.missing_val(x_train)
        x_test = obj.missing_val(x_test)
        y_train = obj.missing_val(y_train)
        y_test = obj.missing_val(y_test)
        #generating classifier
        clf = obj.classifier()
        #fitting the features into the model
        clf.fit(x_train, y_train)
        #plotting training set
        obj.plot(clf, x_train, y_train, "orange", "blue",
                 "sat score (Training set)", "GPA", "SAT SCORE")
        #plotting the testing set
        obj.plot(clf, x_test, y_test, "orange", "blue",
                 "sat score (Testing set)", "GPA", "SAT SCORE")
        #saving classifier
        obj.save_classifier(clf, "sat_score.pkl", "wb")
        #loading the data
        clf = obj.load_classifier("sat_score.pkl", "rb")

        x, y = shuffle_arrays_unison(arrays=[x, y], random_seed=5)

        plot_learning_curves(x_train, y_train, x, y, clf)
        plt.show()
示例#2
0
    def plot_ml_training_curves(self, train_x, train_y, test_x, test_y,
                                model_object):
        plot_learning_curves(train_x,
                             train_y,
                             test_x,
                             test_y,
                             model_object,
                             scoring='accuracy',
                             style='dark_background')

        plt.show()
示例#3
0
 def PlotLearningCurve(self, X, sclf, y):
     # plot learning curves
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.3,
                                                         random_state=42)
     plt.figure()
     plot_learning_curves(X_train,
                          y_train,
                          X_test,
                          y_test,
                          sclf,
                          print_model=False,
                          style='ggplot')
     plt.show()
示例#4
0
def stackingDetection():
    nb_clf = GaussianNB()
    svm_clf = RandomForestClassifier(n_estimators=100, max_depth=400, random_state=5)
    mlp_clff = MLPClassifier(hidden_layer_sizes=(500,500))
    label = ["NB", "RF","MLP"]

    metaclassifier = RandomForestClassifier(n_estimators=100, max_depth=400, random_state=5)
    clf_list = [nb_clf, svm_clf, mlp_clff]

    clf_cv_mean = []
    clf_cv_std = []

    for clf, label in zip(clf_list, label):
        scores = cross_val_score(clf, Xtrain, ytrain, cv=3, scoring='accuracy')
        print(f"Accuracy: {round(scores.mean(), 4)} Std: {round(scores.std(), 4)} Label: {label}")
        # clf_cv_mean.append(scores.mean())
        # clf_cv_std.append(scores.std())
    bagging1 = BaggingClassifier(base_estimator=mlp_clff, n_estimators=10, max_samples=0.8, max_features=0.8)
    plt.figure()
    plot_learning_curves(Xtrain, ytrain, Xtest, ytest, bagging1, print_model=False, style='ggplot')
    plt.show()
示例#5
0
def classify_smile(test_samples):
    Xtrain, Xtest = divide_x(test_samples)
    Xtrain = Xtrain.reshape(Xtrain.shape[0], Xtrain.shape[1] * Xtrain.shape[2])#reshaping the data
    Xtest = Xtest.reshape(Xtest.shape[0], Xtest.shape[1] * Xtest.shape[2])
    Ytrain, Ytest = get_labels(3, test_samples)
    Y_pred_te = SVM_smile(Xtrain, Ytrain, Xtest)  #trains the model
    size_te = len(Y_pred_te)
    Y_pred_tr = SVM_smile(Xtrain, Ytrain, Xtrain)
    size_tr = len(Y_pred_tr)
    test_accuracy = accuracy_score(Ytest, Y_pred_te) * 100 #prints the accuracy found on training/testing data
    train_accuracy = accuracy_score(Ytrain, Y_pred_tr) * 100
    print("Accuracy obtained on test data for smile detection:")
    print(accuracy_score(Ytest, Y_pred_te) * 100, '%')
    print("Accuracy obtained on train data for smile detection:")
    print(accuracy_score(Ytrain, Y_pred_tr) * 100, '%')
    print("The confusion matrix is:")
    print(confusion_matrix(Ytest, Y_pred_te))
    Y_pred = np.concatenate((Y_pred_tr, Y_pred_te), axis=0) #
    size_pred = len(Y_pred)
    np.savetxt("task_2labels.csv", Y_pred, delimiter=',')
    precision = (test_accuracy * size_te) + (train_accuracy * size_tr) #weighted mean of precisions across training/testing data
    print(precision / size_pred)
    percentage = [precision / size_pred]
    np.savetxt("task_1precision.csv", percentage, delimiter=',') #creates csv files for predicted values & precision
    a = pd.read_csv("noise_classified.csv")
    b = pd.read_csv("task_1labels.csv")
    c = pd.read_csv("task_1precision.csv")
    merged = pd.concat([a, b], axis=1) #final csv is obtained by concatenating the others
    merged.to_csv("task_1.csv", index=False)
    labelled = pd.read_csv("task_1.csv")
    final = pd.concat([labelled, c], axis=1)
    final.to_csv("task_1.csv", index=False)

    #learning curve
    clf = svm.SVC(C=3, degree=2, gamma='scale', kernel='poly')
    plot_learning_curves(Xtrain, Ytrain, Xtest, Ytest, clf)
    plt.show()
def test_training_size():

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    X_train, X_test, y_train, y_test = (train_test_split(X, y,
                                        train_size=0.6, random_state=2))

    clf = DecisionTreeClassifier(max_depth=1, random_state=1)
    training_errors, test_errors = (plot_learning_curves(X_train, y_train,
                                    X_test, y_test, clf, suppress_plot=True))

    desired1 = [0.22, 0.22, 0.22, 0.31, 0.31, 0.3, 0.33, 0.32, 0.33, 0.32]
    desired2 = [0.45, 0.45, 0.35, 0.35, 0.45, 0.43, 0.35, 0.35, 0.35, 0.35]

    np.testing.assert_almost_equal(training_errors, desired1, decimal=2)
    np.testing.assert_almost_equal(test_errors, desired2, decimal=2)
def test_training_size():

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    X_train, X_test, y_train, y_test = (train_test_split(X, y,
                                        test_size=0.4, random_state=2))

    clf = DecisionTreeClassifier(max_depth=1, random_state=1)
    training_errors, test_errors = (plot_learning_curves(X_train, y_train,
                                    X_test, y_test, clf, suppress_plot=True))

    desired1 = [0.22, 0.22, 0.22, 0.31, 0.31, 0.3, 0.33, 0.32, 0.33, 0.32]
    desired2 = [0.45, 0.45, 0.35, 0.35, 0.45, 0.43, 0.35, 0.35, 0.35, 0.35]

    np.testing.assert_almost_equal(training_errors, desired1, decimal=2)
    np.testing.assert_almost_equal(test_errors, desired2, decimal=2)
def test_scikit_metrics():
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    X_train, X_test, y_train, y_test = (train_test_split(X, y,
                                        test_size=0.4, random_state=2))

    clf = DecisionTreeClassifier(max_depth=1, random_state=1)
    training_acc, test_acc = (plot_learning_curves(X_train, y_train,
                              X_test, y_test, clf,
                              scoring='accuracy',
                              suppress_plot=True))

    desired1 = np.array([0.22, 0.22, 0.22, 0.31, 0.31,
                         0.3, 0.33, 0.32, 0.33, 0.32])
    desired2 = np.array([0.45, 0.45, 0.35, 0.35, 0.45,
                         0.43, 0.35, 0.35, 0.35, 0.35])
    np.testing.assert_almost_equal(training_acc, 1 - desired1, decimal=2)
    np.testing.assert_almost_equal(test_acc, 1 - desired2, decimal=2)
def test_scikit_metrics():
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    X_train, X_test, y_train, y_test = (train_test_split(X, y,
                                        train_size=0.6, random_state=2))

    clf = DecisionTreeClassifier(max_depth=1, random_state=1)
    training_acc, test_acc = (plot_learning_curves(X_train, y_train,
                              X_test, y_test, clf,
                              scoring='accuracy',
                              suppress_plot=True))

    desired1 = np.array([0.22, 0.22, 0.22, 0.31, 0.31,
                         0.3, 0.33, 0.32, 0.33, 0.32])
    desired2 = np.array([0.45, 0.45, 0.35, 0.35, 0.45,
                         0.43, 0.35, 0.35, 0.35, 0.35])
    np.testing.assert_almost_equal(training_acc, 1 - desired1, decimal=2)
    np.testing.assert_almost_equal(test_acc, 1 - desired2, decimal=2)
示例#10
0
def test_scikit_metrics():
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.6,
                                                        random_state=2)

    clf = DecisionTreeClassifier(max_depth=1, random_state=1)
    training_errors, test_errors = plot_learning_curves(X_train,
                                                        y_train,
                                                        X_test,
                                                        y_test,
                                                        clf,
                                                        suppress_plot=True,
                                                        scoring='accuracy')

    desired1 = [0.68, 0.67, 0.68, 0.67, 0.7, 0.69, 0.69, 0.78, 0.78, 0.78]
    desired2 = [0.65, 0.65, 0.65, 0.65, 0.57, 0.55, 0.65, 0.65, 0.55, 0.55]

    np.testing.assert_almost_equal(training_errors, desired1, decimal=2)
    np.testing.assert_almost_equal(test_errors, desired2, decimal=2)
示例#11
0
def test_scikit_metrics():
    iris = datasets.load_iris()
    X = iris.data
    y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.6,
                                                        random_state=2)

    clf = DecisionTreeClassifier(max_depth=1, random_state=1)
    training_errors, test_errors = plot_learning_curves(X_train,
                                                        y_train,
                                                        X_test,
                                                        y_test,
                                                        clf,
                                                        suppress_plot=True,
                                                        scoring='accuracy')

    desired1 = [0.68, 0.67, 0.68, 0.67, 0.7, 0.69, 0.69, 0.78, 0.78, 0.78]
    desired2 = [0.65, 0.65, 0.65, 0.65, 0.57, 0.55, 0.65, 0.65, 0.55, 0.55]

    np.testing.assert_almost_equal(training_errors, desired1, decimal=2)
    np.testing.assert_almost_equal(test_errors, desired2, decimal=2)
def kNN(of):
    all_data = pd.read_csv("../input/voice.csv")

    label_encoder = LabelEncoder()
    all_data["label"] = label_encoder.fit_transform(all_data["label"])

    rand_indices = np.random.permutation(len(all_data))
    features = [feat for feat in all_data.columns if feat != "label"]

    if of:
        try:
            features.remove('modindx')
            features.remove('dfrange')
            features.remove('maxdom')
            features.remove('mindom')
            features.remove('meandom')
            features.remove('maxfun')
            features.remove('minfun')
            features.remove('mode')
            features.remove('kurt')
            features.remove('skew')
            features.remove('Q75')
        except:
            print()

    print(features)
    output = "label"
    num_datapoints = len(all_data)
    test_total = int(num_datapoints * 0.3)

    test_set_indices = get_test_indices(num_datapoints)
    test_indices = []
    valid_indices = []
    for i in test_set_indices:
        if (len(test_indices) < len(test_set_indices) * 0.5):
            test_indices.insert(len(test_indices), i)
        else:
            valid_indices.insert(len(valid_indices), i)
    train_indices = get_train_indices(num_datapoints)

    test_data = all_data[features].iloc[test_indices]
    valid_data = all_data[features].iloc[valid_indices]
    train_data = all_data[features].iloc[train_indices]

    test_labels = all_data[output].iloc[test_indices]
    valid_labels = all_data[output].iloc[valid_indices]
    train_labels = all_data[output].iloc[train_indices]

    knn = KNeighborsClassifier(n_neighbors=3)

    knn.fit(train_data, train_labels)
    y_pred = knn.predict(valid_data)
    print('------Validation Data-------')
    print('Accuracy Score:')
    print(accuracy_score(valid_labels, y_pred))
    print('Precision Score:')
    print(precision_score(valid_labels, y_pred))
    print('Recall Score:')
    print(recall_score(valid_labels, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(valid_labels, y_pred))

    plot_learning_curves(valid_data, valid_labels, test_data, test_labels, knn)
    plot_1 = plt

    knn.fit(train_data, train_labels)
    y_pred = knn.predict(test_data)
    print('------Test Data-------')
    print('Accuracy Score:')
    print(accuracy_score(test_labels, y_pred))
    print('Precision Score:')
    print(precision_score(test_labels, y_pred))
    print('Recall Score:')
    print(recall_score(test_labels, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(test_labels, y_pred))

    plot_learning_curves(train_data, train_labels, test_data, test_labels, knn)
    plot_1.show()
    plt.show()
def svcc(c, of, svm_kernel):
    features = [feat for feat in all_data.columns if feat != "label"]
    if of:
        try:
            features.remove('modindx')
            features.remove('dfrange')
            features.remove('maxdom')
            features.remove('mindom')
            features.remove('meandom')
            features.remove('maxfun')
            features.remove('minfun')
            features.remove('mode')
            features.remove('kurt')
            features.remove('skew')
            features.remove('Q75')
        except:
            print()

    print(features)
    output = "label"
    num_datapoints = len(all_data)
    test_total = int(num_datapoints * 0.3)

    test_set_indices = get_test_indices(num_datapoints)
    test_indices = []
    valid_indices = []
    for i in test_set_indices:
        if(len(test_indices) < len(test_set_indices) * 0.5):
            test_indices.insert(len(test_indices), i)
        else:
            valid_indices.insert(len(valid_indices), i)
    train_indices = get_train_indices(num_datapoints)

    test_data = all_data[features].iloc[test_indices]
    valid_data = all_data[features].iloc[valid_indices]
    train_data = all_data[features].iloc[train_indices]

    test_labels = all_data[output].iloc[test_indices]
    valid_labels = all_data[output].iloc[valid_indices]
    train_labels = all_data[output].iloc[train_indices]
    
    print(num_datapoints, len(train_data), len(test_data))
    print(features)
    #print (test_labels)

    svc = svm.SVC(kernel=svm_kernel, C=c,gamma= 'auto')
    svc.fit(train_data, train_labels)

    predictions = svc.predict(valid_data)
    print ('----------Validation Data-----------')
    print ('C: ' + str(c) + '\t Kernel: ' + svm_kernel) 
    print ('Accuracy Score:')
    print (accuracy_score(valid_labels, predictions))
    print ('Precision Score:')
    print (precision_score(valid_labels, predictions))
    print ('Recall Score:')
    print (recall_score(valid_labels, predictions))
    print ('Confusion Matrix:')
    print (confusion_matrix(valid_labels, predictions))

    plot_learning_curves(train_data, train_labels, valid_data, valid_labels, svc)
    plot_1 = plt

    predictions = svc.predict(test_data)
    print ('----------Test Data-----------')
    print ('C: ' + str(c) + '\t Kernel: ' + svm_kernel)
    print ('Accuracy Score:')
    print (accuracy_score(test_labels, predictions))
    print ('Precision Score:')
    print (precision_score(test_labels, predictions))
    print ('Recall Score:')
    print (recall_score(test_labels, predictions))
    print ('Confusion Matrix:')
    print (confusion_matrix(test_labels, predictions))

    plot_learning_curves(train_data, train_labels, test_data, test_labels, svc)
    plot_1.show()
    plt.show()
示例#14
0
                             scoring=scoring,
                             return_train_score=False,
                             n_jobs=-1)
    for key in metrics.keys():
        for fold_index, score in enumerate(metrics[key]):
            cv_result_entries.append(
                (model_namelist[i], fold_index, key, score))
    i += 1
cv_results_df = pd.DataFrame(cv_result_entries)

# %% [markdown]
# ### Misclassification Errors
i = 0
for model in models:

    plot_learning_curves(X_train, y_train, X_test, y_test, model)
    plt.title('Learning Curve for ' + model_namelist[i], fontsize=14)
    plt.xlabel('Training Set Size (%)', fontsize=12)
    plt.ylabel('Misclassification Error', fontsize=12)
    plt.show()
    i += 1

# %% [markdown]
# ### Get predictions: prep for Confusion Matrix
y_test_pred = []
for model in models:
    y_test_pred.append(model.predict(X_test))

# %% [markdown]
# ### Graph metrics
fig_size_tuple = (15, 7)
示例#15
0
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_test = logreg.predict(X_val)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

# Determine correlation of each feature to the result
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
print(coeff_df.sort_values(by='Correlation', ascending=False))
# print(train_df.head(20))
clf = MLPClassifier(max_iter=2000, alpha=0.001)
# print(X_train.shape,Y_train.shape,X_val.shape,Y_val.shape)
# clf = SVC()
plot_learning_curves(X_train, Y_train, X_val, Y_val, clf)
plt.show()
# print(xval_df['APURCH'].size)
# print(Y_val.shape)

# plt.plot(train_sizes,train_scores,'r')
# plt.plot(train_scores,valid_scores,'b')
# plt.show()

#Train various model using data and return it's accuraries
svc = SVC()
svc.fit(X_train, Y_train)
Y_val = svc.predict(X_val)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
print("acc svc = ", acc_svc)
示例#16
0
clf_cv_mean = []
clf_cv_std = []
for clf, label, grd in zip(clf_list, label, grid):
    scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    print("Accuracy: %.2f (+/- %.2f) [%s]" % (scores.mean(), scores.std(), label))
    clf_cv_mean.append(scores.mean())
    clf_cv_std.append(scores.std())

    clf.fit(X, y)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X=X, y=y, clf=clf)
    plt.title(label)

plt.show()

#plot classifier accuracy
plt.figure()
(_, caps, _) = plt.errorbar(range(4), clf_cv_mean, yerr=clf_cv_std, c='blue', fmt='-o', capsize=5)
for cap in caps:
    cap.set_markeredgewidth(1)
plt.xticks(range(4), ['KNN', 'RF', 'NB', 'Stacking'])
plt.ylabel('Accuracy'); plt.xlabel('Classifier'); plt.title('Stacking Ensemble');
plt.show()

# plot learning curves
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

plt.figure()
plot_learning_curves(X_train, y_train, X_test, y_test, sclf, print_model=False, style='ggplot')
plt.show()
示例#17
0
def main_fun():

    test = pd.read_csv("test.csv")

    train = pd.read_csv("train.csv")

    train['Comment'] = train['Comment'].fillna("")
    test['Comment'] = test['Comment'].fillna("")

    print(train.head())

    combi = train.append(test, ignore_index=True)

    combi['Comment'] = combi['Comment'].str.replace("[^a-zA-Z#]", " ")

    combi['Comment'] = combi['Comment'].apply(
        lambda x: ' '.join([w for w in x.split() if len(w) > 3]))

    combi.head()

    tokenized_comment = combi['Comment'].apply(lambda x: x.split())

    tokenized_comment.head()

    stemmer = PorterStemmer()

    tokenized_comment = tokenized_comment.apply(
        lambda x: [stemmer.stem(i) for i in x])  # stemming
    tokenized_comment.head()

    for i in range(len(tokenized_comment)):
        tokenized_comment[i] = ' '.join(tokenized_comment[i])

    combi['Comment'] = tokenized_comment

    print(combi['Comment'])

    all_words = ' '.join([text for text in combi['Comment']])
    from wordcloud import WordCloud
    wordcloud = WordCloud(width=800,
                          height=500,
                          random_state=21,
                          max_font_size=110).generate(all_words)

    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()

    normal_words = ' '.join(
        [text for text in combi['Comment'][combi['polarity'] == 1]])

    wordcloud = WordCloud(width=800,
                          height=500,
                          random_state=21,
                          max_font_size=110).generate(normal_words)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()

    negative_words = ' '.join(
        [text for text in combi['Comment'][combi['polarity'] == 0]])
    wordcloud = WordCloud(width=800,
                          height=500,
                          random_state=21,
                          max_font_size=110).generate(negative_words)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()

    bow_vectorizer = CountVectorizer(max_df=0.90,
                                     min_df=2,
                                     max_features=1000,
                                     stop_words='english')
    # bag-of-words feature matrix
    bow = bow_vectorizer.fit_transform(combi['Comment'])

    train_bow = bow[:17741, :]
    test_bow = bow[17742:, :]

    print("TEST BOW ", test_bow)

    #train_tfidf = tfidf[:17741,:]
    #test_tfidf = tfidf[17742:,:]

    # splitting data into training and validation set
    xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(
        train_bow, train['polarity'], random_state=42, test_size=0.3)
    #xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_tfidf, train['polarity'], random_state=42, test_size=0.3)

    #xtrain_tfidf = train_tfidf[ytrain.index]
    #xvalid_tfidf = train_tfidf[yvalid.index]

    print("X TYPE : ", type(xtrain_bow))
    print("Y TYPE : ", type(ytrain))

    print(xtrain_bow)
    print(ytrain)

    #----------------------------------------------------------------------------------------------------

    svc = svm.SVC(kernel='linear',
                  C=1,
                  probability=True,
                  decision_function_shape='ovo').fit(xtrain_bow, ytrain)

    plot_learning_curves(xtrain_bow, ytrain, xvalid_bow, yvalid, svc)
    plt.show()

    prediction = svc.predict_proba(xvalid_bow)
    prediction_int = prediction[:, 1] >= 0.3
    prediction_int = prediction_int.astype(np.int)

    positive_comments = []
    negative_comments = []

    for i in prediction_int:
        if i == 0:
            negative_comments.append(i)
        else:
            positive_comments.append(i)

    print("TOTAL POSITIVE COMMENTS : ", len(positive_comments))
    print("TOTAL NEGATIVE COMMENTS : ", len(negative_comments))

    plt.bar(["Positive"], [len(positive_comments)], label="Positive")
    plt.bar(["Negative"], [len(negative_comments)], label="Negative")
    plt.legend()
    plt.xlabel('Type of Comment')
    plt.ylabel('Count of Comment')
    plt.title('Sentiment Analysis')

    plt.show()

    #--------------------------------------------------------------------------------------------------------------
    '''from sklearn.linear_model import LogisticRegression


    lreg = LogisticRegression(solver='lbfgs',max_iter=200)

    #lreg.fit(xtrain_bow, ytrain) # training the model with bow

    lreg.fit(xtrain_tfidf, ytrain)

    prediction = lreg.predict_proba(xvalid_bow) # predicting on the validation set
    prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0
    prediction_int = prediction_int.astype(np.int)'''
    #-----------------------------------------------------------------------------------------------------------------
    '''from sklearn.ensemble import RandomForestClassifier

    rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_bow, ytrain)
    prediction_int = rf.predict(xvalid_bow)'''

    #-----------------------------------------------------------------------------------------------------------------

    print(
        "--------------------------------------------Results--------------------------------------------"
    )

    print()

    print("     F1 Score = ", f1_score(yvalid,
                                       prediction_int))  # calculating f1 score
    print()
    print("     Confusion Matrix of Model")
    print()
    print(confusion_matrix(yvalid, prediction_int))
    print()

    confusion_mat = confusion_matrix(yvalid, prediction_int)

    class_names = ['Positive', 'Negative']

    fig, ax = plot_confusion_matrix(conf_mat=confusion_mat,
                                    class_names=class_names)
    plt.show()

    print("--------Classification Report--------------")
    print()

    print(classification_report(yvalid, prediction_int))

    y_pred_prob = svc.predict_proba(xvalid_bow)[:, 1]

    fpr, tpr, thresholds = roc_curve(yvalid, y_pred_prob)
    # create plot
    plt.plot(fpr, tpr, label='ROC curve')
    plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
    _ = plt.xlabel('False Positive Rate')
    _ = plt.ylabel('True Positive Rate')
    _ = plt.title('ROC Curve')
    _ = plt.xlim([-0.02, 1])
    _ = plt.ylim([0, 1.02])
    _ = plt.legend(loc="lower right")

    plt.show()

    print()

    print("     ROC_AUC_SCORE = ", roc_auc_score(yvalid, y_pred_prob))

    precision, recall, thresholds = precision_recall_curve(yvalid, y_pred_prob)
    # create plot
    plt.plot(precision, recall, label='Precision-recall cuisnull()rve')
    _ = plt.xlabel('Precision')
    _ = plt.ylabel('Recall')
    _ = plt.title('Precision-recall curve')
    _ = plt.legend(loc="lower left")

    plt.show()

    print()
    print("     Average_Precision_Score = ",
          average_precision_score(yvalid, y_pred_prob))

    acc_score = accuracy_score(yvalid, prediction_int)

    print()
    print("     Accuracy score = ", acc_score)
    '''from mlxtend.plotting import plot_decision_regions

    print("X TYPE : ",type(xtrain_bow))
    print("Y TYPE : ",type(ytrain))

    arr_x = xtrain_bow.toarray()

    arr_x 
    arr_y = ytrain.to_numpy()

    plot_decision_regions(arr_x, arr_y, clf=svc, legend=2)

    plt.xlabel('Positive')
    plt.ylabel('Negative')
    plt.title('SVM on Iris')
    plt.show()'''
    '''import eli5
示例#18
0
for clf, label in zip(classifiers, label):        
    scores = cross_val_score(clf, x_pca, train_label, cv=3, scoring='accuracy')
    print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))

"""###Random Forest Classification

Above we have seen all the cross validation scores for single classifiers and using bagging as an ensemble method
"""

#Now lets see the effect of max_samples meaning the effect of subsampling the data
bags = [bagging_dt, bagging_knn, bagging_lr]
x_train0, x_test0, y_train0, y_test0 = train_test_split(x_pca, train_label, test_size=0.3, random_state=7)
for b in bags:
 plt.figure()
 plot_learning_curves(x_train0, y_train0, X_test0, y_test0, b, print_model=False, style='ggplot')
 plt.show()

"""Tables are for  'Bagging Tree', 'Bagging K-NN' and 'Bagging Logistic Regression' respectively. 
As we can see in all tables choosing ~80% data as training data we achieve the best ensemble models.
"""

from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [10,20,30,50],
    'max_features': [0.8, 0.9],
    'n_estimators': [10,20,50,100]
}
rf = RandomForestClassifier()
示例#19
0
                             cv=5,
                             scoring=scoring,
                             return_train_score=False,
                             n_jobs=2)
    for key in metrics.keys():
        for fold_index, score in enumerate(metrics[key]):
            cv_result_entries.append(
                (model_namelist[i], fold_index, key, score))
    i += 1
cv_results_df = pandas.DataFrame(cv_result_entries)

# %% [markdown]
# ### Misclassification Errors
i = 0
for model in models:
    plot_learning_curves(x_train_val, y_train, x_test, y_test, model)
    plt.title('Learning Curve for ' + model_namelist[i], fontsize=14)
    plt.xlabel('Training Set Size (%)', fontsize=12)
    plt.ylabel('Misclassification Error', fontsize=12)
    plt.show()
    i += 1

# %% [markdown]
# ### Get predictions: prep for Confusion Matrix
y_test_pred = []
for model in models:
    y_test_pred.append(model.predict(x_test))

# %% [markdown]
# ### Confusion Matrix
from sklearn.metrics import confusion_matrix
示例#20
0
}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                   param_grid,
                   cv=5)
clf = clf.fit(X_train_pca, Y_train)

y_pred = clf.predict(X_test_pca)

print(classification_report(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred, labels=['0', '1']))

# Plot the learning curves
plt.figure(figsize=(20, 10))
plot_learning_curves(X_train_pca,
                     Y_train,
                     X_test_pca,
                     Y_test,
                     clf,
                     scoring="accuracy")
plt.title("Learning Curves")

plt.show()


# plot the result of the prediction on a portion of the test set
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i + 1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
示例#21
0
def rff(ne, md, of):
    all_data = pd.read_csv("../input/voice.csv")

    label_encoder = LabelEncoder()
    all_data["label"] = label_encoder.fit_transform(all_data["label"])

    rand_indices = np.random.permutation(len(all_data))
    features = [feat for feat in all_data.columns if feat != "label"]
    if of:
        try:
            features.remove('modindx')
            features.remove('dfrange')
            features.remove('maxdom')
            features.remove('mindom')
            features.remove('meandom')
            features.remove('maxfun')
            features.remove('minfun')
            features.remove('mode')
            features.remove('kurt')
            features.remove('skew')
            features.remove('Q75')
        except:
            print()

    print(features)
    output = "label"
    num_datapoints = len(all_data)
    test_total = int(num_datapoints * 0.3)

    test_set_indices = get_test_indices(num_datapoints)
    test_indices = []
    valid_indices = []
    for i in test_set_indices:
        if (len(test_indices) < len(test_set_indices) * 0.5):
            test_indices.insert(len(test_indices), i)
        else:
            valid_indices.insert(len(valid_indices), i)
    train_indices = get_train_indices(num_datapoints)

    test_data = all_data[features].iloc[test_indices]
    valid_data = all_data[features].iloc[valid_indices]
    train_data = all_data[features].iloc[train_indices]

    test_labels = all_data[output].iloc[test_indices]
    valid_labels = all_data[output].iloc[valid_indices]
    train_labels = all_data[output].iloc[train_indices]

    print(num_datapoints, len(train_data), len(test_data))
    print(features)
    #print (test_labels)

    rf = RandomForestClassifier(n_estimators=ne, max_depth=md)
    rf.fit(train_data, train_labels)

    print('Number of Estimators: ' + str(ne) + '\t Max. Depth: ' + str(md))
    predictions = rf.predict(valid_data)
    print('---------Validation Data-----------')
    print('Accuracy Score:')
    print(accuracy_score(valid_labels, predictions))
    print('Precision Score:')
    print(precision_score(valid_labels, predictions))
    print('Recall Score:')
    print(recall_score(valid_labels, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(valid_labels, predictions))

    plot_learning_curves(train_data, train_labels, valid_data, valid_labels,
                         rf)
    plot_1 = plt

    predictions = rf.predict(test_data)
    print('---------Test Data-----------')
    print('Accuracy Score:')
    print(accuracy_score(test_labels, predictions))
    print('Precision Score:')
    print(precision_score(test_labels, predictions))
    print('Recall Score:')
    print(recall_score(test_labels, predictions))
    print('Confusion Matrix:')
    print(confusion_matrix(test_labels, predictions))

    plot_learning_curves(train_data, train_labels, test_data, test_labels, rf)
    plot_1.show()
    plt.show()
示例#22
0
#new_data_train.isnull().sum().sort_values(ascending=False).head(10)

# aplica uma correção para os valores nulos - setar  média entre eles
new_data_train['Age'].fillna(new_data_train['Age'].mean(), inplace=True)
new_data_test['Fare'].fillna(new_data_test['Fare'].mean(), inplace=True)

# define o conjunto de features e labels
X = new_data_train.drop('Survived', axis=1)
y = new_data_train['Survived']

# separa os dados em um conjunto de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(y_test)

#cria o modelo
tree = DecisionTreeClassifier(max_depth=3, random_state=0)
tree.fit(X_train, y_train)

plot_learning_curves(X_train, y_train, X_test, y_test, clf=tree)
plt.title("DecisionTreeClassifier %.2f " % (float(tree.score(X_test, y_test))))
plt.show()

#verificando o Score do conjunto de treino
#print('Score : ', tree.score(X,y))
#print('Score : ', clf_rf.score(X_test,y_test))

submission = pd.DataFrame()
submission['PassengerId'] = X_test['PassengerId']
submission['Survived'] = tree.predict(X_test)
submission.to_csv('prediction/Titanic/submission.csv', index=False)
示例#23
0
def lr(c, of):
    features = [feat for feat in all_data.columns if feat != "label"]
    if of:
        try:
            features.remove('modindx')
            features.remove('dfrange')
            features.remove('maxdom')
            features.remove('mindom')
            features.remove('meandom')
            features.remove('maxfun')
            features.remove('minfun')
            features.remove('mode')
            features.remove('kurt')
            features.remove('skew')
            features.remove('Q75')
        except:
            print()

    print(features)
    output = "label"
    num_datapoints = len(all_data)
    test_total = int(num_datapoints * 0.3)

    test_set_indices = get_test_indices(num_datapoints)
    test_indices = []
    valid_indices = []
    for i in test_set_indices:
        if(len(test_indices) < len(test_set_indices) * 0.5):
            test_indices.insert(len(test_indices), i)
        else:
            valid_indices.insert(len(valid_indices), i)
    train_indices = get_train_indices(num_datapoints)

    test_data = all_data[features].iloc[test_indices]
    valid_data = all_data[features].iloc[valid_indices]
    train_data = all_data[features].iloc[train_indices]

    test_labels = all_data[output].iloc[test_indices]
    valid_labels = all_data[output].iloc[valid_indices]
    train_labels = all_data[output].iloc[train_indices]
    
    logistic = linear_model.LogisticRegression(C=c)
    logistic.fit(train_data, train_labels)
    
    predictions = logistic.predict(valid_data)
    print ('--------Validation Data----------')
    print ('-------' + str(c) + '-------')
    print ('Accuracy Score:')
    print (accuracy_score(valid_labels, predictions))
    print ('Precision Score:')
    print (precision_score(valid_labels, predictions))
    print ('Recall Score:')
    print (recall_score(valid_labels, predictions))
    print ('Confusion Matrix:')
    print (confusion_matrix(valid_labels, predictions))

    plot_learning_curves(train_data, train_labels, valid_data, valid_labels, logistic)
    plot_1 = plt
    
    predictions = logistic.predict(test_data)
    print ('--------Testing Data----------')    
    print ('-------' + str(c) + '-------')
    print ('Accuracy Score:')
    print (accuracy_score(test_labels, predictions))
    print ('Precision Score:')
    print (precision_score(test_labels, predictions))
    print ('Recall Score:')
    print (recall_score(test_labels, predictions))
    print ('Confusion Matrix:')
    print (confusion_matrix(test_labels, predictions))

    plot_learning_curves(train_data, train_labels, test_data, test_labels, logistic)
    plot_1.show()
    plt.show()
                             scoring=scoring,
                             return_train_score=False,
                             n_jobs=-1)
    for key in metrics.keys():
        for fold_index, score in enumerate(metrics[key]):
            cv_result_entries.append(
                (model_namelist[i], fold_index, key, score))
    i += 1
cv_results_df = pd.DataFrame(cv_result_entries)

# %%
# ### Misclassification Errors
i = 0
for model in models:
    plt.figure()
    plot_learning_curves(X, y, X, y, model)
    plt.title('Learning Curve for ' + model_namelist[i], fontsize=14)
    plt.xlabel('Training Set Size (%)', fontsize=12)
    plt.ylabel('Misclassification Error', fontsize=12)
    plt.show()
    i += 1

# %% [markdown]
# ### Get predictions: prep for Confusion Matrix
y_test_pred = []
for model in models:
    y_test_pred.append(model.predict(X))

# %% [markdown]
# ### Confusion Matrix
from sklearn.metrics import confusion_matrix
示例#25
0
svc = svm.SVC(kernel='linear',
              C=1,
              probability=True,
              decision_function_shape='ovo').fit(xtrain_bow, ytrain)

print(svc)

print("\n\n")
text3 = colored("6. Displaying the learning Curve ", 'red', attrs=['bold'])
print(text3)

print("\n\n")

from mlxtend.plotting import plot_learning_curves

plot_learning_curves(xtrain_bow, ytrain, xvalid_bow, yvalid, svc)
plt.show()

prediction = svc.predict_proba(xvalid_bow)
prediction_int = prediction[:, 1] >= 0.3
prediction_int = prediction_int.astype(np.int)

print("\n\n")
text3 = colored("7. Displaying the Predicted Sentiment Analysis ",
                'red',
                attrs=['bold'])
print(text3)
f = input()
print("\n\n")
positive_comments = []
negative_comments = []