def main(self): print(__doc__) #creating object instances obj = sat() #extracting features and output x, y = obj.load_data("gpa.csv") #splitting the data x_train, x_test, y_train, y_test = obj.split(x, y) #scaling the data #x_train,x_test,y_train,y_test = obj.scale(x_train,x_test,y_train,y_test) #missing value imputation x_train = obj.missing_val(x_train) x_test = obj.missing_val(x_test) y_train = obj.missing_val(y_train) y_test = obj.missing_val(y_test) #generating classifier clf = obj.classifier() #fitting the features into the model clf.fit(x_train, y_train) #plotting training set obj.plot(clf, x_train, y_train, "orange", "blue", "sat score (Training set)", "GPA", "SAT SCORE") #plotting the testing set obj.plot(clf, x_test, y_test, "orange", "blue", "sat score (Testing set)", "GPA", "SAT SCORE") #saving classifier obj.save_classifier(clf, "sat_score.pkl", "wb") #loading the data clf = obj.load_classifier("sat_score.pkl", "rb") x, y = shuffle_arrays_unison(arrays=[x, y], random_seed=5) plot_learning_curves(x_train, y_train, x, y, clf) plt.show()
def plot_ml_training_curves(self, train_x, train_y, test_x, test_y, model_object): plot_learning_curves(train_x, train_y, test_x, test_y, model_object, scoring='accuracy', style='dark_background') plt.show()
def PlotLearningCurve(self, X, sclf, y): # plot learning curves X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) plt.figure() plot_learning_curves(X_train, y_train, X_test, y_test, sclf, print_model=False, style='ggplot') plt.show()
def stackingDetection(): nb_clf = GaussianNB() svm_clf = RandomForestClassifier(n_estimators=100, max_depth=400, random_state=5) mlp_clff = MLPClassifier(hidden_layer_sizes=(500,500)) label = ["NB", "RF","MLP"] metaclassifier = RandomForestClassifier(n_estimators=100, max_depth=400, random_state=5) clf_list = [nb_clf, svm_clf, mlp_clff] clf_cv_mean = [] clf_cv_std = [] for clf, label in zip(clf_list, label): scores = cross_val_score(clf, Xtrain, ytrain, cv=3, scoring='accuracy') print(f"Accuracy: {round(scores.mean(), 4)} Std: {round(scores.std(), 4)} Label: {label}") # clf_cv_mean.append(scores.mean()) # clf_cv_std.append(scores.std()) bagging1 = BaggingClassifier(base_estimator=mlp_clff, n_estimators=10, max_samples=0.8, max_features=0.8) plt.figure() plot_learning_curves(Xtrain, ytrain, Xtest, ytest, bagging1, print_model=False, style='ggplot') plt.show()
def classify_smile(test_samples): Xtrain, Xtest = divide_x(test_samples) Xtrain = Xtrain.reshape(Xtrain.shape[0], Xtrain.shape[1] * Xtrain.shape[2])#reshaping the data Xtest = Xtest.reshape(Xtest.shape[0], Xtest.shape[1] * Xtest.shape[2]) Ytrain, Ytest = get_labels(3, test_samples) Y_pred_te = SVM_smile(Xtrain, Ytrain, Xtest) #trains the model size_te = len(Y_pred_te) Y_pred_tr = SVM_smile(Xtrain, Ytrain, Xtrain) size_tr = len(Y_pred_tr) test_accuracy = accuracy_score(Ytest, Y_pred_te) * 100 #prints the accuracy found on training/testing data train_accuracy = accuracy_score(Ytrain, Y_pred_tr) * 100 print("Accuracy obtained on test data for smile detection:") print(accuracy_score(Ytest, Y_pred_te) * 100, '%') print("Accuracy obtained on train data for smile detection:") print(accuracy_score(Ytrain, Y_pred_tr) * 100, '%') print("The confusion matrix is:") print(confusion_matrix(Ytest, Y_pred_te)) Y_pred = np.concatenate((Y_pred_tr, Y_pred_te), axis=0) # size_pred = len(Y_pred) np.savetxt("task_2labels.csv", Y_pred, delimiter=',') precision = (test_accuracy * size_te) + (train_accuracy * size_tr) #weighted mean of precisions across training/testing data print(precision / size_pred) percentage = [precision / size_pred] np.savetxt("task_1precision.csv", percentage, delimiter=',') #creates csv files for predicted values & precision a = pd.read_csv("noise_classified.csv") b = pd.read_csv("task_1labels.csv") c = pd.read_csv("task_1precision.csv") merged = pd.concat([a, b], axis=1) #final csv is obtained by concatenating the others merged.to_csv("task_1.csv", index=False) labelled = pd.read_csv("task_1.csv") final = pd.concat([labelled, c], axis=1) final.to_csv("task_1.csv", index=False) #learning curve clf = svm.SVC(C=3, degree=2, gamma='scale', kernel='poly') plot_learning_curves(Xtrain, Ytrain, Xtest, Ytest, clf) plt.show()
def test_training_size(): iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = (train_test_split(X, y, train_size=0.6, random_state=2)) clf = DecisionTreeClassifier(max_depth=1, random_state=1) training_errors, test_errors = (plot_learning_curves(X_train, y_train, X_test, y_test, clf, suppress_plot=True)) desired1 = [0.22, 0.22, 0.22, 0.31, 0.31, 0.3, 0.33, 0.32, 0.33, 0.32] desired2 = [0.45, 0.45, 0.35, 0.35, 0.45, 0.43, 0.35, 0.35, 0.35, 0.35] np.testing.assert_almost_equal(training_errors, desired1, decimal=2) np.testing.assert_almost_equal(test_errors, desired2, decimal=2)
def test_training_size(): iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = (train_test_split(X, y, test_size=0.4, random_state=2)) clf = DecisionTreeClassifier(max_depth=1, random_state=1) training_errors, test_errors = (plot_learning_curves(X_train, y_train, X_test, y_test, clf, suppress_plot=True)) desired1 = [0.22, 0.22, 0.22, 0.31, 0.31, 0.3, 0.33, 0.32, 0.33, 0.32] desired2 = [0.45, 0.45, 0.35, 0.35, 0.45, 0.43, 0.35, 0.35, 0.35, 0.35] np.testing.assert_almost_equal(training_errors, desired1, decimal=2) np.testing.assert_almost_equal(test_errors, desired2, decimal=2)
def test_scikit_metrics(): iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = (train_test_split(X, y, test_size=0.4, random_state=2)) clf = DecisionTreeClassifier(max_depth=1, random_state=1) training_acc, test_acc = (plot_learning_curves(X_train, y_train, X_test, y_test, clf, scoring='accuracy', suppress_plot=True)) desired1 = np.array([0.22, 0.22, 0.22, 0.31, 0.31, 0.3, 0.33, 0.32, 0.33, 0.32]) desired2 = np.array([0.45, 0.45, 0.35, 0.35, 0.45, 0.43, 0.35, 0.35, 0.35, 0.35]) np.testing.assert_almost_equal(training_acc, 1 - desired1, decimal=2) np.testing.assert_almost_equal(test_acc, 1 - desired2, decimal=2)
def test_scikit_metrics(): iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = (train_test_split(X, y, train_size=0.6, random_state=2)) clf = DecisionTreeClassifier(max_depth=1, random_state=1) training_acc, test_acc = (plot_learning_curves(X_train, y_train, X_test, y_test, clf, scoring='accuracy', suppress_plot=True)) desired1 = np.array([0.22, 0.22, 0.22, 0.31, 0.31, 0.3, 0.33, 0.32, 0.33, 0.32]) desired2 = np.array([0.45, 0.45, 0.35, 0.35, 0.45, 0.43, 0.35, 0.35, 0.35, 0.35]) np.testing.assert_almost_equal(training_acc, 1 - desired1, decimal=2) np.testing.assert_almost_equal(test_acc, 1 - desired2, decimal=2)
def test_scikit_metrics(): iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, random_state=2) clf = DecisionTreeClassifier(max_depth=1, random_state=1) training_errors, test_errors = plot_learning_curves(X_train, y_train, X_test, y_test, clf, suppress_plot=True, scoring='accuracy') desired1 = [0.68, 0.67, 0.68, 0.67, 0.7, 0.69, 0.69, 0.78, 0.78, 0.78] desired2 = [0.65, 0.65, 0.65, 0.65, 0.57, 0.55, 0.65, 0.65, 0.55, 0.55] np.testing.assert_almost_equal(training_errors, desired1, decimal=2) np.testing.assert_almost_equal(test_errors, desired2, decimal=2)
def kNN(of): all_data = pd.read_csv("../input/voice.csv") label_encoder = LabelEncoder() all_data["label"] = label_encoder.fit_transform(all_data["label"]) rand_indices = np.random.permutation(len(all_data)) features = [feat for feat in all_data.columns if feat != "label"] if of: try: features.remove('modindx') features.remove('dfrange') features.remove('maxdom') features.remove('mindom') features.remove('meandom') features.remove('maxfun') features.remove('minfun') features.remove('mode') features.remove('kurt') features.remove('skew') features.remove('Q75') except: print() print(features) output = "label" num_datapoints = len(all_data) test_total = int(num_datapoints * 0.3) test_set_indices = get_test_indices(num_datapoints) test_indices = [] valid_indices = [] for i in test_set_indices: if (len(test_indices) < len(test_set_indices) * 0.5): test_indices.insert(len(test_indices), i) else: valid_indices.insert(len(valid_indices), i) train_indices = get_train_indices(num_datapoints) test_data = all_data[features].iloc[test_indices] valid_data = all_data[features].iloc[valid_indices] train_data = all_data[features].iloc[train_indices] test_labels = all_data[output].iloc[test_indices] valid_labels = all_data[output].iloc[valid_indices] train_labels = all_data[output].iloc[train_indices] knn = KNeighborsClassifier(n_neighbors=3) knn.fit(train_data, train_labels) y_pred = knn.predict(valid_data) print('------Validation Data-------') print('Accuracy Score:') print(accuracy_score(valid_labels, y_pred)) print('Precision Score:') print(precision_score(valid_labels, y_pred)) print('Recall Score:') print(recall_score(valid_labels, y_pred)) print('Confusion Matrix:') print(confusion_matrix(valid_labels, y_pred)) plot_learning_curves(valid_data, valid_labels, test_data, test_labels, knn) plot_1 = plt knn.fit(train_data, train_labels) y_pred = knn.predict(test_data) print('------Test Data-------') print('Accuracy Score:') print(accuracy_score(test_labels, y_pred)) print('Precision Score:') print(precision_score(test_labels, y_pred)) print('Recall Score:') print(recall_score(test_labels, y_pred)) print('Confusion Matrix:') print(confusion_matrix(test_labels, y_pred)) plot_learning_curves(train_data, train_labels, test_data, test_labels, knn) plot_1.show() plt.show()
def svcc(c, of, svm_kernel): features = [feat for feat in all_data.columns if feat != "label"] if of: try: features.remove('modindx') features.remove('dfrange') features.remove('maxdom') features.remove('mindom') features.remove('meandom') features.remove('maxfun') features.remove('minfun') features.remove('mode') features.remove('kurt') features.remove('skew') features.remove('Q75') except: print() print(features) output = "label" num_datapoints = len(all_data) test_total = int(num_datapoints * 0.3) test_set_indices = get_test_indices(num_datapoints) test_indices = [] valid_indices = [] for i in test_set_indices: if(len(test_indices) < len(test_set_indices) * 0.5): test_indices.insert(len(test_indices), i) else: valid_indices.insert(len(valid_indices), i) train_indices = get_train_indices(num_datapoints) test_data = all_data[features].iloc[test_indices] valid_data = all_data[features].iloc[valid_indices] train_data = all_data[features].iloc[train_indices] test_labels = all_data[output].iloc[test_indices] valid_labels = all_data[output].iloc[valid_indices] train_labels = all_data[output].iloc[train_indices] print(num_datapoints, len(train_data), len(test_data)) print(features) #print (test_labels) svc = svm.SVC(kernel=svm_kernel, C=c,gamma= 'auto') svc.fit(train_data, train_labels) predictions = svc.predict(valid_data) print ('----------Validation Data-----------') print ('C: ' + str(c) + '\t Kernel: ' + svm_kernel) print ('Accuracy Score:') print (accuracy_score(valid_labels, predictions)) print ('Precision Score:') print (precision_score(valid_labels, predictions)) print ('Recall Score:') print (recall_score(valid_labels, predictions)) print ('Confusion Matrix:') print (confusion_matrix(valid_labels, predictions)) plot_learning_curves(train_data, train_labels, valid_data, valid_labels, svc) plot_1 = plt predictions = svc.predict(test_data) print ('----------Test Data-----------') print ('C: ' + str(c) + '\t Kernel: ' + svm_kernel) print ('Accuracy Score:') print (accuracy_score(test_labels, predictions)) print ('Precision Score:') print (precision_score(test_labels, predictions)) print ('Recall Score:') print (recall_score(test_labels, predictions)) print ('Confusion Matrix:') print (confusion_matrix(test_labels, predictions)) plot_learning_curves(train_data, train_labels, test_data, test_labels, svc) plot_1.show() plt.show()
scoring=scoring, return_train_score=False, n_jobs=-1) for key in metrics.keys(): for fold_index, score in enumerate(metrics[key]): cv_result_entries.append( (model_namelist[i], fold_index, key, score)) i += 1 cv_results_df = pd.DataFrame(cv_result_entries) # %% [markdown] # ### Misclassification Errors i = 0 for model in models: plot_learning_curves(X_train, y_train, X_test, y_test, model) plt.title('Learning Curve for ' + model_namelist[i], fontsize=14) plt.xlabel('Training Set Size (%)', fontsize=12) plt.ylabel('Misclassification Error', fontsize=12) plt.show() i += 1 # %% [markdown] # ### Get predictions: prep for Confusion Matrix y_test_pred = [] for model in models: y_test_pred.append(model.predict(X_test)) # %% [markdown] # ### Graph metrics fig_size_tuple = (15, 7)
# Logistic Regression logreg = LogisticRegression() logreg.fit(X_train, Y_train) Y_test = logreg.predict(X_val) acc_log = round(logreg.score(X_train, Y_train) * 100, 2) # Determine correlation of each feature to the result coeff_df = pd.DataFrame(train_df.columns.delete(0)) coeff_df.columns = ['Feature'] coeff_df["Correlation"] = pd.Series(logreg.coef_[0]) print(coeff_df.sort_values(by='Correlation', ascending=False)) # print(train_df.head(20)) clf = MLPClassifier(max_iter=2000, alpha=0.001) # print(X_train.shape,Y_train.shape,X_val.shape,Y_val.shape) # clf = SVC() plot_learning_curves(X_train, Y_train, X_val, Y_val, clf) plt.show() # print(xval_df['APURCH'].size) # print(Y_val.shape) # plt.plot(train_sizes,train_scores,'r') # plt.plot(train_scores,valid_scores,'b') # plt.show() #Train various model using data and return it's accuraries svc = SVC() svc.fit(X_train, Y_train) Y_val = svc.predict(X_val) acc_svc = round(svc.score(X_train, Y_train) * 100, 2) print("acc svc = ", acc_svc)
clf_cv_mean = [] clf_cv_std = [] for clf, label, grd in zip(clf_list, label, grid): scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy') print("Accuracy: %.2f (+/- %.2f) [%s]" % (scores.mean(), scores.std(), label)) clf_cv_mean.append(scores.mean()) clf_cv_std.append(scores.std()) clf.fit(X, y) ax = plt.subplot(gs[grd[0], grd[1]]) fig = plot_decision_regions(X=X, y=y, clf=clf) plt.title(label) plt.show() #plot classifier accuracy plt.figure() (_, caps, _) = plt.errorbar(range(4), clf_cv_mean, yerr=clf_cv_std, c='blue', fmt='-o', capsize=5) for cap in caps: cap.set_markeredgewidth(1) plt.xticks(range(4), ['KNN', 'RF', 'NB', 'Stacking']) plt.ylabel('Accuracy'); plt.xlabel('Classifier'); plt.title('Stacking Ensemble'); plt.show() # plot learning curves X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) plt.figure() plot_learning_curves(X_train, y_train, X_test, y_test, sclf, print_model=False, style='ggplot') plt.show()
def main_fun(): test = pd.read_csv("test.csv") train = pd.read_csv("train.csv") train['Comment'] = train['Comment'].fillna("") test['Comment'] = test['Comment'].fillna("") print(train.head()) combi = train.append(test, ignore_index=True) combi['Comment'] = combi['Comment'].str.replace("[^a-zA-Z#]", " ") combi['Comment'] = combi['Comment'].apply( lambda x: ' '.join([w for w in x.split() if len(w) > 3])) combi.head() tokenized_comment = combi['Comment'].apply(lambda x: x.split()) tokenized_comment.head() stemmer = PorterStemmer() tokenized_comment = tokenized_comment.apply( lambda x: [stemmer.stem(i) for i in x]) # stemming tokenized_comment.head() for i in range(len(tokenized_comment)): tokenized_comment[i] = ' '.join(tokenized_comment[i]) combi['Comment'] = tokenized_comment print(combi['Comment']) all_words = ' '.join([text for text in combi['Comment']]) from wordcloud import WordCloud wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) plt.figure(figsize=(10, 7)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis('off') plt.show() normal_words = ' '.join( [text for text in combi['Comment'][combi['polarity'] == 1]]) wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words) plt.figure(figsize=(10, 7)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis('off') plt.show() negative_words = ' '.join( [text for text in combi['Comment'][combi['polarity'] == 0]]) wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(negative_words) plt.figure(figsize=(10, 7)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis('off') plt.show() bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english') # bag-of-words feature matrix bow = bow_vectorizer.fit_transform(combi['Comment']) train_bow = bow[:17741, :] test_bow = bow[17742:, :] print("TEST BOW ", test_bow) #train_tfidf = tfidf[:17741,:] #test_tfidf = tfidf[17742:,:] # splitting data into training and validation set xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split( train_bow, train['polarity'], random_state=42, test_size=0.3) #xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_tfidf, train['polarity'], random_state=42, test_size=0.3) #xtrain_tfidf = train_tfidf[ytrain.index] #xvalid_tfidf = train_tfidf[yvalid.index] print("X TYPE : ", type(xtrain_bow)) print("Y TYPE : ", type(ytrain)) print(xtrain_bow) print(ytrain) #---------------------------------------------------------------------------------------------------- svc = svm.SVC(kernel='linear', C=1, probability=True, decision_function_shape='ovo').fit(xtrain_bow, ytrain) plot_learning_curves(xtrain_bow, ytrain, xvalid_bow, yvalid, svc) plt.show() prediction = svc.predict_proba(xvalid_bow) prediction_int = prediction[:, 1] >= 0.3 prediction_int = prediction_int.astype(np.int) positive_comments = [] negative_comments = [] for i in prediction_int: if i == 0: negative_comments.append(i) else: positive_comments.append(i) print("TOTAL POSITIVE COMMENTS : ", len(positive_comments)) print("TOTAL NEGATIVE COMMENTS : ", len(negative_comments)) plt.bar(["Positive"], [len(positive_comments)], label="Positive") plt.bar(["Negative"], [len(negative_comments)], label="Negative") plt.legend() plt.xlabel('Type of Comment') plt.ylabel('Count of Comment') plt.title('Sentiment Analysis') plt.show() #-------------------------------------------------------------------------------------------------------------- '''from sklearn.linear_model import LogisticRegression lreg = LogisticRegression(solver='lbfgs',max_iter=200) #lreg.fit(xtrain_bow, ytrain) # training the model with bow lreg.fit(xtrain_tfidf, ytrain) prediction = lreg.predict_proba(xvalid_bow) # predicting on the validation set prediction_int = prediction[:,1] >= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0 prediction_int = prediction_int.astype(np.int)''' #----------------------------------------------------------------------------------------------------------------- '''from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_bow, ytrain) prediction_int = rf.predict(xvalid_bow)''' #----------------------------------------------------------------------------------------------------------------- print( "--------------------------------------------Results--------------------------------------------" ) print() print(" F1 Score = ", f1_score(yvalid, prediction_int)) # calculating f1 score print() print(" Confusion Matrix of Model") print() print(confusion_matrix(yvalid, prediction_int)) print() confusion_mat = confusion_matrix(yvalid, prediction_int) class_names = ['Positive', 'Negative'] fig, ax = plot_confusion_matrix(conf_mat=confusion_mat, class_names=class_names) plt.show() print("--------Classification Report--------------") print() print(classification_report(yvalid, prediction_int)) y_pred_prob = svc.predict_proba(xvalid_bow)[:, 1] fpr, tpr, thresholds = roc_curve(yvalid, y_pred_prob) # create plot plt.plot(fpr, tpr, label='ROC curve') plt.plot([0, 1], [0, 1], 'k--', label='Random guess') _ = plt.xlabel('False Positive Rate') _ = plt.ylabel('True Positive Rate') _ = plt.title('ROC Curve') _ = plt.xlim([-0.02, 1]) _ = plt.ylim([0, 1.02]) _ = plt.legend(loc="lower right") plt.show() print() print(" ROC_AUC_SCORE = ", roc_auc_score(yvalid, y_pred_prob)) precision, recall, thresholds = precision_recall_curve(yvalid, y_pred_prob) # create plot plt.plot(precision, recall, label='Precision-recall cuisnull()rve') _ = plt.xlabel('Precision') _ = plt.ylabel('Recall') _ = plt.title('Precision-recall curve') _ = plt.legend(loc="lower left") plt.show() print() print(" Average_Precision_Score = ", average_precision_score(yvalid, y_pred_prob)) acc_score = accuracy_score(yvalid, prediction_int) print() print(" Accuracy score = ", acc_score) '''from mlxtend.plotting import plot_decision_regions print("X TYPE : ",type(xtrain_bow)) print("Y TYPE : ",type(ytrain)) arr_x = xtrain_bow.toarray() arr_x arr_y = ytrain.to_numpy() plot_decision_regions(arr_x, arr_y, clf=svc, legend=2) plt.xlabel('Positive') plt.ylabel('Negative') plt.title('SVM on Iris') plt.show()''' '''import eli5
for clf, label in zip(classifiers, label): scores = cross_val_score(clf, x_pca, train_label, cv=3, scoring='accuracy') print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label)) """###Random Forest Classification Above we have seen all the cross validation scores for single classifiers and using bagging as an ensemble method """ #Now lets see the effect of max_samples meaning the effect of subsampling the data bags = [bagging_dt, bagging_knn, bagging_lr] x_train0, x_test0, y_train0, y_test0 = train_test_split(x_pca, train_label, test_size=0.3, random_state=7) for b in bags: plt.figure() plot_learning_curves(x_train0, y_train0, X_test0, y_test0, b, print_model=False, style='ggplot') plt.show() """Tables are for 'Bagging Tree', 'Bagging K-NN' and 'Bagging Logistic Regression' respectively. As we can see in all tables choosing ~80% data as training data we achieve the best ensemble models. """ from sklearn.model_selection import GridSearchCV param_grid = { 'bootstrap': [True], 'max_depth': [10,20,30,50], 'max_features': [0.8, 0.9], 'n_estimators': [10,20,50,100] } rf = RandomForestClassifier()
cv=5, scoring=scoring, return_train_score=False, n_jobs=2) for key in metrics.keys(): for fold_index, score in enumerate(metrics[key]): cv_result_entries.append( (model_namelist[i], fold_index, key, score)) i += 1 cv_results_df = pandas.DataFrame(cv_result_entries) # %% [markdown] # ### Misclassification Errors i = 0 for model in models: plot_learning_curves(x_train_val, y_train, x_test, y_test, model) plt.title('Learning Curve for ' + model_namelist[i], fontsize=14) plt.xlabel('Training Set Size (%)', fontsize=12) plt.ylabel('Misclassification Error', fontsize=12) plt.show() i += 1 # %% [markdown] # ### Get predictions: prep for Confusion Matrix y_test_pred = [] for model in models: y_test_pred.append(model.predict(x_test)) # %% [markdown] # ### Confusion Matrix from sklearn.metrics import confusion_matrix
} clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid, cv=5) clf = clf.fit(X_train_pca, Y_train) y_pred = clf.predict(X_test_pca) print(classification_report(Y_test, y_pred)) print(confusion_matrix(Y_test, y_pred, labels=['0', '1'])) # Plot the learning curves plt.figure(figsize=(20, 10)) plot_learning_curves(X_train_pca, Y_train, X_test_pca, Y_test, clf, scoring="accuracy") plt.title("Learning Curves") plt.show() # plot the result of the prediction on a portion of the test set def plot_gallery(images, titles, h, w, n_row=3, n_col=4): """Helper function to plot a gallery of portraits""" plt.figure(figsize=(1.8 * n_col, 2.4 * n_row)) plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) for i in range(n_row * n_col): plt.subplot(n_row, n_col, i + 1) plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
def rff(ne, md, of): all_data = pd.read_csv("../input/voice.csv") label_encoder = LabelEncoder() all_data["label"] = label_encoder.fit_transform(all_data["label"]) rand_indices = np.random.permutation(len(all_data)) features = [feat for feat in all_data.columns if feat != "label"] if of: try: features.remove('modindx') features.remove('dfrange') features.remove('maxdom') features.remove('mindom') features.remove('meandom') features.remove('maxfun') features.remove('minfun') features.remove('mode') features.remove('kurt') features.remove('skew') features.remove('Q75') except: print() print(features) output = "label" num_datapoints = len(all_data) test_total = int(num_datapoints * 0.3) test_set_indices = get_test_indices(num_datapoints) test_indices = [] valid_indices = [] for i in test_set_indices: if (len(test_indices) < len(test_set_indices) * 0.5): test_indices.insert(len(test_indices), i) else: valid_indices.insert(len(valid_indices), i) train_indices = get_train_indices(num_datapoints) test_data = all_data[features].iloc[test_indices] valid_data = all_data[features].iloc[valid_indices] train_data = all_data[features].iloc[train_indices] test_labels = all_data[output].iloc[test_indices] valid_labels = all_data[output].iloc[valid_indices] train_labels = all_data[output].iloc[train_indices] print(num_datapoints, len(train_data), len(test_data)) print(features) #print (test_labels) rf = RandomForestClassifier(n_estimators=ne, max_depth=md) rf.fit(train_data, train_labels) print('Number of Estimators: ' + str(ne) + '\t Max. Depth: ' + str(md)) predictions = rf.predict(valid_data) print('---------Validation Data-----------') print('Accuracy Score:') print(accuracy_score(valid_labels, predictions)) print('Precision Score:') print(precision_score(valid_labels, predictions)) print('Recall Score:') print(recall_score(valid_labels, predictions)) print('Confusion Matrix:') print(confusion_matrix(valid_labels, predictions)) plot_learning_curves(train_data, train_labels, valid_data, valid_labels, rf) plot_1 = plt predictions = rf.predict(test_data) print('---------Test Data-----------') print('Accuracy Score:') print(accuracy_score(test_labels, predictions)) print('Precision Score:') print(precision_score(test_labels, predictions)) print('Recall Score:') print(recall_score(test_labels, predictions)) print('Confusion Matrix:') print(confusion_matrix(test_labels, predictions)) plot_learning_curves(train_data, train_labels, test_data, test_labels, rf) plot_1.show() plt.show()
#new_data_train.isnull().sum().sort_values(ascending=False).head(10) # aplica uma correção para os valores nulos - setar média entre eles new_data_train['Age'].fillna(new_data_train['Age'].mean(), inplace=True) new_data_test['Fare'].fillna(new_data_test['Fare'].mean(), inplace=True) # define o conjunto de features e labels X = new_data_train.drop('Survived', axis=1) y = new_data_train['Survived'] # separa os dados em um conjunto de treinamento e teste X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) print(y_test) #cria o modelo tree = DecisionTreeClassifier(max_depth=3, random_state=0) tree.fit(X_train, y_train) plot_learning_curves(X_train, y_train, X_test, y_test, clf=tree) plt.title("DecisionTreeClassifier %.2f " % (float(tree.score(X_test, y_test)))) plt.show() #verificando o Score do conjunto de treino #print('Score : ', tree.score(X,y)) #print('Score : ', clf_rf.score(X_test,y_test)) submission = pd.DataFrame() submission['PassengerId'] = X_test['PassengerId'] submission['Survived'] = tree.predict(X_test) submission.to_csv('prediction/Titanic/submission.csv', index=False)
def lr(c, of): features = [feat for feat in all_data.columns if feat != "label"] if of: try: features.remove('modindx') features.remove('dfrange') features.remove('maxdom') features.remove('mindom') features.remove('meandom') features.remove('maxfun') features.remove('minfun') features.remove('mode') features.remove('kurt') features.remove('skew') features.remove('Q75') except: print() print(features) output = "label" num_datapoints = len(all_data) test_total = int(num_datapoints * 0.3) test_set_indices = get_test_indices(num_datapoints) test_indices = [] valid_indices = [] for i in test_set_indices: if(len(test_indices) < len(test_set_indices) * 0.5): test_indices.insert(len(test_indices), i) else: valid_indices.insert(len(valid_indices), i) train_indices = get_train_indices(num_datapoints) test_data = all_data[features].iloc[test_indices] valid_data = all_data[features].iloc[valid_indices] train_data = all_data[features].iloc[train_indices] test_labels = all_data[output].iloc[test_indices] valid_labels = all_data[output].iloc[valid_indices] train_labels = all_data[output].iloc[train_indices] logistic = linear_model.LogisticRegression(C=c) logistic.fit(train_data, train_labels) predictions = logistic.predict(valid_data) print ('--------Validation Data----------') print ('-------' + str(c) + '-------') print ('Accuracy Score:') print (accuracy_score(valid_labels, predictions)) print ('Precision Score:') print (precision_score(valid_labels, predictions)) print ('Recall Score:') print (recall_score(valid_labels, predictions)) print ('Confusion Matrix:') print (confusion_matrix(valid_labels, predictions)) plot_learning_curves(train_data, train_labels, valid_data, valid_labels, logistic) plot_1 = plt predictions = logistic.predict(test_data) print ('--------Testing Data----------') print ('-------' + str(c) + '-------') print ('Accuracy Score:') print (accuracy_score(test_labels, predictions)) print ('Precision Score:') print (precision_score(test_labels, predictions)) print ('Recall Score:') print (recall_score(test_labels, predictions)) print ('Confusion Matrix:') print (confusion_matrix(test_labels, predictions)) plot_learning_curves(train_data, train_labels, test_data, test_labels, logistic) plot_1.show() plt.show()
scoring=scoring, return_train_score=False, n_jobs=-1) for key in metrics.keys(): for fold_index, score in enumerate(metrics[key]): cv_result_entries.append( (model_namelist[i], fold_index, key, score)) i += 1 cv_results_df = pd.DataFrame(cv_result_entries) # %% # ### Misclassification Errors i = 0 for model in models: plt.figure() plot_learning_curves(X, y, X, y, model) plt.title('Learning Curve for ' + model_namelist[i], fontsize=14) plt.xlabel('Training Set Size (%)', fontsize=12) plt.ylabel('Misclassification Error', fontsize=12) plt.show() i += 1 # %% [markdown] # ### Get predictions: prep for Confusion Matrix y_test_pred = [] for model in models: y_test_pred.append(model.predict(X)) # %% [markdown] # ### Confusion Matrix from sklearn.metrics import confusion_matrix
svc = svm.SVC(kernel='linear', C=1, probability=True, decision_function_shape='ovo').fit(xtrain_bow, ytrain) print(svc) print("\n\n") text3 = colored("6. Displaying the learning Curve ", 'red', attrs=['bold']) print(text3) print("\n\n") from mlxtend.plotting import plot_learning_curves plot_learning_curves(xtrain_bow, ytrain, xvalid_bow, yvalid, svc) plt.show() prediction = svc.predict_proba(xvalid_bow) prediction_int = prediction[:, 1] >= 0.3 prediction_int = prediction_int.astype(np.int) print("\n\n") text3 = colored("7. Displaying the Predicted Sentiment Analysis ", 'red', attrs=['bold']) print(text3) f = input() print("\n\n") positive_comments = [] negative_comments = []