def cross_validation_example(): """ Slightly more complex example : Perform grid search cross-validation to find optimal parameters for MinCq using rbf kernels as voters. """ # We load iris dataset, We convert the labels to be -1 or 1, and we split it in two parts: train and test. dataset = load_iris() dataset.target[dataset.target == 0] = -1 dataset.target[dataset.target == 2] = -1 X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=42) # The learning algorithm and its parameters. learner = MinCqLearner(mu=0.0001, voters_type='kernel', kernel='rbf', gamma=0.0) learner_params = {'mu': [0.0001, 0.001, 0.01], 'gamma': [0.0, 0.1, 1.0, 10]} cv_classifier = GridSearchCV(learner, learner_params, scoring=accuracy_scorer) cv_classifier = cv_classifier.fit(X_train, y_train) predictions_train = cv_classifier.predict(X_train) predictions_test = cv_classifier.predict(X_test) print_sklearn_grid_scores("Iris", "RbfMinCq", learner_params, cv_classifier.grid_scores_) print("Best parameters: {}".format(str(cv_classifier.best_params_))) print("Training set risk: {:.4f}".format(zero_one_loss(y_train, predictions_train))) print("Testing set risk: {:.4f}".format(zero_one_loss(y_test, predictions_test)))
def drawLearningCurve(model, x_train, y_train, x_test, y_test, num_points = 50): # adapted from http://sachithdhanushka.blogspot.de/2013/09/learning-curve-generator-for-learning.html train_error = np.zeros(num_points) crossval_error = np.zeros(num_points) sizes = np.linspace(2, len(x_train), num=num_points).astype(int) for i,size in enumerate(sizes): #getting the predicted results of the model model.fit(x_train[:size], y_train[:size]) #compute the validation error y_pred = model.predict(x_test[:size]) crossval_error[i] = zero_one_loss(y_test[:size], y_pred, normalize=True) #compute the training error y_pred = model.predict(x_train[:size]) train_error[i] = zero_one_loss(y_train[:size], y_pred, normalize=True) #draw the plot print crossval_error print train_error fig,ax = plt.subplots() ax.plot(sizes,crossval_error,lw = 2, label='cross validation error') ax.plot(sizes,train_error, lw = 4, label='training error') ax.set_xlabel('cross val error') ax.set_ylabel('rms error') ax.legend(loc = 0) ax.set_title('Learning Curve' ) return fig
def plot_adaclassifier(classifier, n_estimators, X_train, X_test, y_train, y_test): fig = plt.figure() ax = fig.add_subplot(111) #ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-', # label='Decision Stump Error') #ax.plot([1, n_estimators], [dt_err] * 2, 'k--', # label='Decision Tree Error') ada_err_test = np.zeros((n_estimators,)) for i, y_pred in enumerate(classifier.staged_predict(X_test)): ada_err_test[i] = zero_one_loss(y_pred, y_test) ada_err_train = np.zeros((n_estimators,)) for i, y_pred in enumerate(classifier.staged_predict(X_train)): ada_err_train[i] = zero_one_loss(y_pred, y_train) ax.plot(np.arange(n_estimators) + 1, ada_err_test, label='AdaBoost Test Error', color='red') ax.plot(np.arange(n_estimators) + 1, ada_err_train, label='AdaBoost Train Error', color='blue') ax.set_ylim((0.0, 1.0)) ax.set_xlabel('n_estimators') ax.set_ylabel('error rate') leg = ax.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.7) return fig
def test_grid(features, target): ''' Given a list of models for each genre, run the features through the models to predict target labels, and compare the predictions to the true target labels. ''' genre_list = ['animated', 'action', 'comedy', 'drama', 'family', 'fantasy', \ 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western'] ypred_mat = np.empty([target.shape[0], target.shape[1]]) for i in xrange(target.shape[1]): filename = '../data/is_' + genre_list[i] + '.pkl' ypred = test_prediction(filename, features, target[:,i]) for j, prob in enumerate(ypred): ypred_mat[j,i] = prob with open('../data/grid_pkl_500.txt','w') as f: f.write("Model rounded by .25\n") yrd = round_by(ypred_mat, .25) f.write( metrics.classification_report(target, yrd) ) f.write( "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd)) ) f.write("\nModel rounded by .3\n") yrd = round_by(ypred_mat, .3) f.write( metrics.classification_report(target, yrd) ) f.write( "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd)) ) f.write("\nModel rounded by .2\n") yrd = round_by(ypred_mat, .2) f.write( metrics.classification_report(target, yrd) ) f.write( "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd)) ) f.write("\nModel rounded by .1\n") yrd = round_by(ypred_mat, .1) f.write( metrics.classification_report(target, yrd) ) f.write( "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd)) )
def run_gamma(x, y): perc = 0.6 n = x.shape[0] gamma_list = (np.power(2.0, range(-4, 12))/(n*perc)).tolist() n_iter = 2 train_err_libsvm = np.zeros((len(gamma_list), n_iter)) test_err_libsvm = np.zeros((len(gamma_list), n_iter)) train_err_dsvm = np.zeros((len(gamma_list), n_iter)) test_err_dsvm = np.zeros((len(gamma_list), n_iter)) train_err_pegasos = np.zeros((len(gamma_list), n_iter)) test_err_pegasos = np.zeros((len(gamma_list), n_iter)) ss = cv.StratifiedShuffleSplit(y, n_iter=n_iter, test_size=1-perc, train_size=None, random_state=0) for k, (train, test) in enumerate(ss): ntr = len(train) lmda = 1.0 / ntr print "#iter: %d" % k x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test] mM_scale = preprocessing.MinMaxScaler(feature_range=(-1, 1)) x_train = mM_scale.fit_transform(x_train) x_test = mM_scale.transform(x_test) for j, gm in enumerate(gamma_list): print "check lamda %f, gamma %f" % (lmda, gm) clf = svm.SVC(C=lmda * ntr, kernel='rbf', gamma=gm, cache_size=600) clf.fit(x_train, y_train) pred = clf.predict(x_train) train_err_libsvm[j, k] = zero_one_loss(y_train, pred) pred = clf.predict(x_test) test_err_libsvm[j, k] = zero_one_loss(y_test, pred) dsvm = DualKSVM(lmda=lmda, gm=gm, kernelstr='rbf', nsweep=ntr/2, b=5, c=1) dsvm.fit(x_train, y_train, x_test, y_test, ) train_err_dsvm[j, k] = dsvm.err_tr[-1] test_err_dsvm[j, k] = dsvm.err_te[-1] kpega = Pegasos(ntr, lmda, gm, nsweep=2, batchsize=2) kpega.train_test(x_train, y_train, x_test, y_test) train_err_pegasos[j, k] = kpega.err_tr[-1] test_err_pegasos[j, k] = kpega.err_te[-1] avg_train_err_libsvm = np.mean(train_err_libsvm, axis=1) avg_test_err_libsvm = np.mean(test_err_libsvm, axis=1) avg_train_err_dsvm = np.mean(train_err_dsvm, axis=1) avg_test_err_dsvm = np.mean(test_err_dsvm, axis=1) avg_train_err_pegasos = np.mean(train_err_pegasos, axis=1) avg_test_err_pegasos = np.mean(test_err_pegasos, axis=1) plt.figure() # color_list = ['b', 'r', 'g', 'c', ] # marker_list = ['o', 'x', '>', 's'] plt.loglog(gamma_list, avg_train_err_libsvm, 'bo-', label='libsvm train') plt.loglog(gamma_list, avg_test_err_libsvm, 'ro-', label='libsvm test') plt.loglog(gamma_list, avg_train_err_dsvm, 'gx-', label='dsvm train') plt.loglog(gamma_list, avg_test_err_dsvm, 'cx-', label='dsvm test') plt.loglog(gamma_list, avg_train_err_pegasos, 'mD-', label='pegasos train') plt.loglog(gamma_list, avg_test_err_pegasos, 'kD-', label='pegasos test') plt.legend(bbox_to_anchor=(0, 1.17, 1, .1), loc=2, ncol=2, mode="expand", borderaxespad=0) plt.savefig('../output/usps_diff_gamma.pdf')
def build_tree(clf,type,i,X_train, X_test, y_train, y_test,attribute_names,class_names): print("------------Run "+type+ "_"+str(i)+"----------") clf.fit(X_train, y_train) print("Training error =", zero_one_loss(y_train, clf.predict(X_train))) predicted_test = clf.predict(X_test) print("Test error =",zero_one_loss(y_test, predicted_test ) ) figure_name = type+"_"+str(i) visualize_tree(clf,attribute_names,class_names,figure_name) print(classification_report( y_test,predicted_test )) print(confusion_matrix(y_test,predicted_test)) return zero_one_loss(y_test, predicted_test )
def simple_classification_example(): """ Simple example : with fixed hyperparameters, run four versions of MinCq on a single dataset. """ # MinCq parameters, fixed to a given value as this is a simple example. mu = 0.001 # We load iris dataset, We convert the labels to be -1 or 1, and we split it in two parts: train and test. dataset = load_iris() dataset.target[dataset.target == 0] = -1 dataset.target[dataset.target == 2] = -1 X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=42) # We train MinCq using decision stumps as voters, on the training set. learner = MinCqLearner(mu, voters_type='stumps') learner.fit(X_train, y_train) # We predict the train and test labels and print the risk. predictions_train = learner.predict(X_train) predictions_test = learner.predict(X_test) print("\nStumpsMinCq") print("-----------") print("Training set risk: {:.4f}".format(zero_one_loss(y_train, predictions_train))) print("Testing set risk: {:.4f}\n".format(zero_one_loss(y_test, predictions_test))) # We do the same again, now with a linear kernel. learner = MinCqLearner(mu, voters_type='kernel', kernel='linear') learner.fit(X_train, y_train) predictions_train = learner.predict(X_train) predictions_test = learner.predict(X_test) print("\nLinearMinCq") print("-----------") print("Training set risk: {:.4f}".format(zero_one_loss(y_train, predictions_train))) print("Testing set risk: {:.4f}\n".format(zero_one_loss(y_test, predictions_test))) # We do the same again, now with a polynomial kernel. learner = MinCqLearner(mu, voters_type='kernel', kernel='poly') learner.fit(X_train, y_train) predictions_train = learner.predict(X_train) predictions_test = learner.predict(X_test) print("\nPolyMinCq") print("-----------") print("Training set risk: {:.4f}".format(zero_one_loss(y_train, predictions_train))) print("Testing set risk: {:.4f}\n".format(zero_one_loss(y_test, predictions_test))) # We do the same again, now with an RBF kernel. learner = MinCqLearner(mu, voters_type='kernel', kernel='rbf', gamma=0.0) learner.fit(X_train, y_train) predictions_train = learner.predict(X_train) predictions_test = learner.predict(X_test) print("\nRbfMinCq") print("--------") print("Training set risk: {:.4f}".format(zero_one_loss(y_train, predictions_train))) print("Testing set risk: {:.4f}\n".format(zero_one_loss(y_test, predictions_test)))
def test_losses(): """Test loss functions""" y_true, y_pred, _ = make_prediction(binary=True) n_samples = y_true.shape[0] n_classes = np.size(unique_labels(y_true)) # Classification # -------------- with warnings.catch_warnings(record=True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), 11) assert_almost_equal(zero_one_loss(y_true, y_pred), 11 / float(n_samples), 2) assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 11) assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2) assert_almost_equal(hamming_loss(y_true, y_pred), 2 * 11. / (n_samples * n_classes), 2) assert_equal(accuracy_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) # Regression # ---------- assert_almost_equal(mean_squared_error(y_true, y_pred), 10.999 / n_samples, 2) assert_almost_equal(mean_squared_error(y_true, y_true), 0.00, 2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. assert_almost_equal(mean_absolute_error(y_true, y_pred), 10.999 / n_samples, 2) assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 0.16, 2) assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2) assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0) assert_almost_equal(r2_score(y_true, y_pred), 0.12, 2) assert_almost_equal(r2_score(y_true, y_true), 1.00, 2) assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0) assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
def lr(X_train, y_train, X_test, y_test): # Tune the hyperparameter maxScore = float("-inf") maxC = 0 for c in np.arange(0.1, 1, 0.1): clf = LogisticRegression(penalty="l2", C=c).fit(X_train, y_train) scores = cross_val_score(clf, X_train, y_train, cv=5) mean = np.mean(scores) print("C: %f and Score: %f" % (c, mean)) if mean > maxScore: maxScore = mean maxC = c # Train the model print("MaxC: %f" % maxC) print("MaxScore: %f" % maxScore) clf = LogisticRegression(penalty="l2", C=maxC).fit(X_train, y_train) # Predict labels for the test data pred = clf.predict(X_test) pred_prob = clf.predict_proba(X_test) # Calculate the misclassification rate mc_rate = zero_one_loss(y_test, pred) print("MC rate: %f" % mc_rate) # Calculate the ROC curve prob = pred_prob[:, 1:] roc_score = roc_auc_score(y_test, prob) print("ROC score: %f" % roc_score) return (mc_rate, roc_score)
def train(min_samples_leaf, max_depth, dataset): mlflow.log_param("min_samples_leaf", min_samples_leaf) mlflow.log_param("max_depth", max_depth) clf = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, max_depth=max_depth) print("Classifier:",clf) clf.fit(dataset.data, dataset.target) expected = dataset.target predicted = clf.predict(dataset.data) mlflow.sklearn.log_model(clf, "model") write_artifact('confusion_matrix.txt',str(metrics.confusion_matrix(expected, predicted))) write_artifact('classification_report.txt',metrics.classification_report(expected, predicted)) auc = metrics.auc(expected, predicted) accuracy_score = metrics.accuracy_score(expected, predicted) zero_one_loss = metrics.zero_one_loss(expected, predicted) mlflow.log_metric("auc", auc) mlflow.log_metric("accuracy_score", accuracy_score) mlflow.log_metric("zero_one_loss", zero_one_loss) print("Params: min_samples_leaf={} max_depth={}".format(min_samples_leaf,max_depth)) print("Metrics: auc={} accuracy_score={} zero_one_loss={}".format(auc,accuracy_score,zero_one_loss))
def experiment_neighbors_k_nearest_neighbors(): avgError = [] x_learners = [] for k_neighbors in range(1, 20, 1): k = 10 skf = StratifiedKFold(labels,n_folds=k) averageError = 0.0 for train_index, test_index in skf: X_train, X_test = mfcc[:,train_index], mfcc[:,test_index] y_train, y_test = labels[train_index], labels[test_index] knc = KNeighborsClassifier(n_neighbors=k_neighbors, weights='distance') knc.fit(X_train.T,y_train) y_pred = knc.predict(X_test.T) error = zero_one_loss(y_pred,y_test) print error averageError += (1./k) * error print "Average error: %4.2f%s" % (100 * averageError,'%') avgError.append(averageError) x_learners.append(k_neighbors) plt.plot(x_learners, avgError) plt.ylabel('Average Error (k=10)') plt.xlabel('Number of Neighbors') plt.title('Error as a function of the number of neighbors taken into consideration') plt.show()
def experiment_pca_n_components_random_forest(): pca = decomposition.PCA() rf = RandomForestClassifier(n_estimators=maxLearners, max_depth = maxDepth, warm_start = False) pipe = Pipeline(steps=[('pca', pca), ('rf', rf)]) avgError = [] x_learners = [] for k_components in range(10, 100, 10): k = 10 skf = StratifiedKFold(labels,n_folds=k) averageError = 0.0 for train_index, test_index in skf: X_train, X_test = mfcc[:,train_index], mfcc[:,test_index] y_train, y_test = labels[train_index], labels[test_index] estimator = GridSearchCV(pipe, dict(pca__n_components=[k_components])) estimator.fit(X_train.T,y_train) y_pred = estimator.predict(X_test.T) error = zero_one_loss(y_pred,y_test) print error averageError += (1./k) * error print "Average error: %4.2f%s" % (100 * averageError,'%') avgError.append(averageError) x_learners.append(k_components) plt.plot(x_learners, avgError) plt.ylabel('Average Error (k=10)') plt.xlabel('Number of Components') plt.title('Error as a function of the number of components') plt.show()
def classify_by_KNeighbors(train_x, train_Y, test_x, test_Y, colors, category): classifier = KNeighborsClassifier(n_neighbors=3, n_jobs=-1) classifier.fit(train_x, train_Y) pred_y = classifier.predict(test_x) results = confusion_matrix(test_Y, pred_y) error = zero_one_loss(test_Y, pred_y) accuracy = metrics.accuracy_score(test_Y, pred_y) classification = metrics.classification_report(test_Y, pred_y) pca = PCA(n_components=2) train_x_pca_cont = pca.fit_transform(test_x) plt.figure(figsize=(15, 10)) for color, cat in zip(colors, category.keys()): plt.scatter(train_x_pca_cont[pred_y == cat, 0], train_x_pca_cont[pred_y == cat, 1], color=color, alpha=.8, lw=2, label=cat) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title(" KNeibors result visualization") plt.show() plt.figure(figsize=(10, 7)) sn.heatmap(results, annot=True, fmt='d') plt.title("KNeighbors confusion matrix: \n") print("KNeighbors confusion matrix: ", results) print("Error: ", error * 100, '%') print("Accuracy: ", accuracy * 100, "%") print("Classification report:" "\n", classification) return pred_y
def experiment_estimators_AdaBoostRandomForest(): avgError = [] x_learners = [] rf = RandomForestClassifier(n_estimators=maxLearners, max_depth = maxDepth, warm_start = False) for lr in frange(0.01, 1., 0.25): k = 10 skf = StratifiedKFold(labels,n_folds=k) averageError = 0.0 for train_index, test_index in skf: X_train, X_test = mfcc[:,train_index], mfcc[:,test_index] y_train, y_test = labels[train_index], labels[test_index] adb = AdaBoostClassifier(base_estimator=rf, n_estimators=100, learning_rate=lr) adb.fit(X_train.T,y_train) y_pred = adb.predict(X_test.T) error = zero_one_loss(y_pred,y_test) print error averageError += (1./k) * error print "Average error: %4.2f%s" % (100 * averageError,'%') avgError.append(averageError) x_learners.append(lr) # graph the errors now. plt.plot(x_learners, avgError) plt.ylabel('Average Error (k=10)') plt.xlabel('Learning Rate') plt.title('Error as a function of the learning rate') plt.show()
def evaluate_naivebayes(classifier, test_reviews): # For computing metrics ref_set = collections.defaultdict(set) test_set = collections.defaultdict(set) ref_set_arr = [] test_set_arr = [] # Create gold standard and predicted labels for i, (feat, label) in enumerate(test_reviews): # Predict observed = classifier.classify(feat) ref_set[label].add(i) test_set[observed].add(i) label = 0 if label == "neg" else 1 observed = 0 if observed == "neg" else 1 ref_set_arr.append(label) test_set_arr.append(observed) print('pos precision:', precision(ref_set['pos'], test_set['pos'])) print('pos recall:', recall(ref_set['pos'], test_set['pos'])) print('neg precision:', precision(ref_set['neg'], test_set['neg'])) print('neg recall:', recall(ref_set['neg'], test_set['neg'])) print('misclassification rate', zero_one_loss(ref_set_arr, test_set_arr)) print('most informative features', classifier.show_most_informative_features(10))
def experiment_learners_random_forest(): avgError = [] x_learners = [] for maxLearners in range(10, 150, 20): k = 10 skf = StratifiedKFold(labels,n_folds=k) averageError = 0.0 for train_index, test_index in skf: X_train, X_test = mfcc[:,train_index], mfcc[:,test_index] y_train, y_test = labels[train_index], labels[test_index] rf = RandomForestClassifier(n_estimators=maxLearners, max_depth = maxDepth, warm_start = False) rf.fit(X_train.T,y_train) y_pred = rf.predict(X_test.T) error = zero_one_loss(y_pred,y_test) print error averageError += (1./k) * error print "Average error: %4.2f%s" % (100 * averageError,'%') avgError.append(averageError) x_learners.append(maxLearners) plt.plot(x_learners, avgError) plt.ylabel('Average Error (k=10)') plt.xlabel('Max Learners') plt.title('Error as a function of the number of learners') plt.show()
def perceptron_v0(max_iters, pred_every, X_train, X_test, Y_train, Y_test): acc = [] w = np.zeros([num_digits, X_train.shape[1]]) iters = 0 while iters < max_iters: # Train for i in range(len(Y_train)): iters += 1 for int_class in range(num_digits): Y_mult = 1 if Y_train[i] == int_class else -1 if Y_mult * np.dot(w[int_class], X_train[i]) <= 0: w[int_class] += Y_mult * X_train[i] if iters % pred_every == 0: # Predict Y_pred = np.zeros(Y_test.shape) for k in range(len(Y_test)): preds = np.zeros(num_digits) for int_class in range(num_digits): preds[int_class] = np.dot(w[int_class], X_test[k]) Y_pred[k] = np.argmax(preds) # Test acc.append(zero_one_loss(Y_test, Y_pred)) return acc
def train(self, train_data, tr_lab=None): """ Method that performs training. It compares the clustering labels on training set (i.e., A(X) computed by :class:`reval.relative_validation.RelativeValidation.clust_method`) against the labels obtained from the classification algorithm (i.e., f(X), computed by :class:`reval.relative_validation.RelativeValidation.class_method`). It returns the misclassification error, the supervised model fitted to the data, and both clustering and classification labels. :param train_data: training dataset. :type train_data: ndarray, (n_samples, n_features) :param tr_lab: cluster labels found during CV for clustering methods with no `n_clusters` parameter. If not None the clustering method is not performed on the whole test set. Default None. :type tr_lab: list :return: misclassification error, fitted supervised model object, clustering and classification labels. :rtype: float, object, ndarray (n_samples,) """ if tr_lab is None: clustlab_tr = self.clust_method.fit_predict(train_data) # A_k(X) else: clustlab_tr = tr_lab if len([cl for cl in clustlab_tr if cl >= 0]) == 0: logging.info( f"No clusters found during training with {self.clust_method}.") return None fitclass_tr = self.class_method.fit(train_data, clustlab_tr) classlab_tr = fitclass_tr.predict(train_data) misclass = zero_one_loss(clustlab_tr, classlab_tr) return misclass, fitclass_tr, clustlab_tr
def inline(inputfile, outputfile): # data = np.loadtxt(sys.stdin) data = np.loadtxt(inputfile, delimiter=',') if np.ndim(data) == 1: data = np.array([data]) train_x = data[:, 1:] train_y = data[:, 0] candidate_size = 1000 evaluation_size = 1000 x, y = make_classification(n_samples=candidate_size + evaluation_size, n_features=2, n_informative=1, n_redundant=1, n_clusters_per_class=1, random_state=37) eval_x = x[candidate_size:] eval_y = y[candidate_size:] learner = KNeighborsClassifier(n_neighbors=1) learner = learner.fit(train_x, train_y) pred_y = learner.predict(eval_x) with open(outputfile, 'w') as f: l = zero_one_loss(eval_y, pred_y) f.write(str(l))
def __train_with_cross_validation(self): """ This method enables sk-learn algorithms to perform KFold-cross-validation. The method also initiates the cnvrg experiment with all its metrics. """ scores = cross_validate(estimator=self.__model, X=self.__x_train.values, y=self.__y_train.values, cv=self.__cross_val_folds, return_train_score=True, scoring=['neg_mean_squared_error', 'accuracy'], return_estimator=True) train_acc_cv = scores['train_accuracy'] train_err_cv = (-1) * scores['train_neg_mean_squared_error'] val_acc_cv = scores['test_accuracy'] val_err_cv = (-1) * scores['test_neg_mean_squared_error'] self.__model = scores['estimator'][-1] y_pred = self.__model.predict(self.__x_test.values) test_acc = accuracy_score(self.__y_test.values, y_pred) test_loss = zero_one_loss(self.__y_test.values, y_pred) self.__metrics.update({ 'train_acc': train_acc_cv, 'train_loss': train_err_cv, 'train_loss_type': 'MSE', 'validation_acc': val_acc_cv, 'validation_loss': val_err_cv, 'validation_loss_type': 'MSE', 'test_acc': test_acc, 'test_loss': test_loss, 'test_loss_type': 'zero_one_loss' }) self.__plot_all(y_pred)
def test(self, test_data, fit_model): """ Method that compares test set clustering labels (i.e., A(X'), computed by :class:`reval.relative_validation.RelativeValidation.clust_method`) against the (permuted) labels obtained through the classification algorithm fitted to the training set (i.e., f(X'), computed by :class:`reval.relative_validation.RelativeValidation.class_method`). It returns the misclassification error, together with both clustering and classification labels. :param test_data: test dataset. :type test_data: ndarray, (n_samples, n_features) :param fit_model: fitted supervised model. :type fit_model: class :return: misclassification error, clustering and classification labels. :rtype: float, dictionary of ndarrays (n_samples,) """ clustlab_ts = self.clust_method.fit_predict(test_data) # A_k(X') if len([cl for cl in clustlab_ts if cl >= 0]) == 0: logging.info( f"No clusters found during testing with {self.clust_method}") return None classlab_ts = fit_model.predict(test_data) bestperm = kuhn_munkres_algorithm(classlab_ts, clustlab_ts) # array of integers misclass = zero_one_loss(classlab_ts, bestperm) return misclass, bestperm
def nb(X_train, y_train, X_test, y_test): # Tune the hyperparameter maxScore = float("-inf") maxA = 0 for a in np.arange(0.1, 1, 0.1): clf = MultinomialNB(alpha=a).fit(X_train, y_train) scores = cross_val_score(clf, X_train, y_train, cv=5) mean = np.mean(scores) print("A: %f and Score: %f" % (a, mean)) if mean > maxScore: maxScore = mean maxA = a # Train the model print("MaxA: %f" % maxA) print("MaxScore: %f" % maxScore) clf = MultinomialNB(alpha=maxA).fit(X_train, y_train) # Predict labels for the test data pred = clf.predict(X_test) pred_prob = clf.predict_proba(X_test) # Calculate the misclassification rate mc_rate = zero_one_loss(y_test, pred) print("MC rate: %f" % mc_rate) # Calculate the ROC curve prob = pred_prob[:, 1:] roc_score = roc_auc_score(y_test, prob) print("ROC score: %f" % roc_score) return (mc_rate, roc_score)
def rf(X_train, y_train, X_test, y_test): # Tune the hyperparameter maxScore = float("-inf") maxN = 0 for n in np.arange(100, 600, 100): clf = RandomForestClassifier(n_estimators=n).fit(X_train, y_train) scores = cross_val_score(clf, X_train, y_train, cv=5) mean = np.mean(scores) print("N: %f and Score: %f" % (n, mean)) if mean > maxScore: maxScore = mean maxN = n # Train the model print("MaxN: %f" % maxN) print("MaxScore: %f" % maxScore) clf = RandomForestClassifier(n_estimators=maxN).fit(X_train, y_train) # Predict labels for the test data pred = clf.predict(X_test) pred_prob = clf.predict_proba(X_test) # Calculate the misclassification rate mc_rate = zero_one_loss(y_test, pred) print("MC rate: %f" % mc_rate) # Calculate the ROC curve prob = pred_prob[:, 1:] roc_score = roc_auc_score(y_test, prob) curve = roc_curve(y_test, prob) print("ROC score: %f" % roc_score) return (mc_rate, roc_score, curve)
def cross_valid(h, y, ratio_list): """ cross validation to tune the best cap probability for soft-margin boosting """ print " find optimal ratio" n_samples = h.shape[0] n_folds = 4 ntr = n_samples/n_folds ratio_list = ratio_list[ratio_list >= 1.0/ntr] kf = cv.KFold(n=n_samples, n_folds=n_folds) err_tr = np.zeros((n_folds, len(ratio_list))) err_te = np.zeros((n_folds, len(ratio_list))) k = 0 for tr_ind, te_ind in kf: print "nfold: %d" % (k) xtr, ytr, xte, yte = h[tr_ind, :], y[tr_ind], h[te_ind, :], y[te_ind] for i, r in enumerate(ratio_list): pd = ParaBoost(epsi=0.005, has_dcap=True, ratio=r) pd.train(xtr, ytr) pred = pd.test_h(xte) err_te[k, i] = zero_one_loss(y_true=yte, y_pred=pred) err_tr[k, i] = pd.err_tr[-1] k += 1 err_te_avg = np.mean(err_te, axis=0) err_tr_avg = np.mean(err_tr, axis=0) arg = np.argmin(err_te_avg) best_ratio = ratio_list[arg] err = err_te_avg[arg] return best_ratio
def clf_bias_var(clf, X, y, n_replicas): roc_auc_scorer = get_scorer("roc_auc") # roc_auc_scorer(clf, X_test, y_test) auc_scores = [] error_scores = [] counts = np.zeros(X.shape[0], dtype = np.float64) sum_preds = np.zeros(X.shape[0], dtype = np.float64) for it in xrange(n_replicas): # generate train sets and test sets train_indices = np.random.randint(X.shape[0], size = X.shape[0]) # get test sets in_train = np.unique(train_indices) mask = np.ones(X.shape[0], dtype = np.bool) mask[in_train] = False test_indices = np.arange(X.shape[0])[mask] clf.fit(X[train_indices], y[train_indices]) auc_scores.append(roc_auc_scorer(clf, X[test_indices], y[test_indices])) error_scores.append(zero_one_loss(y[test_indices], clf.predict(X[test_indices]))) preds = clf.predict(X) for index in test_indices: counts[index] += 1 sum_preds[index] += preds[index] test_mask = (counts > 0) # indices of samples that have been tested # print('counts mean: {}'.format(np.mean(counts))) # print('counts standard derivation: {}'.format(np.std(counts))) bias, var = bias_var(y[test_mask], sum_preds[test_mask], counts[test_mask], n_replicas) return auc_scores, error_scores, bias, var
def exercise_1(): X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0) n_samples = len(X) kf = cross_validation.KFold(n_samples, n_folds=10, shuffle=False, random_state=None) # kf = cross_validation.ShuffleSplit(1000,n_iter=25, test_size=0.1, train_size=0.9, random_state=None) error_total = np.zeros([49, 1], dtype=float) for k in range(1,50): error = [] clf = KNeighborsClassifier(n_neighbors=k) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) error.append( zero_one_loss(y_test, clf.predict(X_test)) ) # error.append(clf.predict(X_test)) # error.append( 1. - clf.score(X_test, y_test) ) #, accuracy_score(y_test, clf.predict(X_test)) # error.append(mean_squared_error(y_test, clf.predict(X_test))) # error.append() # print error error_total[k-1, 0] = np.array(error).mean() # print error_total x = np.arange(1,50, dtype=int) plt.style.use('ggplot') plt.plot(x, error_total[:, 0], '#009999', marker='o') # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^') plt.xticks(x, x) plt.margins(0.02) plt.xlabel('K values') plt.ylabel('Missclasification Error') plt.show()
def exercise_2(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.KFold(len(X), n_folds=10, shuffle=False, random_state=0) error = [] error_mean = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) for i in lst: error_mean = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.set_params(n_estimators=i) clf.fit(X_train, y_train) error_mean.append( zero_one_loss(y_test, clf.predict(X_test)) ) error.append( np.array(error_mean).mean() ) #plot plt.style.use('ggplot') plt.plot(lst, error, '#009999', marker='o') plt.xticks(lst) plt.show()
def test_sample_order_invariance(): y_true, y_pred, _ = make_prediction(binary=True) y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred, random_state=0) for metric in [accuracy_score, hamming_loss, zero_one_loss, lambda y1, y2: zero_one_loss(y1, y2, normalize=False), precision_score, recall_score, f1_score, lambda y1, y2: fbeta_score(y1, y2, beta=2), lambda y1, y2: fbeta_score(y1, y2, beta=0.5), matthews_corrcoef, mean_absolute_error, mean_squared_error, explained_variance_score, r2_score]: assert_almost_equal(metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" % metric)
def fit(self, data, target): no_of_stages = self.no_of_stages decision_stump = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=1, max_features=1) #No. of samples m = data.shape[0] weight = numpy.ones(m) weight = numpy.float32(weight)/m Alpha = numpy.zeros(no_of_stages) classifiers = [] for i in range(no_of_stages): decision_stump = decision_stump.fit(data, target, sample_weight = weight) classifiers.append(decision_stump) pred = decision_stump.predict(data) error = zero_one_loss(target, pred, normalize=True, sample_weight = weight) if error > 0.5: print 'error value is greater than 0.5!' beta = error/(1-error) if beta != 0: weight[pred == target] = weight[pred==target]*beta weight = weight / weight.sum() print weight # beta_mat = (pred==target)*beta # beta_mat[beta_mat==0] = 1 # weight = numpy.multiply(weight, beta_mat) if beta > 0: alpha = math.log(1/beta) else: alpha = 10000 # make alpha extremly large if decision stump is totally correct. Alpha[i] = alpha self.Alpha = Alpha self.classifiers = classifiers
def run_ordinal_regression(X_train, y_train, X_test, y_test, ordinal_regression_model): print("Running ordinal regression with multiclass labels...") ordinal_regression_clf = ordinal_regression_model(alpha=ALPHA, max_iter=MAX_ITER) ordinal_regression_clf.fit(X_train, y_train) y_pred = ordinal_regression_clf.predict(X_train) training_err = metrics.zero_one_loss(y_train, y_pred, normalize=False) print("%.4f = Training accuracy for ordinal regression with multiclass labels" % (float(len(y_train) - training_err) / len(y_train))) y_pred = ordinal_regression_clf.predict(X_test) test_err = metrics.zero_one_loss(y_test, y_pred, normalize=False) print("%.4f = Test accuracy for ordinal regression with multiclass labels" % (float(len(y_test) - test_err) / len(y_test))) return float(len(y_test) - test_err) / len(y_test)
def apply_dbn(files, main_driver=1): """ Applies DBN for identifying trips which are not from the driver of interest """ (X_train, Y_train, weight, X, driver_trip_arr) = \ get_train_data(files, main_driver) a = np.empty(shape=[0, 2]) net = DBN([len(COL), 10, 2], learn_rates=0.3, learn_rate_decays=0.9, epochs=10, verbose=0) net.fit(X_train, Y_train) Y_dbn = net.predict(X_train) print main_driver, ':', 1 - zero_one_loss(Y_train, Y_dbn) # print "Classification report:" # print classification_report(Y_train, preds) i = 0 Y = net.predict(X) for y in Y: driver_trip = driver_trip_arr[i][0] prob = str(int(Y[i])) a = np.append(a, np.array([[driver_trip, prob]]), axis=0) i = i + 1 print main_driver, ': ', sum([1 for p in a if p[1] == '1']) return a
def compare_manual_vs_model(): with open(DATA_FOLDER + "labels_int.p", "r") as f: y_dict = pickle.load(f) print "Loading test data" X_test, y_test, filenames_test = dataset.load_test() y_pred = joblib.load("../models/pred_ml_improved.pkl") relevant = [] for pred, correct, filename in zip(y_pred, y_test, filenames_test): if filename in FILES: relevant.append((pred, correct, filename, CLASSIFICATIONS[filename])) model_predictions, correct, filename, manual_predictions = zip(*relevant) manual_predictions = learn.multilabel_binary_y(manual_predictions) model_predictions = np.array(model_predictions) correct = learn.multilabel_binary_y(correct) rules = infer_topology.infer_topology_rules() improved_manual = infer_topology.apply_topology_rules(rules, manual_predictions) prediction_names = ["MODEL", "MANUAL", "IMPROVED_MANUAL"] predictions = [model_predictions, manual_predictions, improved_manual] for name, pred in zip(prediction_names, predictions): print "\n{}\n--".format(name) print "Zero-one classification loss", zero_one_loss(correct, pred) print "Hamming loss", hamming_loss(correct, pred) print "Precision:", precision_score(correct, pred, average="weighted", labels=label_list) print "Recall :", recall_score(correct, pred, average="weighted", labels=label_list) print "F1 score :", f1_score(correct, pred, average="weighted", labels=label_list)
def classify(self, model, test_y, test_x): pred = model.predict(test_x) if not self.multi: rec, spec, acc = self.score(pred, test_y) return rec, spec, acc else: return 1 - zero_one_loss(test_y, pred)
def use_sklearn_ml_knn(): """ :return: """ base_path = os.getcwd() # train_x = np.load(os.path.join(base_path, 'dataset/train_x.npy'), allow_pickle=True) # train_y = np.load(os.path.join(base_path, 'dataset/train_y.npy'), allow_pickle=True) train_x = np.load(os.path.join(base_path, 'my_dataset/train_x.npy'), allow_pickle=True) train_y = np.load(os.path.join(base_path, 'my_dataset/train_y.npy'), allow_pickle=True) new_train_y = [] for tup in train_y: tmp = [] for label in tup: if label == 0: tmp.append(0) else: tmp.append(1) new_train_y.append(tmp) # test_x = np.load('dataset/test_x.npy', allow_pickle=True) # test_y = np.load('dataset/test_y.npy', allow_pickle=True) test_x = np.load('my_dataset/test_x.npy', allow_pickle=True) test_y = np.load('my_dataset/test_y.npy', allow_pickle=True) new_test_y = [] for tup in test_y: tmp = [] for label in tup: if label == 0: tmp.append(0) else: tmp.append(1) new_test_y.append(tmp) new_test_y = np.array(new_test_y) classifier = MLkNN2(train_x, np.array(new_train_y), k=10) # classifier.fit(train_x, np.array(new_train_y)) classifier.fit() predictions = classifier.predict(test_x) predictions = convert_prediction(predictions) # hamming_loss = HammingLoss(new_test_y, predictions) h_loss = hamming_loss(new_test_y, predictions) z = zero_one_loss(new_test_y, predictions) c = coverage_error(new_test_y, predictions) r = label_ranking_loss(new_test_y, predictions) a = average_precision_score(new_test_y, predictions) print('hamming_loss = ', h_loss) print('0-1_loss = ', z) print('cover_loss = ', c) print('rank_loss = ', r) print('average_loss = ', a)
def modelevaluation(self, y_test, y_pred, features, ml): ''' confusion = metrics.confusion_matrix(y_test, y_pred) print("Confussion matrix: \n", confusion) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] ''' data = {} data[features] = [] print('----------REPORT-----------') print("Accuracy: ", metrics.accuracy_score(y_test, y_pred)) print("Balanced Accuracy: ", metrics.balanced_accuracy_score( y_test, y_pred, sample_weight=None)) #Average of label accuracies print("Precision: ", metrics.precision_score(y_test, y_pred)) print("Recall: ", metrics.recall_score(y_test, y_pred)) print("F1 score macro: ", metrics.f1_score(y_test, y_pred, average='macro')) print("F1 score micro: ", metrics.f1_score(y_test, y_pred, average='micro')) print("F-Beta score: ", metrics.fbeta_score(y_test, y_pred, beta=10)) print("AUC Score: ", metrics.roc_auc_score(y_test, y_pred)) print("Zero_one_loss", metrics.zero_one_loss(y_test, y_pred)) print( "Matthews_corrcoef", metrics.matthews_corrcoef(y_test, y_pred) ) #Gives equal weight to all TP, TN, FP, FN (Better than F1-score) print( "Brier score: ", metrics.brier_score_loss(y_test, y_pred) ) #The Brier score is calculated as the mean squared error between the expected probabilities for the positive class (e.g. 1.0) and the predicted probabilities. (Better than log_loss) print( "Cohen keppa score: ", metrics.cohen_kappa_score(y_test, y_pred) ) #It basically tells you how much better your classifier is performing over the performance of a classifier that simply guesses at random according to the frequency of each class. print("Classification_report\n", metrics.classification_report(y_test, y_pred, output_dict=True)) print('----------REPORT-----------') with open('evaluations/model_evaluation.json', 'r+') as opened_file: current_json = json.load(opened_file) current_json[features] = { 'model': '' + ml, 'accuracy': metrics.accuracy_score(y_test, y_pred), 'fraud_precision': metrics.classification_report( y_test, y_pred, output_dict=True)['1']['precision'], 'fraud_recall': metrics.classification_report(y_test, y_pred, output_dict=True)['1']['recall'], 'fraud_f1_score': metrics.classification_report( y_test, y_pred, output_dict=True)['1']['f1-score'] } opened_file.seek(0) opened_file.truncate(0) json.dump(current_json, opened_file)
def main(): test_label = [1]*100 + [2]*100 + [3]*100 + [4]*100 + [5]*100 + \ [6]*100 + [7]*100 + [8]*100 filename = sys.argv[1] with open(filename, 'rU') as f: pred = [rec for rec in csv.reader(f, delimiter=',')] pred = sum(pred, []) pred = [int(x) for x in pred] print zero_one_loss(pred, test_label) cm = confusion_matrix(test_label, pred, labels=[1, 2, 3, 4, 5, 6, 7, 8]) np.set_printoptions(precision=2) fig = plt.figure() fig.patch.set_facecolor('white') plot_confusion_matrix(cm) plt.show()
def compute_evaluation(true_matrix, predict_matrix): h = hamming_loss(true_matrix, predict_matrix) z = zero_one_loss(true_matrix, predict_matrix) c = coverage_error(true_matrix, predict_matrix) result = [h, z, c] return result
def test_logitboost_hastie_fitting(): c = LogitBoostClassifier(base_estimator=DecisionTreeRegressor(max_depth=1), n_estimators=30, learning_rate=1.0) data = Hastie_10_2() c.fit(data.data, np.sign(data.labels)) assert_array_less(c.estimator_errors_, 0.5) assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.2
def sklearnSvm(): svc = SVC() score = make_scorer(zero_one_loss,greater_is_better=False) clf = GridSearchCV(svc, tuned_parameters, scoring = score,cv = 5) clf.fit(data_Train_feature,data_Train_label) test_true, test_predict = data_Test_label, clf.predict(data_Test_feature) train_ture, train_predict = data_Train_label, clf.predict(data_Train_feature) err_Train = zero_one_loss(train_ture, train_predict) err_Test = zero_one_loss(test_true, test_predict) print(clf.best_params_) print(classification_report(test_true, test_predict)) print(err_Train) print(err_Test)
def kNN(pickle_file): fin = open(pickle_file, "rb") train, test = pickle.load(fin) X_tr, y_tr = train X_te, y_te = test n = [1,3,5,7,9] figure = plt.figure() for spot in range(len(n)): knc = KNeighborsClassifier(n_neighbors = n[spot]) knc.fit(X_tr, y_tr) predicted_tr = (knc.predict(X_tr)) predicted_te = (knc.predict(X_te)) axis = figure.add_subplot(1,5,spot+1) xtr_1 = [] xtr_2 = [] for pair in X_tr: xtr_1.append(pair[0]) xtr_2.append(pair[1]) xte_1 = [] xte_2 = [] for pair in X_te: xte_1.append(pair[0]) xte_2.append(pair[1]) colors = ListedColormap(['#FF0000', '#0000FF']) axis.scatter(xtr_1,xtr_2, c = y_tr, cmap = colors, edgecolors = 'k') axis.scatter(xte_1,xte_2, marker="*", c = y_te, cmap = colors, edgecolors = 'k') x1min, x1max, x2min, x2max = helpers.get_bounds(X_tr) helpers.plot_decision_boundary(axis, knc, x1min, x1max, x2min, x2max) axis.set_title("n_neighbors = " + str(n[spot])) tr_loss = round(zero_one_loss(y_tr,predicted_tr),2) te_loss = round(zero_one_loss(y_te,predicted_te),2) axis.set_xlabel("Tr loss: " + str(tr_loss)+"\n Te loss: " + str(te_loss)) plt.show()
def drawLearningCurve(model, x_train, y_train, x_test, y_test, num_points = 50): # adapted from http://sachithdhanushka.blogspot.de/2013/09/learning-curve-generator-for-learning.html train_error = np.zeros(num_points) crossval_error = np.zeros(num_points) #Fix a based array that has an entry from both classes baseitem0 = list(y_train).index(0) xbase = x_train[baseitem0,:] ybase = [y_train[baseitem0]] baseitem1 = list(y_train).index(1) xbase = np.vstack((xbase, x_train[baseitem1,:])) ybase = np.append(ybase,y_train[baseitem1]) #ybase = np.vstack((ybase, y_train[baseitem1])) x_train = np.delete(x_train, (baseitem0), axis=0) x_train = np.delete(x_train, (baseitem1), axis=0) y_train = np.delete(y_train, (baseitem0), axis=0) y_train = np.delete(y_train, (baseitem1), axis=0) sizes = np.linspace(1, len(x_train), num=num_points).astype(int) for i,size in enumerate(sizes): #getting the predicted results of the model xvals = np.vstack((xbase, x_train[:size,:])) yvals = np.append(ybase,y_train[:size]) model.fit(xvals, yvals) #compute the validation error y_pred = model.predict(x_test[:size]) crossval_error[i] = zero_one_loss(y_test[:size], y_pred, normalize=True) #compute the training error y_pred = model.predict(x_train[:size]) train_error[i] = zero_one_loss(y_train[:size], y_pred, normalize=True) #draw the plot print crossval_error print train_error fig,ax = plt.subplots() ax.plot(sizes+1,crossval_error,lw = 2, label='cross validation error') ax.plot(sizes+1,train_error, lw = 4, label='training error') ax.set_xlabel('cross val error') ax.set_ylabel('rms error') ax.legend(loc = 0) ax.set_title('Learning Curve' ) return fig
def train_svm(kernels=None, labels=None): if kernels is None: trn_k, trn_y = load_svmlight_file( 'dns_data_kernel/trn_kernel_mat.svmlight') val_k, val_y = load_svmlight_file( 'dns_data_kernel/val_kernel_mat.svmlight') tst_k, tst_y = load_svmlight_file( 'dns_data_kernel/tst_kernel_mat.svmlight') trn_k = trn_k.todense() val_k = val_k.todense() tst_k = tst_k.todense() else: trn_k, trn_y = kernels[0], labels[0] val_k, val_y = kernels[1], labels[1] tst_k, tst_y = kernels[2], labels[2] pred = dict() C = [0.01, 0.1, 1, 10, 100] val_errs = [] for c in C: m = svm.SVC(kernel='precomputed', C=c) m.fit(trn_k, trn_y) trn_label = m.predict(trn_k) val_label = m.predict(val_k) trn_err = zero_one_loss(trn_label, trn_y) val_err = zero_one_loss(val_label, val_y) pred[c] = [trn_err, val_err, sum(m.n_support_)] val_errs.append(val_err) opt_c = C[val_errs.index(min(val_errs))] m = svm.SVC(kernel='precomputed', C=opt_c) m.fit(trn_k, trn_y) tst_label = m.predict(tst_k) tst_err = zero_one_loss(tst_label, tst_y) print("Test Error: {0:.2%}".format(tst_err)) return pred
def runTests(X_train, X_test, y_train, y_test): zeroSums = np.zeros((13)) count = 0 #Knn neighborList = [1, 3, 5, 7, 9] for value in neighborList: kNeighbors = KNeighborsClassifier(n_neighbors=value) kNeighbors.fit(X_train, y_train) predTest = kNeighbors.predict(X_test) zeroSums[count] = zero_one_loss(y_test, predTest) count += 1 #dTree depthList = [1, 2, 3, 4, None] for depth in depthList: dTree = DecisionTreeClassifier(max_depth=depth) dTree.fit(X_train, y_train) predTest = dTree.predict(X_test) zeroSums[count] = zero_one_loss(y_test, predTest) count += 1 #svms linSVM = SVC(kernel='linear') linSVM.fit(X_train, y_train) predTest = linSVM.predict(X_test) zeroSums[count] = zero_one_loss(y_test, predTest) count += 1 rbfSVM = SVC(kernel='rbf') rbfSVM.fit(X_train, y_train) predTest = rbfSVM.predict(X_test) zeroSums[count] = zero_one_loss(y_test, predTest) count += 1 polySVM = SVC(kernel='poly', degree=3) polySVM.fit(X_train, y_train) predTest = polySVM.predict(X_test) zeroSums[count] = zero_one_loss(y_test, predTest) count += 1 #print(zeroSums) return zeroSums
def benchmark(clf): t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 t0 = time() pred = clf.predict(X_test) test_time = time() - t0 err = metrics.zero_one_loss(y_test, pred, normalize=True) return err, train_time, test_time
def test_group_zero_one_loss_unnormalized(): result = metrics.group_zero_one_loss(Y_true, Y_pred, groups, normalize=False) expected_overall = skm.zero_one_loss(Y_true, Y_pred, False) assert result.overall == expected_overall
def basic(scheduler_address, backends): ESTIMATORS = { 'RandomForest': RandomForestClassifier(n_estimators=100), 'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=100) } X_train, X_test, y_train, y_test = load_data() print_data(X_train, y_train, X_test, y_test) BACKENDS = build_backends(backends, scheduler_address, X_train, y_train) print("Training Classifiers") print("====================") error, train_time, test_time = {}, {}, {} for est_name, estimator in sorted(ESTIMATORS.items()): for backend, backend_kwargs in BACKENDS: print("Training %s with %s backend... " % (est_name, backend), end="") estimator_params = estimator.get_params() estimator.set_params( **{ p: RANDOM_STATE for p in estimator_params if p.endswith("random_state") }) if "n_jobs" in estimator_params: estimator.set_params(n_jobs=-1) # Key for the results name = '%s, %s' % (est_name, backend) with parallel_backend(backend, **backend_kwargs): time_start = time() estimator.fit(X_train, y_train) train_time[name] = time() - time_start time_start = time() y_pred = estimator.predict(X_test) test_time[name] = time() - time_start error[name] = zero_one_loss(y_test, y_pred) print("done") print() print("Classification performance:") print("===========================") print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", "error-rate")) print("-" * 44) for name in sorted(error, key=error.get): print("%s %s %s %s" % (name, ("%.4fs" % train_time[name]), ("%.4fs" % test_time[name]), ("%.4f" % error[name]))) print()
def test_prediction(filename, features, target): ''' Given the filename of a pickled estimator, unpickle the estimator, run the features through it, compare the results with the target, and return the results. ''' estimator = pickle.load( open( filename, "rb" ) ) res = estimator.predict_proba(features)[:,0] print "\nResults for {0}: \n".format(filename) print "\nRounded by .25\n" yrd = round_by(res, .25) print metrics.classification_report(target, yrd) print "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd)) print "\nRounded by .1\n" yrd = round_by(res, .1) print metrics.classification_report(target, yrd) print "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd)) return res
def votingClassifier(): print(colored("------Voting Classification-------", 'red')) # models random_forest = RandomForestClassifier(criterion='entropy', max_depth=30, n_estimators=48, random_state=0) clf_lr = LogisticRegression() clf_knn = KNeighborsClassifier(n_neighbors=7) # build classifier model = VotingClassifier(estimators=[('rf', random_forest), ('knn', clf_knn)], voting='soft', n_jobs=-1, weights=[2, 1]) print("Training the Voting classification.......") # start timer starttime = timeit.default_timer() # start timer cnn = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP # train model.fit(train_x, train_Y) print("The time difference is :", timeit.default_timer() - starttime) print("Predicting test data.......") # predict y_pred = model.predict(test_x) # results c_matrix = confusion_matrix(test_Y, y_pred) error = zero_one_loss(test_Y, y_pred) score = accuracy_score(test_Y, y_pred) # display results print('Confusion Matrix\n---------------------------\n', c_matrix) print('---------------------------') print("Error: {:.4f}%".format(error * 100)) print("Accuracy Score: {:.4f}%".format(score * 100)) print(classification_report(test_Y, y_pred)) print('accuracy: ', c_matrix.diagonal() / c_matrix.sum(axis=1)) # Plot non-normalized confusion matrix disp = plot_confusion_matrix(model, test_x, test_Y, cmap=plt.cm.Greens, values_format='.0f', xticks_rotation='horizontal') plt.title("Confusion Matrix for Voting Classifier") plt.show()
def benchmark(clf): t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 t0 = time() pred = clf.predict(X_test) test_time = time() - t0 err = metrics.zero_one_loss(y_test, pred) / float(pred.shape[0]) return err, train_time, test_time
def compute_error(targets, predictions, binary): mse = mean_squared_error(targets, predictions) loss = 0 # fraction of misclassifications if binary: predictions = np.where(predictions >= 0, 1, -1) loss = zero_one_loss(targets, predictions, normalize=True) return loss, mse
def compute_evaluation(true_matrix, predict_matrix): h = hamming_loss(true_matrix, predict_matrix) z = zero_one_loss(true_matrix, predict_matrix) c = coverage_error(true_matrix, predict_matrix) - 1 r = label_ranking_loss(true_matrix, predict_matrix) a = average_precision_score(true_matrix, predict_matrix) result = [h, z, c, r, a] return result
def vde(y_true, y_pred): """ Voicing Decision Error ---------------------- Proportion of frames for which an incorrect voiced/unvoiced decision is made. """ return zero_one_loss(y_true, y_pred)
def NB(trainvector, trainlabels, testvector, testlabels): from sklearn.naive_bayes import GaussianNB Multi = GaussianNB() Multi.fit(trainvector, trainlabels) error = zero_one_loss(trainlabels, Multi.predict(trainvector), normalize=False) errorrate = zero_one_loss(trainlabels, Multi.predict(trainvector)) accuracy = accuracy_score(testlabels, Multi.predict(testvector)) #print('No of errors = %d and error rate= %f of the training data' % (error, errorrate)) errort = zero_one_loss(testlabels, Multi.predict(testvector), normalize=False) errorratet = zero_one_loss(testlabels, Multi.predict(testvector)) #print('No of errors = %d and error rate= %f of the testing data' % (error, errorrate)) return error, errorrate, errort, errorratet, accuracy
def question2(Wp, Wnp): tssSet = [100, 250, 500, 1000, 2000] kMeanAvg = np.array([]) kMeanSD = np.array([]) sKMeanAvg = np.array([]) sKMeanSD = np.array([]) for skm in range(2): sphericalKMeans = bool(skm) title = "Spherical KMeans NBC Analysis" if sphericalKMeans else "KMeans NBC Analysis" Tp = KMeans(n_clusters=50, n_init=10) Tnp = KMeans(n_clusters=50, n_init=10) if sphericalKMeans: Tp = SphericalKMeans(n_clusters=50, n_init=10) Tnp = SphericalKMeans(n_clusters=50, n_init=10) print "TP" Tp.fit(Wp) print "TNP" Tnp.fit(Wnp) TList = TopicList(Wp, Tp, Wnp, Tnp) data = np.transpose(TList.X) classifiers = np.append(np.ones(len(data)/2,dtype=int), np.zeros(len(data)/2, dtype=int)) clf = GaussianNB() for tss in tssSet: print tss kf = KFold(len(data), n_folds=10, shuffle=True) zeroOneLoss = np.array([]) for train, test_indeces in kf: train_indeces = np.random.permutation(train)[:tss] test_set = data[test_indeces] train_set = data[train_indeces] clf.fit(train_set, classifiers[train_indeces]) y_pred = clf.predict(test_set)# y_pred = [1, 2, 3, 4] y_true = classifiers[test_indeces]# y_true = [2, 2, 3, 4] zeroOneLoss = np.append(zeroOneLoss, zero_one_loss(y_true, y_pred))#.25 <-returns zero one loss percentage if sphericalKMeans: sKMeanAvg = np.append(sKMeanAvg, np.average(zeroOneLoss)) sKMeanSD = np.append(sKMeanSD, np.std(zeroOneLoss) / np.sqrt(10)) else: kMeanAvg = np.append(kMeanAvg, np.average(zeroOneLoss)) kMeanSD = np.append(kMeanSD, np.std(zeroOneLoss) / np.sqrt(10)) np.savetxt(title + '.csv',np.row_stack((tssSet,kMeanAvg, kMeanSD,sKMeanAvg,sKMeanSD)),delimiter=',') fig = plt.figure() fig.suptitle(title, fontsize=14, fontweight='bold') ax = fig.add_subplot(111) ax.set_xlabel('training set sizes') ax.set_ylabel('Zero-One Loss') ax.plot(tssSet, kMeanAvg, 'r-', tssSet, sKMeanAvg, 'b-') ax.errorbar(tssSet, kMeanAvg, yerr=kMeanSD, fmt='ro') ax.errorbar(tssSet, sKMeanAvg, yerr=sKMeanSD, fmt='bo') ax.axis([50, 2050, 0, 1]) plt.savefig(title) plt.show()
def test_logitboost_musk_fitting(): c = LogitBoostClassifier( base_estimator=DecisionTreeRegressor(max_depth=1), n_estimators=30, learning_rate=1.0 ) data = MUSK1() c.fit(data.data, np.sign(data.labels)) assert_array_less(c.estimator_errors_, 0.6) assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.05
def test_gentleboost_hastie_fitting(): c = GentleBoostClassifier( base_estimator=DecisionTreeRegressor(max_depth=1), n_estimators=30, learning_rate=1.0 ) data = Hastie_10_2() c.fit(data.data, np.sign(data.labels)) assert_array_less(c.estimator_errors_, 0.5) assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.2
def ValAccuracyLoss(): imagesVal = os.path.join(path_to_model, "macval.txt") y_true, \ y_predict, \ y_predict_float, \ runtime_times = ForwardNet(imagesVal,False) report = classification_report(y_true, y_predict) print "Val Accuracy = ", accuracy_score(y_true, y_predict) print "Val Loss = ", zero_one_loss(y_true, y_predict) print "Val Loss Cross Entropy = ", CrossEntropyLoss(y_true, y_predict_float) print report