示例#1
0
def cross_validation_example():
    """ Slightly more complex example : Perform grid search cross-validation to find optimal parameters for MinCq using
    rbf kernels as voters.
    """
    # We load iris dataset, We convert the labels to be -1 or 1, and we split it in two parts: train and test.
    dataset = load_iris()
    dataset.target[dataset.target == 0] = -1
    dataset.target[dataset.target == 2] = -1
    X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=42)

    # The learning algorithm and its parameters.
    learner = MinCqLearner(mu=0.0001, voters_type='kernel', kernel='rbf', gamma=0.0)
    learner_params = {'mu': [0.0001, 0.001, 0.01],
                      'gamma': [0.0, 0.1, 1.0, 10]}

    cv_classifier = GridSearchCV(learner, learner_params, scoring=accuracy_scorer)
    cv_classifier = cv_classifier.fit(X_train, y_train)

    predictions_train = cv_classifier.predict(X_train)
    predictions_test = cv_classifier.predict(X_test)

    print_sklearn_grid_scores("Iris", "RbfMinCq", learner_params, cv_classifier.grid_scores_)

    print("Best parameters: {}".format(str(cv_classifier.best_params_)))
    print("Training set risk: {:.4f}".format(zero_one_loss(y_train, predictions_train)))
    print("Testing set risk: {:.4f}".format(zero_one_loss(y_test, predictions_test)))
def drawLearningCurve(model, x_train, y_train, x_test, y_test, num_points = 50):
# adapted from http://sachithdhanushka.blogspot.de/2013/09/learning-curve-generator-for-learning.html
    
    train_error = np.zeros(num_points)
    crossval_error = np.zeros(num_points)
    
    sizes = np.linspace(2, len(x_train), num=num_points).astype(int)
    for i,size in enumerate(sizes):
         
        #getting the predicted results of the model
        model.fit(x_train[:size], y_train[:size])
         
        #compute the validation error
        y_pred = model.predict(x_test[:size])
        crossval_error[i] = zero_one_loss(y_test[:size], y_pred, normalize=True)
         
        #compute the training error
        y_pred = model.predict(x_train[:size])
        train_error[i] = zero_one_loss(y_train[:size], y_pred, normalize=True)

    #draw the plot
    print crossval_error
    print train_error
    fig,ax = plt.subplots()
    ax.plot(sizes,crossval_error,lw = 2, label='cross validation error')
    ax.plot(sizes,train_error, lw = 4, label='training error')
    ax.set_xlabel('cross val error')
    ax.set_ylabel('rms error')
    ax.legend(loc = 0)
    ax.set_title('Learning Curve' )
    return fig
示例#3
0
def plot_adaclassifier(classifier, n_estimators, X_train, X_test, y_train, y_test):

    fig = plt.figure()
    ax = fig.add_subplot(111)

    #ax.plot([1, n_estimators], [dt_stump_err] * 2, 'k-',
    #        label='Decision Stump Error')
    #ax.plot([1, n_estimators], [dt_err] * 2, 'k--',
    #        label='Decision Tree Error')

    ada_err_test = np.zeros((n_estimators,))
    for i, y_pred in enumerate(classifier.staged_predict(X_test)):
        ada_err_test[i] = zero_one_loss(y_pred, y_test)

    ada_err_train = np.zeros((n_estimators,))
    for i, y_pred in enumerate(classifier.staged_predict(X_train)):
        ada_err_train[i] = zero_one_loss(y_pred, y_train)

    ax.plot(np.arange(n_estimators) + 1, ada_err_test,
            label='AdaBoost Test Error',
            color='red')
    ax.plot(np.arange(n_estimators) + 1, ada_err_train,
            label='AdaBoost Train Error',
            color='blue')

    ax.set_ylim((0.0, 1.0))
    ax.set_xlabel('n_estimators')
    ax.set_ylabel('error rate')

    leg = ax.legend(loc='upper right', fancybox=True)
    leg.get_frame().set_alpha(0.7)


    return fig
def test_grid(features, target):
    '''
    Given a list of models for each genre, run the features through the models to
    predict target labels, and compare the predictions to the true target labels.
    '''
    genre_list = ['animated', 'action', 'comedy', 'drama', 'family', 'fantasy', \
    'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western']
    ypred_mat = np.empty([target.shape[0], target.shape[1]])
    for i in xrange(target.shape[1]):
        filename = '../data/is_' + genre_list[i] + '.pkl'
        ypred = test_prediction(filename, features, target[:,i])
        for j, prob in enumerate(ypred):
            ypred_mat[j,i] = prob
    with open('../data/grid_pkl_500.txt','w') as f:
        f.write("Model rounded by .25\n")
        yrd = round_by(ypred_mat, .25)
        f.write( metrics.classification_report(target, yrd) )
        f.write( "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd)) )
        f.write("\nModel rounded by .3\n")
        yrd = round_by(ypred_mat, .3)
        f.write( metrics.classification_report(target, yrd) )
        f.write( "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd)) )
        f.write("\nModel rounded by .2\n")
        yrd = round_by(ypred_mat, .2)
        f.write( metrics.classification_report(target, yrd) )
        f.write( "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd)) )
        f.write("\nModel rounded by .1\n")
        yrd = round_by(ypred_mat, .1)
        f.write( metrics.classification_report(target, yrd) )
        f.write( "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd)) )
示例#5
0
def run_gamma(x, y):
    perc = 0.6
    n = x.shape[0]
    gamma_list = (np.power(2.0, range(-4, 12))/(n*perc)).tolist()
    n_iter = 2
    train_err_libsvm = np.zeros((len(gamma_list), n_iter))
    test_err_libsvm = np.zeros((len(gamma_list), n_iter))
    train_err_dsvm = np.zeros((len(gamma_list), n_iter))
    test_err_dsvm = np.zeros((len(gamma_list), n_iter))
    train_err_pegasos = np.zeros((len(gamma_list), n_iter))
    test_err_pegasos = np.zeros((len(gamma_list), n_iter))
    ss = cv.StratifiedShuffleSplit(y, n_iter=n_iter, test_size=1-perc, train_size=None, random_state=0)
    for k, (train, test) in enumerate(ss):
        ntr = len(train)
        lmda = 1.0 / ntr
        print "#iter: %d" % k
        x_train, x_test, y_train, y_test = x[train], x[test], y[train], y[test]
        mM_scale = preprocessing.MinMaxScaler(feature_range=(-1, 1))
        x_train = mM_scale.fit_transform(x_train)
        x_test = mM_scale.transform(x_test)
        for j, gm in enumerate(gamma_list):
            print "check lamda %f, gamma %f" % (lmda, gm)
            clf = svm.SVC(C=lmda * ntr, kernel='rbf', gamma=gm, cache_size=600)
            clf.fit(x_train, y_train)

            pred = clf.predict(x_train)
            train_err_libsvm[j, k] = zero_one_loss(y_train, pred)
            pred = clf.predict(x_test)
            test_err_libsvm[j, k] = zero_one_loss(y_test, pred)
            dsvm = DualKSVM(lmda=lmda, gm=gm, kernelstr='rbf', nsweep=ntr/2, b=5, c=1)
            dsvm.fit(x_train, y_train, x_test, y_test, )
            train_err_dsvm[j, k] = dsvm.err_tr[-1]
            test_err_dsvm[j, k] = dsvm.err_te[-1]
            kpega = Pegasos(ntr, lmda, gm, nsweep=2, batchsize=2)
            kpega.train_test(x_train, y_train, x_test, y_test)
            train_err_pegasos[j, k] = kpega.err_tr[-1]
            test_err_pegasos[j, k] = kpega.err_te[-1]
    avg_train_err_libsvm = np.mean(train_err_libsvm, axis=1)
    avg_test_err_libsvm = np.mean(test_err_libsvm, axis=1)
    avg_train_err_dsvm = np.mean(train_err_dsvm, axis=1)
    avg_test_err_dsvm = np.mean(test_err_dsvm, axis=1)
    avg_train_err_pegasos = np.mean(train_err_pegasos, axis=1)
    avg_test_err_pegasos = np.mean(test_err_pegasos, axis=1)
    plt.figure()
    # color_list = ['b', 'r', 'g', 'c', ]
    # marker_list = ['o', 'x', '>', 's']

    plt.loglog(gamma_list, avg_train_err_libsvm, 'bo-', label='libsvm train')
    plt.loglog(gamma_list, avg_test_err_libsvm, 'ro-', label='libsvm test')
    plt.loglog(gamma_list, avg_train_err_dsvm, 'gx-', label='dsvm train')
    plt.loglog(gamma_list, avg_test_err_dsvm, 'cx-', label='dsvm test')
    plt.loglog(gamma_list, avg_train_err_pegasos, 'mD-', label='pegasos train')
    plt.loglog(gamma_list, avg_test_err_pegasos, 'kD-', label='pegasos test')
    plt.legend(bbox_to_anchor=(0, 1.17, 1, .1), loc=2, ncol=2, mode="expand", borderaxespad=0)
    plt.savefig('../output/usps_diff_gamma.pdf')
示例#6
0
def build_tree(clf,type,i,X_train, X_test, y_train, y_test,attribute_names,class_names):
    print("------------Run "+type+ "_"+str(i)+"----------")
    clf.fit(X_train, y_train)
    print("Training error =", zero_one_loss(y_train, clf.predict(X_train)))
    predicted_test = clf.predict(X_test)
    print("Test error =",zero_one_loss(y_test, predicted_test ) )
    figure_name = type+"_"+str(i)
    visualize_tree(clf,attribute_names,class_names,figure_name)
    print(classification_report(  y_test,predicted_test ))
    print(confusion_matrix(y_test,predicted_test))
    return zero_one_loss(y_test, predicted_test )
示例#7
0
def simple_classification_example():
    """ Simple example : with fixed hyperparameters, run four versions of MinCq on a single dataset.
    """
    # MinCq parameters, fixed to a given value as this is a simple example.
    mu = 0.001

    # We load iris dataset, We convert the labels to be -1 or 1, and we split it in two parts: train and test.
    dataset = load_iris()
    dataset.target[dataset.target == 0] = -1
    dataset.target[dataset.target == 2] = -1
    X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=42)

    # We train MinCq using decision stumps as voters, on the training set.
    learner = MinCqLearner(mu, voters_type='stumps')
    learner.fit(X_train, y_train)

    # We predict the train and test labels and print the risk.
    predictions_train = learner.predict(X_train)
    predictions_test = learner.predict(X_test)

    print("\nStumpsMinCq")
    print("-----------")
    print("Training set risk: {:.4f}".format(zero_one_loss(y_train, predictions_train)))
    print("Testing set risk: {:.4f}\n".format(zero_one_loss(y_test, predictions_test)))

    # We do the same again, now with a linear kernel.
    learner = MinCqLearner(mu, voters_type='kernel', kernel='linear')
    learner.fit(X_train, y_train)

    predictions_train = learner.predict(X_train)
    predictions_test = learner.predict(X_test)

    print("\nLinearMinCq")
    print("-----------")
    print("Training set risk: {:.4f}".format(zero_one_loss(y_train, predictions_train)))
    print("Testing set risk: {:.4f}\n".format(zero_one_loss(y_test, predictions_test)))

    # We do the same again, now with a polynomial kernel.
    learner = MinCqLearner(mu, voters_type='kernel', kernel='poly')
    learner.fit(X_train, y_train)

    predictions_train = learner.predict(X_train)
    predictions_test = learner.predict(X_test)

    print("\nPolyMinCq")
    print("-----------")
    print("Training set risk: {:.4f}".format(zero_one_loss(y_train, predictions_train)))
    print("Testing set risk: {:.4f}\n".format(zero_one_loss(y_test, predictions_test)))

    # We do the same again, now with an RBF kernel.
    learner = MinCqLearner(mu, voters_type='kernel', kernel='rbf', gamma=0.0)
    learner.fit(X_train, y_train)

    predictions_train = learner.predict(X_train)
    predictions_test = learner.predict(X_test)

    print("\nRbfMinCq")
    print("--------")
    print("Training set risk: {:.4f}".format(zero_one_loss(y_train, predictions_train)))
    print("Testing set risk: {:.4f}\n".format(zero_one_loss(y_test, predictions_test)))
def test_losses():
    """Test loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)
    n_samples = y_true.shape[0]
    n_classes = np.size(unique_labels(y_true))

    # Classification
    # --------------
    with warnings.catch_warnings(record=True):
    # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred), 11)

    assert_almost_equal(zero_one_loss(y_true, y_pred),
                        11 / float(n_samples), 2)
    assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 11)

    assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2)
    assert_almost_equal(hamming_loss(y_true, y_pred),
                        2 * 11. / (n_samples * n_classes), 2)

    assert_equal(accuracy_score(y_true, y_pred),
                 1 - zero_one_loss(y_true, y_pred))

    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one_score(y_true, y_pred),
                     1 - zero_one_loss(y_true, y_pred))

    # Regression
    # ----------
    assert_almost_equal(mean_squared_error(y_true, y_pred),
                        10.999 / n_samples, 2)
    assert_almost_equal(mean_squared_error(y_true, y_true),
                        0.00, 2)

    # mean_absolute_error and mean_squared_error are equal because
    # it is a binary problem.
    assert_almost_equal(mean_absolute_error(y_true, y_pred),
                        10.999 / n_samples, 2)
    assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2)

    assert_almost_equal(explained_variance_score(y_true, y_pred), 0.16, 2)
    assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2)
    assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0)

    assert_almost_equal(r2_score(y_true, y_pred), 0.12, 2)
    assert_almost_equal(r2_score(y_true, y_true), 1.00, 2)
    assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0)
    assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
示例#9
0
def lr(X_train, y_train, X_test, y_test):

    # Tune the hyperparameter
    maxScore = float("-inf")
    maxC = 0
    for c in np.arange(0.1, 1, 0.1):
        clf = LogisticRegression(penalty="l2", C=c).fit(X_train, y_train)
        scores = cross_val_score(clf, X_train, y_train, cv=5)
        mean = np.mean(scores)
        print("C: %f and Score: %f" % (c, mean))
        if mean > maxScore:
            maxScore = mean
            maxC = c

    # Train the model
    print("MaxC: %f" % maxC)
    print("MaxScore: %f" % maxScore)
    clf = LogisticRegression(penalty="l2", C=maxC).fit(X_train, y_train)

    # Predict labels for the test data
    pred = clf.predict(X_test)
    pred_prob = clf.predict_proba(X_test)

    # Calculate the misclassification rate
    mc_rate = zero_one_loss(y_test, pred)
    print("MC rate: %f" % mc_rate)

    # Calculate the ROC curve
    prob = pred_prob[:, 1:]
    roc_score = roc_auc_score(y_test, prob)
    print("ROC score: %f" % roc_score)

    return (mc_rate, roc_score)
示例#10
0
def train(min_samples_leaf, max_depth, dataset):
    mlflow.log_param("min_samples_leaf", min_samples_leaf)
    mlflow.log_param("max_depth", max_depth)

    clf = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, max_depth=max_depth)
    print("Classifier:",clf)
    clf.fit(dataset.data, dataset.target)
    expected = dataset.target
    predicted = clf.predict(dataset.data)

    mlflow.sklearn.log_model(clf, "model") 

    write_artifact('confusion_matrix.txt',str(metrics.confusion_matrix(expected, predicted)))
    write_artifact('classification_report.txt',metrics.classification_report(expected, predicted))

    auc = metrics.auc(expected, predicted)
    accuracy_score = metrics.accuracy_score(expected, predicted)
    zero_one_loss = metrics.zero_one_loss(expected, predicted)

    mlflow.log_metric("auc", auc)
    mlflow.log_metric("accuracy_score", accuracy_score)
    mlflow.log_metric("zero_one_loss", zero_one_loss)

    print("Params:  min_samples_leaf={} max_depth={}".format(min_samples_leaf,max_depth))
    print("Metrics: auc={} accuracy_score={} zero_one_loss={}".format(auc,accuracy_score,zero_one_loss))
def experiment_neighbors_k_nearest_neighbors():
    avgError = []
    x_learners = []
    for k_neighbors in range(1, 20, 1):
        k = 10
        skf = StratifiedKFold(labels,n_folds=k)
        averageError = 0.0
        for train_index, test_index in skf:
            X_train, X_test = mfcc[:,train_index], mfcc[:,test_index]
            y_train, y_test = labels[train_index], labels[test_index]
            knc = KNeighborsClassifier(n_neighbors=k_neighbors, weights='distance')
            knc.fit(X_train.T,y_train)
            y_pred = knc.predict(X_test.T)
            error = zero_one_loss(y_pred,y_test)
            print error
            averageError += (1./k) * error
        print "Average error: %4.2f%s" % (100 * averageError,'%')
        avgError.append(averageError)
        x_learners.append(k_neighbors)

    plt.plot(x_learners, avgError)
    plt.ylabel('Average Error (k=10)')
    plt.xlabel('Number of Neighbors')
    plt.title('Error as a function of the number of neighbors taken into consideration')
    plt.show()
def experiment_pca_n_components_random_forest():
    pca = decomposition.PCA()
    rf = RandomForestClassifier(n_estimators=maxLearners, max_depth = maxDepth, warm_start = False)
    pipe = Pipeline(steps=[('pca', pca), ('rf', rf)])
    avgError = []
    x_learners = []
    for k_components in range(10, 100, 10):
        k = 10
        skf = StratifiedKFold(labels,n_folds=k)
        averageError = 0.0
        for train_index, test_index in skf:
            X_train, X_test = mfcc[:,train_index], mfcc[:,test_index]
            y_train, y_test = labels[train_index], labels[test_index]
            estimator = GridSearchCV(pipe, dict(pca__n_components=[k_components]))
            estimator.fit(X_train.T,y_train)
            y_pred = estimator.predict(X_test.T)
            error = zero_one_loss(y_pred,y_test)
            print error
            averageError += (1./k) * error
        print "Average error: %4.2f%s" % (100 * averageError,'%')
        avgError.append(averageError)
        x_learners.append(k_components)

    plt.plot(x_learners, avgError)
    plt.ylabel('Average Error (k=10)')
    plt.xlabel('Number of Components')
    plt.title('Error as a function of the number of components')
    plt.show()
示例#13
0
def classify_by_KNeighbors(train_x, train_Y, test_x, test_Y, colors, category):

    classifier = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
    classifier.fit(train_x, train_Y)

    pred_y = classifier.predict(test_x)
    results = confusion_matrix(test_Y, pred_y)

    error = zero_one_loss(test_Y, pred_y)
    accuracy = metrics.accuracy_score(test_Y, pred_y)
    classification = metrics.classification_report(test_Y, pred_y)

    pca = PCA(n_components=2)
    train_x_pca_cont = pca.fit_transform(test_x)
    plt.figure(figsize=(15, 10))
    for color, cat in zip(colors, category.keys()):
        plt.scatter(train_x_pca_cont[pred_y == cat, 0],
                    train_x_pca_cont[pred_y == cat, 1],
                    color=color,
                    alpha=.8,
                    lw=2,
                    label=cat)
        plt.legend(loc='best', shadow=False, scatterpoints=1)
    plt.title(" KNeibors result visualization")
    plt.show()

    plt.figure(figsize=(10, 7))
    sn.heatmap(results, annot=True, fmt='d')
    plt.title("KNeighbors confusion matrix: \n")

    print("KNeighbors confusion matrix: ", results)
    print("Error: ", error * 100, '%')
    print("Accuracy: ", accuracy * 100, "%")
    print("Classification report:" "\n", classification)
    return pred_y
def experiment_estimators_AdaBoostRandomForest():
    avgError = []
    x_learners = []
    rf = RandomForestClassifier(n_estimators=maxLearners, max_depth = maxDepth, warm_start = False)
    for lr in frange(0.01, 1., 0.25):
        k = 10
        skf = StratifiedKFold(labels,n_folds=k)
        averageError = 0.0
        for train_index, test_index in skf:
            X_train, X_test = mfcc[:,train_index], mfcc[:,test_index]
            y_train, y_test = labels[train_index], labels[test_index]
            adb = AdaBoostClassifier(base_estimator=rf, n_estimators=100, learning_rate=lr)
            adb.fit(X_train.T,y_train)
            y_pred = adb.predict(X_test.T)
            error = zero_one_loss(y_pred,y_test)
            print error
            averageError += (1./k) * error
        print "Average error: %4.2f%s" % (100 * averageError,'%')
        avgError.append(averageError)
        x_learners.append(lr)
    # graph the errors now.
    plt.plot(x_learners, avgError)
    plt.ylabel('Average Error (k=10)')
    plt.xlabel('Learning Rate')
    plt.title('Error as a function of the learning rate')
    plt.show()
示例#15
0
def evaluate_naivebayes(classifier, test_reviews):
    # For computing metrics
    ref_set = collections.defaultdict(set)
    test_set = collections.defaultdict(set)
    ref_set_arr = []
    test_set_arr = []

    # Create gold standard and predicted labels
    for i, (feat, label) in enumerate(test_reviews):
        # Predict
        observed = classifier.classify(feat)

        ref_set[label].add(i)
        test_set[observed].add(i)

        label = 0 if label == "neg" else 1
        observed = 0 if observed == "neg" else 1
        ref_set_arr.append(label)
        test_set_arr.append(observed)

    print('pos precision:', precision(ref_set['pos'], test_set['pos']))
    print('pos recall:', recall(ref_set['pos'], test_set['pos']))
    print('neg precision:', precision(ref_set['neg'], test_set['neg']))
    print('neg recall:', recall(ref_set['neg'], test_set['neg']))
    print('misclassification rate', zero_one_loss(ref_set_arr, test_set_arr))
    print('most informative features',
          classifier.show_most_informative_features(10))
def experiment_learners_random_forest():
    avgError = []
    x_learners = []
    for maxLearners in range(10, 150, 20):
        k = 10
        skf = StratifiedKFold(labels,n_folds=k)
        averageError = 0.0
        for train_index, test_index in skf:
            X_train, X_test = mfcc[:,train_index], mfcc[:,test_index]
            y_train, y_test = labels[train_index], labels[test_index]
            rf = RandomForestClassifier(n_estimators=maxLearners, max_depth = maxDepth, warm_start = False)
            rf.fit(X_train.T,y_train)
            y_pred = rf.predict(X_test.T)
            error = zero_one_loss(y_pred,y_test)
            print error
            averageError += (1./k) * error
        print "Average error: %4.2f%s" % (100 * averageError,'%')
        avgError.append(averageError)
        x_learners.append(maxLearners)

    plt.plot(x_learners, avgError)
    plt.ylabel('Average Error (k=10)')
    plt.xlabel('Max Learners')
    plt.title('Error as a function of the number of learners')
    plt.show()
def perceptron_v0(max_iters, pred_every, X_train, X_test, Y_train, Y_test):
    acc = []
    w = np.zeros([num_digits, X_train.shape[1]])
    iters = 0
    while iters < max_iters:
        # Train
        for i in range(len(Y_train)):
            iters += 1
            for int_class in range(num_digits):
                Y_mult = 1 if Y_train[i] == int_class else -1
                if Y_mult * np.dot(w[int_class], X_train[i]) <= 0:
                    w[int_class] += Y_mult * X_train[i]

            if iters % pred_every == 0:
                # Predict
                Y_pred = np.zeros(Y_test.shape)
                for k in range(len(Y_test)):
                    preds = np.zeros(num_digits)
                    for int_class in range(num_digits):
                        preds[int_class] = np.dot(w[int_class], X_test[k])
                    Y_pred[k] = np.argmax(preds)

                # Test
                acc.append(zero_one_loss(Y_test, Y_pred))

    return acc
    def train(self, train_data, tr_lab=None):
        """
        Method that performs training. It compares the clustering labels on training set
        (i.e., A(X) computed by :class:`reval.relative_validation.RelativeValidation.clust_method`) against
        the labels obtained from the classification algorithm
        (i.e., f(X), computed by :class:`reval.relative_validation.RelativeValidation.class_method`).
        It returns the misclassification error, the supervised model fitted to the data,
        and both clustering and classification labels.

        :param train_data: training dataset.
        :type train_data: ndarray, (n_samples, n_features)
        :param tr_lab: cluster labels found during CV for clustering methods with no `n_clusters` parameter.
            If not None the clustering method is not performed on the whole test set. Default None.
        :type tr_lab: list
        :return: misclassification error, fitted supervised model object, clustering and classification labels.
        :rtype: float, object, ndarray (n_samples,)
        """
        if tr_lab is None:
            clustlab_tr = self.clust_method.fit_predict(train_data)  # A_k(X)
        else:
            clustlab_tr = tr_lab
        if len([cl for cl in clustlab_tr if cl >= 0]) == 0:
            logging.info(
                f"No clusters found during training with {self.clust_method}.")
            return None
        fitclass_tr = self.class_method.fit(train_data, clustlab_tr)
        classlab_tr = fitclass_tr.predict(train_data)
        misclass = zero_one_loss(clustlab_tr, classlab_tr)
        return misclass, fitclass_tr, clustlab_tr
示例#19
0
def inline(inputfile, outputfile):
    # data = np.loadtxt(sys.stdin)
    data = np.loadtxt(inputfile, delimiter=',')
    if np.ndim(data) == 1:
        data = np.array([data])
    train_x = data[:, 1:]
    train_y = data[:, 0]

    candidate_size = 1000
    evaluation_size = 1000
    x, y = make_classification(n_samples=candidate_size + evaluation_size,
                               n_features=2,
                               n_informative=1,
                               n_redundant=1,
                               n_clusters_per_class=1,
                               random_state=37)
    eval_x = x[candidate_size:]
    eval_y = y[candidate_size:]

    learner = KNeighborsClassifier(n_neighbors=1)
    learner = learner.fit(train_x, train_y)
    pred_y = learner.predict(eval_x)
    with open(outputfile, 'w') as f:
        l = zero_one_loss(eval_y, pred_y)
        f.write(str(l))
示例#20
0
	def __train_with_cross_validation(self):
		"""
		This method enables sk-learn algorithms to perform KFold-cross-validation.
		The method also initiates the cnvrg experiment with all its metrics.
		"""
		scores = cross_validate(estimator=self.__model,
								X=self.__x_train.values,
								y=self.__y_train.values,
								cv=self.__cross_val_folds,
								return_train_score=True,
								scoring=['neg_mean_squared_error', 'accuracy'],
								return_estimator=True)

		train_acc_cv = scores['train_accuracy']
		train_err_cv = (-1) * scores['train_neg_mean_squared_error']
		val_acc_cv = scores['test_accuracy']
		val_err_cv = (-1) * scores['test_neg_mean_squared_error']
		self.__model = scores['estimator'][-1]

		y_pred = self.__model.predict(self.__x_test.values)
		test_acc = accuracy_score(self.__y_test.values, y_pred)
		test_loss = zero_one_loss(self.__y_test.values, y_pred)
		self.__metrics.update({
			'train_acc': train_acc_cv,
			'train_loss': train_err_cv,
			'train_loss_type': 'MSE',
			'validation_acc': val_acc_cv,
			'validation_loss': val_err_cv,
			'validation_loss_type': 'MSE',
			'test_acc': test_acc,
			'test_loss': test_loss,
			'test_loss_type': 'zero_one_loss'
		})
		self.__plot_all(y_pred)
    def test(self, test_data, fit_model):
        """
        Method that compares test set clustering labels (i.e., A(X'), computed by
        :class:`reval.relative_validation.RelativeValidation.clust_method`) against
        the (permuted) labels obtained through the classification algorithm fitted to the training set
        (i.e., f(X'), computed by
        :class:`reval.relative_validation.RelativeValidation.class_method`).
        It returns the misclassification error, together with
        both clustering and classification labels.

        :param test_data: test dataset.
        :type test_data: ndarray, (n_samples, n_features)
        :param fit_model: fitted supervised model.
        :type fit_model: class
        :return: misclassification error, clustering and classification labels.
        :rtype: float, dictionary of ndarrays (n_samples,)
        """
        clustlab_ts = self.clust_method.fit_predict(test_data)  # A_k(X')
        if len([cl for cl in clustlab_ts if cl >= 0]) == 0:
            logging.info(
                f"No clusters found during testing with {self.clust_method}")
            return None
        classlab_ts = fit_model.predict(test_data)
        bestperm = kuhn_munkres_algorithm(classlab_ts,
                                          clustlab_ts)  # array of integers
        misclass = zero_one_loss(classlab_ts, bestperm)
        return misclass, bestperm
示例#22
0
def nb(X_train, y_train, X_test, y_test):

    # Tune the hyperparameter
    maxScore = float("-inf")
    maxA = 0
    for a in np.arange(0.1, 1, 0.1):
        clf = MultinomialNB(alpha=a).fit(X_train, y_train)
        scores = cross_val_score(clf, X_train, y_train, cv=5)
        mean = np.mean(scores)
        print("A: %f and Score: %f" % (a, mean))
        if mean > maxScore:
            maxScore = mean
            maxA = a

    # Train the model
    print("MaxA: %f" % maxA)
    print("MaxScore: %f" % maxScore)
    clf = MultinomialNB(alpha=maxA).fit(X_train, y_train)

    # Predict labels for the test data
    pred = clf.predict(X_test)
    pred_prob = clf.predict_proba(X_test)

    # Calculate the misclassification rate
    mc_rate = zero_one_loss(y_test, pred)
    print("MC rate: %f" % mc_rate)

    # Calculate the ROC curve
    prob = pred_prob[:, 1:]
    roc_score = roc_auc_score(y_test, prob)
    print("ROC score: %f" % roc_score)

    return (mc_rate, roc_score)
示例#23
0
def rf(X_train, y_train, X_test, y_test):

    # Tune the hyperparameter
    maxScore = float("-inf")
    maxN = 0
    for n in np.arange(100, 600, 100):
        clf = RandomForestClassifier(n_estimators=n).fit(X_train, y_train)
        scores = cross_val_score(clf, X_train, y_train, cv=5)
        mean = np.mean(scores)
        print("N: %f and Score: %f" % (n, mean))
        if mean > maxScore:
            maxScore = mean
            maxN = n

    # Train the model
    print("MaxN: %f" % maxN)
    print("MaxScore: %f" % maxScore)
    clf = RandomForestClassifier(n_estimators=maxN).fit(X_train, y_train)

    # Predict labels for the test data
    pred = clf.predict(X_test)
    pred_prob = clf.predict_proba(X_test)

    # Calculate the misclassification rate
    mc_rate = zero_one_loss(y_test, pred)
    print("MC rate: %f" % mc_rate)

    # Calculate the ROC curve
    prob = pred_prob[:, 1:]
    roc_score = roc_auc_score(y_test, prob)
    curve = roc_curve(y_test, prob)
    print("ROC score: %f" % roc_score)

    return (mc_rate, roc_score, curve)
示例#24
0
def cross_valid(h, y, ratio_list):
    """
    cross validation to tune the best cap probability for soft-margin boosting
    """
    print " find optimal ratio"
    n_samples = h.shape[0]
    n_folds = 4
    ntr = n_samples/n_folds
    ratio_list = ratio_list[ratio_list >= 1.0/ntr]
    kf = cv.KFold(n=n_samples, n_folds=n_folds)
    err_tr = np.zeros((n_folds, len(ratio_list)))
    err_te = np.zeros((n_folds, len(ratio_list)))
    k = 0
    for tr_ind, te_ind in kf:
        print "nfold: %d" % (k)
        xtr, ytr, xte, yte = h[tr_ind, :], y[tr_ind], h[te_ind, :], y[te_ind]
        for i, r in enumerate(ratio_list):
            pd = ParaBoost(epsi=0.005, has_dcap=True, ratio=r)
            pd.train(xtr, ytr)
            pred = pd.test_h(xte)
            err_te[k, i] = zero_one_loss(y_true=yte, y_pred=pred)
            err_tr[k, i] = pd.err_tr[-1]
        k += 1
    err_te_avg = np.mean(err_te, axis=0)
    err_tr_avg = np.mean(err_tr, axis=0)
    arg = np.argmin(err_te_avg)
    best_ratio = ratio_list[arg]
    err = err_te_avg[arg]
    return best_ratio
示例#25
0
def clf_bias_var(clf, X, y, n_replicas):
        
    roc_auc_scorer = get_scorer("roc_auc")
    # roc_auc_scorer(clf, X_test, y_test)
    auc_scores = []
    error_scores = []
    counts = np.zeros(X.shape[0], dtype = np.float64)
    sum_preds = np.zeros(X.shape[0], dtype = np.float64)
    for it in xrange(n_replicas):
        # generate train sets and test sets
        train_indices = np.random.randint(X.shape[0], size = X.shape[0])
        # get test sets
        in_train = np.unique(train_indices)
        mask = np.ones(X.shape[0], dtype = np.bool)
        mask[in_train] = False
        test_indices = np.arange(X.shape[0])[mask]
        
        clf.fit(X[train_indices], y[train_indices])
        
        auc_scores.append(roc_auc_scorer(clf, X[test_indices], y[test_indices]))
        error_scores.append(zero_one_loss(y[test_indices], clf.predict(X[test_indices])))
        
        preds = clf.predict(X)
        for index in test_indices:
            counts[index] += 1
            sum_preds[index] += preds[index]
    
    test_mask = (counts > 0) # indices of samples that have been tested
    
    # print('counts mean: {}'.format(np.mean(counts)))
    # print('counts standard derivation: {}'.format(np.std(counts)))
    
    bias, var = bias_var(y[test_mask], sum_preds[test_mask], counts[test_mask], n_replicas)
    
    return auc_scores, error_scores, bias, var
def exercise_1():
    X, y = make_blobs(n_samples=1000,centers=50, n_features=2, random_state=0)
    n_samples = len(X)
    kf = cross_validation.KFold(n_samples, n_folds=10, shuffle=False, random_state=None)
    # kf = cross_validation.ShuffleSplit(1000,n_iter=25, test_size=0.1, train_size=0.9, random_state=None)

    error_total = np.zeros([49, 1], dtype=float)
    for k in range(1,50):
        error = []
        clf = KNeighborsClassifier(n_neighbors=k)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            error.append( zero_one_loss(y_test, clf.predict(X_test)) )


            # error.append(clf.predict(X_test))
            # error.append( 1. - clf.score(X_test, y_test) ) #, accuracy_score(y_test, clf.predict(X_test))
            # error.append(mean_squared_error(y_test, clf.predict(X_test)))
            # error.append()
        # print error
        error_total[k-1, 0] = np.array(error).mean()
    # print error_total
    x = np.arange(1,50, dtype=int)
    plt.style.use('ggplot')
    plt.plot(x, error_total[:, 0], '#009999', marker='o')
    # plt.errorbar(x, accuracy_lst[:, 0], accuracy_lst[:, 1], linestyle='None', marker='^')
    plt.xticks(x, x)
    plt.margins(0.02)
    plt.xlabel('K values')
    plt.ylabel('Missclasification Error')
    plt.show()
def exercise_2():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.KFold(len(X), n_folds=10, shuffle=False, random_state=0)
    error = []
    error_mean = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]
    clf = RandomForestClassifier(oob_score=True,
                                   max_features="auto",
                                   random_state=0)
    for i in lst:
        error_mean = []
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.set_params(n_estimators=i)
            clf.fit(X_train, y_train)
            error_mean.append( zero_one_loss(y_test, clf.predict(X_test)) )
        error.append( np.array(error_mean).mean() )
    #plot
    plt.style.use('ggplot')
    plt.plot(lst, error, '#009999', marker='o')
    plt.xticks(lst)
    plt.show()
示例#28
0
def test_sample_order_invariance():
    y_true, y_pred, _ = make_prediction(binary=True)

    y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred,
                                             random_state=0)

    for metric in [accuracy_score,
                   hamming_loss,
                   zero_one_loss,
                   lambda y1, y2: zero_one_loss(y1, y2, normalize=False),
                   precision_score,
                   recall_score,
                   f1_score,
                   lambda y1, y2: fbeta_score(y1, y2, beta=2),
                   lambda y1, y2: fbeta_score(y1, y2, beta=0.5),
                   matthews_corrcoef,
                   mean_absolute_error,
                   mean_squared_error,
                   explained_variance_score,
                   r2_score]:

        assert_almost_equal(metric(y_true, y_pred),
                            metric(y_true_shuffle, y_pred_shuffle),
                            err_msg="%s is not sample order invariant"
                                    % metric)
	def fit(self, data, target):

		no_of_stages = self.no_of_stages	
		decision_stump = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=1, max_features=1)
		#No. of samples
		m = data.shape[0]
		weight = numpy.ones(m)
		weight = numpy.float32(weight)/m

		Alpha = numpy.zeros(no_of_stages)
		classifiers = []
		for i in range(no_of_stages):
			decision_stump = decision_stump.fit(data, target, sample_weight = weight)
			classifiers.append(decision_stump)
			pred = decision_stump.predict(data)
			error = zero_one_loss(target, pred, normalize=True, sample_weight = weight)

			if error > 0.5:
				print 'error value is greater than 0.5!'

			beta = error/(1-error)
			if beta != 0: 
				weight[pred == target] = weight[pred==target]*beta
				weight = weight / weight.sum()
			print weight
			# beta_mat = (pred==target)*beta
			# beta_mat[beta_mat==0] = 1
			# weight = numpy.multiply(weight, beta_mat)
			if beta > 0:
				alpha = math.log(1/beta) 
			else:
				alpha = 10000 # make alpha extremly large if decision stump is totally correct.
			Alpha[i] = alpha
		self.Alpha = Alpha
		self.classifiers = classifiers
示例#30
0
文件: imgur.py 项目: meganlshao/Imgur
def run_ordinal_regression(X_train, y_train, X_test, y_test, ordinal_regression_model):
    print("Running ordinal regression with multiclass labels...")
    ordinal_regression_clf = ordinal_regression_model(alpha=ALPHA, max_iter=MAX_ITER)
    ordinal_regression_clf.fit(X_train, y_train)

    y_pred = ordinal_regression_clf.predict(X_train)
    training_err = metrics.zero_one_loss(y_train, y_pred, normalize=False)
    print("%.4f = Training accuracy for ordinal regression with multiclass labels" % 
            (float(len(y_train) - training_err) / len(y_train)))

    y_pred = ordinal_regression_clf.predict(X_test)
    test_err = metrics.zero_one_loss(y_test, y_pred, normalize=False)
    print("%.4f = Test accuracy for ordinal regression with multiclass labels" % 
            (float(len(y_test) - test_err) / len(y_test)))

    return float(len(y_test) - test_err) / len(y_test)
示例#31
0
文件: run.py 项目: Mbaroudi/junk
def apply_dbn(files, main_driver=1):
    """
    Applies DBN for identifying trips which are not from the driver of interest
    """
    (X_train, Y_train, weight, X, driver_trip_arr) = \
        get_train_data(files, main_driver)
    a = np.empty(shape=[0, 2])

    net = DBN([len(COL), 10, 2],
              learn_rates=0.3,
              learn_rate_decays=0.9,
              epochs=10,
              verbose=0)
    net.fit(X_train, Y_train)

    Y_dbn = net.predict(X_train)
    print main_driver, ':', 1 - zero_one_loss(Y_train, Y_dbn)
    # print "Classification report:"
    # print classification_report(Y_train, preds)

    i = 0
    Y = net.predict(X)
    for y in Y:
        driver_trip = driver_trip_arr[i][0]
        prob = str(int(Y[i]))
        a = np.append(a, np.array([[driver_trip, prob]]), axis=0)
        i = i + 1

    print main_driver, ': ', sum([1 for p in a if p[1] == '1'])

    return a
def compare_manual_vs_model():

    with open(DATA_FOLDER + "labels_int.p", "r") as f:
        y_dict = pickle.load(f)

    print "Loading test data"
    X_test, y_test, filenames_test = dataset.load_test()
    y_pred = joblib.load("../models/pred_ml_improved.pkl")

    relevant = []
    for pred, correct, filename in zip(y_pred, y_test, filenames_test):
        if filename in FILES:
            relevant.append((pred, correct, filename, CLASSIFICATIONS[filename]))

    model_predictions, correct, filename, manual_predictions = zip(*relevant)
    manual_predictions = learn.multilabel_binary_y(manual_predictions)
    model_predictions = np.array(model_predictions)
    correct = learn.multilabel_binary_y(correct)

    rules = infer_topology.infer_topology_rules()
    improved_manual = infer_topology.apply_topology_rules(rules, manual_predictions)

    prediction_names = ["MODEL", "MANUAL", "IMPROVED_MANUAL"]
    predictions = [model_predictions, manual_predictions, improved_manual]

    for name, pred in zip(prediction_names, predictions):

        print "\n{}\n--".format(name)
        print "Zero-one classification loss", zero_one_loss(correct, pred)
        print "Hamming loss", hamming_loss(correct, pred)
        print "Precision:", precision_score(correct, pred, average="weighted", labels=label_list)
        print "Recall   :", recall_score(correct, pred, average="weighted", labels=label_list)
        print "F1 score :", f1_score(correct, pred, average="weighted", labels=label_list)
示例#33
0
文件: fraud.py 项目: ZeerakW/statsml
 def classify(self, model, test_y, test_x):
     pred = model.predict(test_x)
     if not self.multi:
         rec, spec, acc = self.score(pred, test_y)
         return rec, spec, acc
     else:
         return 1 - zero_one_loss(test_y, pred)
示例#34
0
def use_sklearn_ml_knn():
    """

    :return:
    """

    base_path = os.getcwd()
    # train_x = np.load(os.path.join(base_path, 'dataset/train_x.npy'), allow_pickle=True)
    # train_y = np.load(os.path.join(base_path, 'dataset/train_y.npy'), allow_pickle=True)

    train_x = np.load(os.path.join(base_path, 'my_dataset/train_x.npy'),
                      allow_pickle=True)
    train_y = np.load(os.path.join(base_path, 'my_dataset/train_y.npy'),
                      allow_pickle=True)

    new_train_y = []
    for tup in train_y:
        tmp = []
        for label in tup:
            if label == 0:
                tmp.append(0)
            else:
                tmp.append(1)
        new_train_y.append(tmp)

    # test_x = np.load('dataset/test_x.npy', allow_pickle=True)
    # test_y = np.load('dataset/test_y.npy', allow_pickle=True)

    test_x = np.load('my_dataset/test_x.npy', allow_pickle=True)
    test_y = np.load('my_dataset/test_y.npy', allow_pickle=True)
    new_test_y = []
    for tup in test_y:
        tmp = []
        for label in tup:
            if label == 0:
                tmp.append(0)
            else:
                tmp.append(1)
        new_test_y.append(tmp)

    new_test_y = np.array(new_test_y)

    classifier = MLkNN2(train_x, np.array(new_train_y), k=10)

    # classifier.fit(train_x, np.array(new_train_y))
    classifier.fit()
    predictions = classifier.predict(test_x)
    predictions = convert_prediction(predictions)

    # hamming_loss = HammingLoss(new_test_y, predictions)
    h_loss = hamming_loss(new_test_y, predictions)
    z = zero_one_loss(new_test_y, predictions)
    c = coverage_error(new_test_y, predictions)
    r = label_ranking_loss(new_test_y, predictions)
    a = average_precision_score(new_test_y, predictions)
    print('hamming_loss = ', h_loss)
    print('0-1_loss = ', z)
    print('cover_loss = ', c)
    print('rank_loss = ', r)
    print('average_loss = ', a)
示例#35
0
    def modelevaluation(self, y_test, y_pred, features, ml):
        '''
        confusion = metrics.confusion_matrix(y_test, y_pred)
        print("Confussion matrix: \n", confusion)
        TP = confusion[1, 1]
        TN = confusion[0, 0]
        FP = confusion[0, 1]
        FN = confusion[1, 0]
        '''
        data = {}
        data[features] = []
        print('----------REPORT-----------')
        print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
        print("Balanced Accuracy: ",
              metrics.balanced_accuracy_score(
                  y_test, y_pred,
                  sample_weight=None))  #Average of label accuracies
        print("Precision: ", metrics.precision_score(y_test, y_pred))
        print("Recall: ", metrics.recall_score(y_test, y_pred))
        print("F1 score macro: ",
              metrics.f1_score(y_test, y_pred, average='macro'))
        print("F1 score micro: ",
              metrics.f1_score(y_test, y_pred, average='micro'))
        print("F-Beta score: ", metrics.fbeta_score(y_test, y_pred, beta=10))
        print("AUC Score: ", metrics.roc_auc_score(y_test, y_pred))
        print("Zero_one_loss", metrics.zero_one_loss(y_test, y_pred))
        print(
            "Matthews_corrcoef", metrics.matthews_corrcoef(y_test, y_pred)
        )  #Gives equal weight to all TP, TN, FP, FN (Better than F1-score)
        print(
            "Brier score: ", metrics.brier_score_loss(y_test, y_pred)
        )  #The Brier score is calculated as the mean squared error between the expected probabilities for the positive class (e.g. 1.0) and the predicted probabilities. (Better than log_loss)
        print(
            "Cohen keppa score: ", metrics.cohen_kappa_score(y_test, y_pred)
        )  #It basically tells you how much better your classifier is performing over the performance of a classifier that simply guesses at random according to the frequency of each class.
        print("Classification_report\n",
              metrics.classification_report(y_test, y_pred, output_dict=True))
        print('----------REPORT-----------')

        with open('evaluations/model_evaluation.json', 'r+') as opened_file:
            current_json = json.load(opened_file)
            current_json[features] = {
                'model':
                '' + ml,
                'accuracy':
                metrics.accuracy_score(y_test, y_pred),
                'fraud_precision':
                metrics.classification_report(
                    y_test, y_pred, output_dict=True)['1']['precision'],
                'fraud_recall':
                metrics.classification_report(y_test, y_pred,
                                              output_dict=True)['1']['recall'],
                'fraud_f1_score':
                metrics.classification_report(
                    y_test, y_pred, output_dict=True)['1']['f1-score']
            }
            opened_file.seek(0)
            opened_file.truncate(0)
            json.dump(current_json, opened_file)
示例#36
0
def main():
    test_label = [1]*100 + [2]*100 + [3]*100 + [4]*100 + [5]*100 + \
               [6]*100 + [7]*100 + [8]*100

    filename = sys.argv[1]

    with open(filename, 'rU') as f:
        pred = [rec for rec in csv.reader(f, delimiter=',')]
    pred = sum(pred, [])
    pred = [int(x) for x in pred]
    print zero_one_loss(pred, test_label)
    cm = confusion_matrix(test_label, pred, labels=[1, 2, 3, 4, 5, 6, 7, 8])
    np.set_printoptions(precision=2)
    fig = plt.figure()
    fig.patch.set_facecolor('white')
    plot_confusion_matrix(cm)
    plt.show()
示例#37
0
文件: statics.py 项目: Jiang1Xue/LPFS
def compute_evaluation(true_matrix, predict_matrix):
    h = hamming_loss(true_matrix, predict_matrix)
    z = zero_one_loss(true_matrix, predict_matrix)
    c = coverage_error(true_matrix, predict_matrix)

    result = [h, z, c]

    return result
示例#38
0
def test_logitboost_hastie_fitting():
    c = LogitBoostClassifier(base_estimator=DecisionTreeRegressor(max_depth=1),
                             n_estimators=30,
                             learning_rate=1.0)
    data = Hastie_10_2()
    c.fit(data.data, np.sign(data.labels))
    assert_array_less(c.estimator_errors_, 0.5)
    assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.2
示例#39
0
def sklearnSvm():
        svc = SVC()
        score = make_scorer(zero_one_loss,greater_is_better=False)
        clf = GridSearchCV(svc, tuned_parameters, scoring = score,cv = 5)

        clf.fit(data_Train_feature,data_Train_label)

        test_true, test_predict = data_Test_label, clf.predict(data_Test_feature)
        train_ture, train_predict = data_Train_label, clf.predict(data_Train_feature)

        err_Train = zero_one_loss(train_ture, train_predict)
        err_Test = zero_one_loss(test_true, test_predict)

        print(clf.best_params_)
        print(classification_report(test_true, test_predict))
        print(err_Train)
        print(err_Test)
示例#40
0
def kNN(pickle_file):

    fin = open(pickle_file, "rb")
    train, test = pickle.load(fin)
    X_tr, y_tr = train
    X_te, y_te = test

    n = [1,3,5,7,9]

    figure = plt.figure()
    for spot in range(len(n)):
        knc = KNeighborsClassifier(n_neighbors = n[spot])
        knc.fit(X_tr, y_tr)
        predicted_tr = (knc.predict(X_tr))
        predicted_te = (knc.predict(X_te))

        axis = figure.add_subplot(1,5,spot+1)

        xtr_1 = []
        xtr_2 = []

        for pair in X_tr:
            xtr_1.append(pair[0])
            xtr_2.append(pair[1])

        xte_1 = []
        xte_2 = []

        for pair in X_te:
            xte_1.append(pair[0])
            xte_2.append(pair[1])

        colors = ListedColormap(['#FF0000', '#0000FF'])

        axis.scatter(xtr_1,xtr_2, c = y_tr, cmap = colors, edgecolors = 'k')
        axis.scatter(xte_1,xte_2, marker="*", c = y_te, cmap = colors, edgecolors = 'k')
        x1min, x1max, x2min, x2max = helpers.get_bounds(X_tr)
        helpers.plot_decision_boundary(axis, knc, x1min, x1max, x2min, x2max)
        axis.set_title("n_neighbors = " + str(n[spot]))

        tr_loss = round(zero_one_loss(y_tr,predicted_tr),2)
        te_loss = round(zero_one_loss(y_te,predicted_te),2)

        axis.set_xlabel("Tr loss: " + str(tr_loss)+"\n Te loss: " + str(te_loss))

    plt.show()
示例#41
0
def drawLearningCurve(model, x_train, y_train, x_test, y_test, num_points = 50):
# adapted from http://sachithdhanushka.blogspot.de/2013/09/learning-curve-generator-for-learning.html
    
    train_error = np.zeros(num_points)
    crossval_error = np.zeros(num_points)
    
    #Fix a based array that has an entry from both classes
    baseitem0 = list(y_train).index(0)
    xbase = x_train[baseitem0,:]
    ybase = [y_train[baseitem0]]
    baseitem1 = list(y_train).index(1)
    xbase = np.vstack((xbase, x_train[baseitem1,:]))
    ybase = np.append(ybase,y_train[baseitem1])
    #ybase = np.vstack((ybase, y_train[baseitem1]))
    
    x_train = np.delete(x_train, (baseitem0), axis=0)
    x_train = np.delete(x_train, (baseitem1), axis=0)
    y_train = np.delete(y_train, (baseitem0), axis=0)
    y_train = np.delete(y_train, (baseitem1), axis=0)
    
    sizes = np.linspace(1, len(x_train), num=num_points).astype(int)
    for i,size in enumerate(sizes):
        #getting the predicted results of the model
        xvals = np.vstack((xbase, x_train[:size,:]))
        yvals = np.append(ybase,y_train[:size])
        model.fit(xvals, yvals)
         
        #compute the validation error
        y_pred = model.predict(x_test[:size])
        crossval_error[i] = zero_one_loss(y_test[:size], y_pred, normalize=True)
         
        #compute the training error
        y_pred = model.predict(x_train[:size])
        train_error[i] = zero_one_loss(y_train[:size], y_pred, normalize=True)

    #draw the plot
    print crossval_error
    print train_error
    fig,ax = plt.subplots()
    ax.plot(sizes+1,crossval_error,lw = 2, label='cross validation error')
    ax.plot(sizes+1,train_error, lw = 4, label='training error')
    ax.set_xlabel('cross val error')
    ax.set_ylabel('rms error')
    ax.legend(loc = 0)
    ax.set_title('Learning Curve' )
    return fig
def train_svm(kernels=None, labels=None):
    if kernels is None:
        trn_k, trn_y = load_svmlight_file(
            'dns_data_kernel/trn_kernel_mat.svmlight')
        val_k, val_y = load_svmlight_file(
            'dns_data_kernel/val_kernel_mat.svmlight')
        tst_k, tst_y = load_svmlight_file(
            'dns_data_kernel/tst_kernel_mat.svmlight')

        trn_k = trn_k.todense()
        val_k = val_k.todense()
        tst_k = tst_k.todense()
    else:
        trn_k, trn_y = kernels[0], labels[0]
        val_k, val_y = kernels[1], labels[1]
        tst_k, tst_y = kernels[2], labels[2]

    pred = dict()

    C = [0.01, 0.1, 1, 10, 100]

    val_errs = []
    for c in C:
        m = svm.SVC(kernel='precomputed', C=c)
        m.fit(trn_k, trn_y)

        trn_label = m.predict(trn_k)
        val_label = m.predict(val_k)

        trn_err = zero_one_loss(trn_label, trn_y)
        val_err = zero_one_loss(val_label, val_y)

        pred[c] = [trn_err, val_err, sum(m.n_support_)]
        val_errs.append(val_err)

    opt_c = C[val_errs.index(min(val_errs))]
    m = svm.SVC(kernel='precomputed', C=opt_c)
    m.fit(trn_k, trn_y)

    tst_label = m.predict(tst_k)

    tst_err = zero_one_loss(tst_label, tst_y)

    print("Test Error: {0:.2%}".format(tst_err))

    return pred
示例#43
0
def runTests(X_train, X_test, y_train, y_test):
    zeroSums = np.zeros((13))
    count = 0
    #Knn
    neighborList = [1, 3, 5, 7, 9]
    for value in neighborList:
        kNeighbors = KNeighborsClassifier(n_neighbors=value)
        kNeighbors.fit(X_train, y_train)
        predTest = kNeighbors.predict(X_test)

        zeroSums[count] = zero_one_loss(y_test, predTest)
        count += 1
    #dTree
    depthList = [1, 2, 3, 4, None]
    for depth in depthList:
        dTree = DecisionTreeClassifier(max_depth=depth)
        dTree.fit(X_train, y_train)
        predTest = dTree.predict(X_test)

        zeroSums[count] = zero_one_loss(y_test, predTest)
        count += 1
    #svms
    linSVM = SVC(kernel='linear')
    linSVM.fit(X_train, y_train)
    predTest = linSVM.predict(X_test)

    zeroSums[count] = zero_one_loss(y_test, predTest)
    count += 1

    rbfSVM = SVC(kernel='rbf')
    rbfSVM.fit(X_train, y_train)
    predTest = rbfSVM.predict(X_test)

    zeroSums[count] = zero_one_loss(y_test, predTest)
    count += 1

    polySVM = SVC(kernel='poly', degree=3)
    polySVM.fit(X_train, y_train)
    predTest = polySVM.predict(X_test)

    zeroSums[count] = zero_one_loss(y_test, predTest)
    count += 1

    #print(zeroSums)
    return zeroSums
示例#44
0
def benchmark(clf):
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    err = metrics.zero_one_loss(y_test, pred, normalize=True)
    return err, train_time, test_time
def test_group_zero_one_loss_unnormalized():
    result = metrics.group_zero_one_loss(Y_true,
                                         Y_pred,
                                         groups,
                                         normalize=False)

    expected_overall = skm.zero_one_loss(Y_true, Y_pred, False)

    assert result.overall == expected_overall
示例#46
0
def basic(scheduler_address, backends):
    ESTIMATORS = {
        'RandomForest': RandomForestClassifier(n_estimators=100),
        'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=100)
    }

    X_train, X_test, y_train, y_test = load_data()
    print_data(X_train, y_train, X_test, y_test)

    BACKENDS = build_backends(backends, scheduler_address, X_train, y_train)

    print("Training Classifiers")
    print("====================")
    error, train_time, test_time = {}, {}, {}
    for est_name, estimator in sorted(ESTIMATORS.items()):
        for backend, backend_kwargs in BACKENDS:
            print("Training %s with %s backend... " % (est_name, backend),
                  end="")
            estimator_params = estimator.get_params()

            estimator.set_params(
                **{
                    p: RANDOM_STATE
                    for p in estimator_params if p.endswith("random_state")
                })

            if "n_jobs" in estimator_params:
                estimator.set_params(n_jobs=-1)

            # Key for the results
            name = '%s, %s' % (est_name, backend)

            with parallel_backend(backend, **backend_kwargs):
                time_start = time()
                estimator.fit(X_train, y_train)
                train_time[name] = time() - time_start

            time_start = time()
            y_pred = estimator.predict(X_test)
            test_time[name] = time() - time_start

            error[name] = zero_one_loss(y_test, y_pred)

            print("done")

    print()
    print("Classification performance:")
    print("===========================")
    print("%s %s %s %s" %
          ("Classifier  ", "train-time", "test-time", "error-rate"))
    print("-" * 44)
    for name in sorted(error, key=error.get):
        print("%s %s %s %s" % (name, ("%.4fs" % train_time[name]),
                               ("%.4fs" % test_time[name]),
                               ("%.4f" % error[name])))

    print()
示例#47
0
def test_prediction(filename, features, target):
    '''
    Given the filename of a pickled estimator, unpickle the estimator, run the
    features through it, compare the results with the target, and return the
    results.
    '''
    estimator = pickle.load( open( filename, "rb" ) )
    res = estimator.predict_proba(features)[:,0]
    print "\nResults for {0}: \n".format(filename)
    print "\nRounded by .25\n"
    yrd = round_by(res, .25)
    print metrics.classification_report(target, yrd)
    print "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd))
    print "\nRounded by .1\n"
    yrd = round_by(res, .1)
    print metrics.classification_report(target, yrd)
    print "Percent of misclassification: {}\n".format(metrics.zero_one_loss(target, yrd))
    return res
def votingClassifier():
    print(colored("------Voting Classification-------", 'red'))

    # models
    random_forest = RandomForestClassifier(criterion='entropy',
                                           max_depth=30,
                                           n_estimators=48,
                                           random_state=0)
    clf_lr = LogisticRegression()
    clf_knn = KNeighborsClassifier(n_neighbors=7)
    # build classifier
    model = VotingClassifier(estimators=[('rf', random_forest),
                                         ('knn', clf_knn)],
                             voting='soft',
                             n_jobs=-1,
                             weights=[2, 1])

    print("Training the Voting classification.......")

    # start timer
    starttime = timeit.default_timer()  # start timer

    cnn = CondensedNearestNeighbour(random_state=42)  # doctest: +SKIP

    # train
    model.fit(train_x, train_Y)

    print("The time difference is :", timeit.default_timer() - starttime)

    print("Predicting test data.......")

    # predict
    y_pred = model.predict(test_x)

    # results
    c_matrix = confusion_matrix(test_Y, y_pred)
    error = zero_one_loss(test_Y, y_pred)
    score = accuracy_score(test_Y, y_pred)

    # display results
    print('Confusion Matrix\n---------------------------\n', c_matrix)
    print('---------------------------')
    print("Error: {:.4f}%".format(error * 100))
    print("Accuracy Score: {:.4f}%".format(score * 100))
    print(classification_report(test_Y, y_pred))
    print('accuracy: ', c_matrix.diagonal() / c_matrix.sum(axis=1))

    # Plot non-normalized confusion matrix
    disp = plot_confusion_matrix(model,
                                 test_x,
                                 test_Y,
                                 cmap=plt.cm.Greens,
                                 values_format='.0f',
                                 xticks_rotation='horizontal')
    plt.title("Confusion Matrix for Voting Classifier")

    plt.show()
 def benchmark(clf):
     t0 = time()
     clf.fit(X_train, y_train)
     train_time = time() - t0
     t0 = time()
     pred = clf.predict(X_test)
     test_time = time() - t0
     err = metrics.zero_one_loss(y_test, pred) / float(pred.shape[0])
     return err, train_time, test_time
示例#50
0
def benchmark(clf):
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    err = metrics.zero_one_loss(y_test, pred) / float(pred.shape[0])
    return err, train_time, test_time
示例#51
0
def compute_error(targets, predictions, binary):
    mse = mean_squared_error(targets, predictions)
    loss = 0
    # fraction of misclassifications
    if binary:
        predictions = np.where(predictions >= 0, 1, -1)
        loss = zero_one_loss(targets, predictions, normalize=True)

    return loss, mse
示例#52
0
def compute_evaluation(true_matrix, predict_matrix):
    h = hamming_loss(true_matrix, predict_matrix)
    z = zero_one_loss(true_matrix, predict_matrix)
    c = coverage_error(true_matrix, predict_matrix) - 1
    r = label_ranking_loss(true_matrix, predict_matrix)
    a = average_precision_score(true_matrix, predict_matrix)

    result = [h, z, c, r, a]
    return result
示例#53
0
def benchmark(clf):
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    err = metrics.zero_one_loss(y_test, pred, normalize=True)
    return err, train_time, test_time
示例#54
0
def vde(y_true, y_pred):
    """
    Voicing Decision Error
    ----------------------

    Proportion of frames for which an incorrect voiced/unvoiced decision is
    made.
    """
    return zero_one_loss(y_true, y_pred)
示例#55
0
def NB(trainvector, trainlabels, testvector, testlabels):
    from sklearn.naive_bayes import GaussianNB
    Multi = GaussianNB()
    Multi.fit(trainvector, trainlabels)

    error = zero_one_loss(trainlabels,
                          Multi.predict(trainvector),
                          normalize=False)
    errorrate = zero_one_loss(trainlabels, Multi.predict(trainvector))
    accuracy = accuracy_score(testlabels, Multi.predict(testvector))
    #print('No of errors = %d and error rate= %f of the training data' % (error, errorrate))

    errort = zero_one_loss(testlabels,
                           Multi.predict(testvector),
                           normalize=False)
    errorratet = zero_one_loss(testlabels, Multi.predict(testvector))
    #print('No of errors = %d and error rate= %f of the testing data' % (error, errorrate))
    return error, errorrate, errort, errorratet, accuracy
示例#56
0
def question2(Wp, Wnp):
    tssSet = [100, 250, 500, 1000, 2000]
    kMeanAvg = np.array([])
    kMeanSD = np.array([])
    sKMeanAvg = np.array([])
    sKMeanSD = np.array([])

    for skm in range(2):
        sphericalKMeans = bool(skm)
        title =  "Spherical KMeans NBC Analysis" if sphericalKMeans else "KMeans NBC Analysis"
        Tp = KMeans(n_clusters=50, n_init=10)
        Tnp = KMeans(n_clusters=50, n_init=10)

        if sphericalKMeans:
            Tp = SphericalKMeans(n_clusters=50, n_init=10)
            Tnp = SphericalKMeans(n_clusters=50, n_init=10)
        print "TP"
        Tp.fit(Wp)
        print "TNP"
        Tnp.fit(Wnp)
        TList = TopicList(Wp, Tp, Wnp, Tnp)
        data = np.transpose(TList.X)
        classifiers = np.append(np.ones(len(data)/2,dtype=int), np.zeros(len(data)/2, dtype=int))
        clf = GaussianNB()

        for tss in tssSet:
            print tss
            kf = KFold(len(data), n_folds=10, shuffle=True)
            zeroOneLoss = np.array([])
            for train, test_indeces in kf:
                train_indeces = np.random.permutation(train)[:tss]
                test_set = data[test_indeces]
                train_set = data[train_indeces]
                clf.fit(train_set, classifiers[train_indeces])
                y_pred = clf.predict(test_set)#                               y_pred = [1, 2, 3, 4]
                y_true = classifiers[test_indeces]#                                     y_true = [2, 2, 3, 4]
                zeroOneLoss = np.append(zeroOneLoss, zero_one_loss(y_true, y_pred))#.25 <-returns zero one loss percentage
            if sphericalKMeans:
                sKMeanAvg = np.append(sKMeanAvg, np.average(zeroOneLoss))
                sKMeanSD = np.append(sKMeanSD, np.std(zeroOneLoss) / np.sqrt(10))
            else:
                kMeanAvg = np.append(kMeanAvg, np.average(zeroOneLoss))
                kMeanSD = np.append(kMeanSD, np.std(zeroOneLoss) / np.sqrt(10))

    np.savetxt(title + '.csv',np.row_stack((tssSet,kMeanAvg, kMeanSD,sKMeanAvg,sKMeanSD)),delimiter=',')
    fig = plt.figure()
    fig.suptitle(title, fontsize=14, fontweight='bold')
    ax = fig.add_subplot(111)
    ax.set_xlabel('training set sizes')
    ax.set_ylabel('Zero-One Loss')
    ax.plot(tssSet, kMeanAvg, 'r-', tssSet, sKMeanAvg, 'b-')
    ax.errorbar(tssSet, kMeanAvg, yerr=kMeanSD, fmt='ro')
    ax.errorbar(tssSet, sKMeanAvg, yerr=sKMeanSD, fmt='bo')
    ax.axis([50, 2050, 0, 1])
    plt.savefig(title)
    plt.show()
示例#57
0
def test_logitboost_musk_fitting():
    c = LogitBoostClassifier(
            base_estimator=DecisionTreeRegressor(max_depth=1),
            n_estimators=30,
            learning_rate=1.0
    )
    data = MUSK1()
    c.fit(data.data, np.sign(data.labels))
    assert_array_less(c.estimator_errors_, 0.6)
    assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.05
示例#58
0
def test_gentleboost_hastie_fitting():
    c = GentleBoostClassifier(
        base_estimator=DecisionTreeRegressor(max_depth=1),
        n_estimators=30,
        learning_rate=1.0
    )
    data = Hastie_10_2()
    c.fit(data.data, np.sign(data.labels))
    assert_array_less(c.estimator_errors_, 0.5)
    assert zero_one_loss(np.sign(data.labels), c.predict(data.data)) < 0.2
示例#59
0
def ValAccuracyLoss():
    imagesVal = os.path.join(path_to_model, "macval.txt")
    y_true, \
    y_predict, \
    y_predict_float, \
    runtime_times = ForwardNet(imagesVal,False)
    report = classification_report(y_true, y_predict)
    print "Val Accuracy = ", accuracy_score(y_true, y_predict)
    print "Val Loss = ", zero_one_loss(y_true, y_predict)
    print "Val Loss Cross Entropy = ", CrossEntropyLoss(y_true, y_predict_float)
    print report