Exemplo n.º 1
0
def test_symmetry():
    """Test the symmetry of score and loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)

    # We shouldn't forget any metrics
    assert_equal(set(SYMETRIC_METRICS).union(set(NOT_SYMETRIC_METRICS)),
                 set(ALL_METRICS))

    assert_equal(set(SYMETRIC_METRICS).intersection(set(NOT_SYMETRIC_METRICS)),
                 set([]))

    # Symmetric metric
    for name, metric in SYMETRIC_METRICS.items():
        assert_equal(metric(y_true, y_pred),
                     metric(y_pred, y_true),
                     msg="%s is not symetric" % name)

    # Not symmetric metrics
    for name, metric in NOT_SYMETRIC_METRICS.items():
        assert_true(metric(y_true, y_pred) != metric(y_pred, y_true),
                    msg="%s seems to be symetric" % name)

    # Deprecated metrics
    with warnings.catch_warnings(record=True):
        # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred),
                     zero_one(y_pred, y_true))

        assert_equal(zero_one(y_true, y_pred, normalize=False),
                     zero_one(y_pred, y_true, normalize=False))

        assert_equal(zero_one_score(y_true, y_pred),
                     zero_one_score(y_pred, y_true))
Exemplo n.º 2
0
def test_symmetry():
    """Test the symmetry of score and loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)

    # We shouldn't forget any metrics
    assert_equal(set(SYMETRIC_METRICS).union(NOT_SYMETRIC_METRICS,
                                             THRESHOLDED_METRICS),
                 set(ALL_METRICS))

    assert_equal(set(SYMETRIC_METRICS).intersection(set(NOT_SYMETRIC_METRICS)),
                 set([]))

    # Symmetric metric
    for name, metric in SYMETRIC_METRICS.items():
        assert_equal(metric(y_true, y_pred),
                     metric(y_pred, y_true),
                     msg="%s is not symetric" % name)

    # Not symmetric metrics
    for name, metric in NOT_SYMETRIC_METRICS.items():
        assert_true(metric(y_true, y_pred) != metric(y_pred, y_true),
                    msg="%s seems to be symetric" % name)

    # Deprecated metrics
    with warnings.catch_warnings(record=True):
        # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred),
                     zero_one(y_pred, y_true))

        assert_equal(zero_one(y_true, y_pred, normalize=False),
                     zero_one(y_pred, y_true, normalize=False))

        assert_equal(zero_one_score(y_true, y_pred),
                     zero_one_score(y_pred, y_true))
Exemplo n.º 3
0
def test_classifiers_classes():
    # test if classifiers can cope with non-consecutive classes
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators
                   if issubclass(E, ClassifierMixin)]
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    y = 2 * y + 1
    # TODO: make work with next line :)
    #y = y.astype(np.str)
    for name, Clf in classifiers:
        if Clf in dont_test or Clf in meta_estimators:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue

        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        # training set performance
        assert_array_equal(np.unique(y), np.unique(y_pred))
        assert_greater(zero_one_score(y, y_pred), 0.78)
Exemplo n.º 4
0
def test_classifiers_classes():
    # test if classifiers can cope with non-consecutive classes
    classifiers = all_estimators(type_filter='classifier')
    X, y = make_blobs(random_state=12345)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    y = 2 * y + 1
    # TODO: make work with next line :)
    #y = y.astype(np.str)
    for name, Clf in classifiers:
        if Clf in dont_test:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue

        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        # training set performance
        assert_array_equal(np.unique(y), np.unique(y_pred))
        assert_greater(zero_one_score(y, y_pred), 0.78,
                       "accuracy of %s not greater than 0.78" % str(Clf))
Exemplo n.º 5
0
def test_classifiers_classes():
    # test if classifiers can cope with non-consecutive classes
    classifiers = all_estimators(type_filter='classifier')
    X, y = make_blobs(random_state=12345)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    y = 2 * y + 1
    classes = np.unique(y)
    # TODO: make work with next line :)
    #y = y.astype(np.str)
    for name, Clf in classifiers:
        if Clf in dont_test:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue

        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        # training set performance
        assert_array_equal(np.unique(y), np.unique(y_pred))
        assert_greater(zero_one_score(y, y_pred), 0.78,
                       "accuracy of %s not greater than 0.78" % str(Clf))
        assert_array_equal(
            clf.classes_, classes,
            "Unexpected classes_ attribute for %r" % clf)
Exemplo n.º 6
0
def test_classifiers_classes():
    # test if classifiers can cope with non-consecutive classes
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators if issubclass(E,
        ClassifierMixin)]
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    y = 2 * y + 1
    # TODO: make work with next line :)
    #y = y.astype(np.str)
    for name, Clf in classifiers:
        if Clf in dont_test or Clf in meta_estimators:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue

        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        # training set performance
        assert_array_equal(np.unique(y), np.unique(y_pred))
        assert_greater(zero_one_score(y, y_pred), 0.78)
Exemplo n.º 7
0
def test_classifiers_train():
    # test if classifiers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators if issubclass(E,
        ClassifierMixin)]
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=7)
    n_samples, n_features = X.shape
    n_labels = len(np.unique(y))
    X = Scaler().fit_transform(X)
    for name, Clf in classifiers:
        if Clf in dont_test or Clf in meta_estimators:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            clf = Clf()
        # raises error on malformed input for fit
        assert_raises(ValueError, clf.fit, X, y[:-1])

        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        assert_equal(y_pred.shape, (n_samples,))
        # training set performance
        assert_greater(zero_one_score(y, y_pred), 0.78)

        # raises error on malformed input for predict
        assert_raises(ValueError, clf.predict, X.T)
        if hasattr(clf, "decision_function"):
            try:
                # decision_function agrees with predict:
                decision = clf.decision_function(X)
                assert_equal(decision.shape, (n_samples, n_labels))
                # raises error on malformed input
                assert_raises(ValueError, clf.decision_function, X.T)
                if not isinstance(clf, BaseLibSVM):
                    # 1on1 of LibSVM works differently
                    assert_array_equal(np.argmax(decision, axis=1), y_pred)
                # raises error on malformed input for decision_function
                assert_raises(ValueError, clf.decision_function, X.T)
            except NotImplementedError:
                pass
        if hasattr(clf, "predict_proba"):
            try:
                # predict_proba agrees with predict:
                y_prob = clf.predict_proba(X)
                assert_equal(y_prob.shape, (n_samples, n_labels))
                # raises error on malformed input
                assert_raises(ValueError, clf.predict_proba, X.T)
                assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
                # raises error on malformed input for predict_proba
                assert_raises(ValueError, clf.predict_proba, X.T)
            except NotImplementedError:
                pass
Exemplo n.º 8
0
def test_classifier(classifier, trainData, trainLabel, testData, testLabel):
    classifier.fit(trainData, trainLabel)
    testPredicted = classifier.predict(testData)
    print 'Accuracy: ', metrics.zero_one_score(testLabel, testPredicted)
    print 'F1-score: ', metrics.f1_score(testLabel, testPredicted)
    print metrics.classification_report(testLabel, testPredicted)

    return classifier
Exemplo n.º 9
0
        def make_conf_mat(y_te, y_te_pr, type):
            conf_mat = metrics.confusion_matrix(y_te, y_te_pr)
            conf_mat_frac = conf_mat / np.sum(conf_mat, axis=0)
            print type, ' Accuracy: ', metrics.zero_one_score(y_te, y_te_pr)

            np.savetxt(os.path.join(class_dir, prefix+'_conf_'+type+'.csv'),
                       conf_mat, fmt='%i', delimiter=',')
            np.savetxt(os.path.join(class_dir, prefix+'_conffr_'+type+'.csv'), 
                       conf_mat_frac, fmt = '%.6f', delimiter=',')
Exemplo n.º 10
0
def main(argv):
    import scipy
    from sklearn import metrics
    from sklearn.multiclass import OneVsOneClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.cross_validation import cross_val_score
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn import preprocessing
    import similarity
    
    class ScaledSVC(SVC):
        def _scale(self, data):
            return preprocessing.scale(data)
        def fit(self, X, Y):
            return super(ScaledSVC, self).fit(self._scale(X), Y)
        def predict(self, X):
            return super(ScaledSVC, self).predict(self._scale(X))

    data, labels = scipy.loadtxt(argv[1]), scipy.loadtxt(argv[2])
    if len(argv) > 3:
        features = np.array([int(s) for s in argv[3].split(',')])
        data = data[:, features]
        
    def ovo(model, adj_strat):
        return OneVsOneClassifier(BinaryTiloClassifier(model, adj_strat))

    classifiers = [
        ('TILO/PRC/Gaussian',
         ovo(PinchRatioCutStrategy(),
             similarity.Gaussian())),
        ("TILO/Nearest/Gaussian",
         ovo(NearestCutStrategy(),
             similarity.Gaussian())),
        ("TILO/PRC/KNN",
         ovo(PinchRatioCutStrategy(),
             similarity.KNN())),
        ("TILO/Nearest/KNN",
         ovo(NearestCutStrategy(),
             similarity.KNN())),
        ("SVC", ScaledSVC()),
        ("Gaussian Naive Bayes", GaussianNB()),
        ("K Neighbors", KNeighborsClassifier()),
        ("Decision Tree", DecisionTreeClassifier())]
    format_str = '{:<30} {} {} {}'
    print '{:<30} {:<10}         RAND   Accuracy'.format('method', 'accuracy')
    for name, c in classifiers:
        scores = cross_val_score(c, data, labels, cv=5)
        #scores = np.array([1., 1.])
        model = c.fit(data, labels)
        guesses = model.predict(data)
        acc = metrics.zero_one_score(guesses, labels)
        rand = metrics.adjusted_rand_score(guesses, labels)
        print '{:<30} {:.4f} +/- {:.4f} {: .4f} {:.4f}'.format(name, scores.mean(),
                                                               scores.std() / 2,
                                                               rand, acc)
Exemplo n.º 11
0
def test_losses():
    """Test loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)
    n_samples = y_true.shape[0]
    n_classes = np.size(unique_labels(y_true))

    # Classification
    # --------------
    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred), 13)
        assert_almost_equal(zero_one(y_true, y_pred, normalize=True),
                            13 / float(n_samples), 2)

    assert_almost_equal(zero_one_loss(y_true, y_pred),
                        13 / float(n_samples), 2)
    assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 13)
    assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2)
    assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2)

    assert_almost_equal(hamming_loss(y_true, y_pred),
                        2 * 13. / (n_samples * n_classes), 2)

    assert_equal(accuracy_score(y_true, y_pred),
                 1 - zero_one_loss(y_true, y_pred))

    assert_equal(accuracy_score(y_true, y_pred, normalize=False),
                 n_samples - zero_one_loss(y_true, y_pred, normalize=False))

    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one_score(y_true, y_pred),
                     1 - zero_one_loss(y_true, y_pred))

    # Regression
    # ----------
    assert_almost_equal(mean_squared_error(y_true, y_pred),
                        12.999 / n_samples, 2)
    assert_almost_equal(mean_squared_error(y_true, y_true),
                        0.00, 2)

    # mean_absolute_error and mean_squared_error are equal because
    # it is a binary problem.
    assert_almost_equal(mean_absolute_error(y_true, y_pred),
                        12.999 / n_samples, 2)
    assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2)

    assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2)
    assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2)
    assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0)

    assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2)
    assert_almost_equal(r2_score(y_true, y_true), 1.00, 2)
    assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0)
    assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
Exemplo n.º 12
0
def test_losses():
    """Test loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)
    n_samples = y_true.shape[0]
    n_classes = np.size(unique_labels(y_true))

    # Classification
    # --------------
    with warnings.catch_warnings(record=True):
        # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred), 11)
        assert_almost_equal(zero_one(y_true, y_pred, normalize=True),
                            11 / float(n_samples), 2)

    assert_almost_equal(zero_one_loss(y_true, y_pred), 11 / float(n_samples),
                        2)
    assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 11)
    assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2)
    assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2)

    assert_almost_equal(hamming_loss(y_true, y_pred),
                        2 * 11. / (n_samples * n_classes), 2)

    assert_equal(accuracy_score(y_true, y_pred),
                 1 - zero_one_loss(y_true, y_pred))

    assert_equal(accuracy_score(y_true, y_pred, normalize=False),
                 n_samples - zero_one_loss(y_true, y_pred, normalize=False))

    with warnings.catch_warnings(record=True):
        # Throw deprecated warning
        assert_equal(zero_one_score(y_true, y_pred),
                     1 - zero_one_loss(y_true, y_pred))

    # Regression
    # ----------
    assert_almost_equal(mean_squared_error(y_true, y_pred), 10.999 / n_samples,
                        2)
    assert_almost_equal(mean_squared_error(y_true, y_true), 0.00, 2)

    # mean_absolute_error and mean_squared_error are equal because
    # it is a binary problem.
    assert_almost_equal(mean_absolute_error(y_true, y_pred),
                        10.999 / n_samples, 2)
    assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2)

    assert_almost_equal(explained_variance_score(y_true, y_pred), 0.16, 2)
    assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2)
    assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0)

    assert_almost_equal(r2_score(y_true, y_pred), 0.12, 2)
    assert_almost_equal(r2_score(y_true, y_true), 1.00, 2)
    assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0)
    assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
Exemplo n.º 13
0
def test_symmetry():
    """Test the symmetry of score and loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)

    # Symmetric metric
    for metric in [accuracy_score,
                   lambda y1, y2: accuracy_score(y1, y2, normalize=False),
                   zero_one_loss,
                   lambda y1, y2: zero_one_loss(y1, y2, normalize=False),
                   hamming_loss,
                   f1_score,
                   matthews_corrcoef,
                   mean_squared_error,
                   mean_absolute_error]:

        assert_equal(metric(y_true, y_pred),
                     metric(y_pred, y_true),
                     msg="%s is not symetric" % metric)

    # Not symmetric metrics
    for metric in [precision_score,
                   recall_score,
                   lambda y1, y2: fbeta_score(y1, y2, beta=0.5),
                   lambda y1, y2: fbeta_score(y1, y2, beta=2),
                   explained_variance_score,
                   r2_score]:

        assert_true(metric(y_true, y_pred) != metric(y_pred, y_true),
                    msg="%s seems to be symetric" % metric)

    # Deprecated metrics
    with warnings.catch_warnings(True):
        # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred),
                     zero_one(y_pred, y_true))

        assert_equal(zero_one(y_true, y_pred, normalize=False),
                     zero_one(y_pred, y_true, normalize=False))

        assert_equal(zero_one_score(y_true, y_pred),
                     zero_one_score(y_pred, y_true))
Exemplo n.º 14
0
def test_classifiers_train():
    # test if classifiers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators
                   if issubclass(E, ClassifierMixin)]
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=7)
    n_samples, n_features = X.shape
    n_labels = len(np.unique(y))
    X = Scaler().fit_transform(X)
    for name, Clf in classifiers:
        if Clf in dont_test or Clf in meta_estimators:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True) as w:
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        assert_equal(y_pred.shape, (n_samples, ))
        # training set performance
        assert_greater(zero_one_score(y, y_pred), 0.78)

        # raises error on malformed input for predict
        assert_raises(ValueError, clf.predict, X.T)
        if hasattr(clf, "decision_function"):
            try:
                # decision_function agrees with predict:
                decision = clf.decision_function(X)
                assert_equal(decision.shape, (n_samples, n_labels))
                if not isinstance(clf, BaseLibSVM):
                    # 1on1 of LibSVM works differently
                    assert_array_equal(np.argmax(decision, axis=1), y_pred)
                # raises error on malformed input for decision_function
                assert_raises(ValueError, clf.decision_function, X.T)
            except NotImplementedError:
                pass
        if hasattr(clf, "predict_proba"):
            try:
                # predict_proba agrees with predict:
                y_prob = clf.predict_proba(X)
                assert_equal(y_prob.shape, (n_samples, n_labels))
                assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
                # raises error on malformed input for predict_proba
                assert_raises(ValueError, clf.predict_proba, X.T)
            except NotImplementedError:
                pass
Exemplo n.º 15
0
def test_symmetry():
    """Test the symmetry of score and loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)

    # Symmetric metric
    for metric in [
            accuracy_score,
            lambda y1, y2: accuracy_score(y1, y2, normalize=False),
            zero_one_loss,
            lambda y1, y2: zero_one_loss(y1, y2, normalize=False),
            hamming_loss, f1_score, matthews_corrcoef, mean_squared_error,
            mean_absolute_error
    ]:

        assert_equal(metric(y_true, y_pred),
                     metric(y_pred, y_true),
                     msg="%s is not symetric" % metric)

    # Not symmetric metrics
    for metric in [
            precision_score, recall_score,
            lambda y1, y2: fbeta_score(y1, y2, beta=0.5),
            lambda y1, y2: fbeta_score(y1, y2, beta=2),
            explained_variance_score, r2_score
    ]:

        assert_true(metric(y_true, y_pred) != metric(y_pred, y_true),
                    msg="%s seems to be symetric" % metric)

    # Deprecated metrics
    with warnings.catch_warnings(record=True):
        # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred), zero_one(y_pred, y_true))

        assert_equal(zero_one(y_true, y_pred, normalize=False),
                     zero_one(y_pred, y_true, normalize=False))

        assert_equal(zero_one_score(y_true, y_pred),
                     zero_one_score(y_pred, y_true))
Exemplo n.º 16
0
def test_symmetry():
    """Test the symmetry of score and loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)

    # symmetric
    assert_equal(accuracy_score(y_true, y_pred),
                 accuracy_score(y_pred, y_true))

    with warnings.catch_warnings(True):
        # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred),
                     zero_one(y_pred, y_true))

        assert_almost_equal(zero_one(y_true, y_pred, normalize=False),
                            zero_one(y_pred, y_true, normalize=False), 2)

    assert_equal(zero_one_loss(y_true, y_pred),
                 zero_one_loss(y_pred, y_true))

    assert_equal(zero_one_loss(y_true, y_pred, normalize=False),
                 zero_one_loss(y_pred, y_true, normalize=False))

    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one_score(y_true, y_pred),
                     zero_one_score(y_pred, y_true))

    assert_almost_equal(mean_squared_error(y_true, y_pred),
                        mean_squared_error(y_pred, y_true))

    assert_almost_equal(mean_absolute_error(y_true, y_pred),
                        mean_absolute_error(y_pred, y_true))

    # not symmetric
    assert_true(explained_variance_score(y_true, y_pred) !=
                explained_variance_score(y_pred, y_true))
    assert_true(r2_score(y_true, y_pred) !=
                r2_score(y_pred, y_true))
Exemplo n.º 17
0
def test_symmetry():
    """Test the symmetry of score and loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)

    # symmetric
    assert_equal(accuracy_score(y_true, y_pred),
                 accuracy_score(y_pred, y_true))

    with warnings.catch_warnings(True):
        # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred),
                     zero_one(y_pred, y_true))

        assert_almost_equal(zero_one(y_true, y_pred, normalize=False),
                            zero_one(y_pred, y_true, normalize=False), 2)

    assert_equal(zero_one_loss(y_true, y_pred),
                 zero_one_loss(y_pred, y_true))

    assert_equal(zero_one_loss(y_true, y_pred, normalize=False),
                 zero_one_loss(y_pred, y_true, normalize=False))

    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one_score(y_true, y_pred),
                     zero_one_score(y_pred, y_true))

    assert_almost_equal(mean_squared_error(y_true, y_pred),
                        mean_squared_error(y_pred, y_true))

    assert_almost_equal(mean_absolute_error(y_true, y_pred),
                        mean_absolute_error(y_pred, y_true))

    # not symmetric
    assert_true(explained_variance_score(y_true, y_pred) !=
                explained_variance_score(y_pred, y_true))
    assert_true(r2_score(y_true, y_pred) !=
                r2_score(y_pred, y_true))
Exemplo n.º 18
0
def train_model():

###   Steps to get and store the tfidf values
#    vectorizer = TfidfVectorizer(stop_words='english', min_n=1, max_n=2,
#                               smooth_idf=True, sublinear_tf=True, max_df=0.5)

#    train_data = vectorizer.fit_transform(generate_emails(training_filenames))
#    test_data = vectorizer.transform(generate_emails(test_filenames))
#    joblib.dump(train_data.tocsr(), 'train_data.joblib')
#    joblib.dump(test_data.tocsr(), 'test_data.joblib')
#    joblib.dump(self.train_target, 'train_target.joblib')
#    joblib.dump(self.test_target, 'test_target.joblib')
###

    train_data = joblib.load('train_data.joblib', mmap_mode='c')
    test_data = joblib.load('test_data.joblib', mmap_mode='c')
    train_target = joblib.load('train_target.joblib', mmap_mode='c')
    test_target = joblib.load('test_target.joblib', mmap_mode='c')

###   Steps to select best features
#    print "Selecting K-best features by chi squared test"
#    start_time = time()
#    ch2 = SelectKBest(chi2, k=100)
#    train_data = ch2.fit_transform(train_data, train_target)
#    test_data = ch2.transform(test_data)
#    print "[Train data] n_samples: %d, n_features: %d" % train_data.shape
#    print "[Test data] n_samples: %d, n_features: %d" % test_data.shape
#    print "Done in %0.3fs" % (time() - start_time)
###
    if train_data.shape[0] == 0:
        print "train_data is empty. No vectors to train on."
        return None

    clf = LinearSVC() #SGDClassifier(n_iter=10, loss='modified_huber')
    print "Training %s" % (clf),
    start_time=time()
    clf.fit(train_data, train_target)
    train_time = time() - start_time
    print "Done in %0.3fs" % train_time

    print "Testing..."
    test_start = time()
    predicted = clf.predict(test_data)
    accuracy = zero_one_score(test_target, predicted)
    error_rate = 1 - accuracy
    test_time = time() - test_start
    print "Done in %0.3fs" % test_time

    print "Accuracy: ", numpy.mean(predicted == self.test_target)
    print "Z1 Accuracy: ", accuracy
Exemplo n.º 19
0
 def evaluate(self, test_file, encoding='UTF-8', classif_file=None):
     """\
     Evaluate on the given test data file. Return accuracy.
     If classif_file is set, save the classification results to this file.
     """
     test = DataSet()
     test.load_from_arff(test_file, encoding)
     values = self.classify(test)
     golden = self.get_classes(test, dtype=None)
     if classif_file is not None:
         classif = DataSet()
         classif.load_from_vect(test.get_attrib(self.class_attr), values)
         classif.rename_attrib(self.class_attr, self.PREDICTED)
         test.merge(classif)
         test.save_to_arff(classif_file, encoding)
     return zero_one_score(golden, values)
Exemplo n.º 20
0
 def evaluate(self, test_file, encoding='UTF-8', classif_file=None):
     """\
     Evaluate on the given test data file. Return accuracy.
     If classif_file is set, save the classification results to this file.
     """
     test = DataSet()
     test.load_from_arff(test_file, encoding)
     values = self.classify(test)
     golden = self.get_classes(test, dtype=None)
     if classif_file is not None:
         classif = DataSet()
         classif.load_from_vect(test.get_attrib(self.class_attr), values)
         classif.rename_attrib(self.class_attr, self.PREDICTED)
         test.merge(classif)
         test.save_to_arff(classif_file, encoding)
     return zero_one_score(golden, values)
Exemplo n.º 21
0
def roi_svc_model_0(X_train, y_train, X_test, y_test):
    """
    An instance of multi-classes classifier -- model-0.
    Return the predict accuracy.

    """
    # data preprocessing
    y_train_bin = y_train.copy()
    y_train_bin[y_train_bin != 0] = 1
    scaler = preprocessing.Scaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # train the binary-classes classifier
    bin_svc = svm.SVC(C=1, kernel='rbf', cache_size=1000, class_weight='auto')
    bin_svc.fit(X_train, y_train_bin)

    # train the multi-classes classifier
    labeled_sample_idx = [idx for idx in range(y_train.shape[0])
                          if y_train[idx] != 0]
    X_train_mul = X_train[labeled_sample_idx, :]
    y_train_mul = y_train[labeled_sample_idx]
    mul_label_list = np.unique(y_train_mul)
    mul_label_list = mul_label_list.tolist()
    print mul_label_list
    mul_svc = svm.SVC(C=1, kernel='rbf', cache_size=1000, class_weight='auto')
    mul_svc.fit(X_train_mul, y_train_mul)

    # test the classifier using an independent dataset
    y_predict_bin = bin_svc.predict(X_test)
    selected_sample_idx = [idx for idx in range(y_predict_bin.shape[0])
                           if y_predict_bin[idx] != 0]
    y_predict_mul = mul_svc.predict(X_test[selected_sample_idx, :])

    # calculate the predict score
    y_predict = np.zeros((y_test.shape[0]))
    y_predict[selected_sample_idx] = y_predict_mul
    score = metrics.zero_one_score(y_test, y_predict)
    precision = metrics.precision_score(y_test, y_predict,
                                        labels=mul_label_list,
                                        pos_label=None)
    recall = metrics.recall_score(y_test, y_predict,
                                  labels=mul_label_list,
                                  pos_label=None)
    return score, precision, recall
Exemplo n.º 22
0
def C_and_gamma_evaluation(X_tr, y_tr, X_cv, y_cv, classifier_by_C_and_gamma_function, 
                          error_measure_function, C, idx_C, gamma, idx_gamma):
    
    classifier = classifier_by_C_and_gamma_function(X_tr, y_tr, C=C, gamma=gamma)
                      
    tr_err, cv_err = error_measure_function(classifier,X_tr,y_tr,X_cv,y_cv)
                
    y_pred=classifier.predict(X_cv)

    if hasattr(metrics,"accuracy_score"):
        acc = metrics.accuracy_score(y_cv,y_pred)
    else:
        assert hasattr(metrics,"zero_one_score")
        acc = metrics.zero_one_score(y_cv, y_pred)
    prec=metrics.precision_score(y_cv,y_pred)
    recall=metrics.recall_score(y_cv,y_pred)
    f1_score=metrics.f1_score(y_cv,y_pred)

    return idx_C, idx_gamma, tr_err, cv_err, acc, prec, recall, f1_score
Exemplo n.º 23
0
def C_evaluation(X_tr, y_tr, X_cv, y_cv, classifier_by_C_function, 
                          error_measure_function, C, idx_C):
    
    classifier = classifier_by_C_function(X_tr, y_tr,C=C)
                          
    tr_err, cv_err = error_measure_function(classifier,X_tr,y_tr,X_cv,y_cv)
    
    #it is assumed that we are dealing with a sklearn classifier...
    y_pred = classifier.predict(X_cv)

    if hasattr(metrics,"accuracy_score"):
        acc = metrics.accuracy_score(y_cv,y_pred)
    else:
        assert hasattr(metrics,"zero_one_score")
        acc = metrics.zero_one_score(y_cv, y_pred)
    prec=metrics.precision_score(y_cv,y_pred)
    recall=metrics.recall_score(y_cv,y_pred)
    f1_score=metrics.f1_score(y_cv,y_pred)
    
    return idx_C, tr_err, cv_err, acc, prec, recall, f1_score
Exemplo n.º 24
0
 def performance_estimation(self, X, y, kernel = SVM_RBF, C = 1.0, gamma = None, n_iterations = 20, test_size = 0.3):
     
     assert isinstance(C,(int,float))
     
     set_ripartitions = StratifiedShuffleSplit(y, n_iter = n_iterations, 
                                               test_size = test_size, indices = False)
     
     if kernel == SVM_linear:
         classifier = LinearSVC(C=C, class_weight = 'auto')
     elif kernel == SVM_RBF:
         assert isinstance(gamma,(int,float))
         classifier = SVC(kernel="rbf", C=C, gamma=gamma, class_weight = 'auto')
     elif kernel == SVM_RBF_Chi2_squared:
         classifier = SVC(kernel=chi2_kernel,C=C, class_weight = 'auto')
         
     accuracy_avg = 0.0
     precision_avg = 0.0
     recall_avg = 0.0
     f1_score_avg = 0.0
     
     for train,test in set_ripartitions:
         X_tr,X_cv,y_tr,y_cv =X[train],X[test],y[train],y[test]
         classifier.fit(X_tr, y_tr)
         y_pred=classifier.predict(X_cv)
         if hasattr(metrics,"accuracy_score"):
             acc = metrics.accuracy_score(y_cv,y_pred)
         else:
             assert hasattr(metrics,"zero_one_score")
             acc = metrics.zero_one_score(y_cv, y_pred)
         prec=metrics.precision_score(y_cv,y_pred)
         recall=metrics.recall_score(y_cv,y_pred)
         f1_score=metrics.f1_score(y_cv,y_pred)
         
         accuracy_avg = accuracy_avg + acc / n_iterations
         precision_avg = precision_avg + prec / n_iterations
         recall_avg = recall_avg + recall / n_iterations
         f1_score_avg = f1_score_avg + f1_score / n_iterations
         
     return accuracy_avg, precision_avg, recall_avg, f1_score_avg
Exemplo n.º 25
0
 def _evaluatePredictions(self,report,predictions,groundtruth):
     predicted_labels =  1*(predictions>.5)
     fpr, tpr, thresholds = roc_curve(groundtruth, predictions)
     roc_auc = auc(fpr, tpr)
     
     plt.clf()
     plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
     plt.plot([0, 1], [0, 1], 'k--')
     plt.xlim([0.0, 1.0])
     plt.ylim([0.0, 1.0])
     plt.xlabel('False Positive Rate')
     plt.ylabel('True Positive Rate')
     plt.title('Receiver operating characteristic example')
     plt.legend(loc="lower right")
     
     report.title('Summary',level=2)
     accuracy = zero_one_score(groundtruth,predicted_labels)
     report.text('Global accuracy = %.1f%%'%(100.0*accuracy))
     
     report.title('ROC Curve',level=2)
     report.plot()
     
     plt.close()
     
     report.title('Short report',level=2)
     report.pre(classification_report(groundtruth,predicted_labels))
     
     report.title('Confusion matrix',level=2)
     report.table(['Ground truth','0','1'])
     
     confusion = confusion_matrix(groundtruth,predicted_labels)
     
     for k in xrange(2):
         report.row([k,confusion[k][0],confusion[k][1]])
         
     report.close()
Exemplo n.º 26
0
def classifier(test='50_categories',target='validation',validate=False,quick=False):

    """classifier(test='test/',target='target/',validate=False,verbose=False): function that attempts to classify a set of images based on a set of learning images, using various image features. If validate=False, classifier uses half the testing set as a target set, and outputs statistics on its performance. If validate=True, classifier uses the entire test directory to train on, and outputs a list of the target files with predicted classification. If quick=True, classifier uses only the first 5 images in each category to train on. Note on paths: training path expects a directory with subdirectories named with the category label, and validation path expects a directory of images"""

    print '%CLASSIFIER: Reading Input Files' # status update
    test = test+'/'
    target = target+'/'

    # read in files, store data and categories
    dir=test
    subdir=os.listdir(dir)
    for element in subdir:
        if element[0]=='.':
            subdir.remove(element) # eliminate non-image  entries
    images=[] # store images
    categories=[] # store name of categories for each image
    catint=[] # store integer designation of category for each image
    filens=[] # store names of files
    i=-1
    for cat in subdir: # loop over directories
        i+=1
        files=os.listdir(dir+cat)
        if quick==True: # speed-learning: 5 images per category
            files=files[:4]
        for element in files:
            if element[0]=='.':
                files.remove(element) # eliminate non-image entries
        categories.append(cat)
        for item in files: # look over files
            catint.append(i)
            images.append(np.flipud(plt.imread(dir+cat+'/'+item)))
            filens.append(item)

    # repeat for testing set, if different from target set
    if validate == True:
        valdir=target
        valimages=[]
        valfilens = os.listdir(valdir)
        for item in valfilens:
            if item[0]=='.':
                valfilens.remove(item)
        for item in valfilens:
            valimages.append(np.flipud(plt.imread(valdir+item)))

    # names of features for class Featured
    featnames=['Total Number of Pixels','Aspect Ratio','Median Number of Edges',\
                'Fraction of Color in Red','Fraction of Color in Green',\
                'Fraction of Color in Green','Fraction of Edges in Vertical Orientation',\
                'Skewness in Red Channel','Variation in Blue Channel','Brightness Centering',\
                'Correlation Between Red and Blue Channels',\
                'Correlation Between Red and Green Channels',\
                'Correlation Between Blue and Green Channels','Number of Connected Bright Objects',\
                'Number of Connected Dim Objects']

    nim = len(catint) # number of images
    nfeat = len(featnames) # number of features
    features = np.zeros((nim,nfeat)) # store feature values for each image
    statuspoints = np.linspace(0,nim,11) # for status update calculations
    statuspoints = [np.floor(pt) for pt in statuspoints]
    if validate == True:
        nval = len(valfilens) # number of validation images
        valfeatures = np.zeros((nval,nfeat)) # store feature values for validation images

    # calculate features for images in training set
    print '%CLASSIFIER: Calculating Features' # status update
    statusn = 0
    for i in range(nim):
        # STATUS UPDATE
        if i in statuspoints:
            # print status update about every 10% complete
            print '%CLASSIFIER: '+theraven(statusn)+' ['+str(i*100/nim)+'%]'
            statusn+=1
        im = Featured(images[i])
        features[i,0]=im.ncountpix()
        features[i,1]=im.aspect()
        features[i,2]=im.mededges()
        features[i,3]=im.redfrac()
        features[i,4]=im.greenfrac()
        features[i,5]=im.bluefrac()
        features[i,6]=im.vedges()
        features[i,7]=im.redskew()
        features[i,8]=im.bluevar()
        features[i,9]=im.centered()
        features[i,10]=im.rgcor()
        features[i,11]=im.rbcor()
        features[i,12]=im.bgcor()
        features[i,13]=im.nbright()
        features[i,14]=im.ndim()

    # calculate features for images in validation set
    if validate == True:
        print '%CLASSIFIER: Calculating Validation Set Features'
        for i in range(nval):
            im = Featured(valimages[i])
            valfeatures[i,0]=im.ncountpix()
            valfeatures[i,1]=im.aspect()
            valfeatures[i,2]=im.mededges()
            valfeatures[i,3]=im.redfrac()
            valfeatures[i,4]=im.greenfrac()
            valfeatures[i,5]=im.bluefrac()
            valfeatures[i,6]=im.vedges()
            valfeatures[i,7]=im.redskew()
            valfeatures[i,8]=im.bluevar()
            valfeatures[i,9]=im.centered()
            valfeatures[i,10]=im.rgcor()
            valfeatures[i,11]=im.rbcor()
            valfeatures[i,12]=im.bgcor()
            valfeatures[i,13]=im.nbright()
            valfeatures[i,14]=im.ndim()

    # building testing and target sets
    if validate==False:
        testim = images[::2]
        targim = images[1::2]
        testfeat = features[::2,:]
        targfeat = features[1::2,:]
        targfiles = filens[1::2]
        testcat = catint[::2]
        targcat = catint[1::2]
    else:
        testim = images
        targim = valimages
        testfeat = features
        targfeat = valfeatures
        targfiles = valfilens
        testcat = catint

    # build random forest 
    print '%CLASSIFIER: Building Random Forest' # status update
    rfc = RandomForestClassifier(compute_importances=True)
    rfc = rfc.fit(testfeat,testcat)
    impt = rfc.feature_importances_
    pred = rfc.predict(targfeat) # predicted categories for target images
    ncat=max(catint)

    if validate == False:
        randpred = [] # predictions for targcat based on random guessing
        for i in range(len(targim)):
            randpred.append(random.randint(0,ncat+1))
        score = metrics.zero_one_score(targcat, pred) # zero-one score
        randscore = metrics.zero_one_score(targcat, randpred) # zero-one score for random guessing
        # outputs
        print 'Three Most Important Features:'
        for ifeat in range(3):
            maxind = np.where(impt == np.max(impt))
            impt[maxind[0]]=0
            print str(ifeat+1)+'. '+featnames[maxind[0]]

        print str(int(score*100))+'% Good Predictions from Random Forest'
        print str(int(randscore*100))+'% Good Predictions from Random Guessing'
    else:
        print 'filename\t\tpredicted_class'
        print '-'*50
        for i in range(len(targfiles)):
            name = targfiles[i]+' '*(30-len(targfiles[i]))
            print name+'\t'+categories[pred[i]]
Exemplo n.º 27
0
    x = {"first": name[0],
         "first2": name[:2],
         "first3": name[:3],
         "last": name[-1],
         "last2": name[-2:],
         "last3": name[-3:]}
    for c in "abcdefghijklmnopqrstuvwzyx":
        x["count(%s)" % c] = name.count(c)
    return x


dv = DictVectorizer()
X = dv.fit_transform(gender_features(n) for n in names)
# TODO scale/center X
X = X.tocsr()
print("%d samples, %d features\n" % X.shape)

y = np.array([0] * len(female_names) + [1] * len(male_names))

# Instead of splitting our data into training and test sets,
# we perform 10-fold cross validation.

for clf in (BernoulliNB(), LinearSVC()):
    print("Training and testing %r" % clf)
    for i, (train, test) in enumerate(StratifiedKFold(y, k=10)):
        clf.fit(X[train], y[train])

        y_pred = clf.predict(X[test])
        acc = zero_one_score(y[test], y_pred)
        print("  Fold: %d  Accuracy: %.2f%%" % (i, acc * 100))
Exemplo n.º 28
0
 def rfe_curves(self, X, y):
     
     num_samples,num_features = X.shape
     
     tr_err_rfe = np.zeros(num_features)
     cv_err_rfe = np.zeros(num_features)
     accuracy_rfe = np.zeros(num_features)
     recall_rfe = np.zeros(num_features)
     precision_rfe = np.zeros(num_features)
     f1_score_rfe = np.zeros(num_features)
     
     for i in xrange(num_features):
         
         mask = np.zeros(num_features)
         mask[:i+1] = 1
         
         new_mask = np.tile(mask==1,(num_samples,1))
         
         extracted_X = X[new_mask]
         
         extracted_X = np.reshape(extracted_X,(num_samples,i+1))
         
         set_ripartitions = StratifiedShuffleSplit(y, n_iter = self.n_iterations, 
                                               test_size = self.test_size, indices=False)
 
         n_iter = len(set_ripartitions)
     
         for train,test in set_ripartitions:
             
             X_tr,X_cv,y_tr,y_cv =extracted_X[train],extracted_X[test],y[train],y[test]
             
             if self.kernel == SVM_RBF:
             
                 classifier = SVM_RBF_by_C_and_gamma_function(X_tr, y_tr, C=self.C, gamma=self.gamma)      
                 tr_err, cv_err = misclassification_errors(classifier,X_tr,y_tr,X_cv,y_cv)
                             
             elif self.kernel == SVM_linear:
                 
                 classifier = linear_SVM_by_C_function(X_tr, y_tr, C=self.C)      
                 tr_err, cv_err = misclassification_errors(classifier,X_tr,y_tr,X_cv,y_cv)
             
             elif self.kernel == SVM_RBF_Chi2_squared:
                    
                 classifier = SVM_RBF_Chi2_squared_by_C_function(X_tr, y_tr, C = self.C)
                 tr_err, cv_err = misclassification_errors(classifier,X_tr,y_tr,X_cv,y_cv)
                 
             y_pred=classifier.predict(X_cv)
             
             if hasattr(metrics,"accuracy_score"):
                 acc = metrics.accuracy_score(y_cv,y_pred)
             else:
                 assert hasattr(metrics,"zero_one_score")
                 acc = metrics.zero_one_score(y_cv, y_pred)
             prec=metrics.precision_score(y_cv,y_pred)
             recall=metrics.recall_score(y_cv,y_pred)
             f1_score=metrics.f1_score(y_cv,y_pred)
             
             tr_err_rfe[i] = tr_err_rfe[i] + tr_err / n_iter
             cv_err_rfe[i] = cv_err_rfe[i] + cv_err / n_iter
             accuracy_rfe[i] = accuracy_rfe[i] + acc / n_iter
             recall_rfe[i] = recall_rfe[i] + recall / n_iter
             precision_rfe[i] = precision_rfe[i] + prec / n_iter
             f1_score_rfe[i] = f1_score_rfe[i] + f1_score / n_iter
             
     return tr_err_rfe, cv_err_rfe,accuracy_rfe,recall_rfe, precision_rfe, f1_score_rfe
Exemplo n.º 29
0
def main():


    clf=joblib.load('svc_wordnet.pkl')

    feature_index={}
    with open('data.p', 'rb') as fp:
        feature_index = pickle.load(fp)

    y_true=[]
    y_pred=[]

    X=[]
    
    fpath=open(os.getcwd()+'/testing_set_path.txt')
    f = open('DMOZ_chi2_testing.txt','w')
    class_no=0
    
    for line in fpath.read().split('\n'):
        print line
        path =line
   	if path == '':
		break 
        for file in glob.glob(os.path.join(path, '*.txt')):
            #print file
            mapping = [0]*1408
            for word in open(file).read().split():
		if len( word ) < 2:
			continue
                index=feature_index.get(word)
                        #print index
                if(index is not None):
                    mapping[index]=1
                else:
                    for ss in wn.synsets(word):
                        for l in ss.lemmas():
                            index = feature_index.get(l.name)
                            if(index is not None):
                                mapping[index]=1
                                break
                            
                    list = []
		    for syn_set in wn.synsets(word):
	                for syn in syn_set.lemmas():
                            list.append(syn.name)

                    for w in list:
                        index = feature_index.get(w)
                        if(index is not None):
                            mapping[index]=1
                            break
                        
            X.append(mapping)
            y_true.append(class_no)

	    f.write(str(class_no))
	    f.write(" ")

	    for m in mapping:
		f.write(" ".join(str(m))+ " ")
	    f.write('\n')



            y_pred.append(int(clf.predict(mapping)))
            
        class_no=class_no+1

    f.close()

    with open('testing_X.p','wb') as fp:
	pickle.dump(X, fp)
    
    with open('testing_y_true.p','wb') as fp:
	pickle.dump(y_true, fp)

    with open('testing_y_pred.p','wb') as fp:
	pickle.dump(y_pred, fp)

    #print y_true
    #print y_pred
    target_names = ['Arts', 'Business', 'Computers','Games','Health','Home','News','Recreation','Reference','Regional','Science','Shopping','Society','Sports']
    print(classification_report(y_true, y_pred, target_names=target_names))
    accuracy = zero_one_score(y_true, y_pred)
    print 'accuracy',accuracy
    print metrics.precision_score(y_true, y_pred, average='macro')
    print metrics.recall_score(y_true, y_pred, average='micro')
    print metrics.f1_score(y_true, y_pred, average='weighted')  

    f = open('Result_class_bns.txt', 'w')
    f.writelines((classification_report(y_true, y_pred, target_names=target_names)))
    f.close()
Exemplo n.º 30
0
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.metrics import zero_one_score
import numpy as np

digits = load_digits()
X, y = shuffle(digits.data, digits.target)
X_train, X_test = X[:1000, :], X[1000:, :]
y_train, y_test = y[:1000], y[1000:]

svc = SVC(kernel='precomputed')

kernel_train = np.dot(X_train, X_train.T)  # linear kernel

svc.fit(kernel_train, y_train)

#kernel_test = np.dot(X_test, X_train[svc.support_, :].T)
kernel_test = np.dot(X_test, X_train.T)
y_pred = svc.predict(kernel_test)
print(zero_one_score(y_test, y_pred))
Exemplo n.º 31
0
    std = X.std(axis=0)
    X = (X - mean) / std

    for clf, name in (
        (SGDClassifier(n_iter=100, alpha=0.01), "plain sgd"),
        (SGDClassifier(n_iter=100, alpha=0.01,
                       class_weight={1: 10}), "weighted sgd"),
        (SGDRanking(n_iter=1000, alpha=0.01,
                    loss='roc_pairwise_ranking'), "pairwise sgd"),
        (RankSVM(n_iter=100, alpha=0.01, loss='hinge'), 'RankSVM'),
    ):
        clf.fit(X, y)
        print clf
        pred = clf.predict(X)

        print "ACC: %.4f" % metrics.zero_one_score(y, pred)
        print "AUC: %.4f" % metrics.auc_score(y, pred)
        print "CONFUSION MATRIX: "
        print metrics.confusion_matrix(y, pred)
        print "Kendall Tau: %.4f" % kendalltau(clf, X, y)
        print 80 * '='

    clf = MinirankSVM(max_iter=100, alpha=0.01).fit(X, y)
    print clf
    scores = np.dot(X, clf.coef_)
    pred = (scores > 0).astype(np.int)
    print "ACC: %.4f" % metrics.zero_one_score(y, pred)
    print "AUC: %.4f" % metrics.auc_score(y, pred)
    print "CONFUSION MATRIX: "
    print metrics.confusion_matrix(y, pred)
    print "Kendall Tau: %.4f" % kendalltau(clf, X, y)
Exemplo n.º 32
0
    Xte, Yte, class_map, feature_names, test_image_names = pickle.load(
        open('testing_set.p', 'r'))
    print 'opening saved test set features...'
except:
    print 'calculating test set features...'
    # need to pull class_map and feature_names from training set
    Xtr, Ytr, class_map, feature_names, ignore = pickle.load(
        open('training_set.p', 'r'))
    # now, calculate all of the features for the testing set
    from feature_calc import calculate_features
    Xte, Yte, ignore, ignore, test_image_names = calculate_features(
        'validation_images', 'testing_set.p', class_map=class_map)

print 'predicting the classes of verification images...'
pred = clf.predict(Xte)
rfor_01_score = metrics.zero_one_score(Yte, pred)  # zero-one score
print "Zero-One Score: " + str(rfor_01_score)

# create and save the confusion matrix
confmat = metrics.confusion_matrix(Yte, pred)
plt.close("all")
plt.imshow(confmat, interpolation="nearest", origin="upper")
plt.savefig("confusion_matrix.pdf")
plt.close("all")

# show the feature importances
print "Summary of feature importances"
for n in range(len(feature_names)):
    print "\t", round(clf.feature_importances_[n], 4), feature_names[n]

# reverse the class_map to get the names for each category
Exemplo n.º 33
0
def test_classifiers_train():
    # test if classifiers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators if issubclass(E,
        ClassifierMixin)]
    iris = load_iris()
    X_m, y_m = iris.data, iris.target
    X_m, y_m = shuffle(X_m, y_m, random_state=7)
    X_m = StandardScaler().fit_transform(X_m)
    # generate binary problem from multi-class one
    y_b = y_m[y_m != 2]
    X_b = X_m[y_m != 2]
    for (X, y) in [(X_m, y_m), (X_b, y_b)]:
        # do it once with binary, once with multiclass
        classes = np.unique(y)
        n_classes = len(classes)
        n_samples, n_features = X.shape
        for name, Clf in classifiers:
            if Clf in dont_test or Clf in meta_estimators:
                continue
            if Clf in [MultinomialNB, BernoulliNB]:
                # TODO also test these!
                continue
            # catch deprecation warnings
            with warnings.catch_warnings(record=True):
                clf = Clf()
            # raises error on malformed input for fit
            assert_raises(ValueError, clf.fit, X, y[:-1])

            # fit
            clf.fit(X, y)
            y_pred = clf.predict(X)
            assert_equal(y_pred.shape, (n_samples,))
            # training set performance
            assert_greater(zero_one_score(y, y_pred), 0.78)

            # raises error on malformed input for predict
            assert_raises(ValueError, clf.predict, X.T)
            if hasattr(clf, "decision_function"):
                try:
                    # decision_function agrees with predict:
                    decision = clf.decision_function(X)
                    if n_classes is 2:
                        assert_equal(decision.ravel().shape, (n_samples,))
                        dec_pred = (decision.ravel() > 0).astype(np.int)
                        assert_array_equal(dec_pred, y_pred)
                    if n_classes is 3 and not isinstance(clf, BaseLibSVM):
                        # 1on1 of LibSVM works differently
                        assert_equal(decision.shape, (n_samples, n_classes))
                        assert_array_equal(np.argmax(decision, axis=1), y_pred)

                    # raises error on malformed input
                    assert_raises(ValueError, clf.decision_function, X.T)
                    # raises error on malformed input for decision_function
                    assert_raises(ValueError, clf.decision_function, X.T)
                except NotImplementedError:
                    pass
            if hasattr(clf, "predict_proba"):
                try:
                    # predict_proba agrees with predict:
                    y_prob = clf.predict_proba(X)
                    assert_equal(y_prob.shape, (n_samples, n_classes))
                    assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
                    # check that probas for all classes sum to one
                    assert_array_almost_equal(
                        np.sum(y_prob, axis=1), np.ones(n_samples))
                    # raises error on malformed input
                    assert_raises(ValueError, clf.predict_proba, X.T)
                    # raises error on malformed input for predict_proba
                    assert_raises(ValueError, clf.predict_proba, X.T)
                except NotImplementedError:
                    pass

            if hasattr(clf, "classes_"):
                if hasattr(clf, "n_outputs_"):
                    assert_equal(clf.n_outputs_, 1)
                    assert_array_equal(
                        clf.classes_, [classes],
                        "Unexpected classes_ attribute for %r" % clf)
                else:
                    # flat classes array: XXX inconsistent
                    assert_array_equal(
                        clf.classes_, classes,
                        "Unexpected classes_ attribute for %r" % clf)
Exemplo n.º 34
0
    def train(self,trainset,labels,config,val_set,val_labels):
        # TODO: adaptive learning rate
        n,dim = trainset.shape
        max_iter = config.iterations
        batchsize = config.batchsize
        learning_rate = config.learning_rate
        momentum = config.momentum
        weight_decay = config.weight_decay
        learning_rates = [0.1,0.2]
        numbatches = n / batchsize
        self.layerstates = [np.array([]) for layersize in self.layersizes]
        tiny = np.ones((batchsize,self.n_classes)) * 0.000001
        val_error = 1000
        best_val_error = 1000
        best_weights = []
        if val_set != None:
            tiny_val = np.ones((val_labels.shape[0],self.n_classes)) * 0.000001
        for iteration in range(max_iter):
            print 'Iteration ' + str(iteration+1)
            iteration_error = 0
            t0 = time.clock()
            for j in range(numbatches):
                # get training batch and targets
                batch = trainset[j*batchsize:(j+1)*batchsize,:]
                target = labels[j*batchsize:(j+1)*batchsize,:]
                # forward propagation
                output = self.forward_pass(batch,config)
                batch_error = 0
                if config.error == 'cross-entropy':
                    self.w_delta[-1] = output - target
                    batch_error = -np.sum(np.sum(np.multiply(target,np.log(output+tiny)),1)) / batchsize
                else:
                    if config.nonlinearity == 'sigmoid':
                        deriv = np.multiply(output,np.ones(output.shape) - output)
                    elif config.nonlinearity == 'tanh':
                        deriv = np.ones(output.shape) - np.square(output)
                    self.w_delta[-1] = np.multiply(target - output,deriv)
                iteration_error += batch_error
                # backpropagation: compute deltas
                for i in range(self.n_layers-2,-1,-1):
                    out = self.layerstates[i]
                    if config.nonlinearity == 'sigmoid':
                        deriv = np.multiply(out,np.ones(out.shape) - out)
                    elif config.nonlinearity == 'tanh':
                        deriv = np.ones(out.shape) - np.square(out)
                    tmp = self.w_delta[i+1] * np.transpose(self.weights[i][:-1,:])
                    self.w_delta[i] = np.multiply(tmp,deriv)
                # compute derivatives from deltas
                # update weights
                for i in range(self.n_layers - 1):
                    activations = np.append(self.layerstates[i],np.ones((batchsize,1)),1)
                    w_deriv = np.transpose(activations) * self.w_delta[i+1] / batchsize \
                                                        + momentum * self.w_deriv_old[i]
                    regularizer = weight_decay * self.weights[i]
                    # no regularization on bias
                    regularizer[-1,:] = np.zeros((1,self.weights[i].shape[1]))
                    self.weights[i] = self.weights[i] - learning_rates[i] * w_deriv \
                                                      - regularizer
                    self.w_deriv_old[i] = w_deriv

            iteration_error /= numbatches
            if val_set != None:
                res = self.predict(val_set,config)
                predicted = self.label_converter.transform(res)
                groundtruth = self.label_converter.transform(val_labels)
                val_error_old = val_error
                val_error = -np.sum(np.sum(np.multiply(groundtruth,np.log(predicted+tiny_val)),1)) \
                            / groundtruth.shape[0]
                if val_error < best_val_error:
                    best_val_error = val_error
                    best_weights = self.weights
                print 'Accuracy on val: ' + str(zero_one_score(val_labels,res))
                if val_error > val_error_old + 0.15:
                    self.weights = best_weights
                    print 'Early stop: ' + str(val_error)
                    break
            for i in range(self.n_layers - 1):
                learning_rates[i] = 0.9 * learning_rates[i]
            self.log_file.write('Iteration ' + str(iteration+1) + '\n')
            self.log_file.write('  Error train: ' + str(iteration_error) + '\n')
            self.log_file.write('  Error val:   ' + str(val_error) + '\n')
            self.log_file.write('  Time for iteration: ' + str(time.clock() - t0) + '\n')
            self.log_file.flush()
            print 'Errors: ' + str(iteration_error / numbatches) + ',' + str(val_error)
Exemplo n.º 35
0
def main():


    clf=joblib.load('svc_wordnet.pkl')

    feature_index={}
    with open('data.p', 'rb') as fp:
        feature_index = pickle.load(fp)

    y_true=[]
    y_pred=[]

    X=[]
    
    fpath=open(os.getcwd()+'/testing_set_path.txt')
    f = open("20NG_ig_test.txt", "w")

    class_no=0
    
    for line in fpath.read().split('\n'):
        print line
        path =line
   	if path == '':
		break 
        for file in glob.glob(os.path.join(path, '*')):
            #print file
            mapping = [0]*2260
            for word in open(file).read().split():
                index=feature_index.get(word)
                        #print index
                if(index is not None):
                    mapping[index]=1
                else:
                    for ss in wn.synsets(word):
                        for l in ss.lemmas():
                            index = feature_index.get(l.name)
                            if(index is not None):
                                mapping[index]=1
                                break
                            
                    list = []
		    for syn_set in wn.synsets(word):
	                for syn in syn_set.lemmas():
                            list.append(syn.name)

                    for w in list:
                        index = feature_index.get(w)
                        if(index is not None):
                            mapping[index]=1
                            break
                        
            X.append(mapping)
            y_true.append(class_no)
            y_pred.append(int(clf.predict(mapping)))
            
	    f.write(str(class_no))
	    f.write(" ")
	    for m in mapping:
		f.write(str(m))
		f.write(" ")
	    f.write('\n')

        class_no=class_no+1

    f.close()

    with open('testing_X.p','wb') as fp:
	pickle.dump(X, fp)
    
    with open('testing_y_true.p','wb') as fp:
	pickle.dump(y_true, fp)

    with open('testing_y_pred.p','wb') as fp:
	pickle.dump(y_pred, fp)

    #print y_true
    #print y_pred
    target_names = ['Alt', 'Computers', 'Miscellaneous','Rec','Science','Social','Talk']
    print(classification_report(y_true, y_pred, target_names=target_names))
    accuracy = zero_one_score(y_true, y_pred)
    print 'accuracy',accuracy
    print metrics.precision_score(y_true, y_pred, average='macro')
    print metrics.recall_score(y_true, y_pred, average='micro')
    print metrics.f1_score(y_true, y_pred, average='weighted')  
Exemplo n.º 36
0
    trainlabels = labels[:n_train]
    #valset = records[n_train:,:]
    #vallabels = labels[n_train:,:]
    valset = records[n_train:n_train+n_val,:]
    vallabels = labels[n_train:n_train+n_val]
    n,dim = trainset.shape

    # mean centering, stdev normalization and whitening
    scaler = Scaler()
    scaler.fit(trainset)
    trainset = scaler.transform(trainset)
    valset = scaler.transform(valset)
    pca = PCA(n_components=dim,whiten=True)
    pca.fit(trainset)
    trainset = pca.transform(trainset)
    valset = pca.transform(valset)

    config = Train_config()
    config.iterations = 10
    config.nonlinearity = 'tanh'
    config.batchsize = 50
    config.learning_rate = 0.2
    config.momentum = 0.7
    log = open('log.txt','w')
    nn = Net([dim,300,10],log_file=log)
    nn.fit(trainset,trainlabels,config,val_set=valset,val_labels=vallabels)
    nn_file = open('nn.obj','w')
    pickle.dump(nn.weights,nn_file)
    results = nn.predict(valset,config)
    print zero_one_score(vallabels,results)
Exemplo n.º 37
0
 def calculate_testing_accuracy(self, Y, predict):
     redundancy = sum(self.Y) * 1.0 / len(self.Y)
     accuracy = zero_one_score(Y, predict)
     precision = precision_score(Y, predict)
     recall = recall_score(Y, predict)
     f1 = f1_score(Y, predict)
#            flag = False
#    return flag
    if decadecount[1]<cap or decadecount[2]<cap or decadecount[3]<cap :
        flag = False 
    return flag
cap = 40
#sys.stdout = open("output.txt", "w")
traindir = "C:\Users\gouthamdl\Desktop\data"
segments,csegments = getclusters(traindir)
#n_samples, n_features = segments.shape
#print segments
#print 'csegments'
#print csegments
print 'Performing Clustering'
estimator = KMeans(init='k-means++', n_clusters=50, n_init=1)
cestimator = KMeans(init='k-means++', n_clusters=50, n_init=1)
kmeans = estimator.fit(segments)
ckmeans = estimator.fit(csegments)
features,labels = buildfeatures(traindir,kmeans,ckmeans,ext='.h5')
features = array(features)
labels = array(labels)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=0)

print 'Performing Classification'
clf2 = LogisticRegression().fit(X_train,y_train)
dec_pred2 = clf2.predict(X_test)
accuracy = zero_one_score(y_test, dec_pred2)
print 'Accuracy with Logistic Regression : ' + str(accuracy)

#for x,y in zip(y_test,dec_pred2):
#    print 'Actual Decade : ' + str(x) + ' Predicted Decade : ' + str(y)
Exemplo n.º 39
0
 def train_classifier(self):
     self.classifer.fit(self.train_features, self.train_labels)
     self.predicted_train_labels = self.classifer.predict(self.train_features)
     self.train_accuracy = sklearn_metrics.zero_one_score(self.train_labels, self.predicted_train_labels)
Exemplo n.º 40
0
    X = Scaler().fit_transform(X)
    for name, Clf in classifiers:
        if Clf in dont_test or Clf in meta_estimators:
            continue
            if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            clf = Clf()
        # fit
            clf.fit(X, y)
            y_pred = clf.predict(X)
        assert_equal(X_pred.shape[0], n_samples)
            # training set performance
            assert_greater(zero_one_score(y, y_pred), 0.78)

            # raises error on malformed input for transform
            assert_raises(ValueError, clf.predict, X.T)
            if hasattr(clf, "decision_function"):
        try:
                    # decision_function agrees with predict:
                    decision = clf.decision_function(X)
                        assert_equal(decision.shape, (n_samples, n_labels))
                if not isinstance(clf, BaseLibSVM):
                        # 1on1 of LibSVM works differently
                        assert_array_equal(np.argmax(decision, axis=1), y_pred)
                    # raises error on malformed input for decision_function
                    assert_raises(ValueError, clf.decision_function, X.T)
            except NotImplementedError:
                pass
Exemplo n.º 41
0
 def test_classifier(self):
     self.predicted_test_labels = self.classifer.predict(self.test_features)
     self.test_accuracy = sklearn_metrics.zero_one_score(self.test_labels, self.predicted_test_labels)
Exemplo n.º 42
0
    hits = [
        np.all(y_true[np.where(groups == this_group)] == y_pred[np.where(groups == this_group)])
        for this_group in np.unique(groups)
    ]
    return np.mean(hits)


def all_or_nothing_contig(y_true, y_pred, groups):
    matches = 0
    n_groups = 0
    is_good = False
    for k, (this_y_true, this_y_pred) in enumerate(zip(y_true, y_pred)):
        if groups[k] != groups[k - 1]:
            n_groups += 1
            matches += is_good
            is_good = True
        if this_y_true != this_y_pred:
            is_good = False

    matches += is_good
    return (matches * 1.0) / n_groups


# what proportion of candidate hyphens were predicted correctly?
print zero_one_score(y_true, y_pred)  # 0.973684210526

# what proportion of words did we get completely right?
print all_or_nothing_score(y_true, y_pred, groups)  # 0.75
print all_or_nothing_contig(y_true, y_pred, groups)
Exemplo n.º 43
0
    hits = [
        np.all(y_true[np.where(groups == this_group)] == y_pred[np.where(
            groups == this_group)]) for this_group in np.unique(groups)
    ]
    return np.mean(hits)


def all_or_nothing_contig(y_true, y_pred, groups):
    matches = 0
    n_groups = 0
    is_good = False
    for k, (this_y_true, this_y_pred) in enumerate(zip(y_true, y_pred)):
        if groups[k] != groups[k - 1]:
            n_groups += 1
            matches += is_good
            is_good = True
        if this_y_true != this_y_pred:
            is_good = False

    matches += is_good
    return (matches * 1.0) / n_groups


# what proportion of candidate hyphens were predicted correctly?
print zero_one_score(y_true, y_pred)  # 0.973684210526

# what proportion of words did we get completely right?
print all_or_nothing_score(y_true, y_pred, groups)  # 0.75
print all_or_nothing_contig(y_true, y_pred, groups)
Exemplo n.º 44
0
def test_classifiers_train():
    # test if classifiers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators
                   if issubclass(E, ClassifierMixin)]
    iris = load_iris()
    X_m, y_m = iris.data, iris.target
    X_m, y_m = shuffle(X_m, y_m, random_state=7)
    X_m = StandardScaler().fit_transform(X_m)
    # generate binary problem from multi-class one
    y_b = y_m[y_m != 2]
    X_b = X_m[y_m != 2]
    for (X, y) in [(X_m, y_m), (X_b, y_b)]:
        # do it once with binary, once with multiclass
        classes = np.unique(y)
        n_classes = len(classes)
        n_samples, n_features = X.shape
        for name, Clf in classifiers:
            if Clf in dont_test or Clf in meta_estimators:
                continue
            if Clf in [MultinomialNB, BernoulliNB]:
                # TODO also test these!
                continue
            # catch deprecation warnings
            with warnings.catch_warnings(record=True):
                clf = Clf()
            # raises error on malformed input for fit
            assert_raises(ValueError, clf.fit, X, y[:-1])

            # fit
            clf.fit(X, y)
            y_pred = clf.predict(X)
            assert_equal(y_pred.shape, (n_samples, ))
            # training set performance
            assert_greater(zero_one_score(y, y_pred), 0.78)

            # raises error on malformed input for predict
            assert_raises(ValueError, clf.predict, X.T)
            if hasattr(clf, "decision_function"):
                try:
                    # decision_function agrees with predict:
                    decision = clf.decision_function(X)
                    if n_classes is 2:
                        assert_equal(decision.ravel().shape, (n_samples, ))
                        dec_pred = (decision.ravel() > 0).astype(np.int)
                        assert_array_equal(dec_pred, y_pred)
                    if n_classes is 3 and not isinstance(clf, BaseLibSVM):
                        # 1on1 of LibSVM works differently
                        assert_equal(decision.shape, (n_samples, n_classes))
                        assert_array_equal(np.argmax(decision, axis=1), y_pred)

                    # raises error on malformed input
                    assert_raises(ValueError, clf.decision_function, X.T)
                    # raises error on malformed input for decision_function
                    assert_raises(ValueError, clf.decision_function, X.T)
                except NotImplementedError:
                    pass
            if hasattr(clf, "predict_proba"):
                try:
                    # predict_proba agrees with predict:
                    y_prob = clf.predict_proba(X)
                    assert_equal(y_prob.shape, (n_samples, n_classes))
                    assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
                    # check that probas for all classes sum to one
                    assert_array_almost_equal(np.sum(y_prob, axis=1),
                                              np.ones(n_samples))
                    # raises error on malformed input
                    assert_raises(ValueError, clf.predict_proba, X.T)
                    # raises error on malformed input for predict_proba
                    assert_raises(ValueError, clf.predict_proba, X.T)
                except NotImplementedError:
                    pass

            if hasattr(clf, "classes_"):
                if hasattr(clf, "n_outputs_"):
                    assert_equal(clf.n_outputs_, 1)
                    assert_array_equal(
                        clf.classes_, [classes],
                        "Unexpected classes_ attribute for %r" % clf)
                else:
                    # flat classes array: XXX inconsistent
                    assert_array_equal(
                        clf.classes_, classes,
                        "Unexpected classes_ attribute for %r" % clf)
Exemplo n.º 45
0
 def train_classifier(self):
     self.classifer.fit(self.train_features, self.train_labels)
     self.predicted_train_labels = self.classifer.predict(
         self.train_features)
     self.train_accuracy = sklearn_metrics.zero_one_score(
         self.train_labels, self.predicted_train_labels)
# Test for 10 rounds using the results from 10 fold cross validations
for train_index, test_index in kf:

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    train_time = time() - t0

    pred = clf.predict(X_test)
    test_time = time() - t0

    # metrics
    f1_score = metrics.f1_score(y_test, pred)
    acc_score = metrics.zero_one_score(y_test, pred)
    pre_score = metrics.precision_score(y_test, pred)
    rec_score = metrics.recall_score(y_test, pred)
    f1_all += f1_score
    acc_all += acc_score
    pre_all += pre_score
    rec_all += rec_score

f1_all = f1_all/num_fold
acc_all = acc_all/num_fold
pre_all = pre_all/num_fold
rec_all = rec_all/num_fold

print
print clf
print "average f1-score:   %0.5f" % f1_all
Exemplo n.º 47
0
 def test_classifier(self):
     self.predicted_test_labels = self.classifer.predict(self.test_features)
     self.test_accuracy = sklearn_metrics.zero_one_score(
         self.test_labels, self.predicted_test_labels)