예제 #1
0
def test_class_labels():
    """Test if the correct class labels are returned."""
    X = DATASETS["iris_semisup"]["X"]
    y = np.copy(DATASETS["iris_semisup"]["y"])

    # default labels
    labels = np.unique(y)[1:]
    clf = SemiSupervisedDecisionTreeClassifier(random_state=2)
    clf.fit(X, y)

    _confirm_only_a_in_b(labels, clf.predict(X),
                         "Predicted continuous labels are wrong.")
    _confirm_only_a_in_b(labels, clf.transduced_labels_,
                         "Transduced continuous labels are wrong.")

    # far apart labels
    for l in labels:
        y[l == y] += l * 100
    labels = np.unique(y)[1:]
    clf = SemiSupervisedDecisionTreeClassifier(random_state=2)
    clf.fit(X, y)

    _confirm_only_a_in_b(labels, clf.predict(X),
                         "Predicted apart labels are wrong.")
    _confirm_only_a_in_b(labels, clf.transduced_labels_,
                         "Transduced apart labels are wrong.")
예제 #2
0
def test_semisupervised_classes():
    """Test that lowest class never appears in results."""
    lowest_class_label = iris.target.min()
    clf = SemiSupervisedDecisionTreeClassifier(random_state=2)
    clf.fit(iris.data, iris.target)
    labels = clf.predict(iris.data)
    assert_false(lowest_class_label in labels)
예제 #3
0
파일: test_tree.py 프로젝트: loli/sklearnef
def test_semisupervised_tree_errors():
    """Check class argument errors for semi-supervised trees."""
    with assert_raises(ValueError):
        SemiSupervisedDecisionTreeClassifier(criterion='gini')
    with assert_raises(ValueError):
        SemiSupervisedDecisionTreeClassifier(splitter='best')
    with assert_raises(ValueError):
        SemiSupervisedDecisionTreeClassifier(supervised_weight=1.0)
    with assert_raises(ValueError):
        SemiSupervisedDecisionTreeClassifier(min_samples_leaf=1).fit(
            iris.data, iris.target)
    with assert_raises(ValueError):
        SemiSupervisedDecisionTreeClassifier().fit(
            iris.data, np.squeeze(np.dstack((iris.target, iris.target))))
예제 #4
0
파일: test_tree.py 프로젝트: loli/sklearnef
def test_labeled_only():
    """Test the labeled only entropy."""
    # Note: labeledonly can not be used directly, but the unsupervised part can
    # be effectively deactivated through setting the weight very height
    # Note that this will still affect the probability, but should lead to
    # the same prediction
    y = iris.target.copy()[:-10]
    y[-1:] = -1
    clf = SemiSupervisedDecisionTreeClassifier(random_state=0, supervised_weight=.9999999999999999, max_features=None).fit(iris.data[:-10], y)
    baseline_pred = clf.predict(iris.data)
    
    # adding new, unlabeled samples should not change the prediction outcome
    for i in range(2, 10):
        y = iris.target.copy()[:-(10 - i + 1)]
        y[-i:] = -1
        clf = SemiSupervisedDecisionTreeClassifier(random_state=0, supervised_weight=.9999999999999999, max_features=None).fit(iris.data[:-(10 - i + 1)], y)
        pred = clf.predict(iris.data)
        assert_array_equal(baseline_pred, pred)
예제 #5
0
def test_semisupervised_as_supervised():
    """Test the semi-supervised tree as supervised tree."""
    ssy = iris.target.copy()
    ssy[-1] = -1  # un-labelled class, will be ignored by semi-supervised approach

    ssclf = SemiSupervisedDecisionTreeClassifier(
        random_state=0,
        min_samples_leaf=iris.data.shape[-1],
        supervised_weight=.9999999999,  # near 1, 1 is not allowed
        unsupervised_transformation=None)
    ssclf.fit(iris.data, ssy)
    ssprob = ssclf.predict_proba(iris.data)
    sspredict = ssclf.predict(iris.data)

    sclf = tree.DecisionTreeClassifier(random_state=0,
                                       min_samples_leaf=iris.data.shape[-1])
    sclf.fit(iris.data[:-1], iris.target[:-1])
    sprob = sclf.predict_proba(iris.data)
    spredict = sclf.predict(iris.data)

    assert_array_equal(ssprob, sprob)
    assert_array_equal(sspredict, spredict)
예제 #6
0
파일: test_tree.py 프로젝트: loli/sklearnef
def test_labeled_only():
    """Test the labeled only entropy."""
    # Note: labeledonly can not be used directly, but the unsupervised part can
    # be effectively deactivated through setting the weight very height
    # Note that this will still affect the probability, but should lead to
    # the same prediction
    y = iris.target.copy()[:-10]
    y[-1:] = -1
    clf = SemiSupervisedDecisionTreeClassifier(
        random_state=0, supervised_weight=.9999999999999999,
        max_features=None).fit(iris.data[:-10], y)
    baseline_pred = clf.predict(iris.data)

    # adding new, unlabeled samples should not change the prediction outcome
    for i in range(2, 10):
        y = iris.target.copy()[:-(10 - i + 1)]
        y[-i:] = -1
        clf = SemiSupervisedDecisionTreeClassifier(
            random_state=0,
            supervised_weight=.9999999999999999,
            max_features=None).fit(iris.data[:-(10 - i + 1)], y)
        pred = clf.predict(iris.data)
        assert_array_equal(baseline_pred, pred)
예제 #7
0
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)

    # ----- Data generation ----
    means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma)

    (X_train, X_train_unlabelled, X_train_labelled),\
    (y_train, y_train_unlabelled, y_train_labelled),\
    y_train_gt = sample_data(means, covs, args.n_samples)

    # make custom map
    cmap = plt.get_cmap('jet', len(np.unique(y_train)))

    # ----- Data scaling ----
    # Must be performed before to display final data in the right space
    if args.scaling:
        scale_data(X_train,
                   (X_train, X_train_unlabelled, X_train_labelled, means))

    # ----- Grid -----
    grid = generate_grid(X_train, args.sigma, args.resolution)

    # ----- Training -----
    clf = SemiSupervisedDecisionTreeClassifier(
        random_state=args.seed,
        max_depth=args.max_depth,
        max_features=args.max_features,
        supervised_weight=args.supervised_weight,
        min_improvement=args.min_improvement,
        transduction_method=args.transduction_method,
        unsupervised_transformation='scale' if args.scaling else None)
    clf.fit(X_train, y_train)

    # ----- plot tree into file -----
    # Convert with: dot -Tps tree.dot -o tree.ps
    export_graphviz(clf)

    # ----- Learned distribution -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape(
        (np.product(grid.shape[1:]), grid.shape[0]))
    pdf = clf.pdf(X_test_pred)

    # ----- Ground truth distribution -----
    X_test_gt = np.rollaxis(grid, 0, 3)
    prob_gt = np.sum([
        scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov)
        for mean, cov in zip(means, covs)
    ], 0)
    prob_gt /= args.n_clusters  # normalize

    # ----- Transduction -----
    y_train_result = clf.transduced_labels_

    # ----- A-posteriori classification / induction -----
    y_train_prediction = clf.predict(X_train_unlabelled)

    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y)

    # colour range: pdf
    pdf_vmin = min(prob_gt.min(), pdf.min())
    pdf_vmax = min(prob_gt.max(), pdf.max())

    # plot: gt - pdf
    plt.subplot(3, 1, 1)
    plt.scatter(X_train_unlabelled[:, 0],
                X_train_unlabelled[:, 1],
                c=cmap(y_train_gt.astype(np.uint8)),
                s=20,
                alpha=.5)
    plt.scatter(X_train_labelled[:, 0],
                X_train_labelled[:, 1],
                c=cmap(y_train_labelled.astype(np.uint8)),
                s=100)
    img = plt.imshow(prob_gt.T,
                     extent=[min(x), max(x), min(y),
                             max(y)],
                     interpolation='none',
                     cmap=plt.cm.afmhot,
                     aspect='auto',
                     origin='lower',
                     vmin=pdf_vmin,
                     vmax=pdf_vmax,
                     alpha=.5)  #'auto'
    plt.colorbar(img)

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Ground-truth: PDF + samples')

    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    # plot: learned - pdf
    plt.subplot(3, 1, 2)
    plt.scatter(X_train_unlabelled[:, 0],
                X_train_unlabelled[:, 1],
                c=cmap(y_train_result.astype(np.uint8)),
                s=20,
                alpha=.5)
    plt.scatter(X_train_labelled[:, 0],
                X_train_labelled[:, 1],
                c=cmap(y_train_labelled.astype(np.uint8)),
                s=100)
    img = plt.imshow(pdf.reshape((x.size, y.size)).T,
                     extent=[min(x), max(x), min(y),
                             max(y)],
                     interpolation='none',
                     cmap=plt.cm.afmhot,
                     aspect='auto',
                     origin='lower',
                     vmin=pdf_vmin,
                     vmax=pdf_vmax,
                     alpha=.5)  #'auto'
    plt.colorbar(img)

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Learned: PDF + samples labelled through transduction')

    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    # plot: learned - pdf
    plt.subplot(3, 1, 3)
    plt.scatter(X_train_unlabelled[:, 0],
                X_train_unlabelled[:, 1],
                c=cmap(y_train_prediction.astype(np.int8)),
                s=20,
                alpha=.5)
    plt.scatter(X_train_labelled[:, 0],
                X_train_labelled[:, 1],
                c=cmap(y_train_labelled.astype(np.int8)),
                s=100)
    plt.colorbar(img)  # just for scale

    plt.xlim(min(x), max(x))
    plt.ylim(min(y), max(y))
    plt.title('Learned: a-posteriori classification / induction')

    # add split-lines
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    plt.show()
예제 #8
0
def main():
    args = getArguments(getParser())

    # initialize the random seed
    np.random.seed(args.seed)
    
    # ----- Data generation ----
    means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma)
    
    (X_train, X_train_unlabelled, X_train_labelled),\
    (y_train, y_train_unlabelled, y_train_labelled),\
    y_train_gt = sample_data(means, covs, args.n_samples)
    
    # make custom map
    cmap = plt.get_cmap('jet', len(np.unique(y_train)))
    
    # ----- Data scaling ----
    # Must be performed before to display final data in the right space
    if args.scaling:
        scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means))
    
    # ----- Grid -----
    grid = generate_grid(X_train, args.sigma, args.resolution)
    
    # ----- Training -----
    clf = SemiSupervisedDecisionTreeClassifier(random_state=args.seed,
                                               max_depth=args.max_depth,
                                               max_features=args.max_features,
                                               supervised_weight=args.supervised_weight,
                                               min_improvement=args.min_improvement,
                                               transduction_method=args.transduction_method,
                                               unsupervised_transformation='scale' if args.scaling else None)
    clf.fit(X_train, y_train)
    
    # ----- plot tree into file -----
    # Convert with: dot -Tps tree.dot -o tree.ps
    export_graphviz(clf)
    
    # ----- Learned distribution -----
    X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0]))
    pdf = clf.pdf(X_test_pred)
    
    # ----- Ground truth distribution -----
    X_test_gt = np.rollaxis(grid, 0, 3)
    prob_gt = np.sum([scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs)], 0)
    prob_gt /= args.n_clusters # normalize
    
    # ----- Transduction -----
    y_train_result = clf.transduced_labels_
    
    # ----- A-posteriori classification / induction -----
    y_train_prediction = clf.predict(X_train_unlabelled)
    
    # ----- Plotting -----
    x, y = grid
    x = np.unique(x)
    y = np.unique(y) 
    
    # colour range: pdf
    pdf_vmin = min(prob_gt.min(), pdf.min())
    pdf_vmax = min(prob_gt.max(), pdf.max())
    
    # plot: gt - pdf
    plt.subplot(3, 1, 1)
    plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_gt.astype(np.uint8)), s=20, alpha=.5)
    plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100)
    img = plt.imshow(prob_gt.T, extent=[min(x),max(x),min(y),max(y)], interpolation='none',
                     cmap=plt.cm.afmhot, aspect='auto', origin='lower',
                     vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto'
    plt.colorbar(img)
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Ground-truth: PDF + samples')
    
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    # plot: learned - pdf
    plt.subplot(3, 1, 2)
    plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_result.astype(np.uint8)), s=20, alpha=.5)
    plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100)
    img =   plt.imshow(pdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)],
                       interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower',
                       vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto'
    plt.colorbar(img)
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Learned: PDF + samples labelled through transduction')
    
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)

    # plot: learned - pdf
    plt.subplot(3, 1, 3)
    plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_prediction.astype(np.int8)), s=20, alpha=.5)
    plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.int8)), s=100)
    plt.colorbar(img) # just for scale
    
    plt.xlim(min(x),max(x))
    plt.ylim(min(y),max(y))
    plt.title('Learned: a-posteriori classification / induction')
    
    # add split-lines
    if not args.no_split_lines:
        draw_split_lines(clf, x, y)
    
    plt.show()
예제 #9
0
def test_semisupervised():
    """Test class working without checking results."""
    clf = SemiSupervisedDecisionTreeClassifier(random_state=2)
    clf.fit(iris.data, iris.target)
    clf.predict_proba(iris.data)
예제 #10
0
def test_semisupervised_probas():
    """Test probability results to sum to one."""
    clf = SemiSupervisedDecisionTreeClassifier(random_state=2)
    clf.fit(iris.data, iris.target)
    proba = clf.predict_proba(iris.data)
    assert_true(np.all(1 == proba.sum(1)))