def test_class_labels(): """Test if the correct class labels are returned.""" X = DATASETS["iris_semisup"]["X"] y = np.copy(DATASETS["iris_semisup"]["y"]) # default labels labels = np.unique(y)[1:] clf = SemiSupervisedDecisionTreeClassifier(random_state=2) clf.fit(X, y) _confirm_only_a_in_b(labels, clf.predict(X), "Predicted continuous labels are wrong.") _confirm_only_a_in_b(labels, clf.transduced_labels_, "Transduced continuous labels are wrong.") # far apart labels for l in labels: y[l == y] += l * 100 labels = np.unique(y)[1:] clf = SemiSupervisedDecisionTreeClassifier(random_state=2) clf.fit(X, y) _confirm_only_a_in_b(labels, clf.predict(X), "Predicted apart labels are wrong.") _confirm_only_a_in_b(labels, clf.transduced_labels_, "Transduced apart labels are wrong.")
def test_semisupervised_classes(): """Test that lowest class never appears in results.""" lowest_class_label = iris.target.min() clf = SemiSupervisedDecisionTreeClassifier(random_state=2) clf.fit(iris.data, iris.target) labels = clf.predict(iris.data) assert_false(lowest_class_label in labels)
def test_semisupervised_tree_errors(): """Check class argument errors for semi-supervised trees.""" with assert_raises(ValueError): SemiSupervisedDecisionTreeClassifier(criterion='gini') with assert_raises(ValueError): SemiSupervisedDecisionTreeClassifier(splitter='best') with assert_raises(ValueError): SemiSupervisedDecisionTreeClassifier(supervised_weight=1.0) with assert_raises(ValueError): SemiSupervisedDecisionTreeClassifier(min_samples_leaf=1).fit( iris.data, iris.target) with assert_raises(ValueError): SemiSupervisedDecisionTreeClassifier().fit( iris.data, np.squeeze(np.dstack((iris.target, iris.target))))
def test_labeled_only(): """Test the labeled only entropy.""" # Note: labeledonly can not be used directly, but the unsupervised part can # be effectively deactivated through setting the weight very height # Note that this will still affect the probability, but should lead to # the same prediction y = iris.target.copy()[:-10] y[-1:] = -1 clf = SemiSupervisedDecisionTreeClassifier(random_state=0, supervised_weight=.9999999999999999, max_features=None).fit(iris.data[:-10], y) baseline_pred = clf.predict(iris.data) # adding new, unlabeled samples should not change the prediction outcome for i in range(2, 10): y = iris.target.copy()[:-(10 - i + 1)] y[-i:] = -1 clf = SemiSupervisedDecisionTreeClassifier(random_state=0, supervised_weight=.9999999999999999, max_features=None).fit(iris.data[:-(10 - i + 1)], y) pred = clf.predict(iris.data) assert_array_equal(baseline_pred, pred)
def test_semisupervised_as_supervised(): """Test the semi-supervised tree as supervised tree.""" ssy = iris.target.copy() ssy[-1] = -1 # un-labelled class, will be ignored by semi-supervised approach ssclf = SemiSupervisedDecisionTreeClassifier( random_state=0, min_samples_leaf=iris.data.shape[-1], supervised_weight=.9999999999, # near 1, 1 is not allowed unsupervised_transformation=None) ssclf.fit(iris.data, ssy) ssprob = ssclf.predict_proba(iris.data) sspredict = ssclf.predict(iris.data) sclf = tree.DecisionTreeClassifier(random_state=0, min_samples_leaf=iris.data.shape[-1]) sclf.fit(iris.data[:-1], iris.target[:-1]) sprob = sclf.predict_proba(iris.data) spredict = sclf.predict(iris.data) assert_array_equal(ssprob, sprob) assert_array_equal(sspredict, spredict)
def test_labeled_only(): """Test the labeled only entropy.""" # Note: labeledonly can not be used directly, but the unsupervised part can # be effectively deactivated through setting the weight very height # Note that this will still affect the probability, but should lead to # the same prediction y = iris.target.copy()[:-10] y[-1:] = -1 clf = SemiSupervisedDecisionTreeClassifier( random_state=0, supervised_weight=.9999999999999999, max_features=None).fit(iris.data[:-10], y) baseline_pred = clf.predict(iris.data) # adding new, unlabeled samples should not change the prediction outcome for i in range(2, 10): y = iris.target.copy()[:-(10 - i + 1)] y[-i:] = -1 clf = SemiSupervisedDecisionTreeClassifier( random_state=0, supervised_weight=.9999999999999999, max_features=None).fit(iris.data[:-(10 - i + 1)], y) pred = clf.predict(iris.data) assert_array_equal(baseline_pred, pred)
def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # ----- Data generation ---- means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma) (X_train, X_train_unlabelled, X_train_labelled),\ (y_train, y_train_unlabelled, y_train_labelled),\ y_train_gt = sample_data(means, covs, args.n_samples) # make custom map cmap = plt.get_cmap('jet', len(np.unique(y_train))) # ----- Data scaling ---- # Must be performed before to display final data in the right space if args.scaling: scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means)) # ----- Grid ----- grid = generate_grid(X_train, args.sigma, args.resolution) # ----- Training ----- clf = SemiSupervisedDecisionTreeClassifier( random_state=args.seed, max_depth=args.max_depth, max_features=args.max_features, supervised_weight=args.supervised_weight, min_improvement=args.min_improvement, transduction_method=args.transduction_method, unsupervised_transformation='scale' if args.scaling else None) clf.fit(X_train, y_train) # ----- plot tree into file ----- # Convert with: dot -Tps tree.dot -o tree.ps export_graphviz(clf) # ----- Learned distribution ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape( (np.product(grid.shape[1:]), grid.shape[0])) pdf = clf.pdf(X_test_pred) # ----- Ground truth distribution ----- X_test_gt = np.rollaxis(grid, 0, 3) prob_gt = np.sum([ scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs) ], 0) prob_gt /= args.n_clusters # normalize # ----- Transduction ----- y_train_result = clf.transduced_labels_ # ----- A-posteriori classification / induction ----- y_train_prediction = clf.predict(X_train_unlabelled) # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) # colour range: pdf pdf_vmin = min(prob_gt.min(), pdf.min()) pdf_vmax = min(prob_gt.max(), pdf.max()) # plot: gt - pdf plt.subplot(3, 1, 1) plt.scatter(X_train_unlabelled[:, 0], X_train_unlabelled[:, 1], c=cmap(y_train_gt.astype(np.uint8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:, 0], X_train_labelled[:, 1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) img = plt.imshow(prob_gt.T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' plt.colorbar(img) plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Ground-truth: PDF + samples') if not args.no_split_lines: draw_split_lines(clf, x, y) # plot: learned - pdf plt.subplot(3, 1, 2) plt.scatter(X_train_unlabelled[:, 0], X_train_unlabelled[:, 1], c=cmap(y_train_result.astype(np.uint8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:, 0], X_train_labelled[:, 1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) img = plt.imshow(pdf.reshape((x.size, y.size)).T, extent=[min(x), max(x), min(y), max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' plt.colorbar(img) plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Learned: PDF + samples labelled through transduction') if not args.no_split_lines: draw_split_lines(clf, x, y) # plot: learned - pdf plt.subplot(3, 1, 3) plt.scatter(X_train_unlabelled[:, 0], X_train_unlabelled[:, 1], c=cmap(y_train_prediction.astype(np.int8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:, 0], X_train_labelled[:, 1], c=cmap(y_train_labelled.astype(np.int8)), s=100) plt.colorbar(img) # just for scale plt.xlim(min(x), max(x)) plt.ylim(min(y), max(y)) plt.title('Learned: a-posteriori classification / induction') # add split-lines if not args.no_split_lines: draw_split_lines(clf, x, y) plt.show()
def main(): args = getArguments(getParser()) # initialize the random seed np.random.seed(args.seed) # ----- Data generation ---- means, covs = generate_clusters(args.max_area, args.n_clusters, args.sigma) (X_train, X_train_unlabelled, X_train_labelled),\ (y_train, y_train_unlabelled, y_train_labelled),\ y_train_gt = sample_data(means, covs, args.n_samples) # make custom map cmap = plt.get_cmap('jet', len(np.unique(y_train))) # ----- Data scaling ---- # Must be performed before to display final data in the right space if args.scaling: scale_data(X_train, (X_train, X_train_unlabelled, X_train_labelled, means)) # ----- Grid ----- grid = generate_grid(X_train, args.sigma, args.resolution) # ----- Training ----- clf = SemiSupervisedDecisionTreeClassifier(random_state=args.seed, max_depth=args.max_depth, max_features=args.max_features, supervised_weight=args.supervised_weight, min_improvement=args.min_improvement, transduction_method=args.transduction_method, unsupervised_transformation='scale' if args.scaling else None) clf.fit(X_train, y_train) # ----- plot tree into file ----- # Convert with: dot -Tps tree.dot -o tree.ps export_graphviz(clf) # ----- Learned distribution ----- X_test_pred = np.rollaxis(grid, 0, 3).reshape((np.product(grid.shape[1:]), grid.shape[0])) pdf = clf.pdf(X_test_pred) # ----- Ground truth distribution ----- X_test_gt = np.rollaxis(grid, 0, 3) prob_gt = np.sum([scipy.stats.multivariate_normal.pdf(X_test_gt, mean, cov) for mean, cov in zip(means, covs)], 0) prob_gt /= args.n_clusters # normalize # ----- Transduction ----- y_train_result = clf.transduced_labels_ # ----- A-posteriori classification / induction ----- y_train_prediction = clf.predict(X_train_unlabelled) # ----- Plotting ----- x, y = grid x = np.unique(x) y = np.unique(y) # colour range: pdf pdf_vmin = min(prob_gt.min(), pdf.min()) pdf_vmax = min(prob_gt.max(), pdf.max()) # plot: gt - pdf plt.subplot(3, 1, 1) plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_gt.astype(np.uint8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) img = plt.imshow(prob_gt.T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' plt.colorbar(img) plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Ground-truth: PDF + samples') if not args.no_split_lines: draw_split_lines(clf, x, y) # plot: learned - pdf plt.subplot(3, 1, 2) plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_result.astype(np.uint8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.uint8)), s=100) img = plt.imshow(pdf.reshape((x.size,y.size)).T, extent=[min(x),max(x),min(y),max(y)], interpolation='none', cmap=plt.cm.afmhot, aspect='auto', origin='lower', vmin=pdf_vmin, vmax=pdf_vmax, alpha=.5) #'auto' plt.colorbar(img) plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Learned: PDF + samples labelled through transduction') if not args.no_split_lines: draw_split_lines(clf, x, y) # plot: learned - pdf plt.subplot(3, 1, 3) plt.scatter(X_train_unlabelled[:,0], X_train_unlabelled[:,1], c=cmap(y_train_prediction.astype(np.int8)), s=20, alpha=.5) plt.scatter(X_train_labelled[:,0], X_train_labelled[:,1], c=cmap(y_train_labelled.astype(np.int8)), s=100) plt.colorbar(img) # just for scale plt.xlim(min(x),max(x)) plt.ylim(min(y),max(y)) plt.title('Learned: a-posteriori classification / induction') # add split-lines if not args.no_split_lines: draw_split_lines(clf, x, y) plt.show()
def test_semisupervised(): """Test class working without checking results.""" clf = SemiSupervisedDecisionTreeClassifier(random_state=2) clf.fit(iris.data, iris.target) clf.predict_proba(iris.data)
def test_semisupervised_probas(): """Test probability results to sum to one.""" clf = SemiSupervisedDecisionTreeClassifier(random_state=2) clf.fit(iris.data, iris.target) proba = clf.predict_proba(iris.data) assert_true(np.all(1 == proba.sum(1)))