def test_class_prediction_error_quickmethod_X_test_only(self): """ Test the ClassPredictionError quickmethod """ X, y = load_occupancy(return_dataset=True).to_numpy() X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True, random_state=42) fig = plt.figure() ax = fig.add_subplot() clf = LinearSVC(random_state=42) with pytest.raises( YellowbrickValueError, match="must specify both X_test and y_test or neither", ): class_prediction_error(clf, X_train=X_train, y_train=y_train, X_test=X_test, ax=ax, show=False)
def test_integrated_radviz_pandas_classes_features(self): """ Test RadViz with classes and features specified using Pandas """ # Load the data from the fixture data = load_occupancy(return_dataset=True) X, y = data.to_pandas() features = ["temperature", "relative humidity", "light"] classes = [ k for k, _ in sorted(data.meta["labels"].items(), key=lambda i: i[1]) ] assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) # Filter the dataset to make sure it's not just class names X = X[features] y = y.astype(int) # Test the visualizer visualizer = RadViz(features=features, classes=classes) visualizer.fit_transform(X, y) visualizer.finalize() self.assert_images_similar(visualizer, tol=0.1)
def test_integrated_radviz_numpy_classes_features(self): """ Test RadViz with classes and features specified using numpy """ # Load the data from the fixture data = load_occupancy(return_dataset=True) X, y = data.to_numpy() features = data.meta["features"][0:3] classes = [ k for k, _ in sorted(data.meta["labels"].items(), key=lambda i: i[1]) ] assert isinstance(X, np.ndarray) assert isinstance(y, np.ndarray) # Filter the dataset to make sure it's not just class names X = X[:, :3] y = y.astype(int) # Test the visualizer visualizer = RadViz(features=features, classes=classes) visualizer.fit_transform(X, y) visualizer.finalize() self.assert_images_similar(visualizer, tol=0.1)
def balance(): X, y = load_occupancy() _, _, y_train, y_test = tts(X, y, test_size=0.2) oz = ClassBalance(ax=newfig(), labels=["unoccupied", "occupied"]) oz.fit(y_train, y_test) savefig(oz, "class_balance")
def classreport(): X, y = load_occupancy() X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2) oz = ClassificationReport(GaussianNB(), support=True, ax=newfig()) oz.fit(X_train, y_train) oz.score(X_test, y_test) savefig(oz, "classification_report")
def test_pandas_integration(self): """ Test the precision_recall_curve with Pandas dataframes """ X, y = load_occupancy(return_dataset=True).to_pandas() model = DecisionTreeClassifier(random_state=14) X_train, X_test, y_train, y_test = tts( X, y, test_size=0.2, shuffle=True, random_state=555 ) oz = PrecisionRecallCurve( model, per_class=True, micro=False, fill_area=False, iso_f1_curves=True, ap_score=False, classes=["unoccupied", "occupied"], ) oz.fit(X_train, y_train) oz.score(X_test, y_test) oz.finalize() self.assert_images_similar(oz, tol=5.0)
def test_class_prediction_error_quickmethod_X_test_and_y_test(self): """ Test the ClassPredictionError quickmethod """ X, y = load_occupancy(return_dataset=True).to_numpy() X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True, random_state=42) fig = plt.figure() ax = fig.add_subplot() clf = LinearSVC(random_state=42) viz = class_prediction_error(clf, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, ax=ax, show=False) # Not sure why the tolerance must be so high for this # Failing on travis with RMS 9.544 # AppVeyor and Linux conda fail due to non-text-based differences: RMS 12.961 self.assert_images_similar(viz, tol=13, windows_tol=13)
def test_missing_test_data_in_quick_method(self): """ Test quick method when test data is missing. """ X, y = load_occupancy(return_dataset=True).to_numpy() X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, shuffle=True, random_state=55555) emsg = "both X_test and y_test are required if one is specified" with pytest.raises(YellowbrickValueError, match=emsg): precision_recall_curve(RandomForestClassifier(), X_train, y_train, y_test=y_test, show=False) with pytest.raises(YellowbrickValueError, match=emsg): precision_recall_curve(RandomForestClassifier(), X_train, y_train, X_test, show=False)
def test_rocauc_quickmethod(self): """ Test the ROCAUC quick method """ X, y = load_occupancy(return_dataset=True).to_numpy() model = DecisionTreeClassifier() # TODO: image comparison of the quick method roc_auc(model, X, y)
def test_rank2d_quick_method(self): """ Test Rank2D quick method """ X, y = load_occupancy() oz = rank2d(X, y, algorithm="spearman", colormap="RdYlGn_r") assert isinstance(oz, Rank2D) self.assert_images_similar(oz, tol=0.1)
def test_rocauc_quickmethod(self): """ Test the ROCAUC quick method """ X, y = load_occupancy(return_dataset=True).to_numpy() model = LogisticRegression() # compare the images visualizer = roc_auc(model, X, y, show=False) self.assert_images_similar(visualizer)
def test_parallel_coordinates_quickmethod(self): """ Test the quick method producing a valid visualization """ X, y = load_occupancy(return_dataset=True).to_numpy() # Compare the images # Use only the first 100 samples so the test will run faster visualizer = parallel_coordinates(X, y, sample=100, show=False) self.assert_images_similar(visualizer)
def test_quick_method(self): """ Test the quick method with producing a valid visualization """ data = load_occupancy(return_dataset=True) _, y = data.to_numpy() visualizer = balanced_binning_reference(y, show=False) assert isinstance(visualizer, BalancedBinningReference) self.assert_images_similar(visualizer, tol=0.5)
def manifold(dataset, manifold): if dataset == "concrete": X, y = load_concrete() elif dataset == "occupancy": X, y = load_occupancy() else: raise ValueError("unknown dataset") oz = Manifold(manifold=manifold, ax=newfig()) oz.fit_transform(X, y) savefig(oz, "{}_{}_manifold".format(dataset, manifold))
def test_integrated_scatter(self): """ Test scatter on the real, occupancy data set """ # Load the data from the fixture X, y = load_occupancy(return_dataset=True).to_numpy() # Test the visualizer features = ["temperature", "relative humidity"] visualizer = ScatterViz(features=features) visualizer.fit_transform_show(X[:, :2], y)
def test_classes_greater_than_indices(self): """ A model error should be raised when there are more classes in fit than score """ X, y = load_occupancy(return_dataset=True).to_numpy() classes = ["unoccupied", "occupied", "partytime"] model = LinearSVC(random_state=42) model.fit(X, y) with pytest.raises(ModelError): visualizer = ClassPredictionError(model, classes=classes) visualizer.score(X, y)
def test_classes_less_than_indices(self): """ Assert error when there is an attempt to filter classes """ X, y = load_occupancy(return_dataset=True).to_numpy() classes = ["unoccupied"] model = LinearSVC(random_state=42) model.fit(X, y) with pytest.raises(NotImplementedError): visualizer = ClassPredictionError(model, classes=classes) visualizer.score(X, y)
def test_pandas_bins(self): """ Test Histogram on a Pandas Dataframe """ # Load the data from the fixture data = load_occupancy(return_dataset=True) X, y = data.to_pandas() visualizer = BalancedBinningReference() visualizer.fit(y) visualizer.finalize() self.assert_images_similar(visualizer, tol=0.5)
def test_integrated_scatter_with_pandas(self): """ Test scatterviz on the real, occupancy data set with pandas """ # Load the data from the fixture # Load the data from the fixture X, y = load_occupancy(return_dataset=True).to_pandas() # Test the visualizer features = ["temperature", "relative humidity"] visualizer = ScatterViz(features=features) visualizer.fit_transform_poof(X, y)
def test_score_returns_score(self): """ Test that ClassPredictionError score() returns a score between 0 and 1 """ X, y = load_occupancy(return_dataset=True).to_numpy() # Create and fit the visualizer visualizer = ClassPredictionError(LinearSVC(random_state=42)) visualizer.fit(X, y) # Score the visualizer s = visualizer.score(X, y) assert 0 <= s <= 1
def test_scatter_quick_method(self): """ Test scatter quick method on the real, occupancy data set """ # Load the data from the fixture X, y = load_occupancy(return_dataset=True).to_numpy() # Test the visualizer features = ["temperature", "relative humidity"] viz = scatterviz(X[:, :2], y=y, ax=None, features=features) # test that is returns a matplotlib obj with axes assert isinstance(viz, ScatterVisualizer)
def test_numpy_occupancy_balance(self): """ Test NumPy arrays with string target in balance mode """ data = load_occupancy(return_dataset=True) X, y = data.to_numpy() # Create and fit the visualizer oz = ClassBalance() assert oz.fit(y) is oz # oz.finalize() self.assert_images_similar(oz)
def select_features_example( algorithm="isomap", path="images/occupancy_select_k_best_isomap_manifold.png", **kwargs): _, ax = plt.subplots(figsize=(9, 6)) model = Pipeline([ ("selectk", SelectKBest(k=3, score_func=f_classif)), ("viz", Manifold(ax=ax, manifold=algorithm, **kwargs)), ]) X, y = load_occupancy() model.fit(X, y) model.named_steps["viz"].poof(outpath=path)
def test_integrated_radviz_with_pandas(self): """ Test RadViz with Pandas on the occupancy dataset """ data = load_occupancy(return_dataset=True) X, y = data.to_pandas() assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) # Test the visualizer visualizer = RadViz() visualizer.fit_transform_poof(X, y) self.assert_images_similar(visualizer, tol=0.1)
def test_integrated_radviz_with_numpy(self): """ Test RadViz with numpy on the occupancy dataset """ data = load_occupancy(return_dataset=True) X, y = data.to_numpy() assert isinstance(X, np.ndarray) assert isinstance(y, np.ndarray) # Test the visualizer visualizer = RadViz() visualizer.fit_transform_poof(X, y) self.assert_images_similar(visualizer, tol=0.1)
def test_quick_method_with_test_set(self): """ Test quick method when both train and test data is supplied """ X, y = load_occupancy(return_dataset=True).to_numpy() X_train, X_test, y_train, y_test = tts( X, y, test_size=0.2, shuffle=True, random_state=555 ) viz = precision_recall_curve( RandomForestClassifier(random_state=72), X_train, y_train, X_test, y_test ) self.assert_images_similar(viz)
def test_pandas_occupancy_compare(self): """ Test pandas data frame with string target in compare mode """ data = load_occupancy(return_dataset=True) X, y = data.to_pandas() _, _, y_train, y_test = tts(X, y, test_size=0.4, random_state=2242) # Create and fit the visualizer oz = ClassBalance() assert oz.fit(y_train, y_test) is oz # oz.finalize() self.assert_images_similar(oz, tol=0.5) # w/o tol fails with RMS 0.433
def test_numpy_occupancy_compare(self): """ Test NumPy arrays with string target in compare mode """ data = load_occupancy(return_dataset=True) X, y = data.to_numpy() _, _, y_train, y_test = tts(X, y, test_size=0.4, random_state=2242) # Create and fit the visualizer oz = ClassBalance() assert oz.fit(y_train, y_test) is oz # oz.finalize() self.assert_images_similar(oz)
def test_stack_param_incorrectly_used_throws_error(self): """ Test incorrectly using stack param on a dataset with two classes which does not return a coef_ array in the shape of (n_classes, n_features) """ X, y = load_occupancy() viz = FeatureImportances( LogisticRegression(solver="liblinear", random_state=222), stack=True ) expected_error = "The model used does not return coef_ array" with pytest.raises(YellowbrickValueError, match=expected_error): viz.fit(X, y)
def test_pandas_integration(self): """ Assert no errors during class prediction error integration with Pandas """ X, y = load_occupancy(return_dataset=True).to_pandas() classes = ["unoccupied", "occupied"] model = LinearSVC(random_state=42) model.fit(X, y) visualizer = ClassPredictionError(model, classes=classes) visualizer.score(X, y) visualizer.finalize() # AppVeyor and Linux conda fail due to non-text-based differences # AppVeyor fails with RMS 13.161 - 13.289 (python - miniconda) self.assert_images_similar(visualizer, tol=12.5, windows_tol=13.3)
def compare_class_balance(path="images/class_balance_compare.png"): data = load_occupancy() features = ["temperature", "relative_humidity", "light", "C02", "humidity"] classes = ['unoccupied', 'occupied'] # Extract the numpy arrays from the data frame X = data[features] y = data["occupancy"] # Create the train and test data _, _, y_train, y_test = train_test_split(X, y, test_size=0.2) # Instantiate the classification model and visualizer visualizer = ClassBalance(labels=classes) visualizer.fit(y_train, y_test) return visualizer.poof(outpath=path)