def test_hdbscan_approximate_predict_score(): clusterer = HDBSCAN(min_cluster_size=200).fit(X) # no prediction data error assert_raises(ValueError, approximate_predict_scores, clusterer, X) clusterer.generate_prediction_data() # wrong dimensions error assert_raises(ValueError, approximate_predict_scores, clusterer, np.array([[1, 2, 3]])) with warnings.catch_warnings(record=True) as w: approximate_predict_scores(clusterer, np.array([[1.5, -1.0]])) # no clusters warning assert 'Clusterer does not have any defined clusters' in str(w[-1].message) clusterer = HDBSCAN(prediction_data=True).fit(X) scores = approximate_predict_scores(clusterer, X) assert_array_almost_equal(scores, clusterer.outlier_scores_) assert scores.min() >= 0 assert scores.max() <= 1
def predict(self, dim_reduced_vecs, outlier_labels, scores, contamination, min_cluster_size, allow_noise): print("Clustering ...") clusterer = HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=True, metric="euclidean").fit(dim_reduced_vecs) print("Get prediction data ...") clusterer.generate_prediction_data() try: cluster_pred = clusterer.labels_ if allow_noise else np.argmax( all_points_membership_vectors(clusterer)[:, 1:], axis=1) except IndexError: print( "Got IndexError and will not enforce cluster membership (allow noise) ..." ) print(all_points_membership_vectors(clusterer)) cluster_pred = clusterer.labels_ # scoring print("Get scores ...") # GLOSH threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9) outlier_pred = np.where(clusterer.outlier_scores_ > threshold, -1, 1) scores["cluster_n"] = len(np.unique(clusterer.labels_)) scores["homogeneity"] = homogeneity_score(outlier_labels, cluster_pred) scores["completeness"] = completeness_score(outlier_labels, cluster_pred) scores["v_measure"] = v_measure_score(outlier_labels, cluster_pred) scores = get_scores(scores, outlier_labels, outlier_pred) print( f"Homogeneity - {homogeneity_score(outlier_labels, cluster_pred)*100:.1f} \ cluster_n - {len(np.unique(clusterer.labels_))}") return scores, clusterer.outlier_scores_