def hdbscan(self, args): start = time.time() model = HDBSCAN( min_cluster_size=args["min_cluster_size"], metric=args["metric"], leaf_size=args["leaf_size"], allow_single_cluster=args["allow_single_cluster"], ).fit(self.data_matrix) labels = model.predict(self.data_matrix) end = time.time() return labels, (end - start)
axis=1).dropna().reset_index() scaler = RobustScaler().fit(scores.iloc[:, 2:]) score_scaled = scaler.transform(scores.iloc[:, 2:]) # clusterer clusterer = HDBSCAN( # min_cluster_size=min_cluster_size, min_cluster_size=30, gen_min_span_tree=True) clusterer.fit(score_scaled) labels_ = clusterer.labels_ clusterer = KMeans(n_clusters=5, random_state=0).fit(score_scaled) labels_ = clusterer.predict(score_scaled) scores["labels"] = labels_ scores.groupby("labels").agg({"file_name": len}) # fit pca pca = PCA(n_components=3) pca.fit(score_scaled) components = pca.transform(score_scaled) # create df for visualization pca_columns = [f"PC{i+1}" for i in range(3)] components = pd.DataFrame(components, columns=pca_columns).reset_index() # components = pd.concat( # [train_df.reset_index(), components], axis=1) total_var = pca.explained_variance_ratio_.sum() * 100