def hdbscan(self, args):
     start = time.time()
     model = HDBSCAN(
         min_cluster_size=args["min_cluster_size"],
         metric=args["metric"],
         leaf_size=args["leaf_size"],
         allow_single_cluster=args["allow_single_cluster"],
     ).fit(self.data_matrix)
     labels = model.predict(self.data_matrix)
     end = time.time()
     return labels, (end - start)
Пример #2
0
                   axis=1).dropna().reset_index()

scaler = RobustScaler().fit(scores.iloc[:, 2:])
score_scaled = scaler.transform(scores.iloc[:, 2:])

# clusterer
clusterer = HDBSCAN(
    # min_cluster_size=min_cluster_size,
    min_cluster_size=30,
    gen_min_span_tree=True)
clusterer.fit(score_scaled)

labels_ = clusterer.labels_

clusterer = KMeans(n_clusters=5, random_state=0).fit(score_scaled)
labels_ = clusterer.predict(score_scaled)

scores["labels"] = labels_

scores.groupby("labels").agg({"file_name": len})

# fit pca
pca = PCA(n_components=3)
pca.fit(score_scaled)
components = pca.transform(score_scaled)
# create df for visualization
pca_columns = [f"PC{i+1}" for i in range(3)]
components = pd.DataFrame(components, columns=pca_columns).reset_index()
# components = pd.concat(
#     [train_df.reset_index(), components], axis=1)
total_var = pca.explained_variance_ratio_.sum() * 100