# compute components for all the data, add cluster labels and train/test labels components = pd.DataFrame(component.transform(X), columns=["PC" + str(i + 1) for i in range(n_comp)]) components["Cluster"] = labels components["Data"] = "Train" components.to_csv("hclust and pca.csv", index=False) # tells how well separated the clusters are train_score = str( np.round(silhouette_score(X, components.loc[:, "Cluster"]), 3)) # plot the clusters save_plot = False pairs_plot(components, vars=components.columns[:n_comp], color="Cluster", title="Hierarchical Clustering & PCA - Silhouette: " + train_score, save=save_plot) # train a random forest to learn the clusters model = RandomForestClassifier(n_estimators=50, max_depth=10, min_samples_leaf=5, max_features="sqrt", class_weight="balanced_subsample", random_state=42, n_jobs=1) model.fit(X, labels) # collect and sort feature importance importance = pd.DataFrame({
component = PCA(n_components=n_comp, random_state=42) component.fit(X) # compute components for all the data, add cluster labels and train/test labels components = pd.DataFrame(component.transform(X), columns=["PC" + str(i + 1) for i in range(n_comp)]) components["Inlier"] = labels components["Data"] = "Train" components.to_csv("inliers and pca.csv", index=False) # tells how well separated the clusters are train_score = str(np.round(silhouette_score(X, components.loc[:, "Inlier"]), 3)) # plot the clusters save_plot = False pairs_plot(components, vars=components.columns[:n_comp], color="Inlier", title="Local Outlier Factor & PCA - Silhouette: " + train_score, save=save_plot) # remove the outliers good_idx = np.where(model.negative_outlier_factor_ > cutoff)[0] X = X.iloc[good_idx, :].reset_index(drop=True) Y = Y.iloc[good_idx, :].reset_index(drop=True) # export the data X.to_csv("X ansur.csv", index=False) Y.to_csv("Y ansur.csv", index=False)
x="index", y="Actual", color="State", title="Hidden Markov Model - Train", legend=True, save=False) scatter_plot(data=states_, x="index", y="Difference", color="State", title="Hidden Markov Model - Train", legend=True, save=False) pairs_plot(data=states_, vars=["Actual", "Difference"], color="State", title="Hidden Markov Model - Train", save=False) # plot the states on the test values scatter_plot(data=states, x="index", y="Actual", color="State", title="Hidden Markov Model - Test", legend=True, save=False) scatter_plot(data=states, x="index", y="Difference", color="State",
# tells how well separated the clusters are train_score = str( np.round( silhouette_score(X.iloc[train_idx, :], components.loc[train_idx, "Cluster"]), 3)) test_score = str( np.round( silhouette_score(X.iloc[test_idx, :], components.loc[test_idx, "Cluster"]), 3)) # plot the clusters save_plot = True pairs_plot(components.iloc[train_idx, :], vars=components.columns[:n_comp], color="Cluster", title="Birch Clustering - Train - Silhouette: " + train_score, save=save_plot) pairs_plot(components.iloc[test_idx, :], vars=components.columns[:n_comp], color="Cluster", title="Birch Clustering - Test - Silhouette: " + test_score, save=save_plot) # train a random forest to learn the clusters model = RandomForestClassifier(n_estimators=50, max_depth=10, min_samples_leaf=5, max_features="sqrt", class_weight="balanced_subsample", random_state=42,