예제 #1
0
# compute components for all the data, add cluster labels and train/test labels
components = pd.DataFrame(component.transform(X),
                          columns=["PC" + str(i + 1) for i in range(n_comp)])
components["Cluster"] = labels
components["Data"] = "Train"
components.to_csv("hclust and pca.csv", index=False)

# tells how well separated the clusters are
train_score = str(
    np.round(silhouette_score(X, components.loc[:, "Cluster"]), 3))

# plot the clusters
save_plot = False
pairs_plot(components,
           vars=components.columns[:n_comp],
           color="Cluster",
           title="Hierarchical Clustering & PCA - Silhouette: " + train_score,
           save=save_plot)

# train a random forest to learn the clusters
model = RandomForestClassifier(n_estimators=50,
                               max_depth=10,
                               min_samples_leaf=5,
                               max_features="sqrt",
                               class_weight="balanced_subsample",
                               random_state=42,
                               n_jobs=1)
model.fit(X, labels)

# collect and sort feature importance
importance = pd.DataFrame({
예제 #2
0
component = PCA(n_components=n_comp, random_state=42)
component.fit(X)

# compute components for all the data, add cluster labels and train/test labels
components = pd.DataFrame(component.transform(X),
                          columns=["PC" + str(i + 1) for i in range(n_comp)])
components["Inlier"] = labels
components["Data"] = "Train"
components.to_csv("inliers and pca.csv", index=False)

# tells how well separated the clusters are
train_score = str(np.round(silhouette_score(X, components.loc[:, "Inlier"]),
                           3))

# plot the clusters
save_plot = False
pairs_plot(components,
           vars=components.columns[:n_comp],
           color="Inlier",
           title="Local Outlier Factor & PCA - Silhouette: " + train_score,
           save=save_plot)

# remove the outliers
good_idx = np.where(model.negative_outlier_factor_ > cutoff)[0]
X = X.iloc[good_idx, :].reset_index(drop=True)
Y = Y.iloc[good_idx, :].reset_index(drop=True)

# export the data
X.to_csv("X ansur.csv", index=False)
Y.to_csv("Y ansur.csv", index=False)
예제 #3
0
             x="index",
             y="Actual",
             color="State",
             title="Hidden Markov Model - Train",
             legend=True,
             save=False)
scatter_plot(data=states_,
             x="index",
             y="Difference",
             color="State",
             title="Hidden Markov Model - Train",
             legend=True,
             save=False)
pairs_plot(data=states_,
           vars=["Actual", "Difference"],
           color="State",
           title="Hidden Markov Model - Train",
           save=False)

# plot the states on the test values
scatter_plot(data=states,
             x="index",
             y="Actual",
             color="State",
             title="Hidden Markov Model - Test",
             legend=True,
             save=False)
scatter_plot(data=states,
             x="index",
             y="Difference",
             color="State",
예제 #4
0
# tells how well separated the clusters are
train_score = str(
    np.round(
        silhouette_score(X.iloc[train_idx, :], components.loc[train_idx,
                                                              "Cluster"]), 3))
test_score = str(
    np.round(
        silhouette_score(X.iloc[test_idx, :], components.loc[test_idx,
                                                             "Cluster"]), 3))

# plot the clusters
save_plot = True
pairs_plot(components.iloc[train_idx, :],
           vars=components.columns[:n_comp],
           color="Cluster",
           title="Birch Clustering - Train - Silhouette: " + train_score,
           save=save_plot)
pairs_plot(components.iloc[test_idx, :],
           vars=components.columns[:n_comp],
           color="Cluster",
           title="Birch Clustering - Test - Silhouette: " + test_score,
           save=save_plot)

# train a random forest to learn the clusters
model = RandomForestClassifier(n_estimators=50,
                               max_depth=10,
                               min_samples_leaf=5,
                               max_features="sqrt",
                               class_weight="balanced_subsample",
                               random_state=42,