def pca_signatures(variance, return_pca=False): """ Does a principal component analysis on the signature dataframe so that the number of principal components accounts for at least 80 % of the total variance of the original variables :param variance: amount of variance that is to be explained :param return_pca: allows returning of the pca object """ # Get the data meta_df = read_attributes_signatures.read_meta() att_df, sig_df = read_attributes_signatures.seperate_attributes_signatures( meta_df) # Perform the pca pca = PCA(n_components=variance, svd_solver="full") # Standardize the data, so the PCA makes more sense standardized_df = StandardScaler().fit_transform(sig_df) # Calculate the components principal_components = pca.fit_transform(np.array(standardized_df)) print("Explained variance of the components (sorted in ascending order):") print(pca.explained_variance_ratio_) # Make it a dataframe again principal_df = pd.DataFrame(data=principal_components, index=sig_df.index) # Give the columns more meaningful names principal_df.columns = [ "PC " + str(i) for i in range(1, len(principal_df.columns) + 1) ] if return_pca: return pca else: return principal_df
def read_data(): meta_df = read_attributes_signatures.read_meta() att_df, sig_df = read_attributes_signatures.seperate_attributes_signatures( meta_df) knoben = pd.read_csv("catchment_clusters_with_continoues_climate.csv", index_col=1) df = pd.merge(sig_df, knoben, right_index=True, left_index=True) df.drop(["FID", "gauge_lat", "gauge_lon", "b1_", "b2_", "b3_"], axis=1, inplace=True) df.columns = [ 'Mean annual discharge', 'Mean winter discharge', 'Mean half-flow date', 'Q95 (high flow)', 'Runoff ratio', 'Mean summer discharge', "Catchment Cluster", "Aridity", "Seasonality", "Precipitation falling as snow" ] # Rescale the data to the original values of Wouter # Find the new min max values by looking how far the current min max values # are away to the min max of the range. df["Aridity"] = rescaler(df["Aridity"], 1, 0, 1, -1) df["Seasonality"] = rescaler(df["Seasonality"], 1, 0, 2, 0) return df
frameon=True, fancybox=True) legend.get_frame().set_edgecolor("grey") legend.get_frame().set_facecolor("white") # Make plot nicer by removing the borders ax.set_facecolor("white") for spine in ax.spines.values(): spine.set_visible(False) # Add correct descriptions ax.set_title(describer, alpha=alpha) ax.set_ylabel("PC 2", alpha=alpha) ax.set_xlabel("PC 1", alpha=alpha) ax.grid(color="grey", alpha=alpha) plt.setp(ax.get_yticklabels(), alpha=alpha) plt.setp(ax.get_xticklabels(), alpha=alpha) ax.tick_params(axis=u'both', which=u'both',length=0) # Save the plot fig.tight_layout() plt.savefig(describer.replace("\n","") + ".png", bbox_inches="tight") plt.close() if __name__ == "__main__": variance = 0.8 pca_df = pca.pca_signatures(variance) meta_df = read_attributes_signatures.read_meta() att_df, sig_df = read_attributes_signatures.seperate_attributes_signatures(meta_df) plotting_df = pd.concat([pca_df, att_df], axis=1) plot_pca(plotting_df, att_df.columns)