Exemplo n.º 1
0
def pca_signatures(variance, return_pca=False):
    """
    Does a principal component analysis on the signature dataframe so that the 
    number of principal components accounts for at least 80 % of the total 
    variance of the original variables
    
    :param variance: amount of variance that is to be explained
    :param return_pca: allows returning of the pca object
    """
    # Get the data
    meta_df = read_attributes_signatures.read_meta()
    att_df, sig_df = read_attributes_signatures.seperate_attributes_signatures(
        meta_df)

    # Perform the pca
    pca = PCA(n_components=variance, svd_solver="full")
    # Standardize the data, so the PCA makes more sense
    standardized_df = StandardScaler().fit_transform(sig_df)
    # Calculate the components
    principal_components = pca.fit_transform(np.array(standardized_df))
    print("Explained variance of the components (sorted in ascending order):")
    print(pca.explained_variance_ratio_)
    # Make it a dataframe again
    principal_df = pd.DataFrame(data=principal_components, index=sig_df.index)
    # Give the columns more meaningful names
    principal_df.columns = [
        "PC " + str(i) for i in range(1,
                                      len(principal_df.columns) + 1)
    ]
    if return_pca:
        return pca
    else:
        return principal_df
Exemplo n.º 2
0
def read_data():
    meta_df = read_attributes_signatures.read_meta()
    att_df, sig_df = read_attributes_signatures.seperate_attributes_signatures(
        meta_df)
    knoben = pd.read_csv("catchment_clusters_with_continoues_climate.csv",
                         index_col=1)
    df = pd.merge(sig_df, knoben, right_index=True, left_index=True)
    df.drop(["FID", "gauge_lat", "gauge_lon", "b1_", "b2_", "b3_"],
            axis=1,
            inplace=True)
    df.columns = [
        'Mean annual discharge', 'Mean winter discharge',
        'Mean half-flow date', 'Q95 (high flow)', 'Runoff ratio',
        'Mean summer discharge', "Catchment Cluster", "Aridity", "Seasonality",
        "Precipitation falling as snow"
    ]
    # Rescale the data to the original values of Wouter
    # Find the new min max values by looking how far the current min max values
    # are away to the min max of the range.
    df["Aridity"] = rescaler(df["Aridity"], 1, 0, 1, -1)
    df["Seasonality"] = rescaler(df["Seasonality"], 1, 0, 2, 0)

    return df
Exemplo n.º 3
0
                       frameon=True, fancybox=True)
            legend.get_frame().set_edgecolor("grey")
            legend.get_frame().set_facecolor("white")

        # Make plot nicer by removing the borders
        ax.set_facecolor("white")
        for spine in ax.spines.values():
            spine.set_visible(False)

        # Add correct descriptions
        ax.set_title(describer, alpha=alpha)
        ax.set_ylabel("PC 2", alpha=alpha)
        ax.set_xlabel("PC 1", alpha=alpha)
        ax.grid(color="grey", alpha=alpha)
        plt.setp(ax.get_yticklabels(), alpha=alpha)
        plt.setp(ax.get_xticklabels(), alpha=alpha)
        ax.tick_params(axis=u'both', which=u'both',length=0)

        # Save the plot
        fig.tight_layout()
        plt.savefig(describer.replace("\n","") + ".png",  bbox_inches="tight")
        plt.close()


if __name__ == "__main__":
    variance = 0.8
    pca_df = pca.pca_signatures(variance)
    meta_df = read_attributes_signatures.read_meta()
    att_df, sig_df = read_attributes_signatures.seperate_attributes_signatures(meta_df)
    plotting_df = pd.concat([pca_df, att_df], axis=1)
    plot_pca(plotting_df, att_df.columns)