Exemplo n.º 1
0
    def correlation_distance(self, how: str = 'euclidean') -> float:
        """
        Calculate distance between correlation matrices with certain metric.

        :param how: metric to measure distance. Choose from [``euclidean``, ``mae``, ``rmse``].
        :return: distance between the association matrices in the chosen evaluation metric. Default: Euclidean
        """
        from scipy.spatial.distance import cosine
        if how == 'euclidean':
            distance_func = euclidean_distance
        elif how == 'mae':
            distance_func = mean_absolute_error
        elif how == 'rmse':
            distance_func = rmse
        elif how == 'cosine':

            def custom_cosine(a, b):
                return cosine(a.reshape(-1), b.reshape(-1))

            distance_func = custom_cosine
        else:
            raise ValueError(
                f'`how` parameter must be in [euclidean, mae, rmse]')

        real_corr = compute_associations(
            self.real, nominal_columns=self.categorical_columns, theil_u=True)
        fake_corr = compute_associations(
            self.fake, nominal_columns=self.categorical_columns, theil_u=True)

        return distance_func(real_corr.values, fake_corr.values)
Exemplo n.º 2
0
def test_associations():
    """
    Tests that check wether the dython associations are still computed as is expected.
    """
    # load test data
    real_assoc = pd.read_csv(test_data_folder / 'real_associations.csv',
                             index_col='Unnamed: 0')
    real_assoc_theil = pd.read_csv(test_data_folder /
                                   'real_associations_theil.csv',
                                   index_col='Unnamed: 0')
    fake_assoc = pd.read_csv(test_data_folder / 'fake_associations.csv',
                             index_col='Unnamed: 0')
    fake_assoc_theil = pd.read_csv(test_data_folder /
                                   'fake_associations_theil.csv',
                                   index_col='Unnamed: 0')

    # Assert equality with saved data
    pd.testing.assert_frame_equal(
        real_assoc, compute_associations(real, nominal_columns=cat_cols))
    pd.testing.assert_frame_equal(
        real_assoc_theil,
        compute_associations(real, nominal_columns=cat_cols, theil_u=True))
    pd.testing.assert_frame_equal(
        fake_assoc, compute_associations(fake, nominal_columns=cat_cols))
    pd.testing.assert_frame_equal(
        fake_assoc_theil,
        compute_associations(fake, nominal_columns=cat_cols, theil_u=True))
Exemplo n.º 3
0
def test_associations():
    # load test data
    real_assoc = pd.read_csv(test_data_folder/'real_associations.csv', index_col='Unnamed: 0')
    real_assoc_theil = pd.read_csv(test_data_folder/'real_associations_theil.csv', index_col='Unnamed: 0')
    fake_assoc = pd.read_csv(test_data_folder/'fake_associations.csv', index_col='Unnamed: 0')
    fake_assoc_theil = pd.read_csv(test_data_folder/'fake_associations_theil.csv', index_col='Unnamed: 0')

    # Assert equality with saved data
    pd.testing.assert_frame_equal(real_assoc, compute_associations(real, nominal_columns=cat_cols))
    pd.testing.assert_frame_equal(real_assoc_theil, compute_associations(real, nominal_columns=cat_cols, theil_u=True))
    pd.testing.assert_frame_equal(fake_assoc, compute_associations(fake, nominal_columns=cat_cols))
    pd.testing.assert_frame_equal(fake_assoc_theil, compute_associations(fake, nominal_columns=cat_cols, theil_u=True))
Exemplo n.º 4
0
    def correlation_correlation(self) -> float:
        """
        Calculate the correlation coefficient between the association matrices of self.real and self.fake using self.comparison_metric

        :return: The correlation coefficient
        """
        total_metrics = pd.DataFrame()
        for ds_name in ['real', 'fake']:
            ds = getattr(self, ds_name)
            corr_df = compute_associations(ds, nominal_columns=self.categorical_columns, theil_u=True)
            values = corr_df.values
            values = values[~np.eye(values.shape[0], dtype=bool)].reshape(values.shape[0], -1)
            total_metrics[ds_name] = values.flatten()

        self.correlation_correlations = total_metrics
        corr, p = self.comparison_metric(total_metrics['real'], total_metrics['fake'])
        if self.verbose:
            print('\nColumn correlation between datasets:')
            print(total_metrics.to_string())
        return corr
Exemplo n.º 5
0
idx = 0
fig, ax = plt.subplots(figsize = (4,4))
ax.scatter(s2[idx], s_full[idx])
plt.title(full_contra.columns[idx])

for i, txt in enumerate(full_contra.columns):
    ax.annotate(txt, (s2[idx][i], s_full[idx][i]))
ax.set_xlim([-1,1])
ax.set_ylim([-1,1])

# Compare the representation between completed cosine similarity and original associations
associations(complete_y.astype(float), nominal_columns = cat_features)
associations(completed_y.astype(float), nominal_columns = cat_features)
associations(full_contra.astype(float), nominal_columns = cat_features)

assoc = compute_associations(full_contra.astype(float), nominal_columns = cat_features).values     


idx = 0
fig, ax = plt.subplots(figsize = (4,4))
ax.scatter(assoc[idx], s_full[idx])
plt.title(full_contra.columns[idx])

for i, txt in enumerate(full_contra.columns):
    ax.annotate(txt, (assoc[idx][i], s_full[idx][i]))
ax.set_xlim([-1,1])
ax.set_ylim([-1,1])
  
#======================================
# Ez.y
#======================================
Exemplo n.º 6
0
                if cat_features[col_idx]:
                    le = LabelEncoder()
                    data[dataset][colname] = le.fit_transform(
                        data[dataset][colname])

            # Delete the sex column
            data[dataset] = data[dataset][[
                name for name in varnames if name != 'sex'
            ]]
            cat_features_new = [
                cat_features[i] for i in range(len(varnames))
                if varnames[i] != 'sex'
            ]

            if i == len(plot_datasets) - 1:
                corr = compute_associations(data[dataset].astype(int),
                                            nominal_columns=cat_features_new)

                g1 = sns.heatmap(corr,
                                 cmap="YlGnBu",
                                 ax=axs[i],
                                 cbar_ax=axs[-1])

            else:
                corr = compute_associations(data[dataset].astype(int),
                                            nominal_columns=cat_features_new)
                g1 = sns.heatmap(corr, cmap="YlGnBu", cbar=False, ax=axs[i])

            g1.set_ylabel('')
            g1.set_xlabel('')
            g1.set_yticks([])
X_train=X_preprocessing.fit_transform(X_train)
X_cols=X_preprocessing.transformers_[0][1].named_steps["onehot"].get_feature_names(categorical_cols).tolist()+numeric_cols
X_train=pd.DataFrame(X_train, columns=X_cols)

X_test=X_preprocessing.fit_transform(X_test)
X_cols=X_preprocessing.transformers_[0][1].named_steps["onehot"].get_feature_names(categorical_cols).tolist()+numeric_cols
X_test=pd.DataFrame(X_test, columns=X_cols)
# endregion

# region Correlation & clustering
#The random forest works without problems with correlated features but in order to better asses the features importance
#it is importante to run a groupiwse permutation on the features, thus one has to create clusters of correlated features.
#The Dython library allows to compute association measures for a dataset made up by variables of diffent type.
#Pearson's r for quantitative variables pairs, correlation ratio for quantitative/nominal pairs and Cramers'V for nominals variables as default(or
# Theil's U)
dissimilarity=1-abs(compute_associations(X_train))
diss_condensed =squareform(dissimilarity)
diss_linkage = linkage(diss_condensed, method='complete')

fig, (ax1) = plt.subplots()
ax1.set(xlabel="Features", ylabel="Dissimilarity [0,1]")
plt.axhline(y=0.4, color='r', linestyle='--')
dendro = hierarchy.dendrogram(diss_linkage, labels=list(X_train.columns.values), ax=ax1,leaf_rotation=90)
plt.tight_layout()
#plt.savefig('Clusters.png', bbox_inches='tight')
#pickle.dump(fig,open("Clusters.pickle","wb"))

# endregion

# region Creation clusters
clusters=fcluster(diss_linkage,0.4,criterion='distance')
Exemplo n.º 8
0
                    gow.append(np.mean(np.abs(true - imputed))/cont_range)
                else:
                    gow.append((true.astype(int) != imputed.astype(int)).mean())
            error.loc[method, 'gow'] = np.mean(gow)
        
        #error.T[['full']].T.to_csv(res + 'Run' + str(run_idx) + '/res' + dataset_name + '.csv', index = False)

#=============================
# Comparing associations structure
#=============================

import seaborn as sns
from dython.nominal import compute_associations, associations
from sklearn.metrics.pairwise import cosine_similarity

original_assoc = compute_associations(full_pima, nominal_columns = cat_features)

associations(full_pima, nominal_columns = cat_features)

Ez = out2['Ez.y']
vc = vars_contributions(completed_y2, Ez, assoc_thr = 0.0, \
                       title = 'Contribution of the variables to the latent dimensions',\
                       storage_path = None)

assoc = cosine_similarity(vc, dense_output=True)

labels = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'D.P. Function', 'Age', 'Outcome']

fig, axn = plt.subplots(1, 2, sharex=True, sharey=True, figsize = (12,10)) 
cbar_ax = fig.add_axes([.91, .3, .03, .4])
Exemplo n.º 9
0
def vars_contributions(df, latent_rpz, assoc_thr = 0.0, \
                       title = 'Contribution of the variables to the latent dimensions',\
                       storage_path = None):
    '''
    Plot the contribution of the original variables to the latent dimensions
    constructed by the MDGMM 

    Parameters
    ----------
    df : pandas DataFrame
        The original variables.
    latent_rpz : pandas DataFrame
        The latent representation of the observations issued by the MDGMM.
    assoc_thr : int, optional
        The minimal association (in absolute value) with the latent 
        dimensions for a variable to be displayed. 
        The default is 0.0.
    title : str, optional
        The title of the plot to display. The default is 'Latent representation of the observations'.
    storage : Bool
        The path to store the plot
        
    Returns
    -------
    corrs: The associations computed
    '''

    latent_dim = latent_rpz.shape[1]
    if latent_dim > 2:
        raise NotImplementedError('This function is intended for latent\
                                  representation of dimension 2 for the moment'
                                  )

    if isinstance(latent_rpz, pd.DataFrame):
        latent_rpz.columns = ['Latent dimension 1', 'Latent dimension 2']
    else:
        # Format the latent representation into a pandas DataFrame
        latent_rpz = pd.DataFrame(
            latent_rpz, columns=['Latent dimension 1', 'Latent dimension 2'])

    # Latent representation of the variables
    corrs = np.zeros((df.shape[1], latent_rpz.shape[1]))

    for j1, original_col in enumerate(df.columns):
        for j2, latent_col in enumerate(latent_rpz.columns):
            old_new = pd.DataFrame(df[original_col]).join(
                pd.DataFrame(latent_rpz[latent_col]))
            assoc = compute_associations(old_new).iloc[1, 0]
            corrs[j1, j2] = assoc

    # Plot a variable factor map for the first two dimensions.
    (fig, ax) = plt.subplots(figsize=(8, 8))
    for i in range(df.shape[1]):

        if (np.abs(corrs[i]) > assoc_thr).all():
            ax.arrow(
                0,
                0,  # Start the arrow at the origin
                corrs[i, 0],  #0 for PC1
                corrs[i, 1],  #1 for PC2
                head_width=0.1,
                head_length=0.1)

            plt.text(corrs[i, 0] + 0.05,
                     corrs[i, 1] + 0.05,
                     s=df.columns.values[i])

    an = np.linspace(0, 2 * np.pi, 300)
    plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale
    plt.axis('equal')
    plt.xlabel('Latent dimension 1', fontsize=16)
    plt.ylabel('Latent dimension 2', fontsize=16)
    ax.set_title(title)

    if storage_path:
        plt.savefig(storage_path)

    plt.show()

    return corrs