def correlation_distance(self, how: str = 'euclidean') -> float: """ Calculate distance between correlation matrices with certain metric. :param how: metric to measure distance. Choose from [``euclidean``, ``mae``, ``rmse``]. :return: distance between the association matrices in the chosen evaluation metric. Default: Euclidean """ from scipy.spatial.distance import cosine if how == 'euclidean': distance_func = euclidean_distance elif how == 'mae': distance_func = mean_absolute_error elif how == 'rmse': distance_func = rmse elif how == 'cosine': def custom_cosine(a, b): return cosine(a.reshape(-1), b.reshape(-1)) distance_func = custom_cosine else: raise ValueError( f'`how` parameter must be in [euclidean, mae, rmse]') real_corr = compute_associations( self.real, nominal_columns=self.categorical_columns, theil_u=True) fake_corr = compute_associations( self.fake, nominal_columns=self.categorical_columns, theil_u=True) return distance_func(real_corr.values, fake_corr.values)
def test_associations(): """ Tests that check wether the dython associations are still computed as is expected. """ # load test data real_assoc = pd.read_csv(test_data_folder / 'real_associations.csv', index_col='Unnamed: 0') real_assoc_theil = pd.read_csv(test_data_folder / 'real_associations_theil.csv', index_col='Unnamed: 0') fake_assoc = pd.read_csv(test_data_folder / 'fake_associations.csv', index_col='Unnamed: 0') fake_assoc_theil = pd.read_csv(test_data_folder / 'fake_associations_theil.csv', index_col='Unnamed: 0') # Assert equality with saved data pd.testing.assert_frame_equal( real_assoc, compute_associations(real, nominal_columns=cat_cols)) pd.testing.assert_frame_equal( real_assoc_theil, compute_associations(real, nominal_columns=cat_cols, theil_u=True)) pd.testing.assert_frame_equal( fake_assoc, compute_associations(fake, nominal_columns=cat_cols)) pd.testing.assert_frame_equal( fake_assoc_theil, compute_associations(fake, nominal_columns=cat_cols, theil_u=True))
def test_associations(): # load test data real_assoc = pd.read_csv(test_data_folder/'real_associations.csv', index_col='Unnamed: 0') real_assoc_theil = pd.read_csv(test_data_folder/'real_associations_theil.csv', index_col='Unnamed: 0') fake_assoc = pd.read_csv(test_data_folder/'fake_associations.csv', index_col='Unnamed: 0') fake_assoc_theil = pd.read_csv(test_data_folder/'fake_associations_theil.csv', index_col='Unnamed: 0') # Assert equality with saved data pd.testing.assert_frame_equal(real_assoc, compute_associations(real, nominal_columns=cat_cols)) pd.testing.assert_frame_equal(real_assoc_theil, compute_associations(real, nominal_columns=cat_cols, theil_u=True)) pd.testing.assert_frame_equal(fake_assoc, compute_associations(fake, nominal_columns=cat_cols)) pd.testing.assert_frame_equal(fake_assoc_theil, compute_associations(fake, nominal_columns=cat_cols, theil_u=True))
def correlation_correlation(self) -> float: """ Calculate the correlation coefficient between the association matrices of self.real and self.fake using self.comparison_metric :return: The correlation coefficient """ total_metrics = pd.DataFrame() for ds_name in ['real', 'fake']: ds = getattr(self, ds_name) corr_df = compute_associations(ds, nominal_columns=self.categorical_columns, theil_u=True) values = corr_df.values values = values[~np.eye(values.shape[0], dtype=bool)].reshape(values.shape[0], -1) total_metrics[ds_name] = values.flatten() self.correlation_correlations = total_metrics corr, p = self.comparison_metric(total_metrics['real'], total_metrics['fake']) if self.verbose: print('\nColumn correlation between datasets:') print(total_metrics.to_string()) return corr
idx = 0 fig, ax = plt.subplots(figsize = (4,4)) ax.scatter(s2[idx], s_full[idx]) plt.title(full_contra.columns[idx]) for i, txt in enumerate(full_contra.columns): ax.annotate(txt, (s2[idx][i], s_full[idx][i])) ax.set_xlim([-1,1]) ax.set_ylim([-1,1]) # Compare the representation between completed cosine similarity and original associations associations(complete_y.astype(float), nominal_columns = cat_features) associations(completed_y.astype(float), nominal_columns = cat_features) associations(full_contra.astype(float), nominal_columns = cat_features) assoc = compute_associations(full_contra.astype(float), nominal_columns = cat_features).values idx = 0 fig, ax = plt.subplots(figsize = (4,4)) ax.scatter(assoc[idx], s_full[idx]) plt.title(full_contra.columns[idx]) for i, txt in enumerate(full_contra.columns): ax.annotate(txt, (assoc[idx][i], s_full[idx][i])) ax.set_xlim([-1,1]) ax.set_ylim([-1,1]) #====================================== # Ez.y #======================================
if cat_features[col_idx]: le = LabelEncoder() data[dataset][colname] = le.fit_transform( data[dataset][colname]) # Delete the sex column data[dataset] = data[dataset][[ name for name in varnames if name != 'sex' ]] cat_features_new = [ cat_features[i] for i in range(len(varnames)) if varnames[i] != 'sex' ] if i == len(plot_datasets) - 1: corr = compute_associations(data[dataset].astype(int), nominal_columns=cat_features_new) g1 = sns.heatmap(corr, cmap="YlGnBu", ax=axs[i], cbar_ax=axs[-1]) else: corr = compute_associations(data[dataset].astype(int), nominal_columns=cat_features_new) g1 = sns.heatmap(corr, cmap="YlGnBu", cbar=False, ax=axs[i]) g1.set_ylabel('') g1.set_xlabel('') g1.set_yticks([])
X_train=X_preprocessing.fit_transform(X_train) X_cols=X_preprocessing.transformers_[0][1].named_steps["onehot"].get_feature_names(categorical_cols).tolist()+numeric_cols X_train=pd.DataFrame(X_train, columns=X_cols) X_test=X_preprocessing.fit_transform(X_test) X_cols=X_preprocessing.transformers_[0][1].named_steps["onehot"].get_feature_names(categorical_cols).tolist()+numeric_cols X_test=pd.DataFrame(X_test, columns=X_cols) # endregion # region Correlation & clustering #The random forest works without problems with correlated features but in order to better asses the features importance #it is importante to run a groupiwse permutation on the features, thus one has to create clusters of correlated features. #The Dython library allows to compute association measures for a dataset made up by variables of diffent type. #Pearson's r for quantitative variables pairs, correlation ratio for quantitative/nominal pairs and Cramers'V for nominals variables as default(or # Theil's U) dissimilarity=1-abs(compute_associations(X_train)) diss_condensed =squareform(dissimilarity) diss_linkage = linkage(diss_condensed, method='complete') fig, (ax1) = plt.subplots() ax1.set(xlabel="Features", ylabel="Dissimilarity [0,1]") plt.axhline(y=0.4, color='r', linestyle='--') dendro = hierarchy.dendrogram(diss_linkage, labels=list(X_train.columns.values), ax=ax1,leaf_rotation=90) plt.tight_layout() #plt.savefig('Clusters.png', bbox_inches='tight') #pickle.dump(fig,open("Clusters.pickle","wb")) # endregion # region Creation clusters clusters=fcluster(diss_linkage,0.4,criterion='distance')
gow.append(np.mean(np.abs(true - imputed))/cont_range) else: gow.append((true.astype(int) != imputed.astype(int)).mean()) error.loc[method, 'gow'] = np.mean(gow) #error.T[['full']].T.to_csv(res + 'Run' + str(run_idx) + '/res' + dataset_name + '.csv', index = False) #============================= # Comparing associations structure #============================= import seaborn as sns from dython.nominal import compute_associations, associations from sklearn.metrics.pairwise import cosine_similarity original_assoc = compute_associations(full_pima, nominal_columns = cat_features) associations(full_pima, nominal_columns = cat_features) Ez = out2['Ez.y'] vc = vars_contributions(completed_y2, Ez, assoc_thr = 0.0, \ title = 'Contribution of the variables to the latent dimensions',\ storage_path = None) assoc = cosine_similarity(vc, dense_output=True) labels = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'D.P. Function', 'Age', 'Outcome'] fig, axn = plt.subplots(1, 2, sharex=True, sharey=True, figsize = (12,10)) cbar_ax = fig.add_axes([.91, .3, .03, .4])
def vars_contributions(df, latent_rpz, assoc_thr = 0.0, \ title = 'Contribution of the variables to the latent dimensions',\ storage_path = None): ''' Plot the contribution of the original variables to the latent dimensions constructed by the MDGMM Parameters ---------- df : pandas DataFrame The original variables. latent_rpz : pandas DataFrame The latent representation of the observations issued by the MDGMM. assoc_thr : int, optional The minimal association (in absolute value) with the latent dimensions for a variable to be displayed. The default is 0.0. title : str, optional The title of the plot to display. The default is 'Latent representation of the observations'. storage : Bool The path to store the plot Returns ------- corrs: The associations computed ''' latent_dim = latent_rpz.shape[1] if latent_dim > 2: raise NotImplementedError('This function is intended for latent\ representation of dimension 2 for the moment' ) if isinstance(latent_rpz, pd.DataFrame): latent_rpz.columns = ['Latent dimension 1', 'Latent dimension 2'] else: # Format the latent representation into a pandas DataFrame latent_rpz = pd.DataFrame( latent_rpz, columns=['Latent dimension 1', 'Latent dimension 2']) # Latent representation of the variables corrs = np.zeros((df.shape[1], latent_rpz.shape[1])) for j1, original_col in enumerate(df.columns): for j2, latent_col in enumerate(latent_rpz.columns): old_new = pd.DataFrame(df[original_col]).join( pd.DataFrame(latent_rpz[latent_col])) assoc = compute_associations(old_new).iloc[1, 0] corrs[j1, j2] = assoc # Plot a variable factor map for the first two dimensions. (fig, ax) = plt.subplots(figsize=(8, 8)) for i in range(df.shape[1]): if (np.abs(corrs[i]) > assoc_thr).all(): ax.arrow( 0, 0, # Start the arrow at the origin corrs[i, 0], #0 for PC1 corrs[i, 1], #1 for PC2 head_width=0.1, head_length=0.1) plt.text(corrs[i, 0] + 0.05, corrs[i, 1] + 0.05, s=df.columns.values[i]) an = np.linspace(0, 2 * np.pi, 300) plt.plot(np.cos(an), np.sin(an)) # Add a unit circle for scale plt.axis('equal') plt.xlabel('Latent dimension 1', fontsize=16) plt.ylabel('Latent dimension 2', fontsize=16) ax.set_title(title) if storage_path: plt.savefig(storage_path) plt.show() return corrs