def distance(character1, character2, novelLength, t): character1 = copy.deepcopy(character1) character2 = copy.deepcopy(character2) # First character needs to have more appearances if len(character1) < len(character2): tmp = character1 character1 = character2 character2 = tmp character1new = [] used_list = [] j_nearest = None # Find a subset of appearances of the first character so that the first character is as close as possible to the # second character for i in range(len(character2)): min_distance = float('inf') for j in range(len(character1)): curr_distance = abs(character1[j] - character2[i]) if curr_distance < min_distance and j not in used_list: min_distance = curr_distance j_nearest = j character1new.append(character1[j_nearest]) used_list.append(j_nearest) character1new.sort() # Normalize character appearances by dividing them with novel length for i in range(len(character1new)): character1new[i] /= novelLength character2[i] /= novelLength # Raise elements to the power of 1 + t for i in range(len(character1new)): character1new[i] **= (1+t) for i in range(len(character1new)): character2[i] **= (1+t) return ot.wasserstein_1d(character1new, character2, p=0.5)
def test_wass_1d(): # test emd1d gives similar results as emd n = 20 m = 30 rng = np.random.RandomState(0) u = rng.randn(n, 1) v = rng.randn(m, 1) M = ot.dist(u, v, metric='sqeuclidean') G, log = ot.emd([], [], M, log=True) wass = log["cost"] wass1d = ot.wasserstein_1d(u, v, [], [], p=2.) # check loss is similar np.testing.assert_allclose(np.sqrt(wass), wass1d)
def predictive_distribution_wasserstein_distance(predictive_distribution1, predictive_distribution2, n_samples=1000, seed=0): predictive_samples1 = np.squeeze( predictive_distribution1.sample(n_samples, seed=seed).numpy()) predictive_samples2 = np.squeeze( predictive_distribution2.sample(n_samples, seed=seed + 1).numpy()) wds = [] for i_test_point in range(predictive_samples1.shape[1]): samples1 = predictive_samples1[:, i_test_point] samples2 = predictive_samples2[:, i_test_point] # ot's Wasserstein distance is about 30% faster than scipy's # wd = wasserstein_distance(samples1, samples2) wd = ot.wasserstein_1d(samples1, samples2) wds.append(wd) return np.mean(wds)
kdeplot(ref_sample, label="kde-ref", color="darkred") plt.grid(linestyle="-.", color="lightgrey") plt.legend() plt.show() # calculate average Wp distance with 100 points in test dataset from ot import wasserstein_1d predlist = [] predlistY = [] ideallist = [] for i in range(100): Y_c, W_c = reg.predict_distribution(X[N_train + i]) predlist += [ wasserstein_1d(p=2, x_a=np.random.choice(Y_c, p=W_c, size=N_train), x_b=np.random.normal(obj_func(X[N_train + i]), np.sqrt(obj_func2(X[N_train + i])), 100000)) ] predlistY += [ wasserstein_1d(p=2, x_a=Y[:N_train], x_b=np.random.normal(obj_func(X[N_train + i]), np.sqrt(obj_func2(X[N_train + i])), 100000)) ] ideallist += [ wasserstein_1d(p=2, x_a=np.random.normal(obj_func(X[N_train + i]), np.sqrt(obj_func2(X[N_train + i])), N_train),
def distance_stats(pre, post, downsample=False, verbose=True): """ Tests for correlation between Euclidean cell-cell distances before and after transformation by a function or DR algorithm. Parameters ---------- pre : np.array vector of unique distances (pdist()) or distance matrix of shape (n_cells, m_cells), i.e. (cdist()) before transformation/projection post : np.array vector of unique distances (pdist()) or distance matrix of shape (n_cells, m_cells), i.e. (cdist()) after transformation/projection downsample : int, optional (default=False) number of distances to downsample to. maximum of 50M (~10k cells, if symmetrical) is recommended for performance. verbose : bool, optional (default=True) print progress statements to console Returns ------- pre : np.array vector of normalized unique distances (pdist()) or distance matrix of shape (n_cells, m_cells), before transformation/projection post : np.array vector of normalized unique distances (pdist()) or distance matrix of shape (n_cells, m_cells), after transformation/projection corr_stats : list output of `pearsonr()` function correlating the two normalized unique distance vectors EMD : float output of `wasserstein_1d()` function calculating the Earth Mover's Distance between the two normalized unique distance vectors 1) performs Pearson correlation of distance distributions 2) normalizes unique distances using min-max standardization for each dataset 3) calculates Wasserstein or Earth-Mover's Distance for normalized distance distributions between datasets """ # make sure the number of cells in each matrix is the same assert ( pre.shape == post.shape ), 'Matrices contain different number of distances.\n{} in "pre"\n{} in "post"\n'.format( pre.shape[0], post.shape[0]) # if distance matrix (mA x mB, result of cdist), flatten to unique cell-cell distances if pre.ndim == 2: if verbose: print( "Flattening pre-transformation distance matrix into 1D array..." ) # if symmetric, only keep unique values (above diagonal) if np.allclose(pre, pre.T, rtol=1e-05, atol=1e-08): pre = pre[np.triu_indices(n=pre.shape[0], k=1)] # otherwise, flatten all distances else: pre = pre.flatten() # if distance matrix (mA x mB, result of cdist), flatten to unique cell-cell distances if post.ndim == 2: if verbose: print( "Flattening post-transformation distance matrix into 1D array..." ) # if symmetric, only keep unique values (above diagonal) if np.allclose(post, post.T, rtol=1e-05, atol=1e-08): post = post[np.triu_indices(n=post.shape[0], k=1)] # otherwise, flatten all distances else: post = post.flatten() # if dataset is large, randomly downsample to reasonable number of distances for calculation if downsample: assert downsample < len( pre ), "Must provide downsample value smaller than total number of cell-cell distances provided in pre and post" if verbose: print("Downsampling to {} total cell-cell distances...".format( downsample)) idx = np.random.choice(np.arange(len(pre)), downsample, replace=False) pre = pre[idx] post = post[idx] # calculate correlation coefficient using Pearson correlation if verbose: print("Correlating distances") corr_stats = pearsonr(x=pre, y=post) # min-max normalization for fair comparison of probability distributions if verbose: print("Normalizing unique distances") pre -= pre.min() pre /= pre.ptp() post -= post.min() post /= post.ptp() # calculate EMD for the distance matrices # by default, downsample to 50M distances to speed processing time, # since this function often breaks with larger distributions if verbose: print("Calculating Earth-Mover's Distance between distributions") if len(pre) > 50000000: idx = np.random.choice(np.arange(len(pre)), 50000000, replace=False) pre_EMD = pre[idx] post_EMD = post[idx] EMD = wasserstein_1d(pre_EMD, post_EMD) else: EMD = wasserstein_1d(pre, post) return pre, post, corr_stats, EMD
def cluster_arrangement_sc( adata, pre, post, obs_col, IDs, ID_names=None, figsize=(4, 4), legend=True, ax_labels=["Native", "Latent"], ): """ Determines pairwise distance preservation between 3 IDs from `adata.obs[obs_col]` Parameters ---------- adata : anndata.AnnData anndata object to pull dimensionality reduction from pre : np.array matrix to subset as pre-transformation (i.e. `adata.X`) post : np.array matrix to subset as pre-transformation (i.e. `adata.obsm["X_pca"]`) obs_col : str name of column in `adata.obs` to use as cell IDs (i.e. "louvain") IDs : list of int (len==3) list of THREE ID indices to compare (i.e. [0,1,2]) figsize : tuple of float, optional (default=(4,4)) size of resulting figure legend : bool, optional (default=True) display legend on plot ax_labels : list of str (len==2), optional (default=["Native","Latent"]) list of two strings for x and y axis labels, respectively. if False, exclude axis labels. Returns ------- corr_stats : list list of outputs of `pearsonr()` function correlating the three normalized unique distance vectors in a pairwise fashion EMD : float list of outputs of `wasserstein_1d()` function calculating the Earth Mover's Distance between the three normalized unique distance vectors in a pairwise fashion Outputs jointplot with scatter of pairwise distance correlations, with marginal KDE plots showing density of each native and latent distance vector """ # distance calculations for pre_obj dist_0_1 = cdist(pre[adata.obs[obs_col] == IDs[0]], pre[adata.obs[obs_col] == IDs[1]]).flatten() dist_0_2 = cdist(pre[adata.obs[obs_col] == IDs[0]], pre[adata.obs[obs_col] == IDs[2]]).flatten() dist_1_2 = cdist(pre[adata.obs[obs_col] == IDs[1]], pre[adata.obs[obs_col] == IDs[2]]).flatten() # combine and min-max normalize dist = np.append(np.append(dist_0_1, dist_0_2), dist_1_2) dist -= dist.min() dist /= dist.ptp() # split normalized distances by cluster pair dist_norm_0_1 = dist[:dist_0_1.shape[0]] dist_norm_0_2 = dist[dist_0_1.shape[0]:dist_0_1.shape[0] + dist_0_2.shape[0]] dist_norm_1_2 = dist[dist_0_1.shape[0] + dist_0_2.shape[0]:] # distance calculations for post_obj post_0_1 = cdist(post[adata.obs[obs_col] == IDs[0]], post[adata.obs[obs_col] == IDs[1]]).flatten() post_0_2 = cdist(post[adata.obs[obs_col] == IDs[0]], post[adata.obs[obs_col] == IDs[2]]).flatten() post_1_2 = cdist(post[adata.obs[obs_col] == IDs[1]], post[adata.obs[obs_col] == IDs[2]]).flatten() # combine and min-max normalize post = np.append(np.append(post_0_1, post_0_2), post_1_2) post -= post.min() post /= post.ptp() # split normalized distances by cluster pair post_norm_0_1 = post[:post_0_1.shape[0]] post_norm_0_2 = post[post_0_1.shape[0]:post_0_1.shape[0] + post_0_2.shape[0]] post_norm_1_2 = post[post_0_1.shape[0] + post_0_2.shape[0]:] # calculate EMD and Pearson correlation stats EMD = [ wasserstein_1d(dist_norm_0_1, post_norm_0_1), wasserstein_1d(dist_norm_0_2, post_norm_0_2), wasserstein_1d(dist_norm_1_2, post_norm_1_2), ] corr_stats = [ pearsonr(x=dist_0_1, y=post_0_1)[0], pearsonr(x=dist_0_2, y=post_0_2)[0], pearsonr(x=dist_1_2, y=post_1_2)[0], ] if ID_names is None: ID_names = IDs.copy() # generate jointplot g = sns.JointGrid(x=dist, y=post, space=0, height=figsize[0]) g.plot_joint(plt.hist2d, bins=50, cmap=sns.cubehelix_palette(as_cmap=True)) sns.kdeplot( dist_norm_0_1, shade=False, bw_method=0.01, ax=g.ax_marg_x, color="darkorange", label=ID_names[0] + " - " + ID_names[1], legend=legend, ) sns.kdeplot( dist_norm_0_2, shade=False, bw_method=0.01, ax=g.ax_marg_x, color="darkgreen", label=ID_names[0] + " - " + ID_names[2], legend=legend, ) sns.kdeplot( dist_norm_1_2, shade=False, bw_method=0.01, ax=g.ax_marg_x, color="darkred", label=ID_names[1] + " - " + ID_names[2], legend=legend, ) if legend: g.ax_marg_x.legend(loc=(1.01, 0.1)) sns.kdeplot( y=post_norm_0_1, shade=False, bw_method=0.01, color="darkorange", ax=g.ax_marg_y, ) sns.kdeplot( y=post_norm_0_2, shade=False, bw_method=0.01, color="darkgreen", ax=g.ax_marg_y, ) sns.kdeplot( y=post_norm_1_2, shade=False, bw_method=0.01, color="darkred", ax=g.ax_marg_y, ) g.ax_joint.plot( np.linspace(max(dist.min(), post.min()), 1, 100), np.linspace(max(dist.min(), post.min()), 1, 100), linestyle="dashed", color=sns.cubehelix_palette()[-1], ) # plot identity line as reference for regression if ax_labels: plt.xlabel(ax_labels[0], fontsize="xx-large", color=sns.cubehelix_palette()[-1]) plt.ylabel(ax_labels[1], fontsize="xx-large", color=sns.cubehelix_palette()[2]) plt.tick_params(labelleft=False, labelbottom=False) return corr_stats, EMD
def cluster_arrangement_sc( adata, pre, post, obs_col, IDs, ID_names=None, figsize=(4, 4), legend=True, ax_labels=["Native", "Latent"], ): """ determine pairwise distance preservation between 3 IDs from adata.obs[obs_col] adata = anndata object to pull dimensionality reduction from pre = matrix to subset as pre-transformation (i.e. adata.X) post = matrix to subset as pre-transformation (i.e. adata.obsm['X_pca']) obs_col = name of column in adata.obs to use as cell IDs (i.e. 'louvain') IDs = list of THREE IDs to compare (i.e. [0,1,2]) figsize = size of resulting axes legend = display legend on plot ax_labels = list of two strings for x and y axis labels, respectively. if False, exclude axis labels. """ # distance calculations for pre_obj dist_0_1 = cdist( pre[adata.obs[obs_col] == IDs[0]], pre[adata.obs[obs_col] == IDs[1]] ).flatten() dist_0_2 = cdist( pre[adata.obs[obs_col] == IDs[0]], pre[adata.obs[obs_col] == IDs[2]] ).flatten() dist_1_2 = cdist( pre[adata.obs[obs_col] == IDs[1]], pre[adata.obs[obs_col] == IDs[2]] ).flatten() # combine and min-max normalize dist = np.append(np.append(dist_0_1, dist_0_2), dist_1_2) dist -= dist.min() dist /= dist.ptp() # split normalized distances by cluster pair dist_norm_0_1 = dist[: dist_0_1.shape[0]] dist_norm_0_2 = dist[dist_0_1.shape[0] : dist_0_1.shape[0] + dist_0_2.shape[0]] dist_norm_1_2 = dist[dist_0_1.shape[0] + dist_0_2.shape[0] :] # distance calculations for post_obj post_0_1 = cdist( post[adata.obs[obs_col] == IDs[0]], post[adata.obs[obs_col] == IDs[1]] ).flatten() post_0_2 = cdist( post[adata.obs[obs_col] == IDs[0]], post[adata.obs[obs_col] == IDs[2]] ).flatten() post_1_2 = cdist( post[adata.obs[obs_col] == IDs[1]], post[adata.obs[obs_col] == IDs[2]] ).flatten() # combine and min-max normalize post = np.append(np.append(post_0_1, post_0_2), post_1_2) post -= post.min() post /= post.ptp() # split normalized distances by cluster pair post_norm_0_1 = post[: post_0_1.shape[0]] post_norm_0_2 = post[post_0_1.shape[0] : post_0_1.shape[0] + post_0_2.shape[0]] post_norm_1_2 = post[post_0_1.shape[0] + post_0_2.shape[0] :] # calculate EMD and Pearson correlation stats EMD = [ wasserstein_1d(dist_norm_0_1, post_norm_0_1), wasserstein_1d(dist_norm_0_2, post_norm_0_2), wasserstein_1d(dist_norm_1_2, post_norm_1_2), ] corr_stats = [ pearsonr(x=dist_0_1, y=post_0_1)[0], pearsonr(x=dist_0_2, y=post_0_2)[0], pearsonr(x=dist_1_2, y=post_1_2)[0], ] if ID_names is None: ID_names = IDs.copy() # generate jointplot g = sns.JointGrid(x=dist, y=post, space=0, height=figsize[0]) g.plot_joint(plt.hist2d, bins=50, cmap=sns.cubehelix_palette(as_cmap=True)) sns.kdeplot( dist_norm_0_1, shade=False, bw_method=0.01, ax=g.ax_marg_x, color="darkorange", label=ID_names[0] + " - " + ID_names[1], legend=legend, ) sns.kdeplot( dist_norm_0_2, shade=False, bw_method=0.01, ax=g.ax_marg_x, color="darkgreen", label=ID_names[0] + " - " + ID_names[2], legend=legend, ) sns.kdeplot( dist_norm_1_2, shade=False, bw_method=0.01, ax=g.ax_marg_x, color="darkred", label=ID_names[1] + " - " + ID_names[2], legend=legend, ) if legend: g.ax_marg_x.legend(loc=(1.01, 0.1)) sns.kdeplot( y=post_norm_0_1, shade=False, bw_method=0.01, color="darkorange", ax=g.ax_marg_y, ) sns.kdeplot( y=post_norm_0_2, shade=False, bw_method=0.01, color="darkgreen", ax=g.ax_marg_y, ) sns.kdeplot( y=post_norm_1_2, shade=False, bw_method=0.01, color="darkred", ax=g.ax_marg_y, ) g.ax_joint.plot( np.linspace(max(dist.min(), post.min()), 1, 100), np.linspace(max(dist.min(), post.min()), 1, 100), linestyle="dashed", color=sns.cubehelix_palette()[-1], ) # plot identity line as reference for regression if ax_labels: plt.xlabel(ax_labels[0], fontsize="xx-large", color=sns.cubehelix_palette()[-1]) plt.ylabel(ax_labels[1], fontsize="xx-large", color=sns.cubehelix_palette()[2]) plt.tick_params(labelleft=False, labelbottom=False) return corr_stats, EMD
def cluster_arrangement( pre_obj, post_obj, clusters, cluster_names=None, figsize=(6, 6), pre_transform="arcsinh", legend=True, ax_labels=["Native", "Latent"], ): """ determine pairwise distance preservation between 3 clusters pre_obj = RNA_counts object post_obj = DR object clusters = list of barcode IDs i.e. ['0','1','2'] to calculate pairwise distances between clusters 0, 1 and 2 cluster_names = list of cluster names for labeling i.e. ['Bipolar Cells','Rods','Amacrine Cells'] for clusters 0, 1 and 2, respectively figsize = size of output figure to plot pre_transform = apply transformation to pre_obj counts? (None, 'arcsinh', or 'log2') legend = display legend on plot ax_labels = list of two strings for x and y axis labels, respectively. if False, exclude axis labels. """ # distance calculations for pre_obj dist_0_1 = pre_obj.barcode_distance_matrix( ranks=[clusters[0], clusters[1]], transform=pre_transform ).flatten() dist_0_2 = pre_obj.barcode_distance_matrix( ranks=[clusters[0], clusters[2]], transform=pre_transform ).flatten() dist_1_2 = pre_obj.barcode_distance_matrix( ranks=[clusters[1], clusters[2]], transform=pre_transform ).flatten() # combine and min-max normalize dist = np.append(np.append(dist_0_1, dist_0_2), dist_1_2) dist -= dist.min() dist /= dist.ptp() # split normalized distances by cluster pair dist_norm_0_1 = dist[: dist_0_1.shape[0]] dist_norm_0_2 = dist[dist_0_1.shape[0] : dist_0_1.shape[0] + dist_0_2.shape[0]] dist_norm_1_2 = dist[dist_0_1.shape[0] + dist_0_2.shape[0] :] # distance calculations for post_obj post_0_1 = post_obj.barcode_distance_matrix( ranks=[clusters[0], clusters[1]] ).flatten() post_0_2 = post_obj.barcode_distance_matrix( ranks=[clusters[0], clusters[2]] ).flatten() post_1_2 = post_obj.barcode_distance_matrix( ranks=[clusters[1], clusters[2]] ).flatten() # combine and min-max normalize post = np.append(np.append(post_0_1, post_0_2), post_1_2) post -= post.min() post /= post.ptp() # split normalized distances by cluster pair post_norm_0_1 = post[: post_0_1.shape[0]] post_norm_0_2 = post[post_0_1.shape[0] : post_0_1.shape[0] + post_0_2.shape[0]] post_norm_1_2 = post[post_0_1.shape[0] + post_0_2.shape[0] :] # calculate EMD and Pearson correlation stats EMD = [ wasserstein_1d(dist_norm_0_1, post_norm_0_1), wasserstein_1d(dist_norm_0_2, post_norm_0_2), wasserstein_1d(dist_norm_1_2, post_norm_1_2), ] corr_stats = [ pearsonr(x=dist_0_1, y=post_0_1)[0], pearsonr(x=dist_0_2, y=post_0_2)[0], pearsonr(x=dist_1_2, y=post_1_2)[0], ] if cluster_names is None: cluster_names = clusters.copy() # generate jointplot g = sns.JointGrid(x=dist, y=post, space=0, height=figsize[0]) g.plot_joint(plt.hist2d, bins=50, cmap=sns.cubehelix_palette(as_cmap=True)) sns.kdeplot( x=dist_norm_0_1, shade=False, bw_method=0.01, ax=g.ax_marg_x, color="darkorange", label=cluster_names[0] + " - " + cluster_names[1], legend=legend, ) sns.kdeplot( x=dist_norm_0_2, shade=False, bw_method=0.01, ax=g.ax_marg_x, color="darkgreen", label=cluster_names[0] + " - " + cluster_names[2], legend=legend, ) sns.kdeplot( x=dist_norm_1_2, shade=False, bw_method=0.01, ax=g.ax_marg_x, color="darkred", label=cluster_names[1] + " - " + cluster_names[2], legend=legend, ) if legend: g.ax_marg_x.legend(loc=(1.01, 0.1)) sns.kdeplot( y=post_norm_0_1, shade=False, bw_method=0.01, color="darkorange", ax=g.ax_marg_y, ) sns.kdeplot( y=post_norm_0_2, shade=False, bw_method=0.01, color="darkgreen", ax=g.ax_marg_y, ) sns.kdeplot( y=post_norm_1_2, shade=False, bw_method=0.01, color="darkred", ax=g.ax_marg_y, ) g.ax_joint.plot( np.linspace(max(min(dist), min(post)), 1, 100), np.linspace(max(min(dist), min(post)), 1, 100), linestyle="dashed", color=sns.cubehelix_palette()[-1], ) # plot identity line as reference for regression if ax_labels: plt.xlabel(ax_labels[0], fontsize="xx-large", color=sns.cubehelix_palette()[-1]) plt.ylabel(ax_labels[1], fontsize="xx-large", color=sns.cubehelix_palette()[2]) plt.tick_params(labelleft=False, labelbottom=False) return corr_stats, EMD