def score_divergence(codes, labels, sources, k=50, **kwargs): """ Measures how well sources are mixed (smaller: well-mixed) Function to calculate the divergence score as described in BERMUDA Estimates the avg pairwise symmetric divergence of p_src and q_tgt i.e. .5 * D(p_src || q_tgt) + .5 D(q_tgt || p_src) for each src tgt pair p and q eval with a non-parametric density estimate centered at x_i i.e weighthed by the distance to the kth-NN from x_i for each dataset inputs: codes: merged data matrix labels: labels of each item (e.g. cell-type) sources: index of each item's source (e.g tech; data or prior) k: k-NN used to estimate data density kwargs: see preprocess_code outputs: divergence score, non-negative """ num_datasets = np.unique(sources).size div_pq = list() div_qp = list() # pairs of datasets for d1 in range(num_datasets): for d2 in range(d1+1, num_datasets): idx1, idx2, _ = separate_shared_idx(labels, sources, d1=d1, d2=d2) if sum(idx1) < k or sum(idx2) < k: continue pq = estimate(codes[idx1, :], codes[idx2, :], k) div_pq.append(max(pq, 0)) qp = estimate(codes[idx2, :], codes[idx1, :], k) div_qp.append(max(qp, 0)) # average the scores across pairs of datasets try: div_score = (sum(div_pq) / len(div_pq) + sum(div_qp) / len(div_qp)) / 2 except ZeroDivisionError: div_score = np.nan return div_score
Xtrain[:, j].min() - 0.5, Xtrain[:, j].max() + 0.5, Xtrain[:, i].min() - 0.5, Xtrain[:, i].max() + 0.5 ]) if i > j: fig.axes[i, j].axis([ Xtrain[:, j].min() - 0.5, Xtrain[:, j].max() + 0.5, Xtrain[:, i].min() - 0.5, Xtrain[:, i].max() + 0.5 ]) # plot plt.close('all') fig1 = sns.PairGrid(pd.DataFrame(Xtrain)) fig1 = fig1.map_upper(plt.scatter, edgecolor="w") fig1 = fig1.map_lower(sns.kdeplot, cmap="Blues_d") fig1 = fig1.map_diag(sns.kdeplot, lw=3, legend=False) set_axes(fig1) savefig(1) x_mu = model.sample(1000) std = (x_mu[:, 4:8] - x_mu[:, 0:4]) x_samp = x_mu[:, 4:8] + (std**2) * torch.randn_like(x_mu[:, 4:8]) x_samp = x_samp[std.sum(dim=1) < 10] fig7 = sns.PairGrid(pd.DataFrame(x_samp.detach().numpy())) fig7 = fig7.map_upper(plt.scatter, edgecolor="w") fig7 = fig7.map_lower(sns.kdeplot, cmap="Blues_d") fig7 = fig7.map_diag(sns.kdeplot, lw=3, legend=False) set_axes(fig7) savefig(7) print(estimate(Xtrain, x_samp.detach().numpy()))
def evaluate_scores(div_ent_code, sil_code, cell_labels, dataset_labels, num_datasets, div_ent_dim, sil_dim, sil_dist): """ Calculate three proposed evaluation metrics Args: div_ent_code: num_cells * num_features, embedding for divergence and entropy calculation, usually with dim of 2 sil_code: num_cells * num_features, embedding for silhouette score calculation cell_labels: dataset_labels: num_datasets: div_ent_dim: if dimension of div_ent_code > div_ent_dim, apply PCA first sil_dim: if dimension of sil_code > sil_dim, apply PCA first sil_dist: distance metric for silhouette score calculation Returns: div_score: divergence score ent_score: entropy score sil_score: silhouette score """ # calculate divergence and entropy if div_ent_code.shape[1] > div_ent_dim: div_ent_code = PCA( n_components=div_ent_dim).fit_transform(div_ent_code) div_pq = [] # divergence dataset p, q div_qp = [] # divergence dataset q, p ent = [] # entropy # pairs of datasets for d1 in range(1, num_datasets + 1): for d2 in range(d1 + 1, num_datasets + 1): idx1 = dataset_labels == d1 idx2 = dataset_labels == d2 labels = np.intersect1d(np.unique(cell_labels[idx1]), np.unique(cell_labels[idx2])) idx1_mutual = np.logical_and(idx1, np.isin(cell_labels, labels)) idx2_mutual = np.logical_and(idx2, np.isin(cell_labels, labels)) idx_specific = np.logical_and( np.logical_or(idx1, idx2), np.logical_not(np.isin(cell_labels, labels))) # divergence if np.sum(idx1_mutual) >= cal_min and np.sum( idx2_mutual) >= cal_min: div_pq.append( max( estimate(div_ent_code[idx1_mutual, :], div_ent_code[idx2_mutual, :], cal_min), 0)) div_qp.append( max( estimate(div_ent_code[idx2_mutual, :], div_ent_code[idx1_mutual, :], cal_min), 0)) # entropy if (sum(idx_specific) > 0): ent_tmp = cal_entropy(div_ent_code, idx_specific, dataset_labels) ent.append(sum(ent_tmp) / len(ent_tmp)) if len(ent) == 0: # if no dataset specific cell types, store entropy as -1 ent.append(-1) # calculate silhouette_score if sil_code.shape[1] > sil_dim: sil_code = PCA(n_components=sil_dim).fit_transform(sil_code) sil_scores = silhouette_samples(sil_code, cell_labels, metric=sil_dist) # average for scores div_score = (sum(div_pq) / len(div_pq) + sum(div_qp) / len(div_qp)) / 2 ent_score = sum(ent) / len(ent) sil_score = sum(sil_scores) / len(sil_scores) return div_score, ent_score, sil_score
def plot_posterior(samples, x_truth, epoch, idx, run='testing', other_samples=None): """ plots the posteriors """ if other_samples is not None: true_post = np.zeros([other_samples.shape[0], bilby_ol_len]) true_x = np.zeros(inf_ol_len) true_XS = np.zeros([samples.shape[0], inf_ol_len]) ol_pars = [] cnt = 0 for inf_idx, bilby_idx in zip(inf_ol_idx, bilby_ol_idx): inf_par = params['inf_pars'][inf_idx] bilby_par = params['bilby_pars'][bilby_idx] true_XS[:, cnt] = (samples[:, inf_idx] * (bounds[inf_par + '_max'] - bounds[inf_par + '_min'])) + bounds[inf_par + '_min'] true_post[:, cnt] = ( other_samples[:, bilby_idx] * (bounds[bilby_par + '_max'] - bounds[bilby_par + '_min'])) + bounds[bilby_par + '_min'] true_x[cnt] = (x_truth[inf_idx] * (bounds[inf_par + '_max'] - bounds[inf_par + '_min'] )) + bounds[inf_par + '_min'] ol_pars.append(inf_par) cnt += 1 parnames = [] for k_idx, k in enumerate(params['rand_pars']): if np.isin(k, ol_pars): parnames.append(params['corner_labels'][k]) # convert to RA true_XS = convert_hour_angle_to_ra(true_XS, params, ol_pars) true_x = convert_hour_angle_to_ra( np.reshape(true_x, [1, true_XS.shape[1]]), params, ol_pars).flatten() # compute KL estimate idx1 = np.random.randint(0, true_XS.shape[0], 1000) idx2 = np.random.randint(0, true_post.shape[0], 1000) try: KL_est = estimate(true_XS[idx1, :], true_post[idx2, :]) except: KL_est = -1.0 pass else: # Get corner parnames to use in plotting labels parnames = [] for k_idx, k in enumerate(params['rand_pars']): if np.isin(k, params['inf_pars']): parnames.append(params['corner_labels'][k]) # un-normalise full inference parameters full_true_x = np.zeros(len(params['inf_pars'])) new_samples = np.zeros([samples.shape[0], len(params['inf_pars'])]) for inf_par_idx, inf_par in enumerate(params['inf_pars']): new_samples[:, inf_par_idx] = ( samples[:, inf_par_idx] * (bounds[inf_par + '_max'] - bounds[inf_par + '_min'])) + bounds[inf_par + '_min'] full_true_x[inf_par_idx] = ( x_truth[inf_par_idx] * (bounds[inf_par + '_max'] - bounds[inf_par + '_min'])) + bounds[inf_par + '_min'] new_samples = convert_hour_angle_to_ra(new_samples, params, params['inf_pars']) full_true_x = convert_hour_angle_to_ra( np.reshape(full_true_x, [1, samples.shape[1]]), params, params['inf_pars']).flatten() KL_est = -1.0 # define general plotting arguments defaults_kwargs = dict(bins=50, smooth=0.9, label_kwargs=dict(fontsize=16), title_kwargs=dict(fontsize=16), truth_color='tab:orange', quantiles=[0.16, 0.84], levels=(0.68, 0.90, 0.95), density=True, plot_density=False, plot_datapoints=True, max_n_ticks=3) # 1-d hist kwargs for normalisation hist_kwargs = dict(density=True, color='tab:red') hist_kwargs_other = dict(density=True, color='tab:blue') if other_samples is None: figure = corner.corner(new_samples, **defaults_kwargs, labels=parnames, color='tab:red', fill_contours=True, truths=x_truth, show_titles=True, hist_kwargs=hist_kwargs) plt.savefig('%s/full_posterior_epoch_%d_event_%d.png' % (run, epoch, idx)) plt.close() else: figure = corner.corner(true_post, **defaults_kwargs, labels=parnames, color='tab:blue', show_titles=True, hist_kwargs=hist_kwargs_other) corner.corner(true_XS, **defaults_kwargs, color='tab:red', fill_contours=True, truths=true_x, show_titles=True, fig=figure, hist_kwargs=hist_kwargs) plt.annotate('KL = {:.3f}'.format(KL_est), (0.2, 0.95), xycoords='figure fraction', fontsize=18) plt.savefig('%s/comp_posterior_epoch_%d_event_%d.png' % (run, epoch, idx)) plt.close() return KL_est
def evaluate_scores(code_arr, cell_labels, dataset_labels, num_datasets, epoch): """ Calculate three proposed evaluation metrics Args: div_ent_code: num_cells * num_features, embedding for divergence and entropy calculation, usually with dim of 2 sil_code: num_cells * num_features, embedding for silhouette score calculation cell_labels: true cell labels dataset_labels: index of different datasets num_datasets: number of datasets Returns: div_score: divergence score ent_score: entropy score sil_score: silhouette score """ # calculate UMAP import umap fit = umap.UMAP(n_neighbors=30, min_dist=0.3, n_components=2, metric='cosine', random_state=123) div_ent_code = fit.fit_transform(code_arr) # div_ent_code = PCA(n_components=2).fit_transform(code_arr) # print(div_ent_code.shape) # calculate divergence and entropy div_pq = [] # divergence dataset p, q div_qp = [] # divergence dataset q, p div_pq_all = [] # divergence dataset p, q div_qp_all = [] # divergence dataset q, p ent = [] # entropy # pairs of datasets for d1 in range(1, num_datasets + 1): for d2 in range(d1 + 1, num_datasets + 1): idx1 = dataset_labels == d1 idx2 = dataset_labels == d2 # the samples in dataset_labels belongs to which batch labels = np.intersect1d( np.unique(cell_labels[idx1]), np.unique(cell_labels[idx2])) #shared cluster between datasets idx1_mutual = np.logical_and(idx1, np.isin(cell_labels, labels)) idx2_mutual = np.logical_and(idx2, np.isin(cell_labels, labels)) idx_specific = np.logical_and( np.logical_or(idx1, idx2), np.logical_not(np.isin(cell_labels, labels))) # Estimate univesal k-NN divergence. if np.sum(idx1_mutual) >= cal_min and np.sum( idx2_mutual) >= cal_min: # calculate by cluster # batch_1 = div_ent_code[idx1, :] # batch_2 = div_ent_code[idx2, :] # for label_by in labels: # # print(sum(label_by == cell_labels[idx1]), sum(label_by == cell_labels[idx2])) #cluster contain too little samples will lead to inf or nan # #estimate(X, Y, k=None, n_jobs=1), X, Y: 2-dimensional array where each row is a sample. # div_pq.append( # estimate(batch_1[label_by == cell_labels[idx1], :], batch_2[label_by == cell_labels[idx2], :], # cal_min)) # div_qp.append( # estimate(batch_2[label_by == cell_labels[idx2], :], batch_1[label_by == cell_labels[idx1], :], # cal_min)) # calculate by all cells div_pq_all.append( max( estimate(div_ent_code[idx1_mutual, :], div_ent_code[idx2_mutual, :], cal_min), 0)) div_qp_all.append( max( estimate(div_ent_code[idx2_mutual, :], div_ent_code[idx1_mutual, :], cal_min), 0)) # entropy if (sum(idx_specific) > 0): ent_tmp = cal_entropy(div_ent_code, idx_specific, dataset_labels) ent.append(sum(ent_tmp) / len(ent_tmp)) if len(ent) == 0: # if no dataset specific cell types, store entropy as -1 ent.append(-1) # # calculate silhouette_score # sil_code = code_arr # if sil_code.shape[1] > sil_dim: # sil_code = PCA(n_components=2).fit_transform(sil_code) # sil_scores = silhouette_samples(sil_code, cell_labels, metric="euclidean") # print(div_ent_code.shape, sil_code.shape) sil_scores = silhouette_samples(div_ent_code, cell_labels, metric="euclidean") # sil_scores = silhouette_score(div_ent_code, cell_labels, metric="euclidean") # average for scores # div_pq = np.array(div_pq)[np.logical_and(np.isfinite(div_pq), ~np.isnan(div_pq))] # div_qp= np.array(div_qp)[np.logical_and(np.isfinite(div_qp), ~np.isnan(div_qp))] # div_score = (sum(div_pq) / len(div_pq) + sum(div_qp) / len(div_qp)) / 2 div_score = 0 div_score_all = (sum(div_pq_all) / len(div_pq_all) + sum(div_qp_all) / len(div_qp_all)) / 2 ent_score = sum(ent) / len(ent) sil_score = sum(sil_scores) / len(sil_scores) alignment_score = seurat_alignment_score(code_arr, dataset_labels, n=10, k=0.01) mixing_entropy = batch_mixing_entropy(code_arr, dataset_labels) print( "epoch: ", epoch, ' divergence_score: {:.3f}, {:.3f}, alignment_score, mixing_entropy: {:.3f},{:.3f} entropy_score: {:.3f}, silhouette_score: {:.3f}' .format(div_score, div_score_all, alignment_score, mixing_entropy, ent_score, sil_score)) return div_score, div_score_all, ent_score, sil_score
def compute_kl(sampset_1,sampset_2,samplers,one_D=False): """ Compute KL for one test case. """ # Remove samples outside of the prior mass distribution cur_max = self.params['n_samples'] # Iterate over parameters and remove samples outside of prior if samplers[0] == 'vitamin1' or samplers[1] == 'vitamin2': # Apply mask sampset_1 = sampset_1.T sampset_2 = sampset_2.T set1 = sampset_1 set2 = sampset_2 del_cnt_set1 = 0 del_cnt_set2 = 0 params_to_infer = self.params['inf_pars'] for i in range(set1.shape[1]): # iterate over each parameter in first set for k,q in enumerate(params_to_infer): # if sample out of range, delete the sample if set1[k,i] < 0.0 or set1[k,i] > 1.0: sampset_1 = np.delete(sampset_1,del_cnt_set1,axis=1) del_cnt_set1-=1 break # check m1 > m2 elif q == 'mass_1' or q == 'mass_2': m1_idx = np.argwhere(params_to_infer=='mass_1') m2_idx = np.argwhere(params_to_infer=='mass_2') if set1[m1_idx,i] < set1[m2_idx,i]: sampset_1 = np.delete(sampset_1,del_cnt_set1,axis=1) del_cnt_set1-=1 break del_cnt_set1+=1 # iterate over each sample for i in range(set2.shape[1]): # iterate over each parameter in second set for k,q in enumerate(params_to_infer): # if sample out of range, delete the sample if set2[k,i] < 0.0 or set2[k,i] > 1.0: sampset_2 = np.delete(sampset_2,del_cnt_set2,axis=1) del_cnt_set2-=1 break # check m1 > m2 elif q == 'mass_1' or q == 'mass_2': m1_idx = np.argwhere(params_to_infer=='mass_1') m2_idx = np.argwhere(params_to_infer=='mass_2') if set2[m1_idx,i] < set2[m2_idx,i]: sampset_2 = np.delete(sampset_2,del_cnt_set2,axis=1) del_cnt_set2-=1 break del_cnt_set2+=1 del_final_idx = np.min([del_cnt_set1,del_cnt_set2]) set1 = sampset_1[:,:del_final_idx] set2 = sampset_2[:,:del_final_idx] else: set1 = sampset_1.T set2 = sampset_2.T # Iterate over number of randomized sample slices SMALL_CONSTANT = 1e-162 # 1e-4 works best for some reason def my_kde_bandwidth(obj, fac=1.0): """We use Scott's Rule, multiplied by a constant factor.""" return np.power(obj.n, -1./(obj.d+4)) * fac if one_D: kl_result_all = np.zeros((1,len(self.params['inf_pars']))) for r in range(len(self.params['inf_pars'])): if self.params['gen_indi_KLs'] == True: p = gaussian_kde(set1[r],bw_method=my_kde_bandwidth)#'scott') # 7.5e0 works best ... don't know why. Hope it's not over-smoothing results. q = gaussian_kde(set2[r],bw_method=my_kde_bandwidth)#'scott')#'silverman') # 7.5e0 works best ... don't know why. # Compute KL Divergence log_diff = np.log((p(set1[r])+SMALL_CONSTANT)/(q(set1[r])+SMALL_CONSTANT)) kl_result = (1.0/float(set1.shape[1])) * np.sum(log_diff) # compute symetric kl anti_log_diff = np.log((q(set2[r])+SMALL_CONSTANT)/(p(set2[r])+SMALL_CONSTANT)) anti_kl_result = (1.0/float(set1.shape[1])) * np.sum(anti_log_diff) kl_result_all[:,r] = kl_result + anti_kl_result else: kl_result_all[:,r] = 0 return kl_result_all else: kl_result = [] set1 = set1.T set2 = set2.T for kl_idx in range(10): rand_idx_kl = np.random.choice(np.linspace(0,set1.shape[0]-1,dtype=np.int),size=100) kl_result.append(estimate(set1[rand_idx_kl,:],set2[rand_idx_kl,:]) + estimate(set2[rand_idx_kl,:],set1[rand_idx_kl,:])) kl_result = np.mean(kl_result) return kl_result