def find_alignments_table(datasets, knn=KNN, approx=APPROX, verbose=VERBOSE, prenormalized=False, geosketch=False, geosketch_max=20000): if not prenormalized: datasets = [ normalize(ds, axis=1) for ds in datasets ] if geosketch: # Only match cells in geometric sketches. from ample import gs, uniform global gs_idxs if gs_idxs is None: gs_idxs = [ uniform(X, geosketch_max, replace=False) if X.shape[0] > geosketch_max else range(X.shape[0]) for X in datasets ] datasets = [ datasets[i][gs_idx, :] for i, gs_idx in enumerate(gs_idxs) ] table = {} for i in range(len(datasets)): if len(datasets[:i]) > 0: fill_table(table, i, datasets[i], datasets[:i], knn=knn, approx=approx) if len(datasets[i+1:]) > 0: fill_table(table, i, datasets[i], datasets[i+1:], knn=knn, base_ds=i+1, approx=approx) # Count all mutual nearest neighbors between datasets. matches = {} table1 = {} if verbose > 1: table_print = np.zeros((len(datasets), len(datasets))) for i in range(len(datasets)): for j in range(len(datasets)): if i >= j: continue if not (i, j) in table or not (j, i) in table: continue match_ij = table[(i, j)] match_ji = set([ (b, a) for a, b in table[(j, i)] ]) matches[(i, j)] = match_ij & match_ji table1[(i, j)] = (max( float(len(set([ idx for idx, _ in matches[(i, j)] ]))) / datasets[i].shape[0], float(len(set([ idx for _, idx in matches[(i, j)] ]))) / datasets[j].shape[0] )) if verbose > 1: table_print[i, j] += table1[(i, j)] if geosketch: # Translate matches within geometric sketches to original indices. matches_mnn = matches[(i, j)] matches[(i, j)] = [ (gs_idxs[i][a], gs_idxs[j][b]) for a, b in matches_mnn ] if verbose > 1: print(table_print) return table1, table_print, matches else: return table1, None, matches
def integrate_sketch(datasets_dimred, integration_fn, integration_fn_args={}, sampling_fn=gs, N=2000, n_iter=1): sketch_idxs = [ sorted( set(gs(X, N, replace=False)) | set(uniform(X, N, replace=False))) for X in datasets_dimred ] datasets_sketch = [X[idx] for X, idx in zip(datasets_dimred, sketch_idxs)] for _ in range(n_iter): datasets_int = integration_fn(datasets_sketch[:], **integration_fn_args) labels = [] curr_label = 0 for i, a in enumerate(datasets_sketch): labels += list(np.zeros(a.shape[0]) + curr_label) curr_label += 1 labels = np.array(labels, dtype=int) for i, (X_dimred, X_sketch) in enumerate(zip(datasets_dimred, datasets_sketch)): X_int = datasets_int[i] neigh = NearestNeighbors(n_neighbors=3).fit(X_dimred) _, neigh_idx = neigh.kneighbors(X_sketch) ds_idxs, ref_idxs = [], [] for ref_idx in range(neigh_idx.shape[0]): for k_idx in range(neigh_idx.shape[1]): ds_idxs.append(neigh_idx[ref_idx, k_idx]) ref_idxs.append(ref_idx) bias = transform(X_dimred, X_int, ds_idxs, ref_idxs, 15, batch_size=1000) datasets_int[i] = X_dimred + bias return datasets_int
labels += list(np.zeros(X_dimred.shape[0]) + i) X_dimred = np.concatenate(Xs) cell_labels = np.array(labels, dtype=int) expected = np.array([1., 1. / 10, 1. / 100]) expected = np.array(expected) / sum(expected) print(expected) from ample import gs, gs_gap, srs, uniform samp_idx = gs_gap(X_dimred, 3000, replace=True) report_cluster_counts(cell_labels[samp_idx]) print('') samp_idx = srs(X_dimred, 3000, replace=True) report_cluster_counts(cell_labels[samp_idx]) print('') samp_idx = uniform(X_dimred, 3000, replace=True) report_cluster_counts(cell_labels[samp_idx]) exit() experiments( X_dimred, NAMESPACE, cell_labels=cell_labels, #rare=True, rare_label=2, #entropy=True, kl_divergence=True, expected=expected, #max_min_dist=True )
from ample import gs, gs_gap, uniform gs_idx = gs(X_dimred, 110, replace=False) write_table(X[gs_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_gs') report_cluster_counts(cell_labels[gs_idx]) with open('data/pseudotime/mono_macro_meta_gs.txt', 'w') as of: of.write('Label\n') i = 0 for idx in range(X.shape[0]): if idx not in gs_idx: continue of.write('mono_macro_gs{}\t{}\n'.format(i, cell_names[idx])) i += 1 uni_idx = uniform(X_dimred, 110, replace=False) write_table(X[uni_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_uni') report_cluster_counts(cell_labels[uni_idx]) with open('data/pseudotime/mono_macro_meta_uni.txt', 'w') as of: of.write('Label\n') i = 0 for idx in range(X.shape[0]): if idx not in uni_idx: continue of.write('mono_macro_uni{}\t{}\n'.format(i, cell_names[idx])) i += 1 with open('data/pseudotime/mono_macro_genes.txt', 'w') as of: of.write('gene_short_name\n') for gene in genes: