Exemplo n.º 1
0
def find_alignments_table(datasets, knn=KNN, approx=APPROX, verbose=VERBOSE,
                          prenormalized=False, geosketch=False,
                          geosketch_max=20000):
    if not prenormalized:
        datasets = [ normalize(ds, axis=1) for ds in datasets ]

    if geosketch:
        # Only match cells in geometric sketches.
        from ample import gs, uniform
        global gs_idxs
        if gs_idxs is None:
            gs_idxs = [ uniform(X, geosketch_max, replace=False)
                        if X.shape[0] > geosketch_max else range(X.shape[0])
                        for X in datasets ]
        datasets = [ datasets[i][gs_idx, :] for i, gs_idx in enumerate(gs_idxs) ]

    table = {}
    for i in range(len(datasets)):
        if len(datasets[:i]) > 0:
            fill_table(table, i, datasets[i], datasets[:i], knn=knn,
                       approx=approx)
        if len(datasets[i+1:]) > 0:
            fill_table(table, i, datasets[i], datasets[i+1:],
                       knn=knn, base_ds=i+1, approx=approx)
    # Count all mutual nearest neighbors between datasets.
    matches = {}
    table1 = {}
    if verbose > 1:
        table_print = np.zeros((len(datasets), len(datasets)))
    for i in range(len(datasets)):
        for j in range(len(datasets)):
            if i >= j:
                continue
            if not (i, j) in table or not (j, i) in table:
                continue
            match_ij = table[(i, j)]
            match_ji = set([ (b, a) for a, b in table[(j, i)] ])
            matches[(i, j)] = match_ij & match_ji
            
            table1[(i, j)] = (max(
                float(len(set([ idx for idx, _ in matches[(i, j)] ]))) /
                datasets[i].shape[0],
                float(len(set([ idx for _, idx in matches[(i, j)] ]))) /
                datasets[j].shape[0]
            ))
            if verbose > 1:
                table_print[i, j] += table1[(i, j)]

            if geosketch:
                # Translate matches within geometric sketches to original indices.
                matches_mnn = matches[(i, j)]
                matches[(i, j)] = [
                    (gs_idxs[i][a], gs_idxs[j][b]) for a, b in matches_mnn
                ]
            
    if verbose > 1:
        print(table_print)
        return table1, table_print, matches
    else:
        return table1, None, matches
Exemplo n.º 2
0
def integrate_sketch(datasets_dimred,
                     integration_fn,
                     integration_fn_args={},
                     sampling_fn=gs,
                     N=2000,
                     n_iter=1):

    sketch_idxs = [
        sorted(
            set(gs(X, N, replace=False)) | set(uniform(X, N, replace=False)))
        for X in datasets_dimred
    ]
    datasets_sketch = [X[idx] for X, idx in zip(datasets_dimred, sketch_idxs)]

    for _ in range(n_iter):
        datasets_int = integration_fn(datasets_sketch[:],
                                      **integration_fn_args)

    labels = []
    curr_label = 0
    for i, a in enumerate(datasets_sketch):
        labels += list(np.zeros(a.shape[0]) + curr_label)
        curr_label += 1
    labels = np.array(labels, dtype=int)

    for i, (X_dimred,
            X_sketch) in enumerate(zip(datasets_dimred, datasets_sketch)):
        X_int = datasets_int[i]

        neigh = NearestNeighbors(n_neighbors=3).fit(X_dimred)
        _, neigh_idx = neigh.kneighbors(X_sketch)

        ds_idxs, ref_idxs = [], []
        for ref_idx in range(neigh_idx.shape[0]):
            for k_idx in range(neigh_idx.shape[1]):
                ds_idxs.append(neigh_idx[ref_idx, k_idx])
                ref_idxs.append(ref_idx)

        bias = transform(X_dimred,
                         X_int,
                         ds_idxs,
                         ref_idxs,
                         15,
                         batch_size=1000)

        datasets_int[i] = X_dimred + bias

    return datasets_int
Exemplo n.º 3
0
        labels += list(np.zeros(X_dimred.shape[0]) + i)

    X_dimred = np.concatenate(Xs)
    cell_labels = np.array(labels, dtype=int)

    expected = np.array([1., 1. / 10, 1. / 100])
    expected = np.array(expected) / sum(expected)

    print(expected)

    from ample import gs, gs_gap, srs, uniform
    samp_idx = gs_gap(X_dimred, 3000, replace=True)
    report_cluster_counts(cell_labels[samp_idx])
    print('')
    samp_idx = srs(X_dimred, 3000, replace=True)
    report_cluster_counts(cell_labels[samp_idx])
    print('')
    samp_idx = uniform(X_dimred, 3000, replace=True)
    report_cluster_counts(cell_labels[samp_idx])
    exit()
    experiments(
        X_dimred,
        NAMESPACE,
        cell_labels=cell_labels,
        #rare=True, rare_label=2,
        #entropy=True,
        kl_divergence=True,
        expected=expected,
        #max_min_dist=True
    )
Exemplo n.º 4
0
    from ample import gs, gs_gap, uniform

    gs_idx = gs(X_dimred, 110, replace=False)
    write_table(X[gs_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_gs')
    report_cluster_counts(cell_labels[gs_idx])

    with open('data/pseudotime/mono_macro_meta_gs.txt', 'w') as of:
        of.write('Label\n')
        i = 0
        for idx in range(X.shape[0]):
            if idx not in gs_idx:
                continue
            of.write('mono_macro_gs{}\t{}\n'.format(i, cell_names[idx]))
            i += 1
    
    uni_idx = uniform(X_dimred, 110, replace=False)
    write_table(X[uni_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_uni')
    report_cluster_counts(cell_labels[uni_idx])
    
    with open('data/pseudotime/mono_macro_meta_uni.txt', 'w') as of:
        of.write('Label\n')
        i = 0
        for idx in range(X.shape[0]):
            if idx not in uni_idx:
                continue
            of.write('mono_macro_uni{}\t{}\n'.format(i, cell_names[idx]))
            i += 1

    with open('data/pseudotime/mono_macro_genes.txt', 'w') as of:
        of.write('gene_short_name\n')
        for gene in genes: