Exemplo n.º 1
0
def preproc_data(data):
    """
    basic data preprocessing
    """
    import uncurl
    from uncurl.preprocessing import log1p, cell_normalize
    from sklearn.decomposition import TruncatedSVD
    gene_subset = uncurl.max_variance_genes(data)
    data_subset = data[gene_subset, :]
    tsvd = TruncatedSVD(8)
    data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data_subset)).T)
    return data_tsvd
Exemplo n.º 2
0
def one_vs_rest_t(data,
                  labels,
                  eps=1.0,
                  calc_pvals=True,
                  test='t',
                  normalize=False,
                  use_fdr=False):
    """
    Computes 1-vs-rest t-test for all clusters and genes.

    If calc_pvals is False, the pvals will be all 0.

    test is either 't' or 'u'.
    't' indicates that the 2-sample t-test will be used,
    and 'u' is the Mann-Whitney U test.

    Returns:
        scores (dict): map of labels to tuples (array of gene ids, array of ratios) sorted by descending ratio
        pvals (dict): map of labels to tuples (array of gene ids, array of pvals) sorted by ascending pval
    """
    data_csc = sparse.csc_matrix(data)
    if normalize:
        from uncurl.preprocessing import cell_normalize
        data_csc = cell_normalize(data_csc)
    genes, cells = data.shape
    if cells != len(labels):
        raise Exception('length of data not equal to length of labels')
    labels_array = np.zeros(len(labels), dtype=int)
    # map from label names to indices
    labels_map = {}
    for i, l in enumerate(sorted(list(set(labels)))):
        labels_map[l] = i
    for i, l in enumerate(labels):
        labels_array[i] = labels_map[l]
    if test == 't':
        test_func = csc_unweighted_1_vs_rest_t_test
    elif test == 'u':
        test_func = csc_unweighted_1_vs_rest_rank_sum_test
    scores, pvals = test_func(data_csc.data, data_csc.indices, data_csc.indptr,
                              labels_array, cells, genes, eps, calc_pvals)
    # map back to original label set?
    new_scores = {}
    new_pvals = {}
    for i, l in labels_map.items():
        new_scores[i] = scores[l]
        new_pvals[i] = pvals[l]
    if use_fdr:
        from statsmodels.stats.multitest import fdrcorrection
        for k, v in new_pvals.items():
            pv = v[1]
            v[1] = fdrcorrection(pv)[1]
    return new_scores, new_pvals
Exemplo n.º 3
0
def load_data(paths,
              genes_path,
              genes_subset_path=None,
              normalize=True,
              log=True):
    """
    Returns:
        data - cells x genes csr matrix
        genes - array of gene names
        all_labels - all cell labels
    """
    all_matrices = []
    all_labels = []
    for data_path in paths:
        for f in os.listdir(data_path):
            if f.endswith('.mtx.gz'):
                file_path = os.path.join(data_path, f)
                label = f.split('.')[0]
                print(label)
                data = scipy.io.mmread(file_path)
                data = sparse.csc_matrix(data)
                all_matrices.append(data)
                all_labels.extend([label] * data.shape[1])
                print('num cells: ', data.shape[1])
    all_matrices = sparse.hstack(all_matrices)
    all_labels = np.array(all_labels)

    n_genes, n_cells = all_matrices.shape

    print('data matrix shape:', all_matrices.shape)
    print('number of cells:', all_labels.shape)

    from uncurl import preprocessing
    if normalize:
        all_matrices = preprocessing.cell_normalize(all_matrices,
                                                    multiply_means=False)
    if log:
        all_matrices = preprocessing.log1p(all_matrices)
    all_matrices = sparse.csr_matrix(all_matrices.T)
    print('matrices normalized')

    genes = np.loadtxt(genes_path, dtype=str)
    if genes_subset_path is not None:
        all_matrices, genes = load_genes_subset(all_matrices, genes,
                                                genes_subset_path)
    return all_matrices, genes, all_labels
Exemplo n.º 4
0
def pairwise_t(data,
               w_or_labels,
               eps=1.0,
               calc_pvals=True,
               normalize=False,
               use_fdr=False):
    """
    Computes pairwise t-test between all pairs of clusters and all genes.

    If calc_pvals is False, the pvals will be all 0.

    Returns:
        ratios, pvals - two arrays of shape (k, k, genes), where k is the number of clusters.
    """
    data_csc = sparse.csc_matrix(data)
    if normalize:
        from uncurl.preprocessing import cell_normalize
        data_csc = cell_normalize(data_csc)
    genes, cells = data.shape
    if len(w_or_labels.shape) == 2:
        scores, pvals = csc_weighted_t_test(data_csc.data, data_csc.indices,
                                            data_csc.indptr, w_or_labels,
                                            cells, genes, calc_pvals)
    else:
        labels_array = np.zeros(len(w_or_labels), dtype=int)
        if cells != len(labels_array):
            raise Exception('length of data not equal to length of labels')
        labels_map = {}
        for i, l in enumerate(sorted(list(set(w_or_labels)))):
            labels_map[l] = i
        for i, l in enumerate(w_or_labels):
            labels_array[i] = labels_map[l]
        scores, pvals = csc_unweighted_t_test(data_csc.data, data_csc.indices,
                                              data_csc.indptr, labels_array,
                                              cells, genes, eps, calc_pvals)
        # TODO: re-map keys? or would that mess up the c-score calculation?
    if use_fdr:
        from statsmodels.stats.multitest import fdrcorrection
        new_pvals = np.zeros(pvals.shape)
        for k1 in range(pvals.shape[0]):
            for k2 in range(pvals.shape[1]):
                new_pvals[k1, k2, :] = fdrcorrection(pvals[k1, k2, :])[1]
        pvals = new_pvals
    return scores, pvals
Exemplo n.º 5
0
def preproc_data(data, gene_subset=False, **kwargs):
    """
    basic data preprocessing before running gap score

    Assumes that data is a matrix of shape (genes, cells).

    Returns a matrix of shape (cells, 8), using the first 8 SVD
    components. Why 8? It's an arbitrary selection...
    """
    import uncurl
    from uncurl.preprocessing import log1p, cell_normalize
    from sklearn.decomposition import TruncatedSVD
    data_subset = data
    if gene_subset:
        gene_subset = uncurl.max_variance_genes(data)
        data_subset = data[gene_subset, :]
    tsvd = TruncatedSVD(min(8, data_subset.shape[0] - 1))
    data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data_subset)).T)
    return data_tsvd
def identify_mnns(data1, data2, k=20, n_components=20, metric='cosine'):
    """
    Identify mutual nearest neighbors in the two datasets.

    Args:
        data1, data2: sparse (csc) matrices of shape genes x cells. data1 is the target matrix and data2 is the reference matrix.
        k (int): number of nearest neighbors
        n_components (int): number of components for tsvd
        metric (str): cosine or euclidean

    Returns:
        a list of lists, indicating MNNs every cell in data1
    """
    # 1. compute tsvd on sparse matrices
    data_combined = scipy.sparse.hstack([data1, data2])
    cell_indices = np.array([1]*data1.shape[1] + [2]*data2.shape[1])
    U, Sigma, VT = randomized_svd(log1p(cell_normalize(data_combined)).T,
                      n_components)
    data_reduced = U*Sigma
    # 2. build index for euclidean nearest neighbors using nmslib
    # space can be cosinesiml or l2
    metric_to_space = {'cosine': 'cosinesimil', 'euclidean': 'l2'}
    index1 = nmslib.init(method='hnsw', space=metric_to_space[metric])
    index1.addDataPointBatch(data_reduced[:, cell_indices==1])
    index2 = nmslib.init(method='hnsw', space=metric_to_space[metric])
    index2.addDataPointBatch(data_reduced[:, cell_indices==2])
    neighbors1 = []
    neighbors2 = []
    for i in range(data1.shape[1]):
        points = index2.knnQueryBatch(data_reduced[i,:], k=k)
        points = set([n[0][0] for n in points])
        neighbors1.append(points)
    for i in range(data2.shape[1]):
        index = data1.shape[1] + i
        points = index1.knnQueryBatch(data_reduced[index,:], k=k)
        points = set([n[0][0] for n in points])
        neighbors2.append(points)
    mnns = [[] for i in range(data1.shape[1])]
    for i in range(data1.shape[1]):
        for p in neighbors1[i]:
            if i in neighbors2[p]:
                mnns[i].append(p)
    return mnns
def bnpy_select_clusters(data, max_cells=50000):
    """
    Args:
        data: matrix of shape genes x cells

    Returns:
        selected k based on converged Gaussian DPMM, and
            the assigned labels.
    """
    # TODO: randomly sub-select max_cells
    selected_cell_ids = list(range(data.shape[1]))
    if max_cells < data.shape[1]:
        import random
        selected_cell_ids = random.sample(selected_cell_ids, max_cells)
    data = data[:, selected_cell_ids]
    tsvd = TruncatedSVD(8)
    data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data)).T)
    data_dense_bnpy = bnpy.data.XData(data_tsvd)
    trained_model, info_dict = bnpy.run(
        data_dense_bnpy,
        'DPMixtureModel',
        'Gauss',
        'memoVB',
        #doSaveToDisk=False,
        doWriteStdOut=False,
        output_path='./temp',
        nLap=100,
        nTask=1,
        nBatch=1,
        sF=0.1,
        ECovMat='eye',
        K=10,
        initname='randexamples',
        moves='birth,merge,shuffle',
        m_startLap=5,
        b_startLap=2,
        b_Kfresh=4)
    selected_k = info_dict['K_history'][-1]
    results = trained_model.calc_local_params(data_dense_bnpy)
    cluster_labels = results['resp'].argmax(1)
    return selected_k, cluster_labels
Exemplo n.º 8
0
if __name__ == '__main__':
    import uncurl
    from uncurl.state_estimation import objective
    from uncurl.preprocessing import cell_normalize, log1p
    import scipy.io
    from sklearn.cluster import KMeans
    from sklearn.metrics.cluster import normalized_mutual_info_score as nmi

    mat = scipy.io.loadmat('data/10x_pooled_400.mat')
    actual_labels = mat['labels'].squeeze()
    X = mat['data'].toarray().astype(np.float32)
    genes = uncurl.max_variance_genes(X, 5, 0.2)
    X_subset = X[genes, :]

    X_log_norm = log1p(cell_normalize(X_subset)).astype(np.float32)
    uncurl_net = UncurlNet(X_log_norm,
                           8,
                           use_reparam=False,
                           use_decoder=False,
                           use_batch_norm=True,
                           hidden_layers=2,
                           hidden_units=200,
                           loss='mse')
    m_init = torch.tensor(uncurl_net.M)

    uncurl_net.pre_train_encoder(None, lr=1e-3, n_epochs=20, log_interval=10)
    uncurl_net.train_model(None, lr=1e-3, n_epochs=50, log_interval=10)
    w = uncurl_net.w_net.get_w(X_log_norm).transpose(1, 0)
    m = uncurl_net.w_net.get_m()
    mw = torch.matmul(m, w)
Exemplo n.º 9
0
    )

    genes_seqwell = table_seqwell.index
    genes_10x = table_10x.index

    genes_set = set(genes_seqwell).intersection(genes_10x)

    genes_list = list(genes_set)
    data_seqwell = table_seqwell.loc[genes_list].values
    data_10x = table_10x.loc[genes_list].values

    batch_list = [0] * data_seqwell.shape[1]
    batch_list += [1] * data_10x.shape[1]

    data_total = np.hstack([data_seqwell, data_10x])
    X_log_norm = log1p(cell_normalize(data_total)).astype(np.float32)
    """
    # TODO: create networks
    net1 = UncurlNet(X_log_norm, 10,
            use_reparam=False, use_decoder=False,
            use_batch_norm=True,
            hidden_layers=2,
            hidden_units=400,
            loss='mse')

    net1.train_1(X_log_norm, log_interval=10)
    # TODO: test clustering?
    w = net1.w_net.get_w(X_log_norm).transpose(1, 0)
    print(w.argmax(0))
    w = w.numpy().T
    tsne = TSNE(2)
Exemplo n.º 10
0
 def testCellNormalize(self):
     sparse_cell_norm = cell_normalize(self.data_sparse)
     dense_cell_norm = cell_normalize(self.data_dense)
     diff = dense_cell_norm - sparse_cell_norm.toarray()
     diff = np.sqrt(np.sum(diff**2))
     self.assertTrue(diff < 1e-6)