Exemplo n.º 1
0
 def test_run_se(self):
     w, h, cost = uncurl.run_state_estimation(self.data, 2, dist='log-norm')
     labs = h.argmax(0)
     self.assertTrue(uncurl.evaluation.purity(labs, self.labs) > 0.85)
     w1, h1, cost = uncurl.run_state_estimation(self.data,
                                                2,
                                                dist='gaussian')
     labs = h1.argmax(0)
     self.assertTrue(uncurl.evaluation.purity(labs, self.labs) > 0.8)
Exemplo n.º 2
0
def relabel(data, m_old, w_old, cell_ids, cell_labels, **uncurl_params):
    """
    Re-runs UNCURL on the dataset, after re-initializing W

    Args:
        data (array): genes x cells
        m_old (array): genes x k
        w_old (array): k x cells
        cell_ids (array): 1d array of cell ids
        cell_labels (array): 1d array of new cell labels
        **uncurl_params: optional kwargs to pass to uncurl

    Returns: M_new, W_new
    """
    k = m_old.shape[1]
    init_weights = w_old.argmax(0)
    for c in cell_ids:
        true_label = cell_labels[c]
        init_weights[c] = true_label
    m_new, w_new, ll_new = uncurl.run_state_estimation(
        data,
        clusters=k,
        #init_means=m_old,
        init_weights=init_weights,
        **uncurl_params)
    return m_new, w_new
Exemplo n.º 3
0
def run_uncurl(data_subset, uncurl_kwargs, **kwargs):
    if uncurl_kwargs is None:
        uncurl_kwargs = {}
    m, w, ll = uncurl.run_state_estimation(data_subset,
                                           clusters=kwargs['clusters'],
                                           **uncurl_kwargs)
    return m, w
 def test_leiden(self):
     m, w, ll = uncurl.run_state_estimation(self.data_subset,
                                            8,
                                            max_iters=20,
                                            inner_max_iters=50)
     print('nmi basic: ' + str(nmi(self.labels, w.argmax(0))))
     g = clustering_methods.create_graph(w.T, metric='cosine')
     leiden_clustering = clustering_methods.run_leiden(g)
     self.assertTrue(nmi(self.labels, leiden_clustering) >= 0.7)
     louvain_clustering = clustering_methods.run_louvain(g)
     self.assertTrue(nmi(self.labels, louvain_clustering) >= 0.7)
Exemplo n.º 5
0
def merge_clusters(data,
                   m_old,
                   w_old,
                   clusters_to_merge,
                   rerun_uncurl=True,
                   **uncurl_params):
    """
    Merges a given list of clusters, returning the results of an uncurl re-initialization.

    Merging is done by averaging m_old over the clusters to be merged,
    and summing over w_old (and re-normalizing).

    Args:
        data (array): genes x cells
        m_old (array): genes x k
        w_old (array): k x cells
        clusters_to_merge (list): list of cluster ids to merge
        rerun_uncurl (boolean): if True, re-runs uncurl with a new initialization.
            If false, this just returns the merged cluster matrix.
        **uncurl_params: optional kwargs to pass to uncurl

    Returns: M_new, W_new
    """
    # TODO: this doesn't work for merging more than 2 clusters
    k = m_old.shape[1] - len(clusters_to_merge) + 1
    m_init_new_col = np.zeros(m_old.shape[0])
    w_init_new_row = np.zeros(w_old.shape[1])
    clusters_to_keep = np.array([True for i in range(m_old.shape[1])])
    clusters_to_keep[list(clusters_to_merge)] = False
    m_init = m_old[:, clusters_to_keep]
    w_init = w_old[clusters_to_keep, :]
    for c in clusters_to_merge:
        m_init_new_col += m_old[:, c]
        w_init_new_row += w_old[c, :]
    m_init_new_col = m_init_new_col / len(clusters_to_merge)
    m_init_new_col = m_init_new_col.reshape((m_old.shape[0], 1))
    w_init_new_row = w_init_new_row.reshape((1, w_old.shape[1]))
    m_init = np.hstack([
        m_init[:, 0:clusters_to_merge[0]], m_init_new_col,
        m_init[:, clusters_to_merge[0]:]
    ])
    w_init = np.vstack([
        w_init[0:clusters_to_merge[0], :], w_init_new_row,
        w_init[clusters_to_merge[0]:, :]
    ])
    w_init = w_init / w_init.sum(0)
    m_new, w_new, ll_new = uncurl.run_state_estimation(data,
                                                       clusters=k,
                                                       init_means=m_init,
                                                       init_weights=w_init,
                                                       **uncurl_params)
    return m_new, w_new
 def test_real_data_pairwise(self):
     mat = scipy.io.loadmat('data/10x_pooled_400.mat')
     data = mat['data']
     # do uncurl, followed by update_m
     selected_genes = uncurl.max_variance_genes(data)
     data_subset = data[selected_genes, :]
     m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=20, inner_max_iters=50)
     m = uncurl.update_m(data, m, w, selected_genes)
     # test pairwise
     all_pvs, all_ratios = poisson_diffexp.uncurl_test_pairwise(m, w, mode='counts')
     self.assertEqual(all_pvs.shape, (data.shape[0], 8, 8))
     self.assertEqual(all_ratios.shape, (data.shape[0], 8, 8))
     self.assertTrue((all_pvs < 0.001).sum() < data.shape[0])
     self.assertTrue((all_pvs < 0.01).sum() > 100)
Exemplo n.º 7
0
def split_cluster(data, m_old, w_old, cluster_to_split, **uncurl_params):
    """
    Splits a given cluster, returning the results of an uncurl re-initialization.

    Args:
        data (array): genes x cells
        m_old (array): genes x k
        w_old (array): k x cells
        cluster_to_split (int): cluster id to split on
        **uncurl_params: optional kwargs to pass to uncurl

    Returns: M_new, W_new
    """
    k = m_old.shape[1]
    k += 1
    labels = w_old.argmax(0)
    cell_subset = (labels == cluster_to_split)
    # or (sorted_labels[1,:]==cluster_to_split)
    data_subset = data[:, cell_subset]
    mean_w = np.hstack([w_old[i, labels == i] for i in set(labels)]).mean()
    new_m, new_w = state_estimation.initialize_means_weights(
        data_subset, 2, max_assign_weight=mean_w)
    m_init = np.hstack([
        m_old[:, :cluster_to_split], new_m[:, 0:1],
        m_old[:, cluster_to_split + 1:], new_m[:, 1:]
    ])
    # extend new_w to encompass all cells
    # TODO: set new row as last cluster
    new_w_2 = np.zeros((2, w_old.shape[1]))
    new_w_2[0, :] = w_old[cluster_to_split] / 2
    new_w_2[1, :] = w_old[cluster_to_split] / 2
    new_w_2[:, cell_subset] = new_w
    w_init = np.vstack([
        w_old[:cluster_to_split, :], new_w_2[0:1, :],
        w_old[cluster_to_split + 1:, :], new_w_2[1:, :]
    ])
    w_init = w_init / w_init.sum(0)
    m_new, w_new, ll_new = uncurl.run_state_estimation(data,
                                                       clusters=k,
                                                       init_means=m_init,
                                                       init_weights=w_init,
                                                       **uncurl_params)
    return m_new, w_new
Exemplo n.º 8
0
 def test_run_se(self):
     """
     test the run_state_estimation function
     """
     sim_m, sim_w = simulation.generate_poisson_states(2, 200, 20)
     sim_data = simulation.generate_state_data(sim_m, sim_w)
     m, w, ll = run_state_estimation(sim_data,
                                     2,
                                     dist='Poiss',
                                     max_iters=10,
                                     disp=False)
     means_good = False
     weights_good = False
     for p in itertools.permutations([0, 1]):
         means_good = means_good or (np.mean(np.abs(sim_m - m[:, p])) <
                                     20.0)
         weights_good = weights_good or (np.mean(np.abs(sim_w - w[p, :])) <
                                         0.3)
     self.assertTrue(means_good)
     self.assertTrue(weights_good)
Exemplo n.º 9
0
    def setUp(self):
        data = scipy.io.loadmat('data/10x_pooled_400.mat')

        data_csc = data['data']
        self.labels = data['labels'].flatten()
        #gene_names = data['gene_names']

        # 2. gene selection
        genes = uncurl.max_variance_genes(data_csc)
        self.data_subset = data_csc[genes, :]
        #gene_names_subset = gene_names[genes]

        # 3. run uncurl
        m, w, ll = uncurl.run_state_estimation(self.data_subset,
                                               8,
                                               max_iters=20,
                                               inner_max_iters=50)
        print('nmi basic: ' + str(nmi(self.labels, w.argmax(0))))
        self.m = m
        self.w = w
Exemplo n.º 10
0
def delete_cluster(data, m_old, w_old, cluster_to_delete, **uncurl_params):
    """
    Removes a cluster from the data.
    """
    k = m_old.shape[1] - 1
    m_init = np.hstack(
        [m_old[:, :cluster_to_delete], m_old[:, cluster_to_delete + 1:]])
    w_init = np.vstack(
        [w_old[:cluster_to_delete, :], w_old[cluster_to_delete + 1:, :]])
    w_init = w_init / w_init.sum(0)
    labels = w_old.argmax(0)
    cells_to_include = (labels != cluster_to_delete)
    data_subset = data[:, cells_to_include]
    w_init = w_init[:, cells_to_include]
    m_new, w_new, ll_new = uncurl.run_state_estimation(data_subset,
                                                       clusters=k,
                                                       init_means=m_init,
                                                       init_weights=w_init,
                                                       **uncurl_params)
    return m_new, w_new, cells_to_include
Exemplo n.º 11
0
 def test_10x_update_m(self):
     """
     Test after updating M
     """
     from uncurl.state_estimation import update_m
     genes = uncurl.max_variance_genes(self.data)
     data_subset = self.data[genes, :]
     # smaller # of iterations than default so it finishes faster...
     M, W, ll = uncurl.run_state_estimation(data_subset,
                                            clusters=0,
                                            max_iters=10,
                                            inner_max_iters=50)
     new_M = update_m(self.data, M, W, genes)
     self.assertEqual(new_M.shape, (self.data.shape[0], W.shape[0]))
     self.assertFalse(np.isnan(new_M).any())
     # test RMSE
     test_data = np.dot(new_M, W)
     error = self.data.toarray() - test_data
     error = np.sqrt(np.mean(error**2))
     print('M update RMSE:', error)
     self.assertTrue(error < 2.0)
Exemplo n.º 12
0
def new_cluster(data,
                m_old,
                w_old,
                cells_to_add,
                rerun_uncurl=True,
                **uncurl_params):
    """
    Creates a new cluster by assigning all the selected cells to the new cluster.

    Args:
        data: array of shape genes, cells
        m_old: array of shape genes, k
        w_old: array of shape k, cells
        cells_to_add: array or list of ints representing indices to add
        rerun_uncurl (bool):

    Returns a new m and w.
    """
    labels = w_old.argmax(0)
    k = m_old.shape[1] + 1
    genes = m_old.shape[0]
    cells = w_old.shape[1]
    m_init = np.zeros((genes, k))
    w_init = np.zeros((k, cells))
    m_init[:, :k - 1] = m_old
    w_init[:k - 1, :] = w_old
    m_init[:, k - 1] = data[:, cells_to_add].mean(1).A1
    mean_w = np.hstack([w_old[i, labels == i] for i in set(labels)]).mean()
    w_init[k - 1, cells_to_add] = mean_w
    w_init[:k - 1, cells_to_add] = (1 - mean_w) / (k - 1)
    if rerun_uncurl:
        m_new, w_new, ll_new = uncurl.run_state_estimation(data,
                                                           clusters=k,
                                                           init_means=m_init,
                                                           init_weights=w_init,
                                                           **uncurl_params)
        return m_new, w_new
    else:
        return m_init, w_init
Exemplo n.º 13
0
 def test_real_data_1_vs_rest(self):
     mat = scipy.io.loadmat('data/10x_pooled_400.mat')
     data = mat['data']
     # do uncurl, followed by update_m
     selected_genes = uncurl.max_variance_genes(data)
     data_subset = data[selected_genes, :]
     m, w, ll = uncurl.run_state_estimation(data_subset, 8, max_iters=20, inner_max_iters=50)
     m = uncurl.update_m(data, m, w, selected_genes)
     # TODO: how should the p-values be tested?
     all_pvs, all_ratios = poisson_diffexp.uncurl_test_1_vs_rest(m, w)
     all_pvs = np.array(all_pvs)
     all_ratios = np.array(all_ratios)
     self.assertTrue((all_pvs < 0.05).sum() > 100)
     self.assertTrue((all_ratios > 10).sum() > 100)
     self.assertEqual(all_pvs.shape, (data.shape[0], 8))
     all_pvs, all_ratios = poisson_diffexp.uncurl_test_1_vs_rest(m, w, mode='counts')
     all_pvs = np.array(all_pvs)
     all_ratios = np.array(all_ratios)
     self.assertEqual(all_pvs.shape, (data.shape[0], 8))
     self.assertTrue((all_pvs < 0.01).sum() > 100)
     self.assertTrue((all_pvs < 0.01).sum() < data.shape[0])
     self.assertTrue((all_ratios > 10).sum() > 100)
Exemplo n.º 14
0
 def test_10x_auto_cluster(self):
     """
     Test using automatic cluster size determination
     """
     from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
     # gene selection
     genes = uncurl.max_variance_genes(self.data)
     data_subset = self.data[genes, :]
     # smaller # of iterations than default so it finishes faster...
     M, W, ll = uncurl.run_state_estimation(data_subset,
                                            clusters=0,
                                            max_iters=10,
                                            inner_max_iters=80)
     labels = W.argmax(0)
     # NMI should be > 0.75 on 10x_pure_pooled
     # (accounting for lower than default iter count)
     self.assertTrue(nmi(self.labs, labels) > 0.6)
     # test RMSE
     test_data = np.dot(M, W)
     error = data_subset.toarray() - test_data
     error = np.sqrt(np.mean(error**2))
     print('data subset RMSE:', error)
     self.assertTrue(error < 2.0)
Exemplo n.º 15
0
def generate_uncurl_analysis(data,
                             output_dir,
                             data_type='dense',
                             gene_names=None,
                             gene_sub=True,
                             **uncurl_kwargs):
    """
    Performs an uncurl analysis of the data, writing the results in the given
    directory.

    Outputs:
        output_dir/m.txt
        output_dir/w.txt
        output_dir/labels.txt (integer labels)
        output_dir/top_genes.txt (json of a dict mapping cluster ids to a list of (gene_id : c_score) sorted by c_score)
        output_dir/mds_means.txt (mds of the means)
        output_dir/mds_data.txt (mds projection of data)
        output_dir/gene_subset.txt (gene subset selected by uncurl)
        output_dir/gene_names.txt (list of all gene names in data subset)

    Args:
        data (array or str): either a data array, or a string containing
            the path to a data array..
        output_dir (str): directory to write output to.
        data_type (str): if data is a path, this indicates whether the data is a dense or sparse array.
        gene_names (list or array): list of all gene names
        gene_sub (bool): whether or not to use gene subset selection (max_variance_genes)
        **uncurl_kwargs: arguments to pass to uncurl.run_state_estimation. has to include clusters=k.
    """
    try:
        os.makedirs(output_dir)
    except:
        print('could not make output dir: {0}'.format(output_dir))
    if isinstance(data, str):
        if data_type == 'dense':
            data = np.loadtxt(data)
        elif data_type == 'sparse':
            data = scipy.io.mmread(data)
            data = sparse.csc_matrix(data)
    if isinstance(gene_names, str):
        gene_names = np.loadtxt(gene_names, dtype=str)
    # run uncurl
    if gene_sub:
        genes_subset = np.array(uncurl.max_variance_genes(data))
        np.savetxt(os.path.join(output_dir, 'gene_subset.txt'),
                   genes_subset,
                   fmt='%d')
        data = data[genes_subset, :]
        if gene_names is not None:
            gene_names = gene_names[genes_subset]
    print(uncurl_kwargs)
    m, w, ll = uncurl.run_state_estimation(data, **uncurl_kwargs)
    np.savetxt(os.path.join(output_dir, 'm.txt'), m)
    np.savetxt(os.path.join(output_dir, 'w.txt'), w)
    labels = w.argmax(0)
    np.savetxt(os.path.join(output_dir, 'labels.txt'), labels, fmt='%d')
    # find overexpressed genes for clusters
    top_genes = uncurl_analysis.find_overexpressed_genes(data, w.argmax(0))
    with open(os.path.join(output_dir, 'top_genes.txt'), 'w') as f:
        json.dump(top_genes, f)
    # run mds
    mds_output = uncurl.dim_reduce(m, w, 2)
    print(mds_output.shape)
    np.savetxt(os.path.join(output_dir, 'mds_means.txt'), mds_output.T)
    mds_data = uncurl.mds(m, w, 2)
    np.savetxt(os.path.join(output_dir, 'mds_data.txt'), mds_data)
    if gene_names is not None:
        np.savetxt(os.path.join(output_dir, 'gene_names.txt'),
                   gene_names,
                   fmt='%s')
Exemplo n.º 16
0
    return m_new, w_new


if __name__ == '__main__':
    from scipy import sparse
    from scipy.io import loadmat
    from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
    # TODO: load dataset
    dat = loadmat('data/10x_pooled_400.mat')
    data = sparse.csc_matrix(dat['data'])
    labs = dat['labels'].flatten()
    genes = uncurl.max_variance_genes(data)
    data_subset = data[genes, :]
    data_subset = uncurl.preprocessing.cell_normalize(data_subset)
    m, w, ll = uncurl.run_state_estimation(data_subset,
                                           8,
                                           max_iters=0,
                                           inner_max_iters=1)
    m1, w1, ll1 = uncurl.run_state_estimation(data_subset,
                                              8,
                                              max_iters=10,
                                              inner_max_iters=25)
    b = data_subset[:, 0].toarray()
    results = irls(m, b.flatten(), Normal(), IDLink(), restrict=False)
    print(results)
    print(lstsq(m, b.flatten())[0])
    print('poisson error, id link')
    results = irls(m, b.flatten(), tol=1e-8)
    print(results)
    m_new, w_new = irls_uncurl(data_subset, 8, m, w)
    #print('poisson error, log link')
    #results = irls(m, b.flatten(), link=LogLink())
Exemplo n.º 17
0
import time

import numpy as np
from uncurl_analysis import poisson_diffexp
import scipy.io
import uncurl

mat = scipy.io.loadmat('data/10x_pooled_400.mat')
data = mat['data']
# do uncurl, followed by update_m
selected_genes = uncurl.max_variance_genes(data)
data_subset = data[selected_genes, :]
m, w, ll = uncurl.run_state_estimation(data_subset,
                                       8,
                                       max_iters=20,
                                       inner_max_iters=50)
m = uncurl.update_m(data, m, w, selected_genes)

t0 = time.time()
all_pvs, all_ratios = poisson_diffexp.uncurl_poisson_test_1_vs_rest(
    m, w, mode='counts')
print('diffexp time: ', time.time() - t0)

t0 = time.time()
all_pvs_2, all_ratios_2 = poisson_diffexp.uncurl_poisson_test_pairwise(
    m, w, mode='counts')
print('pairwise diffexp time: ', time.time() - t0)

# test on simulated data
# plotting mw
import matplotlib.pyplot as plt
Exemplo n.º 18
0
  start_id = json.load(open("/ti/input/start_id.json"))
else:
  start_id = None
if os.path.exists('/ti/input/groups_n.json'):
    groups_n = json.load(open('/ti/input/groups_n.json'))
    groups_n = groups_n[0]
else:
    groups_n = 0

checkpoints["method_afterpreproc"] = time.time()

## Trajectory inference -----------------------------------

count_data = expression.values
count_data = count_data.T
m, w, ll = uncurl.run_state_estimation(count_data, 2)

checkpoints["method_aftermethod"] = time.time()


# extract the component and use it as pseudotimes
cell_ids = pd.DataFrame({
  "cell_ids":expression.index
})
pseudotime = pd.DataFrame({
  "pseudotime":w[0, :],
  "cell_id":expression.index
})

# use w as dim_red
dimred = pd.DataFrame({
Exemplo n.º 19
0

# 1. load data
data = scipy.io.loadmat('data/10x_pooled_400.mat')

data_csc = data['data']
labels = data['labels'].flatten()
gene_names = data['gene_names']

# 2. gene selection
genes = uncurl.max_variance_genes(data_csc)
data_subset = data_csc[genes,:]
gene_names_subset = gene_names[genes]

# 3. run uncurl
m, w, ll = uncurl.run_state_estimation(data_subset, 8)
print('nmi basic: ' + str(nmi(labels, w.argmax(0))))


# 4. run clustering
for metric in ['euclidean', 'cosine']:
    for n_neighbors in [10, 15, 20]:
        print('n_neighbors: ', n_neighbors, ' metric: ', metric)
        w_graph = clustering_methods.create_graph(w.T, n_neighbors=n_neighbors, metric=metric)
        clusters = clustering_methods.run_leiden(w_graph)
        print('nmi leiden: ' + str(nmi(labels, clusters)))
        clusters_louvain = clustering_methods.run_louvain(w_graph)
        print('nmi louvain: ' + str(nmi(labels, clusters_louvain)))

# 5. try running clustering w/o uncurl
clustering_result = clustering_methods.baseline_cluster(data_subset)
Exemplo n.º 20
0
args = parse_args()
print("run with these parametres: %s" % str(args))

data = pd.read_csv(args.input, index_col=0)

if args.gene_subset == 'non_zero':
    genes_subset = np.sum(data.values, axis=1) != 0  # select nonzero genes
elif args.gene_subset == 'max_variance':
    genes_subset = max_variance_genes(data.values, nbins=5, frac=0.2) # select genes with max variance
else:
    raise NotImplementedError("optin `%s` for `gene_subset` not defined." % args.gene_subset)

data_subset = data.iloc[genes_subset,:]
M, W, ll = run_state_estimation(data_subset.values, clusters=args.clusters,
                                dist=args.dist, disp=True,
                                max_iters=args.max_iters,
                                inner_max_iters=args.inner_max_iters,
                                initialization=args.initialization,
                                threads=args.threads)

print("ll: %f" % ll)

data.iloc[genes_subset, :] = np.matmul(M, W) # imputation

make_sure_dir_exists(args.outputdir)

np.savetxt("genes_subset.csv", genes_subset, delimiter=",")
np.savetxt("M.csv", M, delimiter=",")
np.savetxt("W.csv", W, delimiter=",")

data.to_csv(os.path.join(args.outputdir, "uncurl_output.csv"))