예제 #1
0
    def _set_data_to_clusters(self, data_idx):
        data_point = self.data[data_idx]

        num_clusters = len(self.cluster_params)

        log_p = np.zeros(num_clusters)

        cluster = self.clustering[data_idx]

        for c, block_params in self.cluster_params.items():
            if c == cluster:
                block_params.decrement(data_point)

            if block_params.N == 0:
                log_p[c] = float('-inf')

            else:
                log_p[c] = self.partition_prior.log_tau_2(block_params.N)

                log_p[c] += self.dist.log_predictive_likelihood(
                    data_point, block_params)

            if c == cluster:
                block_params.increment(data_point)

        log_p = log_normalize(log_p)

        self.data_to_clusters[data_idx] = []

        for c, log_p_c in enumerate(log_p):
            if log_p_c >= np.log(self.threshold):
                self.data_to_clusters[data_idx].append(c)
예제 #2
0
    def test_likelihood(self):
        dim = 10

        N = 100

        clustering = np.random.randint(0, 10, size=N)

        data = np.random.multivariate_normal(np.random.random(size=dim) * 100,
                                             np.eye(dim),
                                             size=N)

        dist = mvn.MultivariateNormalDistribution(dim)

        partition_prior = mocks.MockPartitionPrior()

        split_merge_setup_kernel = UniformSplitMergeSetupKernel(
            data, dist, partition_prior)

        sampler = SequentiallyAllocatedMergeSplitSampler(
            dist, partition_prior, split_merge_setup_kernel)

        anchors, sigma = split_merge_setup_kernel.setup_split_merge(
            clustering, 2)

        sampler.kernel.setup(anchors, clustering, data, sigma)

        clustering, mh_ratio = sampler._merge(data)

        log_p = dist.log_marginal_likelihood(
            dist.create_params_from_data(data))

        self.assertAlmostEqual(mh_ratio, log_p)

        clustering, mh_ratio = sampler._split(data)

        clustering = clustering.astype(int)

        log_q = 0

        params = [
            dist.create_params_from_data(data[0]),
            dist.create_params_from_data(data[1])
        ]

        for c, x in zip(clustering[2:], data[2:]):
            block_probs = np.zeros(2)

            for i in range(2):
                block_probs[i] = dist.log_predictive_likelihood(x, params[i])

            block_probs = log_normalize(block_probs)

            log_q += block_probs[c]

            params[c].increment(x)

        log_p = sum([dist.log_marginal_likelihood(x) for x in params])

        self.assertAlmostEqual(mh_ratio, log_p - log_q)
예제 #3
0
def held_out_log_predicitive(clustering, dist, partition_prior, test_data, train_data, per_point=False):
    clustering = relabel_clustering(clustering)

    block_params = []

    log_cluster_prior = []

    block_ids = sorted(np.unique(clustering))

    for z in block_ids:
        params = dist.create_params_from_data(train_data[clustering == z])

        block_params.append(params)

        log_cluster_prior.append(partition_prior.log_tau_2_diff(params.N))

    num_blocks = len(block_ids)

    block_params.append(dist.create_params())

    log_cluster_prior.append(partition_prior.log_tau_1_diff(num_blocks))

    log_cluster_prior = np.array(log_cluster_prior)

    log_cluster_prior = log_normalize(log_cluster_prior)

    log_p = np.zeros((test_data.shape[0], len(log_cluster_prior)))

    for z, (w, params) in enumerate(zip(log_cluster_prior, block_params)):
        log_p[:, z] = w + dist.log_predictive_likelihood_bulk(test_data, params)

    if per_point:
        return log_sum_exp(log_p, axis=1)

    else:
        return np.sum(log_sum_exp(log_p, axis=1))
예제 #4
0
def load_data_from_file(file_name,
                        error_rate=1e-3,
                        grid_size=1000,
                        perfect_prior=False,
                        tumour_content=None):
    '''
    Given a PyClone input tsv formatted file, load the discretized grid of likelihoods.

    See https://bitbucket.org/aroth85/pyclone/wiki/Usage for information about the input file format. For debugging
    purposes, this file can also include information about the mutational genotype for use with the perfrect_prior
    argument.
    '''
    data = []

    df = pd.read_csv(file_name, sep='\t')

    if tumour_content is None:
        if 'tumour_content' in df.columns:
            assert len(df['tumour_content'].unique()) == 1

            tumour_content = df['tumour_content'].iloc[0]

            print 'Tumour content of {} detected in file'.format(
                tumour_content)

        else:
            tumour_content = 1.0

    for _, row in df.iterrows():
        a = row['ref_counts']

        b = row['var_counts']

        cn_n = row['normal_cn']

        if 'major_cn' in row:
            major_cn = row['major_cn']

            total_cn = row['major_cn'] + row['minor_cn']

        else:
            total_cn = int(row['total_cn'])

            major_cn = total_cn

        # Use the true mutational genotype information
        if perfect_prior:
            cn = [[len(row['g_n']), len(row['g_r']), len(row['g_v'])]]

            mu = [[
                error_rate, error_rate,
                min(1 - error_rate, row['g_v'].count('B') / len(row['g_v']))
            ]]

            log_pi = [
                0,
            ]

        # Elicit mutational genotype prior based on major and minor copy number
        else:
            cn = []

            mu = []

            log_pi = []

            # Consider all possible mutational genotypes consistent with mutation before CN change
            for x in range(1, major_cn + 1):
                cn.append((cn_n, cn_n, total_cn))

                mu.append(
                    (error_rate, error_rate, min(1 - error_rate,
                                                 x / total_cn)))

                log_pi.append(0)

            # Consider mutational genotype of mutation before CN change if not already added
            mutation_after_cn = (cn_n, total_cn, total_cn)

            if mutation_after_cn not in cn:
                cn.append(mutation_after_cn)

                mu.append(
                    (error_rate, error_rate, min(1 - error_rate,
                                                 1 / total_cn)))

                log_pi.append(0)

                assert len(set(cn)) == 2

        cn = np.array(cn, dtype=int)

        mu = np.array(mu, dtype=float)

        log_pi = log_normalize(np.array(log_pi, dtype=float))

        data.append(DataPoint(a, b, cn, mu, log_pi))

    return convert_data_to_discrete_grid(data,
                                         grid_size=grid_size,
                                         tumour_content=tumour_content)
예제 #5
0
 def normalized_log_pdf_grid(self):
     return log_normalize(self.log_pdf_grid)
예제 #6
0
    def test_partition_prior(self):
        dim = 10

        N = 3

        clustering = np.array([0, 0, 1])

        data = np.random.multivariate_normal(np.random.random(size=dim) * 100,
                                             np.eye(dim),
                                             size=N)

        dist = mocks.MockDistribution()

        partition_prior = DirichletProcessPartitionPrior(0.1234)

        split_merge_setup_kernel = UniformSplitMergeSetupKernel(
            data, dist, partition_prior)

        sampler = SequentiallyAllocatedMergeSplitSampler(
            dist, partition_prior, split_merge_setup_kernel)

        anchors = [0, 2]

        sigma = [0, 2, 1]

        sampler.kernel.setup(anchors, clustering, data, sigma)

        clustering_sigma = clustering[sigma]

        data_sigma = data[sigma]

        clustering, mh_ratio = sampler._merge(data_sigma)

        log_p = partition_prior.log_likelihood([
            N,
        ])

        self.assertAlmostEqual(mh_ratio, log_p)

        clustering, mh_ratio = sampler._split(data_sigma)

        clustering = clustering.astype(int)

        log_q = 0

        params = [
            dist.create_params_from_data(data_sigma[0]),
            dist.create_params_from_data(data_sigma[1])
        ]

        for c, x in zip(clustering_sigma[2:], data_sigma[2:]):
            block_probs = np.zeros(2)

            for i in range(2):
                block_probs[i] = partition_prior.log_tau_2_diff(params[i].N)

            block_probs = log_normalize(block_probs)

            log_q += block_probs[c]

            params[c].increment(x)

        log_p = partition_prior.log_likelihood([x.N for x in params])

        self.assertAlmostEqual(mh_ratio, log_p - log_q)