def _set_data_to_clusters(self, data_idx): data_point = self.data[data_idx] num_clusters = len(self.cluster_params) log_p = np.zeros(num_clusters) cluster = self.clustering[data_idx] for c, block_params in self.cluster_params.items(): if c == cluster: block_params.decrement(data_point) if block_params.N == 0: log_p[c] = float('-inf') else: log_p[c] = self.partition_prior.log_tau_2(block_params.N) log_p[c] += self.dist.log_predictive_likelihood( data_point, block_params) if c == cluster: block_params.increment(data_point) log_p = log_normalize(log_p) self.data_to_clusters[data_idx] = [] for c, log_p_c in enumerate(log_p): if log_p_c >= np.log(self.threshold): self.data_to_clusters[data_idx].append(c)
def test_likelihood(self): dim = 10 N = 100 clustering = np.random.randint(0, 10, size=N) data = np.random.multivariate_normal(np.random.random(size=dim) * 100, np.eye(dim), size=N) dist = mvn.MultivariateNormalDistribution(dim) partition_prior = mocks.MockPartitionPrior() split_merge_setup_kernel = UniformSplitMergeSetupKernel( data, dist, partition_prior) sampler = SequentiallyAllocatedMergeSplitSampler( dist, partition_prior, split_merge_setup_kernel) anchors, sigma = split_merge_setup_kernel.setup_split_merge( clustering, 2) sampler.kernel.setup(anchors, clustering, data, sigma) clustering, mh_ratio = sampler._merge(data) log_p = dist.log_marginal_likelihood( dist.create_params_from_data(data)) self.assertAlmostEqual(mh_ratio, log_p) clustering, mh_ratio = sampler._split(data) clustering = clustering.astype(int) log_q = 0 params = [ dist.create_params_from_data(data[0]), dist.create_params_from_data(data[1]) ] for c, x in zip(clustering[2:], data[2:]): block_probs = np.zeros(2) for i in range(2): block_probs[i] = dist.log_predictive_likelihood(x, params[i]) block_probs = log_normalize(block_probs) log_q += block_probs[c] params[c].increment(x) log_p = sum([dist.log_marginal_likelihood(x) for x in params]) self.assertAlmostEqual(mh_ratio, log_p - log_q)
def held_out_log_predicitive(clustering, dist, partition_prior, test_data, train_data, per_point=False): clustering = relabel_clustering(clustering) block_params = [] log_cluster_prior = [] block_ids = sorted(np.unique(clustering)) for z in block_ids: params = dist.create_params_from_data(train_data[clustering == z]) block_params.append(params) log_cluster_prior.append(partition_prior.log_tau_2_diff(params.N)) num_blocks = len(block_ids) block_params.append(dist.create_params()) log_cluster_prior.append(partition_prior.log_tau_1_diff(num_blocks)) log_cluster_prior = np.array(log_cluster_prior) log_cluster_prior = log_normalize(log_cluster_prior) log_p = np.zeros((test_data.shape[0], len(log_cluster_prior))) for z, (w, params) in enumerate(zip(log_cluster_prior, block_params)): log_p[:, z] = w + dist.log_predictive_likelihood_bulk(test_data, params) if per_point: return log_sum_exp(log_p, axis=1) else: return np.sum(log_sum_exp(log_p, axis=1))
def load_data_from_file(file_name, error_rate=1e-3, grid_size=1000, perfect_prior=False, tumour_content=None): ''' Given a PyClone input tsv formatted file, load the discretized grid of likelihoods. See https://bitbucket.org/aroth85/pyclone/wiki/Usage for information about the input file format. For debugging purposes, this file can also include information about the mutational genotype for use with the perfrect_prior argument. ''' data = [] df = pd.read_csv(file_name, sep='\t') if tumour_content is None: if 'tumour_content' in df.columns: assert len(df['tumour_content'].unique()) == 1 tumour_content = df['tumour_content'].iloc[0] print 'Tumour content of {} detected in file'.format( tumour_content) else: tumour_content = 1.0 for _, row in df.iterrows(): a = row['ref_counts'] b = row['var_counts'] cn_n = row['normal_cn'] if 'major_cn' in row: major_cn = row['major_cn'] total_cn = row['major_cn'] + row['minor_cn'] else: total_cn = int(row['total_cn']) major_cn = total_cn # Use the true mutational genotype information if perfect_prior: cn = [[len(row['g_n']), len(row['g_r']), len(row['g_v'])]] mu = [[ error_rate, error_rate, min(1 - error_rate, row['g_v'].count('B') / len(row['g_v'])) ]] log_pi = [ 0, ] # Elicit mutational genotype prior based on major and minor copy number else: cn = [] mu = [] log_pi = [] # Consider all possible mutational genotypes consistent with mutation before CN change for x in range(1, major_cn + 1): cn.append((cn_n, cn_n, total_cn)) mu.append( (error_rate, error_rate, min(1 - error_rate, x / total_cn))) log_pi.append(0) # Consider mutational genotype of mutation before CN change if not already added mutation_after_cn = (cn_n, total_cn, total_cn) if mutation_after_cn not in cn: cn.append(mutation_after_cn) mu.append( (error_rate, error_rate, min(1 - error_rate, 1 / total_cn))) log_pi.append(0) assert len(set(cn)) == 2 cn = np.array(cn, dtype=int) mu = np.array(mu, dtype=float) log_pi = log_normalize(np.array(log_pi, dtype=float)) data.append(DataPoint(a, b, cn, mu, log_pi)) return convert_data_to_discrete_grid(data, grid_size=grid_size, tumour_content=tumour_content)
def normalized_log_pdf_grid(self): return log_normalize(self.log_pdf_grid)
def test_partition_prior(self): dim = 10 N = 3 clustering = np.array([0, 0, 1]) data = np.random.multivariate_normal(np.random.random(size=dim) * 100, np.eye(dim), size=N) dist = mocks.MockDistribution() partition_prior = DirichletProcessPartitionPrior(0.1234) split_merge_setup_kernel = UniformSplitMergeSetupKernel( data, dist, partition_prior) sampler = SequentiallyAllocatedMergeSplitSampler( dist, partition_prior, split_merge_setup_kernel) anchors = [0, 2] sigma = [0, 2, 1] sampler.kernel.setup(anchors, clustering, data, sigma) clustering_sigma = clustering[sigma] data_sigma = data[sigma] clustering, mh_ratio = sampler._merge(data_sigma) log_p = partition_prior.log_likelihood([ N, ]) self.assertAlmostEqual(mh_ratio, log_p) clustering, mh_ratio = sampler._split(data_sigma) clustering = clustering.astype(int) log_q = 0 params = [ dist.create_params_from_data(data_sigma[0]), dist.create_params_from_data(data_sigma[1]) ] for c, x in zip(clustering_sigma[2:], data_sigma[2:]): block_probs = np.zeros(2) for i in range(2): block_probs[i] = partition_prior.log_tau_2_diff(params[i].N) block_probs = log_normalize(block_probs) log_q += block_probs[c] params[c].increment(x) log_p = partition_prior.log_likelihood([x.N for x in params]) self.assertAlmostEqual(mh_ratio, log_p - log_q)