def normalize_in_logspace(dist, in_log_space=True): if not in_log_space: log_dist = np.log(dist, dtype=np.float64) return np.exp(log_dist - logsumexp_scipy(log_dist)) else: logging.debug('Likelihood before normalization\n{}'.format(dist)) log_dist = np.array(dist, dtype=np.float64) return np.exp(log_dist - logsumexp_scipy(log_dist))
def one_iteration_fix_k(self): random.shuffle(self.mutations) for mut in self.mutations: loglik = np.ones((self.n_clusters, self.n_samples), dtype=np.float64) * -np.inf const_array = np.zeros((self.n_clusters,), dtype=np.float64) if len(mut.assigned_to) == 1: continue # don't reassign last mutation for cluster_idx, cluster in enumerate(self.clusterlist): # if the current point is the only thing in the cluster... # This seems to work empirically (as well as theoretically) if len(cluster) == 1 and mut.assigned_to == cluster: continue stay_in_clust_const = len(cluster) / float(self.n_muts - 1 + self.alpha) const_array[cluster_idx] = np.log(stay_in_clust_const) if mut not in cluster: # TODO: redefine in loglik[cluster_idx] = self.logsum_of_marginals_per_sample(cluster.normed_hist + mut.loghist) else: loglik[cluster_idx] = self.logsum_of_marginals_per_sample( self.normalize_loghist_with_prior(cluster - mut) + mut.loghist) loglik = np.sum(loglik, axis=1) # + const_array loglik = loglik - logsumexp_scipy(loglik) loglik = loglik + const_array c_lik = np.exp(loglik - logsumexp_scipy(loglik)) # if np.random.random() < 0.1: print sum(c_lik[:-1]) new_cluster_idx = np.nonzero(np.random.multinomial(1, c_lik) == 1)[0][0] if new_cluster_idx == self.n_clusters: # new cluster mut.assigned_to -= mut DP_cluster(self, mut) # create new cluster, lists updated automatically else: new_cluster = self.clusterlist[new_cluster_idx] mut.assigned_to -= mut new_cluster += mut cluster_counter = itertools.count() next(cluster_counter) real_index = dict([[x.id, next(cluster_counter)] for x in self.clusterlist]) self.results.assign.append([real_index[x.assigned_to.id] for x in self.mutations]) self.results.alpha.append(self.alpha) self.results.eta.append(self.eta) self.results.cluster_loghistograms.append([cluster.normed_hist for cluster in self.clusterlist]) self.results.cluster_positions.append( [[np.argmax(x) for x in cluster.normed_hist] for cluster in self.clusterlist]) self.results.clust_prop.append([len(cluster) / float(self.n_muts) for cluster in self.clusterlist]) self.results.clust_size.append([len(cluster) for cluster in self.clusterlist]) self.results.K.append(self.n_clusters) return [real_index[x.assigned_to.id] for x in self.mutations], [cluster.normed_hist for cluster in self.clusterlist]
def DP_prob_k_cond_alpha_N(N, alpha, log_stirling_coef): loglik = [np.nan] * N for k in range(1, N + 1): loglik[k - 1] = log_stirling_coef[k - 1] + lgamma(N - 1) + k * np.log(alpha) + lgamma(alpha) - lgamma(alpha + N) Pr = np.exp(loglik - logsumexp_scipy(loglik)) return (Pr)
def make_nd_histogram(hist_array): conv = 1e-40 hist = np.asarray(hist_array, dtype=np.float32) + conv n_samples = np.shape(hist)[1] for i in range(n_samples): hist[:, :, 0] = conv return np.apply_over_axes( lambda x, y: np.apply_along_axis(lambda z: z - logsumexp_scipy(z), y, x), np.log(hist), 2)
def log_conv(x, y): ## y is len 2 try: x = [-np.inf] + list(x) + [-np.inf] except: x = [-np.inf, x, -np.inf] x.insert(0, -np.inf) res = [np.nan] * (len(x) - 1) for k in range(len(x) - 1): res[k] = logsumexp_scipy([x[k] + y[0], x[k + 1] + y[1]]) # return (res)
def _load_clusters(self, cluster_info_file): logging.debug( 'Loading clusters from {} file'.format(cluster_info_file)) cluster_ccf = {} means = {} ccf_headers = [ 'postDP_ccf_' + str(i / 100.0) for i in xrange(0, 101, 1) ] with open(cluster_info_file, 'r') as reader: for line in reader: values = line.strip().split('\t') if line.startswith('Patient_ID'): header = dict( (item, idx) for idx, item in enumerate(values)) else: sample_id = values[header['Sample_ID']] cluster_id = int(values[header['Cluster_ID']]) cluster_mean = float(values[header['postDP_ccf_mean']]) ccf = np.array( [float(values[header[i]]) for i in ccf_headers], dtype=np.float64) ccf = np.clip(ccf, a_min=1e-20, a_max=None) ccf = np.log(ccf, dtype=np.float64) ccf = np.exp(ccf - logsumexp_scipy(ccf)) if cluster_id not in cluster_ccf: cluster_ccf[cluster_id] = {} means[cluster_id] = [] means[cluster_id].append(cluster_mean) cluster_ccf[cluster_id][sample_id] = ccf for cluster_id in cluster_ccf: # decide whether cluster should be removed # if density < 0.1 across all samples add it to remove clusters, to be removed from BuildTree algorithm if self.low_ccf_check(means[cluster_id]): self._removed_clusters.append(cluster_id) logging.debug('Removed cluster {} '.format(cluster_id)) return cluster_ccf
def one_iteration(self, resample=True): for mut in self.mutations: skip_count = 1 loglik = np.ones((self.n_clusters + 1, self.n_samples), dtype=np.float64) * -np.inf const_array = np.zeros((self.n_clusters + 1,), dtype=np.float64) for cluster_idx, cluster in enumerate(self.clusterlist): ## if the current point is the only thing in the cluster... # This seems to work empirically (as well as theoretically) if len(cluster) == 1 and mut.assigned_to == cluster: # skip_count+=1 #at most 2 continue stay_in_clust_const = len(cluster) / float(self.n_muts - 1 + self.alpha) const_array[cluster_idx] = np.log(stay_in_clust_const) if mut not in cluster: # TODO: redefine in loglik[cluster_idx] = self.logsum_of_marginals_per_sample(cluster.normed_hist + mut.loghist) else: loglik[cluster_idx] = self.logsum_of_marginals_per_sample( self.normalize_loghist_with_prior(cluster - mut) + mut.loghist) open_new_clust_const = self.alpha / float(self.n_muts - 1 + self.alpha) prior = np.clip(np.exp(self.logprior) - np.exp( functools.reduce(lambda x, y: np.maximum(x, y), [z.normed_hist for z in self.clusterlist])), a_min=1e-40, a_max=1.) loglik[-1] = self.logsum_of_marginals_per_sample( mut.loghist + self.normalize_loghist_with_prior(np.log(prior))) const_array[-1] = np.log(open_new_clust_const) # c_loglik = np.sum(c_loglik, axis = 1) loglik = np.sum(loglik, axis=1) # + const_array loglik = loglik - logsumexp_scipy(loglik) loglik = loglik + const_array c_lik = np.exp(loglik - logsumexp_scipy(loglik)) new_cluster_idx = np.nonzero(np.random.multinomial(1, c_lik) == 1)[0][0] if new_cluster_idx == self.n_clusters: # new cluster mut.assigned_to -= mut DP_cluster(self, mut) # create new cluster, lists updated automatically else: new_cluster = self.clusterlist[new_cluster_idx] mut.assigned_to -= mut new_cluster += mut cluster_counter = itertools.count() next(cluster_counter) real_index = dict([[x.id, next(cluster_counter)] for x in self.clusterlist]) self.results.assign.append([real_index[x.assigned_to.id] for x in self.mutations]) self.results.alpha.append(self.alpha) self.results.eta.append(self.eta) self.results.cluster_loghistograms.append([cluster.normed_hist for cluster in self.clusterlist]) self.results.cluster_positions.append( [[np.argmax(x) for x in cluster.normed_hist] for cluster in self.clusterlist]) self.results.clust_prop.append([len(cluster) / float(self.n_muts) for cluster in self.clusterlist]) self.results.clust_size.append([len(cluster) for cluster in self.clusterlist]) self.results.K.append(self.n_clusters) print("{}({});".format(self.n_clusters, round(self.alpha, 1)),) sys.stdout.flush() if resample: ##resample alpha self.eta = stats.beta.rvs(self.alpha + 1, self.n_muts) self.alpha = sample_gamma_cond_N_k(self.n_muts, self.n_clusters, self.eta, self.gamma_prior) ## Escobar and West 1995
def _load_mutations(self, mut_info_file): logging.debug('Loading mutations from {} file'.format(mut_info_file)) ccf_headers = [ 'preDP_ccf_' + str(i / 100.0) for i in xrange(0, 101, 1) ] with open(mut_info_file, 'r') as reader: for line in reader: values = line.strip().split('\t') if line.startswith('Patient_ID'): header = dict( (item, idx) for idx, item in enumerate(values)) else: cluster_id = int(values[header['Cluster_Assignment']]) if cluster_id not in self._removed_clusters: chromosome = values[header['Chromosome']] position = values[header['Start_position']] ref = values[header['Reference_Allele']] alt = values[header['Tumor_Seq_Allele']] sample_id = values[header['Sample_ID']] ccf_1d = [ float(values[header[i]]) for i in ccf_headers ] ccf_1d = np.clip(np.array(ccf_1d, dtype=np.float64), a_min=1e-20, a_max=None) ccf_1d = np.log(ccf_1d, dtype=np.float64) ccf_1d = np.exp(ccf_1d - logsumexp_scipy(ccf_1d)) var_type = values[header['Variant_Type']] mutation_str = ':'.join( [chromosome, position, ref, alt]) if cluster_id not in self._cluster_mutations: self._cluster_mutations[cluster_id] = {} if mutation_str not in self._cluster_mutations[ cluster_id]: self._cluster_mutations[cluster_id][ mutation_str] = {} if sample_id not in self._samples_mutations: self._samples_mutations[sample_id] = [] mutation = SomaticEvents.SomMutation( chromosome, position, ref, alt, ccf_1d, ref_cnt=values[header['t_ref_count']], alt_cnt=values[header['t_alt_count']], gene=values[header['Hugo_Symbol']], prot_change=values[header['Protein_change']], mut_category=values[ header['Variant_Classification']], from_sample=sample_id, type_=var_type) self._cluster_mutations[cluster_id][mutation_str][ sample_id] = mutation self._samples_mutations[sample_id].append(mutation_str) self._clusters[cluster_id].add_mutation(mutation) logging.info( 'Mutation {} loaded from sample {}'.format( mutation_str, sample_id))
def logsum_of_marginals_per_sample(loghist): return np.apply_along_axis(lambda x: logsumexp_scipy(x), 1, np.array(loghist, dtype=np.float32))