def _score(self, alignment): if self.use_seq_weights: seq_weights = alignment.get_seq_weights() else: seq_weights = [1.0] * len(alignment.msa) if self.bg_distribution is None: # Estimate bg distribution from this alignment q = weighted_freq_count_pseudocount((aa for seq in alignment.msa for aa in seq), seq_weights, PSEUDOCOUNT) else: q = self.bg_distribution scores = [] for i in xrange(len(alignment.msa[0])): col = get_column(i, alignment.msa) n_gaps = col.count("-") assert n_gaps < len(col) if self.gap_cutoff != 1 and n_gaps / len(col) > self.gap_cutoff: score = self.SCORE_OVER_GAP_CUTOFF else: score = self._score_col(col, seq_weights, q) if self.use_gap_penalty: # vn_entropy has this commented out for some reason score *= weighted_gap_penalty(col, seq_weights) scores.append(score) return scores
def _score_col(self, col, seq_weights): """ Calculate the relative entropy of a column col relative to a partition of the amino acids. Similar to Williamson '95. See shannon_entropy() for more general info. """ if len(self.bg_distribution) == len(self.property_partition): prop_bg_freq = self.bg_distribution else: # XXX: shouldn't we sum the bg distribution frequencies instead of using # some fixed prop bg freq? prop_bg_freq = self.prop_bg_freq fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT) # sum the aa frequencies to get the property frequencies prop_fc = [0.] * len(self.property_partition) for p in xrange(len(self.property_partition)): for aa in self.property_partition[p]: prop_fc[p] += fc[aa_to_index[aa]] d = 0. for i in xrange(len(prop_fc)): if prop_fc[i] and prop_bg_freq[i]: d += prop_fc[i] * math.log(prop_fc[i] / prop_bg_freq[i], 2) # Convert score so that it's between 0 and 1. # XXX: why is relative entropy assumed to be bounded? d /= math.log(len(prop_fc)) return d
def _score_col(self, col, seq_weights): """ Calculate the entropy of a column col relative to a partition of the amino acids. Similar to Mirny '99. """ fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT) # sum the aa frequencies to get the property frequencies prop_fc = [0.] * len(self.property_partition) for p in range(len(self.property_partition)): for aa in self.property_partition[p]: prop_fc[p] += fc[aa_to_index[aa]] h = 0. for pfc_i in prop_fc: if pfc_i: h -= pfc_i * math.log(pfc_i) # Convert score so that it's between 0 and 1. # Recall that shannon entropy is between 0 and log(number of values with nonzero freq) # XXX: Why involve len(col) if we have a pseudocount? h /= math.log(min(len(self.property_partition), len(col))) # Convert score so that 1 is conserved, and 0 is not. return 1 - h
def _score_col(self, col, seq_weights): """ Calculate the relative entropy of the column distribution with a background distribution specified in bg_distr. This is similar to the approach proposed in Wang and Samudrala 06. """ q = self.bg_distribution with_gap = (len(q) == 21) fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT, with_gap) assert len(fc) == len(q) d = np.sum(fc * np.log(fc/q)) # Convert score so that it's between 0 and 1. # XXX: why is relative entropy assumed to be bounded? d /= np.log(len(fc)) return d
def _score_col(self, col, seq_weights, q): """ Return the Jensen-Shannon Divergence for the column with the background distribution q. """ lamb1 = self.lambda_prior lamb2 = 1 - self.lambda_prior # get frequency distribution with_gap = len(q) == 21 pc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT, with_gap) assert len(pc) == len(q) # make r distriubtion r = lamb1 * pc + lamb2 * q # sum relative entropies d1 = lamb1 * sum(pc[i] * math.log(pc[i] / r[i], 2) for i in xrange(len(pc)) if pc[i]) d2 = lamb2 * sum(q[i] * math.log(q[i] / r[i], 2) for i in xrange(len(pc)) if pc[i]) return d1 + d2
def _score_col(self, col, seq_weights): """ Calculates the Shannon entropy of the column col. The entropy will be between zero and one because of its base. See p.13 of Valdar 02 for details. The information score 1 - h is returned for the sake of consistency with other scores. """ fc = weighted_freq_count_pseudocount(col, seq_weights, PSEUDOCOUNT) h = 0. for fc_i in fc: if fc_i: h -= fc_i * math.log(fc_i) # Convert score so that it's between 0 and 1. # Recall that shannon entropy is between 0 and log(number of values with nonzero freq) # XXX: Why involve len(col) if we have a pseudocount? h /= math.log(len(fc))#math.log(min(len(fc), len(col))) # Convert score so that 1 is conserved, and 0 is not. return 1 - h