def test_relative_entropy(self): alpha = (2.0, 10.0, 1.0, 1.0) d = Dirichlet(alpha) pvec = (0.1, 0.2, 0.3, 0.4) rent = d.mean_relative_entropy(pvec) vrent = d.variance_relative_entropy(pvec) low, high = d.interval_relative_entropy(pvec, 0.95) # print() # print('> ', rent, vrent, low, high) # This test can fail randomly, but the precision from a few # thousand samples is low. Increasing samples, 1000->2000 samples = 2000 sent = zeros((samples,), float64) for s in range(samples): post = d.sample() e = -entropy(post) for k in range(4): e += -post[k] * log(pvec[k]) sent[s] = e sent.sort() self.assertTrue(abs(sent.mean() - rent) < 4.0 * sqrt(vrent)) self.assertAlmostEqual(sent.std(), sqrt(vrent), 1) self.assertTrue(abs(low - sent[int(samples * 0.025)]) < 0.2) self.assertTrue(abs(high - sent[int(samples * 0.975)]) < 0.2)
def test_relative_entropy(self): alpha = (2.0, 10.0, 1.0, 1.0) d = Dirichlet(alpha) pvec = (0.1, 0.2, 0.3, 0.4) rent = d.mean_relative_entropy(pvec) vrent = d.variance_relative_entropy(pvec) low, high = d.interval_relative_entropy(pvec, 0.95) # print() # print('> ', rent, vrent, low, high) # This test can fail randomly, but the precision from a few # thousand samples is low. Increasing samples, 1000->2000 samples = 2000 sent = zeros((samples, ), float64) for s in range(samples): post = d.sample() e = -entropy(post) for k in range(4): e += -post[k] * log(pvec[k]) sent[s] = e sent.sort() self.assertTrue(abs(sent.mean() - rent) < 4. * sqrt(vrent)) self.assertAlmostEqual(sent.std(), sqrt(vrent), 1) self.assertTrue(abs(low - sent[int(samples * 0.025)]) < 0.2) self.assertTrue(abs(high - sent[int(samples * 0.975)]) < 0.2)
def do_test(alpha, samples=1000): ent = zeros((samples,), float64) # alpha = ones( ( K,), Float64 ) * A/K # pt = zeros( (len(alpha) ,), Float64) d = Dirichlet(alpha) for s in range(samples): p = d.sample() # print(p) # pt +=p ent[s] = entropy(p) # print(pt/samples) m = mean(ent) v = var(ent) dm = d.mean_entropy() dv = d.variance_entropy() # print(alpha, ':', m, v, dm, dv) error = 4.0 * sqrt(v / samples) self.assertTrue(abs(m - dm) < error) self.assertTrue(abs(v - dv) < error) # dodgy error estimate
def do_test(alpha, samples=1000): ent = zeros((samples, ), float64) # alpha = ones( ( K,), Float64 ) * A/K # pt = zeros( (len(alpha) ,), Float64) d = Dirichlet(alpha) for s in range(samples): p = d.sample() # print(p) # pt +=p ent[s] = entropy(p) # print(pt/samples) m = mean(ent) v = var(ent) dm = d.mean_entropy() dv = d.variance_entropy() # print(alpha, ':', m, v, dm, dv) error = 4. * sqrt(v / samples) self.assertTrue(abs(m - dm) < error) self.assertTrue(abs(v - dv) < error) # dodgy error estimate
def mask_low_complexity(seq, width =12, trigger=1.8, extension=2.0, mask='X') : """ Mask low complexity regions in protein sequences. Uses the method of Seg [1] by Wootton & Federhen [2] to divide a sequence into regions of high and low complexity. The sequence is divided into overlapping windows. Low complexity windows either have a sequence entropy less than the trigger complexity, or have an entropy less than the extension complexity and neighbor other low-complexity windows. The sequence within a low complexity region is replaced with the mask character (default 'X'), and the masked alphabetic sequence is returned. The default parameters, width=12, trigger=1.8, extension=2.0, mask='X' are suitable for masking protein sequences before a database search. The standard default seg parameters are width=12, trigger=2.2, extension=2.5 Arguments: Seq seq -- An alphabetic sequence int width -- Window width float trigger -- Entropy in bits between 0 and 4.3.. ( =log_2(20) ) float extension -- Entropy in bits between 0 and 4.3.. ( =log_2(20) ) char mask -- The mask character (default: 'X') Returns : Seq -- A masked alphabetic sequence Raises : ValueError -- On invalid arguments Refs: [1] seg man page: http://bioportal.weizmann.ac.il/education/materials/gcg/seg.html [2] Wootton & Federhen (Computers and Chemistry 17; 149-163, (1993)) Authors: GEC 2005 Future : - Optional mask character. - Option to lower case masked symbols. - Remove arbitary restriction to protein. """ lg20 = log2(20) if trigger<0 or trigger>lg20 : raise ValueError("Invalid trigger complexity: %f"% trigger) if extension<0 or extension>lg20 or extension<trigger: raise ValueError("Invalid extension complexity: %f"% extension) if width<0 : raise ValueError("Invalid width: %d"% width) if width > len(seq) : return seq s = seq.ords() X = seq.alphabet.ord(mask) nwindows = len(seq)- width +1 ent = [ 0 for x in range(0, nwindows)] count = [ 0 for x in range(0, len(seq.alphabet) )] for c in s[0:width] : count[c] +=1 ent[0] = entropy(count,2) for i in range(1, nwindows) : count[ s[i-1] ] -= 1 count[ s[i+width-1] ] +=1 ent[i] = entropy(count,2) prev_segged = False for i in range(0, nwindows) : if ((prev_segged and ent[i]< extension) or ent[i]< trigger) : for j in range(0, width) : s[i+j]=X prev_segged=True else : prev_segged = False # Redo, only backwards prev_segged = False for i in range(nwindows-1, -1, -1) : if ((prev_segged and ent[i]< extension) or ent[i]< trigger) : for j in range(0, width) : s[i+j]=X prev_segged=True else : prev_segged = False segged = seq.alphabet.chrs(s) segged.name =seq.name segged.description = seq.description return segged
def mask_low_complexity(seq, width=12, trigger=1.8, extension=2.0, mask='X'): """ Mask low complexity regions in protein sequences. Uses the method of Seg [1] by Wootton & Federhen [2] to divide a sequence into regions of high and low complexity. The sequence is divided into overlapping windows. Low complexity windows either have a sequence entropy less than the trigger complexity, or have an entropy less than the extension complexity and neighbor other low-complexity windows. The sequence within a low complexity region is replaced with the mask character (default 'X'), and the masked alphabetic sequence is returned. The default parameters, width=12, trigger=1.8, extension=2.0, mask='X' are suitable for masking protein sequences before a database search. The standard default seg parameters are width=12, trigger=2.2, extension=2.5 Arguments: Seq seq -- An alphabetic sequence int width -- Window width float trigger -- Entropy in bits between 0 and 4.3.. ( =log_2(20) ) float extension -- Entropy in bits between 0 and 4.3.. ( =log_2(20) ) char mask -- The mask character (default: 'X') Returns : Seq -- A masked alphabetic sequence Raises : ValueError -- On invalid arguments Refs: [1] seg man page: http://bioportal.weizmann.ac.il/education/materials/gcg/seg.html [2] Wootton & Federhen (Computers and Chemistry 17; 149-163, (1993)) Authors: GEC 2005 Future : - Optional mask character. - Option to lower case masked symbols. - Remove arbitary restriction to protein. """ lg20 = log2(20) if trigger < 0 or trigger > lg20: raise ValueError("Invalid trigger complexity: %f" % trigger) if extension < 0 or extension > lg20 or extension < trigger: raise ValueError("Invalid extension complexity: %f" % extension) if width < 0: raise ValueError("Invalid width: %d" % width) if width > len(seq): return seq s = seq.ords() X = seq.alphabet.ord(mask) nwindows = len(seq) - width + 1 ent = [0 for x in range(0, nwindows)] count = [0 for x in range(0, len(seq.alphabet))] for c in s[0:width]: count[c] += 1 ent[0] = entropy(count, 2) for i in range(1, nwindows): count[s[i - 1]] -= 1 count[s[i + width - 1]] += 1 ent[i] = entropy(count, 2) prev_segged = False for i in range(0, nwindows): if ((prev_segged and ent[i] < extension) or ent[i] < trigger): for j in range(0, width): s[i + j] = X prev_segged = True else: prev_segged = False # Redo, only backwards prev_segged = False for i in range(nwindows - 1, -1, -1): if ((prev_segged and ent[i] < extension) or ent[i] < trigger): for j in range(0, width): s[i + j] = X prev_segged = True else: prev_segged = False segged = seq.alphabet.chrs(s) segged.name = seq.name segged.description = seq.description return segged