예제 #1
0
    def test_relative_entropy(self):
        alpha = (2.0, 10.0, 1.0, 1.0)
        d = Dirichlet(alpha)
        pvec = (0.1, 0.2, 0.3, 0.4)

        rent = d.mean_relative_entropy(pvec)
        vrent = d.variance_relative_entropy(pvec)
        low, high = d.interval_relative_entropy(pvec, 0.95)

        # print()
        # print('> ', rent, vrent, low, high)

        # This test can fail randomly, but the precision from a few
        # thousand samples is low. Increasing samples, 1000->2000
        samples = 2000
        sent = zeros((samples,), float64)

        for s in range(samples):
            post = d.sample()
            e = -entropy(post)
            for k in range(4):
                e += -post[k] * log(pvec[k])
            sent[s] = e
        sent.sort()
        self.assertTrue(abs(sent.mean() - rent) < 4.0 * sqrt(vrent))
        self.assertAlmostEqual(sent.std(), sqrt(vrent), 1)
        self.assertTrue(abs(low - sent[int(samples * 0.025)]) < 0.2)
        self.assertTrue(abs(high - sent[int(samples * 0.975)]) < 0.2)
예제 #2
0
    def test_relative_entropy(self):
        alpha = (2.0, 10.0, 1.0, 1.0)
        d = Dirichlet(alpha)
        pvec = (0.1, 0.2, 0.3, 0.4)

        rent = d.mean_relative_entropy(pvec)
        vrent = d.variance_relative_entropy(pvec)
        low, high = d.interval_relative_entropy(pvec, 0.95)

        # print()
        # print('> ', rent, vrent, low, high)

        # This test can fail randomly, but the precision from a few
        # thousand samples is low. Increasing samples, 1000->2000
        samples = 2000
        sent = zeros((samples, ), float64)

        for s in range(samples):
            post = d.sample()
            e = -entropy(post)
            for k in range(4):
                e += -post[k] * log(pvec[k])
            sent[s] = e
        sent.sort()
        self.assertTrue(abs(sent.mean() - rent) < 4. * sqrt(vrent))
        self.assertAlmostEqual(sent.std(), sqrt(vrent), 1)
        self.assertTrue(abs(low - sent[int(samples * 0.025)]) < 0.2)
        self.assertTrue(abs(high - sent[int(samples * 0.975)]) < 0.2)
예제 #3
0
        def do_test(alpha, samples=1000):
            ent = zeros((samples,), float64)
            # alpha = ones( ( K,), Float64 ) * A/K

            # pt = zeros( (len(alpha) ,), Float64)
            d = Dirichlet(alpha)
            for s in range(samples):
                p = d.sample()
                # print(p)
                # pt +=p
                ent[s] = entropy(p)

            # print(pt/samples)

            m = mean(ent)
            v = var(ent)

            dm = d.mean_entropy()
            dv = d.variance_entropy()

            # print(alpha, ':', m, v, dm, dv)
            error = 4.0 * sqrt(v / samples)
            self.assertTrue(abs(m - dm) < error)
            self.assertTrue(abs(v - dv) < error)  # dodgy error estimate
예제 #4
0
        def do_test(alpha, samples=1000):
            ent = zeros((samples, ), float64)
            # alpha = ones( ( K,), Float64 ) * A/K

            # pt = zeros( (len(alpha) ,), Float64)
            d = Dirichlet(alpha)
            for s in range(samples):
                p = d.sample()
                # print(p)
                # pt +=p
                ent[s] = entropy(p)

            # print(pt/samples)

            m = mean(ent)
            v = var(ent)

            dm = d.mean_entropy()
            dv = d.variance_entropy()

            # print(alpha, ':', m, v, dm, dv)
            error = 4. * sqrt(v / samples)
            self.assertTrue(abs(m - dm) < error)
            self.assertTrue(abs(v - dv) < error)  # dodgy error estimate
예제 #5
0
def mask_low_complexity(seq, width =12, trigger=1.8, extension=2.0, mask='X') :
    """ Mask low complexity regions in protein sequences.
    
    Uses the method of Seg [1] by Wootton & Federhen [2] to divide a sequence   
    into regions of high and low complexity. The sequence is divided into
    overlapping windows. Low complexity windows either have a sequence entropy
    less than the trigger complexity, or have an entropy less than the extension    
    complexity and neighbor other low-complexity windows. The sequence within   
    a low complexity region is replaced with the mask character (default 'X'), 
    and the masked alphabetic sequence is returned.
    
    The default parameters, width=12, trigger=1.8, extension=2.0, mask='X' are
    suitable for masking protein sequences before a database search. The 
    standard default seg parameters are width=12, trigger=2.2, extension=2.5
    
    Arguments:
        Seq seq         -- An alphabetic sequence
        int width       -- Window width
        float trigger   -- Entropy in bits between 0 and 4.3.. ( =log_2(20) )
        float extension -- Entropy in bits between 0 and 4.3.. ( =log_2(20) )
        char mask       -- The mask character (default: 'X') 
    Returns :
        Seq         -- A masked alphabetic sequence
    Raises :
        ValueError  -- On invalid arguments
    Refs:
        [1] seg man page: 
            http://bioportal.weizmann.ac.il/education/materials/gcg/seg.html
        [2] Wootton & Federhen (Computers and Chemistry 17; 149-163, (1993)) 
    Authors:
        GEC 2005
    Future :
        - Optional mask character.
        - Option to lower case masked symbols.
        - Remove arbitary restriction to protein.
    """
    
    lg20 = log2(20)
    if trigger<0 or trigger>lg20 :
        raise ValueError("Invalid trigger complexity: %f"% trigger) 
    if extension<0 or extension>lg20 or extension<trigger:
        raise ValueError("Invalid extension complexity: %f"% extension)
    if width<0 :
        raise ValueError("Invalid width: %d"% width)

    if width > len(seq) : return seq
    
    s = seq.ords()

    X = seq.alphabet.ord(mask)

    
    nwindows = len(seq)- width +1
    ent = [ 0 for x in range(0, nwindows)]
    count = [ 0 for x in range(0, len(seq.alphabet) )]
    
    for c in s[0:width] : count[c] +=1
    ent[0] = entropy(count,2)
    
    for i in range(1, nwindows) :
        count[ s[i-1] ] -= 1
        count[ s[i+width-1] ] +=1
        ent[i] = entropy(count,2)
    
    prev_segged = False 
    for i in range(0, nwindows) :
        if ((prev_segged and ent[i]< extension) or 
            ent[i]< trigger) :
            for j in range(0, width) : s[i+j]=X
            prev_segged=True
        else :
            prev_segged = False


    # Redo, only backwards
    prev_segged = False 
    for i in range(nwindows-1, -1, -1) :
        if ((prev_segged and ent[i]< extension) or 
            ent[i]< trigger) :
            for j in range(0, width) : s[i+j]=X
            prev_segged=True
        else :
            prev_segged = False

    segged = seq.alphabet.chrs(s)
    segged.name =seq.name
    segged.description = seq.description
    return segged
예제 #6
0
def mask_low_complexity(seq, width=12, trigger=1.8, extension=2.0, mask='X'):
    """ Mask low complexity regions in protein sequences.
    
    Uses the method of Seg [1] by Wootton & Federhen [2] to divide a sequence   
    into regions of high and low complexity. The sequence is divided into
    overlapping windows. Low complexity windows either have a sequence entropy
    less than the trigger complexity, or have an entropy less than the extension    
    complexity and neighbor other low-complexity windows. The sequence within   
    a low complexity region is replaced with the mask character (default 'X'), 
    and the masked alphabetic sequence is returned.
    
    The default parameters, width=12, trigger=1.8, extension=2.0, mask='X' are
    suitable for masking protein sequences before a database search. The 
    standard default seg parameters are width=12, trigger=2.2, extension=2.5
    
    Arguments:
        Seq seq         -- An alphabetic sequence
        int width       -- Window width
        float trigger   -- Entropy in bits between 0 and 4.3.. ( =log_2(20) )
        float extension -- Entropy in bits between 0 and 4.3.. ( =log_2(20) )
        char mask       -- The mask character (default: 'X') 
    Returns :
        Seq         -- A masked alphabetic sequence
    Raises :
        ValueError  -- On invalid arguments
    Refs:
        [1] seg man page: 
            http://bioportal.weizmann.ac.il/education/materials/gcg/seg.html
        [2] Wootton & Federhen (Computers and Chemistry 17; 149-163, (1993)) 
    Authors:
        GEC 2005
    Future :
        - Optional mask character.
        - Option to lower case masked symbols.
        - Remove arbitary restriction to protein.
    """

    lg20 = log2(20)
    if trigger < 0 or trigger > lg20:
        raise ValueError("Invalid trigger complexity: %f" % trigger)
    if extension < 0 or extension > lg20 or extension < trigger:
        raise ValueError("Invalid extension complexity: %f" % extension)
    if width < 0:
        raise ValueError("Invalid width: %d" % width)

    if width > len(seq): return seq

    s = seq.ords()

    X = seq.alphabet.ord(mask)

    nwindows = len(seq) - width + 1
    ent = [0 for x in range(0, nwindows)]
    count = [0 for x in range(0, len(seq.alphabet))]

    for c in s[0:width]:
        count[c] += 1
    ent[0] = entropy(count, 2)

    for i in range(1, nwindows):
        count[s[i - 1]] -= 1
        count[s[i + width - 1]] += 1
        ent[i] = entropy(count, 2)

    prev_segged = False
    for i in range(0, nwindows):
        if ((prev_segged and ent[i] < extension) or ent[i] < trigger):
            for j in range(0, width):
                s[i + j] = X
            prev_segged = True
        else:
            prev_segged = False

    # Redo, only backwards
    prev_segged = False
    for i in range(nwindows - 1, -1, -1):
        if ((prev_segged and ent[i] < extension) or ent[i] < trigger):
            for j in range(0, width):
                s[i + j] = X
            prev_segged = True
        else:
            prev_segged = False

    segged = seq.alphabet.chrs(s)
    segged.name = seq.name
    segged.description = seq.description
    return segged