示例#1
0
 def predict(self, inpseq, useCascade = True):
     """ Classify each symbol in a sequence.
         Return the predictions as a list of symbols. """
     W = self.nn1.ninput / len(self.inp_alpha)
     if useCascade and self.cascade:
         nn1seq = self.predict(inpseq, useCascade = False)
         subseqs = slidewin(nn1seq, self.cascade)
         predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions
         for i in range(len(subseqs)):    # for each input sub-sequence of the primary NN
             input = numpy.zeros(self.cascade * len(self.outp_alpha))
             input[_onehotIndex(self.outp_alpha, subseqs[i])] = 1
             outvec = self.nn2.feedforward(input)
             d = prob.Distrib(self.outp_alpha)
             for k in range(len(outvec)):
                 d.observe(self.outp_alpha[k], outvec[k])
             predsyms[i + self.cascade / 2] = d.getmax()    # use the symbol with the highest probability
         return sequence.Sequence(predsyms, self.outp_alpha)
     else: # only predict using the first NN
         subseqs = slidewin(inpseq, W)
         predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions
         for i in range(len(subseqs)):    # for each input sub-sequence of the primary NN
             input = numpy.zeros(self.inp_len * len(self.inp_alpha))
             input[_onehotIndex(self.inp_alpha, subseqs[i])] = 1
             outvec = self.nn1.feedforward(input)
             d = prob.Distrib(self.outp_alpha)
             for k in range(len(outvec)):
                 d.observe(self.outp_alpha[k], outvec[k])
             predsyms[i + W / 2] = d.getmax()    # use the symbol with the highest probability
         return sequence.Sequence(predsyms, self.outp_alpha)
示例#2
0
def saveConsensus(aln, theta2=0.01, countgaps=False, filename=None):
    """ Display a table with rows for each alignment column, showing
        symbols in order of decreasing probability.
        theta2 is the percent threshold (0.01 is 1 percent) for inclusion (symbols below are ignored).
        countgaps, if true, count gaps (default false).
        filename is name of file to save the output to (default stdout)."""
    if filename == None:
        f = sys.stdout
    else:  # assume filename is a textstring, which is first cleaned from strange characters
        filename = ''.join(e for e in filename
                           if e.isalnum() or e == '_' or e == '.')
        f = open(filename, 'w')
    for col in range(aln.alignlen):
        # collect probabilities for column, with or without gap
        myalpha = aln.alphabet
        if countgaps:
            alist = list(aln.alphabet)
            alist.append('-')
            myalpha = sequence.Alphabet(alist)
        d = prob.Distrib(myalpha)
        for seq in aln.seqs:
            if seq[col] in myalpha:
                d.observe(seq[col])
        symprobs = d.getProbsort()  # the symbols sorted by probability
        ninclusions = 0
        for (s, p) in symprobs:
            if p >= theta2:
                ninclusions += 1
            else:
                break
        if ninclusions > 1:
            f.write("%d:" % (col + 1))
            for (s, p) in symprobs:
                if p >= theta2:
                    f.write("%c%02d" % (s, int((p * 100))))
            f.write(" ")
    f.write('\n')
    f.close()
示例#3
0
def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'):
    """ Produce a plot for a scan of the specified motif.
        The plot has as its x-axis position of sequence, and
        the y-axis the cumulative, non-negative PWM score over all sequences. """
    # check that all sequences are the same length and set sequence length
    seq_len = len(seqs[0])
    for seq in seqs:
        if len(seq) != seq_len:
            usage(sys.argv[0], "All sequences must have same length")
            return

    # create the motif and its reverse complemennt
    bg = prb.Distrib(sym.DNA_Alphabet, sequence.getCount(seqs))
    d = prb.readMultiCounts(jaspar)
    try:
        fg1 = d[motif]
        fg2 = getReverse(d[motif])
    except KeyError:
        usage(sys.argv[0], "Unknown motif %s" % motif)
        return
    print("Motif %s:" % motif)
    pwm1 = sequence.PWM(fg1, bg)
    pwm1.display(format='JASPAR')
    print("Motif %s (reverse complement):" % motif)
    pwm2 = sequence.PWM(fg2, bg)
    pwm2.display(format='JASPAR')

    # initialize things to zero
    avg_motif_score = np.zeros(seq_len)

    # compute average score at each position (on both strands) in sequences
    i_seq = 0
    motif_width = pwm1.length
    for seq in seqs:
        i_seq += 1
        # print >> sys.stderr, "Scoring seq: %4d\r" % (i_seq),

        # positive strand
        hits = pwm1.search(seq, threshold)
        pos_scores = seq_len * [0]
        for hit in hits:
            # mark hit at *center* of site (hence motif_width/2)
            pos_scores[hit[0]+(motif_width/2)] = hit[2]

        # negative strand
        hits = pwm2.search(seq, threshold)
        neg_scores = seq_len * [0]
        for hit in hits:
            neg_scores[hit[0]+(motif_width/2)] = hit[2]

        # use maximum score on two strands
        for i in range(seq_len):
            score = max(pos_scores[i], neg_scores[i])
            if (score > threshold):
                avg_motif_score[i] += score

    # compute average score
    for i in range(seq_len):
        avg_motif_score[i] /= len(seqs)

    # hw = 5 # window width is 2*hw + 1
    # smoothed_avg_motif_score = np.zeros(seq_len)
    # for i in range(hw, seq_len-motif_width+1-hw):
    #    smoothed_avg_motif_score[i]=sum(avg_motif_score[i-hw:i+hw+1])/(2*hw+1)

    # plot the average score curve
    # print >> sys.stderr, ""
    x = list(range(-(seq_len/2), (seq_len/2)))    # call center of sequence X=0
    lbl = "%s" % (motif)
    plt.plot(x, avg_motif_score, label=lbl)
    #plt.plot(x, smoothed_avg_motif_score, label=lbl)
    plt.axhline(color='black', linestyle='dotted')
    plt.legend(loc='lower center')
    plt.xlabel('position')
    plt.ylabel('average motif score')
    plt.title(motif)
    plt.show()
示例#4
0
def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices.txt', seed=0):
    """ Produce a plot for a scan of the specified motif.
        The plot has as its x-axis position of sequence, and
        the y-axis the number of sequences with a best hit at position x.
        Sequences with no hit above 'threshold' are ignored.
        Ties for best hit are broken randomly.
        The p-value of the central region that is most "centrally enriched"
        and the width of the best central region is printed in the label
        of the plot.
    """

    # set the random seed for repeatability
    random.seed(seed)

    # Copy the code from your "improved" version of scanMotifReport()
    # to here, and follow the instructions in the Prac to develop this
    # new function.

    # check that all sequences are the same length and set sequence length
    seq_len = len(seqs[0])
    for seq in seqs:
        if len(seq) != seq_len:
            usage(sys.argv[0], "All sequences must have same length")
            return

    # create the motif and its reverse complemennt
    bg = prb.Distrib(sym.DNA_Alphabet, sequence.getCount(seqs))
    d = prb.readMultiCounts(jaspar)
    try:
        fg1 = d[motif]
        fg2 = getReverse(d[motif])
    except KeyError:
        usage(sys.argv[0], "Unknown motif %s" % motif)
        return
    print("Motif %s:" % motif)
    pwm1 = sequence.PWM(fg1, bg)
    pwm1.display(format='JASPAR')
    print("Motif %s (reverse complement):" % motif)
    pwm2 = sequence.PWM(fg2, bg)
    pwm2.display(format='JASPAR')

    # initialize things to zero
    hit_count = np.zeros(seq_len)
    n_seqs_with_hits = 0.0

    # Scan each sequence for all hits on both strands and record
    # the number of "best hits" at each sequence position.
    #
    motif_width = pwm1.length
    i_seq = 0
    for seq in seqs:
        i_seq += 1
        # print >> sys.stderr, "Scoring seq: %4d\r" % (i_seq),
        # scan with both motifs
        hits = pwm1.search(seq, threshold) + pwm2.search(seq, threshold)
        # Record position of best hit
        if (hits):
                n_seqs_with_hits += 1
                # find best hit score
                best_score = max(hits, key=operator.itemgetter(1))[2]
                # find ties
                best_hits = [ hit for hit in hits if hit[2] == best_score ]
                # break ties at random
                best_hit = random.choice(best_hits)
                # mark hit at *center* of site (hence pwm1.length/2)
                hit_count[best_hit[0] + pwm1.length/2] += 1
    # divide number of sequences with hit by total number of hits
    site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ]

    print("Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits), file=sys.stderr)

    # STATISTICS
    # Get the cumulative hit counts in concentric windows
    # and perform the Binomial Test.  Report best region and its p-value.
    #
    best_r = 0
    best_log_pvalue = 1
    center = seq_len/2                  # center of sequence
    cum_hit_count = np.zeros(seq_len)   # total hits in window of width i
    for i in range(1, (seq_len - pwm1.length/2 + 1)/2):
        cum_hit_count[i] = cum_hit_count[i-1] + hit_count[center-i] + hit_count[center+i]
        # Compute probability of observed or more best hits in central window
        # assuming uniform probability distribution in each sequence.
    #   successes = cum_hit_count[i]
    #   trials = n_seqs_with_hits
    #    p_success = ?
    #    log_pvalue = ?
    #    if (log_pvalue < best_log_pvalue):
    #        best_log_pvalue = log_pvalue
    #        best_r = 2*i
    # End STATISTICS

    hw = 5
    smoothed_site_probability = np.zeros(seq_len)
    for i in range(hw, seq_len-motif_width+1-hw):
        smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1)

    x = list(range(-(seq_len/2), (seq_len/2)))        # call center of sequence X=0
    lbl = "%s, t=%.2f" % (motif, threshold)
    #lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue))
    plt.plot(x, smoothed_site_probability, label=lbl)
    plt.axhline(color='black', linestyle='dotted')
    plt.legend(loc='lower center')
    plt.xlabel('Position of best site')
    plt.ylabel('Smoothed probability')
    plt.title(motif)
    plt.show()
示例#5
0
    def discover(self, pseudocount=None, niter=None):
        """ Find the most probable common pattern represented by a
            position weight matrix (PWM), based on W+1 distributions
            pseudocount: the distribution used for pseudo-counts (default is uniform)
            niter: number of iterations (if None, 100*N is used; where N is number of seqs).
        """
        """ Initialise parameters necessary for the discovery run (below) """
        N = len(self.seqs)  # number of sequences 1..N
        seqs = self.seqs
        W = self.length  # motif width
        """ background that will be used as pseudo-counts """
        pseudocount = pseudocount or prob.Distrib(self.alphabet, 1.0)
        """ q: the foreground distribution (specifying the W distributions in aligned columns)
            p: the background distribution (for non-aligned positions in all sequences) """
        q = [prob.Distrib(self.alphabet, pseudocount) for _ in range(W)]
        p = prob.Distrib(self.alphabet, pseudocount)
        a = self.alignment

        new_z = random.randint(0, N - 1)  # pick a random sequence to withhold
        for k in range(N):
            if k != new_z:
                k_len = len(seqs[k])  # length of current seq
                offset = 0
                for i in range(k_len):
                    if i >= a[k] and i < a[k] + W:  # within pattern
                        q[offset].observe(seqs[k][i])
                        offset += 1
                    else:  # outside pattern
                        p.observe(seqs[k][i])
        """ Main loop: predictive update step THEN sampling step, repeat... """
        niter = niter or 100 * N  # use specified number of iterations or default
        for round in range(niter):
            """ Predictive update step:
                One of the N sequences are chosen at random: z.
                We will not use it in the profile, nor background so we
                exclude it from our counts. """
            prev_z = new_z
            new_z = random.randint(0, N - 1)
            # q's and p's are updated from current a's and all sequences except z,
            # which is the same as use old q's and p's and subtract z's contribs...
            offset = 0
            for i in range(len(seqs[new_z])):
                if i >= a[new_z] and i < a[new_z] + W:  # within pattern
                    q[offset].observe(seqs[new_z][i], -1)  # subtract the count
                    offset += 1
                else:  # outside pattern
                    p.observe(seqs[new_z][i], -1)  # subtract the count
            # ... and add back the previous and now updated z
            offset = 0
            for i in range(len(seqs[prev_z])):
                if i >= a[prev_z] and i < a[prev_z] + W:  # within pattern
                    q[offset].observe(seqs[prev_z][i], +1)  # add the count
                    offset += 1
                else:  # outside pattern
                    p.observe(seqs[prev_z][i], +1)  # add the count
            """ Sampling step:
                Consider each position x in z as a match: find a weight Ax """
            z_len = len(seqs[new_z])  # length of seq z
            A = [0.0 for _ in range(z_len)]
            Asum = 0.0
            for x in range(z_len - W +
                           1):  # look at all starts for a W-wide pattern
                Px = 1.0
                Qx = 1.0
                for w in range(W):
                    Px *= p[seqs[new_z][x + w]]
                    Qx *= q[w][seqs[new_z][x + w]]
                try:
                    A[x] = Qx / Px
                except ZeroDivisionError:
                    pass
                Asum += A[x]
            for x in range(z_len - W +
                           1):  # score all starts for a W-wide pattern
                A[x] /= Asum  # normalise so that all Ax's sum to 1.0
            # Pick the next a[z], with a probability proportional to Ax
            pick = random.random()  # any value between 0 and 1
            cumul = 0.0  # cumulative probability
            for x in range(z_len - W + 1):  # check starts for a W-wide pattern
                cumul += A[x]
                if pick <= cumul:  # check if our random pick is smaller than the cumulative prob
                    a[new_z] = x
                    break
            """ Evaluate data log-likelihood """
            if round % 100 == 0:  # but only every 100th round
                LL = 0.0
                for k in range(N):
                    Pk = 1.0
                    Qk = 1.0
                    for w in range(W):
                        Pk *= p[seqs[k][a[k] + w]]
                        Qk *= q[w][seqs[k][a[k] + w]]
                    try:
                        LL += math.log(Qk / Pk)
                    except ZeroDivisionError:
                        pass
                print "LL @ %5d=\t%5.2f" % (round, LL)

        # end main for-loop
        self.q = q
        self.p = p
        self.alignment = a
        return q