def predict(self, inpseq, useCascade = True): """ Classify each symbol in a sequence. Return the predictions as a list of symbols. """ W = self.nn1.ninput / len(self.inp_alpha) if useCascade and self.cascade: nn1seq = self.predict(inpseq, useCascade = False) subseqs = slidewin(nn1seq, self.cascade) predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions for i in range(len(subseqs)): # for each input sub-sequence of the primary NN input = numpy.zeros(self.cascade * len(self.outp_alpha)) input[_onehotIndex(self.outp_alpha, subseqs[i])] = 1 outvec = self.nn2.feedforward(input) d = prob.Distrib(self.outp_alpha) for k in range(len(outvec)): d.observe(self.outp_alpha[k], outvec[k]) predsyms[i + self.cascade / 2] = d.getmax() # use the symbol with the highest probability return sequence.Sequence(predsyms, self.outp_alpha) else: # only predict using the first NN subseqs = slidewin(inpseq, W) predsyms = ['C' for _ in range(len(inpseq))] # use coil for positions in flanking regions for i in range(len(subseqs)): # for each input sub-sequence of the primary NN input = numpy.zeros(self.inp_len * len(self.inp_alpha)) input[_onehotIndex(self.inp_alpha, subseqs[i])] = 1 outvec = self.nn1.feedforward(input) d = prob.Distrib(self.outp_alpha) for k in range(len(outvec)): d.observe(self.outp_alpha[k], outvec[k]) predsyms[i + W / 2] = d.getmax() # use the symbol with the highest probability return sequence.Sequence(predsyms, self.outp_alpha)
def saveConsensus(aln, theta2=0.01, countgaps=False, filename=None): """ Display a table with rows for each alignment column, showing symbols in order of decreasing probability. theta2 is the percent threshold (0.01 is 1 percent) for inclusion (symbols below are ignored). countgaps, if true, count gaps (default false). filename is name of file to save the output to (default stdout).""" if filename == None: f = sys.stdout else: # assume filename is a textstring, which is first cleaned from strange characters filename = ''.join(e for e in filename if e.isalnum() or e == '_' or e == '.') f = open(filename, 'w') for col in range(aln.alignlen): # collect probabilities for column, with or without gap myalpha = aln.alphabet if countgaps: alist = list(aln.alphabet) alist.append('-') myalpha = sequence.Alphabet(alist) d = prob.Distrib(myalpha) for seq in aln.seqs: if seq[col] in myalpha: d.observe(seq[col]) symprobs = d.getProbsort() # the symbols sorted by probability ninclusions = 0 for (s, p) in symprobs: if p >= theta2: ninclusions += 1 else: break if ninclusions > 1: f.write("%d:" % (col + 1)) for (s, p) in symprobs: if p >= theta2: f.write("%c%02d" % (s, int((p * 100)))) f.write(" ") f.write('\n') f.close()
def scanMotifReport(seqs, motif, threshold=0, jaspar = 'JASPAR_matrices.txt'): """ Produce a plot for a scan of the specified motif. The plot has as its x-axis position of sequence, and the y-axis the cumulative, non-negative PWM score over all sequences. """ # check that all sequences are the same length and set sequence length seq_len = len(seqs[0]) for seq in seqs: if len(seq) != seq_len: usage(sys.argv[0], "All sequences must have same length") return # create the motif and its reverse complemennt bg = prb.Distrib(sym.DNA_Alphabet, sequence.getCount(seqs)) d = prb.readMultiCounts(jaspar) try: fg1 = d[motif] fg2 = getReverse(d[motif]) except KeyError: usage(sys.argv[0], "Unknown motif %s" % motif) return print("Motif %s:" % motif) pwm1 = sequence.PWM(fg1, bg) pwm1.display(format='JASPAR') print("Motif %s (reverse complement):" % motif) pwm2 = sequence.PWM(fg2, bg) pwm2.display(format='JASPAR') # initialize things to zero avg_motif_score = np.zeros(seq_len) # compute average score at each position (on both strands) in sequences i_seq = 0 motif_width = pwm1.length for seq in seqs: i_seq += 1 # print >> sys.stderr, "Scoring seq: %4d\r" % (i_seq), # positive strand hits = pwm1.search(seq, threshold) pos_scores = seq_len * [0] for hit in hits: # mark hit at *center* of site (hence motif_width/2) pos_scores[hit[0]+(motif_width/2)] = hit[2] # negative strand hits = pwm2.search(seq, threshold) neg_scores = seq_len * [0] for hit in hits: neg_scores[hit[0]+(motif_width/2)] = hit[2] # use maximum score on two strands for i in range(seq_len): score = max(pos_scores[i], neg_scores[i]) if (score > threshold): avg_motif_score[i] += score # compute average score for i in range(seq_len): avg_motif_score[i] /= len(seqs) # hw = 5 # window width is 2*hw + 1 # smoothed_avg_motif_score = np.zeros(seq_len) # for i in range(hw, seq_len-motif_width+1-hw): # smoothed_avg_motif_score[i]=sum(avg_motif_score[i-hw:i+hw+1])/(2*hw+1) # plot the average score curve # print >> sys.stderr, "" x = list(range(-(seq_len/2), (seq_len/2))) # call center of sequence X=0 lbl = "%s" % (motif) plt.plot(x, avg_motif_score, label=lbl) #plt.plot(x, smoothed_avg_motif_score, label=lbl) plt.axhline(color='black', linestyle='dotted') plt.legend(loc='lower center') plt.xlabel('position') plt.ylabel('average motif score') plt.title(motif) plt.show()
def scanMotifReport_new(seqs, motif, threshold=3.4567, jaspar = 'JASPAR_matrices.txt', seed=0): """ Produce a plot for a scan of the specified motif. The plot has as its x-axis position of sequence, and the y-axis the number of sequences with a best hit at position x. Sequences with no hit above 'threshold' are ignored. Ties for best hit are broken randomly. The p-value of the central region that is most "centrally enriched" and the width of the best central region is printed in the label of the plot. """ # set the random seed for repeatability random.seed(seed) # Copy the code from your "improved" version of scanMotifReport() # to here, and follow the instructions in the Prac to develop this # new function. # check that all sequences are the same length and set sequence length seq_len = len(seqs[0]) for seq in seqs: if len(seq) != seq_len: usage(sys.argv[0], "All sequences must have same length") return # create the motif and its reverse complemennt bg = prb.Distrib(sym.DNA_Alphabet, sequence.getCount(seqs)) d = prb.readMultiCounts(jaspar) try: fg1 = d[motif] fg2 = getReverse(d[motif]) except KeyError: usage(sys.argv[0], "Unknown motif %s" % motif) return print("Motif %s:" % motif) pwm1 = sequence.PWM(fg1, bg) pwm1.display(format='JASPAR') print("Motif %s (reverse complement):" % motif) pwm2 = sequence.PWM(fg2, bg) pwm2.display(format='JASPAR') # initialize things to zero hit_count = np.zeros(seq_len) n_seqs_with_hits = 0.0 # Scan each sequence for all hits on both strands and record # the number of "best hits" at each sequence position. # motif_width = pwm1.length i_seq = 0 for seq in seqs: i_seq += 1 # print >> sys.stderr, "Scoring seq: %4d\r" % (i_seq), # scan with both motifs hits = pwm1.search(seq, threshold) + pwm2.search(seq, threshold) # Record position of best hit if (hits): n_seqs_with_hits += 1 # find best hit score best_score = max(hits, key=operator.itemgetter(1))[2] # find ties best_hits = [ hit for hit in hits if hit[2] == best_score ] # break ties at random best_hit = random.choice(best_hits) # mark hit at *center* of site (hence pwm1.length/2) hit_count[best_hit[0] + pwm1.length/2] += 1 # divide number of sequences with hit by total number of hits site_probability = [ (cnt/n_seqs_with_hits) for cnt in hit_count ] print("Number of sequences with hit (score >= %f): %d" % (threshold, n_seqs_with_hits), file=sys.stderr) # STATISTICS # Get the cumulative hit counts in concentric windows # and perform the Binomial Test. Report best region and its p-value. # best_r = 0 best_log_pvalue = 1 center = seq_len/2 # center of sequence cum_hit_count = np.zeros(seq_len) # total hits in window of width i for i in range(1, (seq_len - pwm1.length/2 + 1)/2): cum_hit_count[i] = cum_hit_count[i-1] + hit_count[center-i] + hit_count[center+i] # Compute probability of observed or more best hits in central window # assuming uniform probability distribution in each sequence. # successes = cum_hit_count[i] # trials = n_seqs_with_hits # p_success = ? # log_pvalue = ? # if (log_pvalue < best_log_pvalue): # best_log_pvalue = log_pvalue # best_r = 2*i # End STATISTICS hw = 5 smoothed_site_probability = np.zeros(seq_len) for i in range(hw, seq_len-motif_width+1-hw): smoothed_site_probability[i]=sum(site_probability[i-hw:i+hw+1])/(2*hw+1) x = list(range(-(seq_len/2), (seq_len/2))) # call center of sequence X=0 lbl = "%s, t=%.2f" % (motif, threshold) #lbl = "%s, t=%.2f, w=%d, p=%.2e" % (motif, threshold, best_r, math.exp(best_log_pvalue)) plt.plot(x, smoothed_site_probability, label=lbl) plt.axhline(color='black', linestyle='dotted') plt.legend(loc='lower center') plt.xlabel('Position of best site') plt.ylabel('Smoothed probability') plt.title(motif) plt.show()
def discover(self, pseudocount=None, niter=None): """ Find the most probable common pattern represented by a position weight matrix (PWM), based on W+1 distributions pseudocount: the distribution used for pseudo-counts (default is uniform) niter: number of iterations (if None, 100*N is used; where N is number of seqs). """ """ Initialise parameters necessary for the discovery run (below) """ N = len(self.seqs) # number of sequences 1..N seqs = self.seqs W = self.length # motif width """ background that will be used as pseudo-counts """ pseudocount = pseudocount or prob.Distrib(self.alphabet, 1.0) """ q: the foreground distribution (specifying the W distributions in aligned columns) p: the background distribution (for non-aligned positions in all sequences) """ q = [prob.Distrib(self.alphabet, pseudocount) for _ in range(W)] p = prob.Distrib(self.alphabet, pseudocount) a = self.alignment new_z = random.randint(0, N - 1) # pick a random sequence to withhold for k in range(N): if k != new_z: k_len = len(seqs[k]) # length of current seq offset = 0 for i in range(k_len): if i >= a[k] and i < a[k] + W: # within pattern q[offset].observe(seqs[k][i]) offset += 1 else: # outside pattern p.observe(seqs[k][i]) """ Main loop: predictive update step THEN sampling step, repeat... """ niter = niter or 100 * N # use specified number of iterations or default for round in range(niter): """ Predictive update step: One of the N sequences are chosen at random: z. We will not use it in the profile, nor background so we exclude it from our counts. """ prev_z = new_z new_z = random.randint(0, N - 1) # q's and p's are updated from current a's and all sequences except z, # which is the same as use old q's and p's and subtract z's contribs... offset = 0 for i in range(len(seqs[new_z])): if i >= a[new_z] and i < a[new_z] + W: # within pattern q[offset].observe(seqs[new_z][i], -1) # subtract the count offset += 1 else: # outside pattern p.observe(seqs[new_z][i], -1) # subtract the count # ... and add back the previous and now updated z offset = 0 for i in range(len(seqs[prev_z])): if i >= a[prev_z] and i < a[prev_z] + W: # within pattern q[offset].observe(seqs[prev_z][i], +1) # add the count offset += 1 else: # outside pattern p.observe(seqs[prev_z][i], +1) # add the count """ Sampling step: Consider each position x in z as a match: find a weight Ax """ z_len = len(seqs[new_z]) # length of seq z A = [0.0 for _ in range(z_len)] Asum = 0.0 for x in range(z_len - W + 1): # look at all starts for a W-wide pattern Px = 1.0 Qx = 1.0 for w in range(W): Px *= p[seqs[new_z][x + w]] Qx *= q[w][seqs[new_z][x + w]] try: A[x] = Qx / Px except ZeroDivisionError: pass Asum += A[x] for x in range(z_len - W + 1): # score all starts for a W-wide pattern A[x] /= Asum # normalise so that all Ax's sum to 1.0 # Pick the next a[z], with a probability proportional to Ax pick = random.random() # any value between 0 and 1 cumul = 0.0 # cumulative probability for x in range(z_len - W + 1): # check starts for a W-wide pattern cumul += A[x] if pick <= cumul: # check if our random pick is smaller than the cumulative prob a[new_z] = x break """ Evaluate data log-likelihood """ if round % 100 == 0: # but only every 100th round LL = 0.0 for k in range(N): Pk = 1.0 Qk = 1.0 for w in range(W): Pk *= p[seqs[k][a[k] + w]] Qk *= q[w][seqs[k][a[k] + w]] try: LL += math.log(Qk / Pk) except ZeroDivisionError: pass print "LL @ %5d=\t%5.2f" % (round, LL) # end main for-loop self.q = q self.p = p self.alignment = a return q