def _main(args): if len(args) != 3: print("usage: xls_get_region_from_fasta.py <fasta> <xls> <window>") sys.exit(1) win = int(args[2]) fasta = fasta_subseq_2.FastaDB() fasta.openFastaFile(args[0]) seqs = [] for ln in open(args[1]): sp = ln[:-1].split() print sp pk = int(sp[1]) + int(sp[4]) seq = fasta[sp[0]]['sequence'][(pk - win):(pk + win)] get_in = sp[ -1] #raw_input(">%s:%d..%d\n\'k\'=keep; \'r\' = reverse comp; \'<anything else>\' = discard: " % (sp[0],pk-win,pk+win)) if get_in == 'k': pass elif get_in == 'r': seq = fasta_subseq_2.revcomp(seq) else: continue seqs.append(">%s:%d..%d\n%s" % (sp[0], pk - win, pk + win, seq)) outfile = raw_input("name of output file: ") outfh = open(outfile, "w") for s in seqs: print >> outfh, s
def get_enrich(xls_regs, sgr, winsize, fasta): fasta_db = fasta_subseq_2.FastaDB() fasta_db.openFastaFile(fasta) for reg in xls_regs: for x in ('mtx1_hits', 'mtx2_hits'): hit_info = [] for h in reg[x]: hit = {'hit_obj': h} width = abs(h['start'] - h['end']) if h['strand'] == "+": seq = fasta_db[h['chr']]['sequence'][h['start']:( h['start'] + width)] hit['loc'] = h['start'] else: ### !!!!! CHANGE IF FIX HIT DATABASE!!!! seq = fasta_subseq_2.revcomp( fasta_db[h['chr']]['sequence'][h['end']:(h['end'] + width)]) hit['loc'] = h['end'] hit['nearest'] = (0, 0) hit['vals'] = [] hit['seq'] = seq hit_info.append(hit) reg[x + '_info'] = hit_info for y in open(sgr): (chr, loc, val) = y.split() loc = int(loc) val = int(val) #print chr for x in xls_regs: for hit_info in ('mtx1_hits_info', 'mtx2_hits_info'): for d in x[hit_info]: #print (loc,target_loc) target_loc = d['loc'] if (chr == d['hit_obj']['chr']) and ( abs(loc - target_loc) < abs(loc - d['nearest'][0])): d['nearest'] = (loc, val) if (chr == d['hit_obj']['chr']) and (abs(loc - target_loc) < (winsize / 2)): d['vals'].append(val) print >> sys.stderr, d for x in xls_regs: for hit_info in ('mtx1_hits_info', 'mtx2_hits_info'): for h in x[hit_info]: h['win_mean'] = np.mean(h['vals']) h['win_median'] = np.median(h['vals']) h['enrich_md'] = h['nearest'][1] / h['win_median'] h['enrich_mn'] = h['nearest'][1] / h['win_mean'] print >> sys.stderr, h return xls_regs
def _main(args): if len(args) < 1: print "usage: revcomseq.py <fasta>" sys.exit(0) sq = "" header = "" for l in open(args[0]): if re.search("^>",l): header = l[:-1] + "_rc" sq="" if (sq): tr_seq = re.sub("\n","",sq) print "%s\n%s" % (header,revcomp(tr_seq)) else: sq += l tr_seq = re.sub("\n","",sq) print "%s\n%s" % (header,revcomp(tr_seq))
def _main(args): if len(args) < 4: print >> sys.stderr, "usage: xls_motif_window.py <xls> <fasta> <matrix_file> <window>" sys.exit(1) fasta = fasta_subseq_2.FastaDB() fasta.openFastaFile(args[1]) xls_regions = [] for x in open(args[0]): spl = x[:-1].split() region = { 'chr': spl[0], 'start': int(spl[1]), 'end': int(spl[2]), 'enrich': spl[7] } region['seq'] = fasta[ region['chr']]['sequence'][region['start']:region['end']] xls_regions.append(region) for r in xls_regions: try: annot = patser_tools.makePatserAnnotation(sequence=r['seq'], matrix=args[2]) except IOError: print >> sys.stderr, "Error in seq %s:%d..%d:" % ( r['chr'], r['start'], r['end']) continue if len(annot.getAllFeatures()) < 1: continue maxhit = annot.getMaxFeature("score") winstart = None winend = None winseq = None if maxhit.tags["strand"] == '+': winstart = r['start'] + (maxhit.start - int(args[3]) / 2) winend = r['start'] + (maxhit.start + int(args[3]) / 2) win_seq = fasta[r['chr']]['sequence'][winstart:winend] else: winstart = r['start'] + ((maxhit.end - 3) - int(args[3]) / 2) winend = r['start'] + ((maxhit.end - 3) + int(args[3]) / 2) win_seq = fasta_subseq_2.revcomp( fasta[r['chr']]['sequence'][winstart:winend]) print ">%s:%d..%d:%s enr=%s mtx=%s" % ( r['chr'], winstart, winend, maxhit.tags['strand'], r['enrich'], maxhit.tags['score']) print win_seq
def run_aln_mapline(map_line): files = [] for (sp, fasta) in map_line['fastas']: #print >> sys.stderr, (map_line['map_idx'],'a') sp_map = map_line['map_dict'][sp] #print >> sys.stderr, (map_line['map_idx'],'b') outname = str(map_line['map_idx']) + "_" + sp + ".fa" out = open(outname, "w") #print >> sys.stderr, (map_line['map_idx'],'c') print >> sys.stderr, sp_map print >> sys.stderr, sp seq = fasta[sp_map['chr']][sp_map['start']:sp_map['end'] - 1] #print >> sys.stderr, (map_line['map_idx'],'d') if sp_map['strand'] == "-": print sp + " revcomp" seq = fasta_subseq_2.revcomp(seq) #print >> sys.stderr, (map_line['map_idx'],'e') print >> out, ">%s %s-%d:%d" % (sp, sp_map['chr'], sp_map['start'], sp_map['end']) print >> out, seq print "Wrote fasta %s" % (outname, ) #print (map_line['map_idx'],'f') out.close() files.append(outname) #print >> sys.stderr, (map_line['map_idx'],'g') call_pecan = [ "java", "-classpath", PECANPATH, "-Xmx2000m", "bp.pecan.Pecan", "-E", map_line['tree'] + ";", "-F" ] call_pecan.extend(files) call_pecan.extend(["-G", str(map_line['map_idx']) + ".mfa"]) print "Running pecan with command: %s" % (" ".join(call_pecan)) print "Starting alignment %d..." % (map_line['map_idx'], ) sub.check_call(call_pecan) print "Alignment %d finished" % (map_line['map_idx'], ) map(os.remove, files)
def _main(args): if len(args) != 3: print "usage: <bed_file> <seq_file> <matrix>" sys.exit(0) fasta = fasta_subseq_2.FastaDB() fasta.openFastaFile(args[1]) bed_annots = [] bed_in = open(args[0]) for line in bed_in: spl = line[:-1].split() fseq = fasta[spl[0]]["sequence"][int(spl[1]):int(spl[2])] if spl[5] == "-": fseq = fasta_subseq_2.revcomp(fseq) #print spl try: patannot = patser_tools.makePatserAnnotation(sequence=fseq, matrix=args[2]) except: continue #print "-" * 30 #print spl #print pp(patannot.getAllFeatures()) bed_annots.append({ "seq": spl[0] + "_" + spl[1] + "_" + spl[2], "annotation": patannot }) for ann in bed_annots: for feat in ann["annotation"].getAllFeatures(): print "%s\t%i\t%i\t%f\t%f\t%s" % ( ann["seq"], feat.st, feat.en, feat.tagset["score"], feat.tagset["pval"], feat.tagset["strand"])
def _main(args): if len(args) < 1: print "usage: patser_list.py [mtx1][mtx2] ... < seqs.fa" sys.exit(1) matrices = args seqs = sys.stdin hits = {} name = "" seq = "" pssms = convertFreqMtx(matrices) for s in seqs: nameres = re.search(">(\S+)", s) if nameres and not (name == ""): hits[name] = {'seq': seq} name = nameres.group(1) seq = "" elif nameres and (name == ""): #print "1" name = nameres.group(1) seq = "" else: seq += s[:-1] if not (name == ""): hits[name] = {'seq': seq} #print hits mtx_names = [] for (name, d) in hits.iteritems(): for mtx in matrices: hit_annot = patser_tools.makePatserAnnotation(sequence=d['seq'], matrix=mtx, seqname=name, scorecut=-100) features = hit_annot.getAllFeatures() hit = None if len(features) > 0: max = features[0] for x in features: if x.tags['score'] > max.tags['score']: max = x hit = max else: print >> sys.stderr, "Sequence %s: No hit for matrix %s in %s" % ( name, mtx, d['seq']) continue #print hit d[hit.tags['motif_name']] = hit if hit.tags['motif_name'] not in mtx_names: mtx_names.append(hit.tags['motif_name']) #print hits print "name\tsequence\t", for x in mtx_names: print "%s_score\t%s_pval\t%s_PSSM_score" % (x, x, x), print "" for (name, h) in hits.iteritems(): matrices = [x for x in h.keys() if not x == 'seq'] print "%s\t%s\t" % (name, h['seq']), for x in matrices: print str(h[x].tags['score']) + "\t", if 'pval' in h[x].tags.keys(): print str(h[x].tags['pval']) + "\t", else: print "-", pssm_scores = [ scoreSeq(pssms[x], h['seq']), scoreSeq(pssms[x], fasta_subseq_2.revcomp(h['seq'])) ] score = None if pssm_scores[1] > pssm_scores[0]: score = pssm_scores[1] else: score = pssm_scores[0] print str(score) + "\t", print ""