def run_infernal(cmfile, rnd, seqs, outfolder, cpus=1, score=0.0, calibrate=False): if exists("%s/R%ihits.fna" % (outfolder, rnd)): return if not exists(cmfile): raise IOError("cmfile path provided does not exist: %s" % cmfile) params = { '--mid': True, '--Fmid': 0.0002, '--notrunc': True, '--toponly': True, '--cpu': cpus } # '-g': True, if calibrate: calibrate_file(cmfile, cpus=cpus) result = cmsearch_from_file(cmfile, seqs, RNA, cutoff=score, params=params) with open("%s/R%ihits.fna" % (outfolder, rnd), 'w') as fout: for hit in result: fout.write(">%s score:%0.1f e-val:%f\n%s\n" % (hit[0], hit[14], hit[15], seqs.getSeq(hit[0]))) if exists("%s/log.txt" % outfolder): with open("%s/log.txt" % outfolder, 'a') as fout: fout.write("Round %i: %i hits\n" % (rnd, len(result)))
def run_infernal(lock, cmfile, rnd, basefolder, outfolder, cpus=1, score=0.0, mpi=False): try: seqs = 0 #Only search unique sequences to save time #check if previous run has removed some sequences, load correct file if exists(basefolder + "R" + str(rnd) + "/R" + str(rnd) + "-Unique-Remaining.fasta"): seqs = LoadSeqs(basefolder + "R" + str(rnd) + "/R" + str(rnd) + "-Unique-Remaining.fasta", moltype=RNA, aligned=False) else: seqs = LoadSeqs(basefolder + "R" + str(rnd) + "/R" + str(rnd) + "-Unique.fasta", moltype=RNA, aligned=False) params = {'--mid': True, '--Fmid': 0.0002, '--notrunc': True, '--toponly': True, '--cpu': cpus} # '-g': True, if mpi: params['mpi'] = True result = cmsearch_from_file(cmfile, seqs, RNA, cutoff=score, params=params) fout = open(outfolder + "/R" + str(rnd) + "hits.txt", 'w') fout.write(str(len(result)) + " hits\nheader,bitscore,e-value\n") for hit in result: fout.write(hit[0] + "," + str(hit[14]) + "," + str(hit[15]) + "\n") fout.close() lock.acquire() fout = open(outfolder + "/log.txt", 'a') fout.write("Round " + str(rnd) + ": " + str(len(result)) + " hits\n") fout.close() lock.release() except Exception, e: print str(e) lock.release()
alnout = open(otufolder + "/locarnap-aln.sto", "w") struct_dict = {"SS_cons": struct} alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict)) alnout.close() print struct # CLUSTER THE SECONDA print "Creating CM and running Infernal over all rounds" # create the cm file. Could call cmsearch_from_alignment but dont want to build # cm file multiple times since is time consuming and processor intensive cmfile = cmbuild_from_alignment(aln, struct, calibrate=True) for i in range(7, 0, -1): # run cmsearch over every round of SELEX # Only search unique sequences to save time seqs = LoadSeqs( "/Users/Ely/Desktop/Ely_selection/R" + str(i) + "/R" + str(i) + "-Unique.fasta", moltype=RNA, aligned=False, ) score = log2(seqs.getNumSeqs() * len(seqs)) print "R" + str(i) + " (" + str(score) + "):" args = {"--toponly": True} result = cmsearch_from_file(cmfile.name, seqs, RNA, cutoff=score, params=args) print str(len(result)) + " hits" fout = open(otufolder + "/R" + str(i) + "hits.txt", "w") fout.write("header,bitscore,e-value\n") for hit in result: fout.write(hit[0] + "," + str(hit[14]) + "," + str(hit[15]) + "\n") fout.close() # remove found sequences from the round files
# cm file multiple times since is time consuming and processor intensive cmfile = cmbuild_from_alignment(aln, struct, calibrate=True) for i in range(7, 0, -1): # run cmsearch over every round of SELEX # Only search unique sequences to save time seqs = 0 # check if previous run has removed some sequences, load correct file if exists(args.f + "R" + str(i) + "/R" + str(i) + "-Unique-Remaining.fasta"): print "Previous round run detected, runnning over remaining seqs" seqs = LoadSeqs( args.f + "R" + str(i) + "/R" + str(i) + "-Unique-Remaining.fasta", moltype=RNA, aligned=False ) else: seqs = LoadSeqs(args.f + "R" + str(i) + "/R" + str(i) + "-Unique.fasta", moltype=RNA, aligned=False) result = cmsearch_from_file( cmfile.name, seqs, RNA, cutoff=infernalscore, params={"--toponly": True, "--cpu": args.c} ) print "R" + str(i) + ": " + str(len(result)) + " hits" fout = open(currotufolder + "/R" + str(i) + "hits.txt", "w") fout.write("header,bitscore,e-value\n") for hit in result: fout.write(hit[0] + "," + str(hit[14]) + "," + str(hit[15]) + "\n") fout.close() # clean up by removing cm file remove(cmfile.name) # remove found sequences from the round files print "Runtime: " + str(time() - secs) + "m" currgroup += 1 # skip group if less than 100 sequences else: print "Group has less than 100 sequences, skipping infernal run"