def fold_groups(seqs, struct, hold, numkept=None): '''Function for multithreading. Recomputes structure for a group''' try: aln, currstruct = bayesfold(seqs) if currstruct == "": currstruct = struct hold[currstruct] = structgroups[struct] except Exception, e: print str(e) stdout.flush()
def fold_clusters(lock, cluster, seqs, otufolder): '''Function for multithreading. Computes structure for a cluster and writes it to file''' aln, struct = bayesfold(seqs, params={"-diags": True}) gshape = get_shape(struct) #write structure out to file lock.acquire() cfo = open(otufolder + "cluster_structs.fasta", 'a') cfo.write(">" + cluster + " " + gshape + "\n" + struct + "\n") cfo.close() #print cluster + ": " + struct #stdout.flush() lock.release()
def fold_clusters(lock, cluster, seqs, otufile): '''Function for multithreading. Computes structure for a cluster and writes it to file''' aln, struct = bayesfold(seqs, params={"-diags": True}) # write structure out to file try: lock.acquire() cfo = open(otufile, 'a') cfo.write(">%s\n%s\n" % (cluster, struct)) cfo.write(aln.toFasta() + "\n") cfo.close() lock.release() except Exception: lock.release()
def run_fold_for_infernal(currgroup, groupfasta, basefolder, minseqs=1): '''Function for multithreading. Creates the final BayesFold alignment and writes to files, then r2r struct''' try: #run locana-p on the superclusters to get alignment and structure #skip if already run and program just crashed or whatever currotufolder = basefolder + "group_" + str(currgroup) if exists(currotufolder): return "" seqs = [] count = 0 out = "group " + str(currgroup) + ": " for header, seq in MinimalFastaParser(open(groupfasta, 'rU')): seqs.append((header.split()[0] + "_" + header.split("_")[1], seq)) count += int(header.split("_")[1]) out += "\n" + str(count) + " sequences\n" if count < minseqs: return "" stdout.flush() #hard limit of 500 sequences to align and fold for memory reasons if len(seqs) > 500: seqs = seqs[:500] #run BayesFold on sequences in the group #maxiters set to 5 because should have huge amount of sequences for some groups aln, struct = bayesfold(seqs, params={"-diags": True}) #create output folder for group mkdir(currotufolder) out += str(aln.getNumSeqs()) + " unique sequences\n" out += "Structure: " + struct + "\n" #write out alignment and structure in fasta and stockholm formats #write that shit logout = open(currotufolder + "/log.txt", 'w') logout.write(out) logout.close() alnout = open(currotufolder + "/bayesfold-aln.fasta", 'w') alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n") alnout.close() alnout = open(currotufolder + "/bayesfold-aln.sto", 'w') struct_dict = {'SS_cons': struct} alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict)) alnout.close() #make R2R secondary structure for alignment make_r2r(currotufolder + "/bayesfold-aln.sto", currotufolder, "group_" + str(currgroup)) except Exception, e: print str(e) stdout.flush()
def fold_clusters(lock, cluster, seqs, otufolder): '''Function for multithreading. Computes structure for a cluster and writes it to file''' try: aln, struct = bayesfold(seqs) #write structure out to file lock.acquire() cfo = open(otufolder + "cluster_structs.fasta", 'a') cfo.write(">" + cluster + "\n" + struct + "\n") cfo.close() lock.release() #print cluster + ": " + struct #stdout.flush() except Exception, e: cluster, struct, "\nERROR!" stdout.flush() lock.release()
def run_fold_for_infernal(currgroup, groupfasta, basefolder, minseqs=1): '''Function for multithreading creates the final BayesFold alignment and writes to files, then r2r struct''' try: #run locana-p on the superclusters to get the alignment and consensus structure #skip if already run and program just crashsed or whatever currotufolder = basefolder + "group_" + str(currgroup) if exists(currotufolder): return "" seqs = [] count = 0 out = "group " + str(currgroup) + ": " for header, seq in MinimalFastaParser(open(groupfasta, 'rU')): seqs.append((header.split()[0] + "_" + header.split("_")[1], seq)) count += int(header.split("_")[1]) out += "\n" + str(count) + " sequences\n" if count < minseqs: return "" #make sure group has enough sequences before continuing #run BayesFold on the at most 50 most abundant sequences in the group aln, struct = bayesfold(seqs) #create output folder for group mkdir(currotufolder) out += str(aln.getNumSeqs()) + " unique sequences\n" out += "Structure: " + struct + "\n" #write out alignment and structure in fasta and stockholm formats #write that shit logout = open(currotufolder + "/log.txt", 'w') logout.write(out) logout.close() alnout = open(currotufolder + "/bayesfold-aln.fasta", 'w') alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n") alnout.close() alnout = open(currotufolder + "/bayesfold-aln.sto", 'w') struct_dict = {'SS_cons': struct} alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict)) alnout.close() #make R2R secondary structure for alignment make_r2r(currotufolder + "/bayesfold-aln.sto", currotufolder, "group_" + str(currgroup)) except Exception, e: print str(e) stdout.flush()
def fold_clusters(lock, cluster, seqs, otufolder): '''Function for multithreading. Computes structure for a cluster and writes it to file''' #assuming that the fasta has 10 or more sequences in it. Safe assumption #if this is a significant cluster #only using 10 because this is initial structure calc so needs to be fast #and this gives a very good approximation for the initial rnaforester grouping try: aln, struct = bayesfold(seqs) #write structure out to file lock.acquire() cfo = open(otufolder + "cluster_structs.fasta", 'a') cfo.write(">" + cluster + "\n" + struct + "\n") cfo.close() lock.release() #print cluster + ": " + struct #stdout.flush() except Exception, e: cluster, struct, "\nERROR!" stdout.flush() lock.release()
def create_final_output(groupfasta, basefolder, minseqs=1, cpus=1): '''Function for multithreading. Creates the final BayesFold alignment and writes to files, then r2r struct and infernal CM file''' # skip if already run and program just crashed or whatever currgroup = groupfasta.split("/")[-1].split(".")[0] currotufolder = basefolder + currgroup if exists(currotufolder): return # load seqs and make sure we have enough aln = LoadSeqs(groupfasta, moltype=RNA, aligned=True) count = count_seqs(aln.Names) if count < minseqs: return # get weights for each sequence. weight==count weights = [] maxweight = 0 for header in aln.Names: weight = count_seqs(header) if weight > maxweight: maxweight = weight weights.append(header.split()[0]) weights.append(str(weight)) # fold alignment with bayesfold aln, struct = bayesfold(aln, align=False) # write log information mkdir(currotufolder) with open(currotufolder + "/log.txt", 'w') as logout: logout.write(' '.join([ currgroup, ":\n", str(count), "sequences\n", str(aln.getNumSeqs()), "unique sequences\nStructure: ", struct, "\n" ])) # write out alignment and structure in fasta format with open(currotufolder + "/bayesfold-aln.fasta", 'w') as alnout: alnout.write(">SS_cons\n%s\n%s" % (struct, aln.toFasta())) # shave off info in header for stockholm aln = LoadSeqs(data=aln, moltype=RNA, label_to_name=lambda x: x.split()[0]) # create stockholm formatted alignment sto = stockholm_from_alignment(aln, GC_annotation={'SS_cons': struct}) del aln # create standard weights for infernal infweights = "" for pos in range(0, len(weights), 2): infweights = ''.join([ infweights, '# =GS %s WT %s\n' % (weights[pos], str(float(weights[pos + 1]) / maxweight)) ]) # create weights for r2r r2r_weights = "# =GF USE_THIS_WEIGHT_MAP " + ' '.join(weights) # create sto file with r2r and std weights sto = sto.split("\n") sto[-1] = infweights.strip() sto.append(r2r_weights) sto.append("//\n") stofile = currotufolder + "/bayesfold-aln.sto" with open(stofile, 'w') as alnout: alnout.write('\n'.join(sto)) # make R2R secondary structure for alignment make_r2r(stofile, currotufolder, currgroup) # create CM file for infernal from group cmbuild_from_file(stofile, currotufolder + "/cmfile.cm", params={'--wgiven': True}) calibrate_cmfile(currotufolder + "/cmfile.cm", cpus=cpus)