def construct_db_of_parts(infile, infileparts, outprefix): genes = {} #name,[beg,end] genesn = [] genesfn = {} #key name, value temp name genesf = {} #key name, value open file dbfn = outprefix + ".tempdbfa" tempoutf = open(dbfn, "w") for i in infileparts: spls = i.strip().split(" ") name = spls[1] shortname = name.split("/")[-1] rnges = spls[-1].split("-") beg = int(rnges[0]) end = int(rnges[1]) genes[name] = [beg, end] genesn.append(name) genesfn[ name] = outprefix + "." + shortname #open file should be the shortname genesf[name] = open(genesfn[name], "w") for i in seq.read_fasta_file_iter(infile): for j in genesn: b, e = genes[j] if len(i.seq[b - 1:e].replace("-", "")) > 100: tempoutf.write(">" + j + "___" + i.name + "\n" + i.seq[b - 1:e].replace("-", "") + "\n") genesf[j].write(">" + i.name + "\n" + i.seq[b - 1:e] + "\n") tempoutf.close() for i in genesn: genesf[name].close() cmd = "makeblastdb -in " + dbfn + " -out " + dbfn + ".db -dbtype nucl > /dev/null 2>&1" os.system(cmd) os.remove(dbfn) return dbfn, genes, genesfn
def make_blast_db_from_cluster_samp(indir,tempdir="./"): outf = open(tempdir+tempname,"w") for i in os.listdir(indir): if i[-3:] != ".fa": continue fn = i if os.path.isfile(indir+"/"+i.replace(".fa",".samp")): for j in seq.read_fasta_file_iter(indir+"/"+i.replace(".fa",".samp")): j.name = fn+"___"+j.name outf.write(j.get_fasta()) else: for j in seq.read_fasta_file_iter(indir+"/"+i): j.name = fn+"___"+j.name outf.write(j.get_fasta()) outf.close() cmd = "makeblastdb -in "+tempdir+tempname+" -out "+tempdir+tempname+".db -dbtype nucl > /dev/null 2>&1" os.system(cmd)
def make_files(clus, infile, outfiledir): seqs = {} for i in seq.read_fasta_file_iter(infile): seqs[i.name] = i for i in clus: i = ".".join(i.split(".")[0:-1]) + ".fa" outf = open(outfiledir + "/" + i, "w") for j in clus[i]: outf.write(seqs[j].get_fasta()) outf.close()
def make_blast_db_from_cluster(indir): outf = open(tempname, "w") for i in os.listdir(indir): if i[-3:] != ".fa": continue fn = i for j in seq.read_fasta_file_iter(indir + "/" + i): j.name = fn + "___" + j.name outf.write(j.get_fasta()) outf.close() cmd = "makeblastdb -in " + tempname + " -out " + tempname + ".db -dbtype nucl > /dev/null 2>&1" os.system(cmd)
def check_unaligned(infile): clen = None count = 0 for i in seq.read_fasta_file_iter(infile): count += 1 if clen == None: clen = len(i.seq) else: if len(i.seq) != clen: return False if count == 0: return False return True
def write_merge_table_and_temp_aln_file(filelist,tempdir="./"): tf = open(tempdir+"subMSAtable","w") tf2 = open(tempdir+"temp.mergealn","w") count = 1 addlater = [] for i in filelist: flcount = 0 for j in seq.read_fasta_file_iter(i): flcount += 1 if flcount > 1: for j in seq.read_fasta_file_iter(i): tf.write(str(count)+" ") count += 1 tf2.write(j.get_fasta()) tf.write("# "+i) tf.write("\n") else: for j in seq.read_fasta_file_iter(i): addlater.append(j.get_fasta()) for i in addlater: tf2.write(i) tf.close() tf2.close()
def make_files_bait_cut(clus, bait_cuts, infile, outfiledir): seqs = {} for i in seq.read_fasta_file_iter(infile): seqs[i.name] = i for i in clus: i = ".".join(i.split(".")[0:-1]) + ".fa" outf = open(outfiledir + "/" + i, "w") for j in clus[i]: log.wac("CUTTING " + seqs[j].name + " " + str(bait_cuts[seqs[j].name][0]) + "-" + str(bait_cuts[seqs[j].name][1])) seqs[j].seq = seqs[j].seq[ bait_cuts[seqs[j].name][0]:bait_cuts[seqs[j].name][1]] outf.write(seqs[j].get_fasta()) outf.close()
def make_cluster_table(cld, idn,idd, outfile): cli = {} clns = [] for i in os.listdir(cld): if ".fa" not in i: continue sps = set() firstdef = None num = 0 avl = 0 for j in seq.read_fasta_file_iter(cld+"/"+i): sps.add(idn[j.name]) firstdef = idd[j.name] avl += len(j.seq) num += 1 avl = avl/float(num) cli[i] = sps if len(sps) > 2: clns.append([hl.link(i,"clusters/"+i),len(sps),avl,firstdef]) clns = sorted(clns, key=lambda x: x[1], reverse=True) clns.insert(0,["<b>name</b>","<b>num_species</b>","<b>avg unaln len</b>","<b>defline</b>"]) htmlf = open(outfile,"w") links = [] if os.path.isfile(cld+"/../../info.html"): #htmlc = hl.link('back','../info.html') links.append([hl.link('back','../info.html')]) #htmlf.write(htmlc) for i in os.listdir(cld+"/../"): if os.path.isdir(cld+"/../"+i) and "clusters" not in i: #htmlc = hl.link(" "+i+" ",i+"/info.html") links.append([hl.link(" "+i+" ",i+"/info.html")]) #htmlf.write(htmlc) name = cld.split("/")[-2] htmlf.write("<h1>"+name+"</h1>") htmlf.write("<div style=\"float: left\">\n") htmlc = hl.table(links,style="border: 2px solid #000000; border-collapse: collapse;") htmlf.write(htmlc) htmlf.write("</div>\n<div style=\"float: left\">\n") htmlc = hl.table(clns,width=600,style="border: 2px solid #000000; border-collapse: collapse;") htmlf.write(htmlc) htmlf.write("</div>\n") htmlf.close()
def add_ind_mafft(inseq, cl_file, merge, tempdir="./"): tf = open(cl_file, "a") tf.write(inseq.get_fasta()) tf.close() #make temp if merge: tf = open(tempdir + "subMSAtable", "w") tf2 = open(tempdir + "temp.mergealn", "w") count = 1 for i in seq.read_fasta_file_iter(cl_file.replace(".fa", ".aln")): tf2.write(i.get_fasta()) tf.write(str(count) + " ") count += 1 tf2.write(inseq.get_fasta()) tf.close() tf2.close() merge_alignments(cl_file.replace(".fa", ".aln"), tempdir)
def make_info_table(clusterd, idn, outfile): cli = {} clns = [] spls = {} for i in os.listdir(clusterd): if ".fa" not in i: continue sps = set() num = 0 avl = 0 for j in seq.read_fasta_file_iter(clusterd + "/" + i): sps.add(idn[j.name]) if idn[j.name] not in spls: spls[idn[j.name]] = [] spls[idn[j.name]].append(i) num += 1 cli[i] = sps clns.append([i, len(sps)]) clns = sorted(clns, key=lambda x: x[1], reverse=True) keep = [] if len(spls) > 20: for i in clns: if i[1] >= 4: keep.append(i) else: keep = clns outfile = open(outfile, "w") outfile.write("species") for i in keep: outfile.write("," + i[0]) outfile.write("\n") for j in spls: outfile.write(j) for i in keep: if j in cli[i[0]]: outfile.write(",x") else: outfile.write(",") outfile.write("\n") outfile.close()
def make_table_from_fasta(DB, fastafile, outfilen): conn = sqlite3.connect(DB) c = conn.cursor() outfile = open(outfilen, "w") for x in seq.read_fasta_file_iter(fastafile): sid = x.name c.execute("select * from sequence where accession_id = ?", (sid, )) l = c.fetchall() j0, j1, j2, j3, j4, tname = "", "", "", "", "", "" for j in l: j0 = str(j[0]) j1 = str(j[1]) j2 = str(j[2]) j3 = str(j[3]) j4 = str(j[4]) c.execute( "select name from taxonomy where ncbi_id = ? and name_class = 'scientific name'", (j1, )) l = c.fetchall() for j in l: tname = str(j[0]) outfile.write(j0 + "\t" + j1 + "\t" + j2 + "\t" + j3 + "\t" + str(tname) + "\t" + j4 + "\n") outfile.close()
while going: found = False for i in rt.iternodes(): if i.parent != None and len(i.children) == 1 and i.label == "": par = i.parent ch = i.children[0] par.remove_child(i) par.add_child(ch) found = True break if found == False: going = False break return rt if __name__ == "__main__": if len(sys.argv) != 4: print "python " + sys.argv[0] + " dbname baseid alnfile" sys.exit(0) dbname = sys.argv[1] baseid = sys.argv[2] conn = sqlite3.connect(dbname) c = conn.cursor() ids = set() for i in seq.read_fasta_file_iter(sys.argv[3]): ids.add(get_taxid_for_name(c, i.name.replace("_", " "))) t = construct_tree_only_ids(baseid, c, ids) print t.get_newick_repr(False) + ";"
import sys import seq import os if __name__ == "__main__": if len(sys.argv) < 3: print("python " + sys.argv[0] + " table infile...") sys.exit(0) tab = open(sys.argv[1], "r") idn = {} for i in tab: spls = i.strip().split("\t") idn[spls[3]] = spls[4] tab.close() for j in sys.argv[2:]: outf = open(j + ".rn", "w") for i in seq.read_fasta_file_iter(j): i.name = idn[i.name].replace(" ", "_"), + "_" + i.name outf.write(i.get_fasta()) outf.close()
print("python " + sys.argv[0] + " tree files...") sys.exit(0) tree = next(tree_reader.read_tree_file_iter(sys.argv[1])) genes = {} seqfiles = [] badseqs = [] goodseqs = [] for i in sys.argv[2:]: if i[0] != "_": seqfiles.append(i) else: goodseqs.append(i[1:]) for i in seqfiles: genes[i] = [] for j in seq.read_fasta_file_iter(i): genes[i].append(j.name) print(seqfiles) print(goodseqs) filt = set() for i in tree.iternodes(order="POSTORDER"): if len(i.children) == 0: continue lvsnms = i.lvsnms() lvsnmsin = set() G = nx.MultiGraph() gene_per_taxon = {} for j in genes: ndgenes = []
outf = open(outfiledir + "/" + i, "w") for j in clus[i]: outf.write(seqs[j].get_fasta()) outf.close() if __name__ == "__main__": if len(sys.argv) != 3: print "python " + sys.argv[0] + " indir logfile" sys.exit(0) if sys.argv[1][-1] == "/": sys.argv[1] = sys.argv[1][:-1] sp = sys.argv[1].split("/")[-1] INFILE = sys.argv[1] + "/" + sp count = 0 for i in seq.read_fasta_file_iter(INFILE + ".fas"): count += 1 if count == 0: sys.exit(0) LOGFILE = sys.argv[2] log = Logger(LOGFILE) cmd = "blastn -db " + tempname + ".db -query " + INFILE + ".fas -perc_identity " + str( perc_identity ) + " -evalue " + str(evalue_limit) + " -num_threads " + str( nthread ) + " -max_target_seqs 10000000 -out " + INFILE + ".fasta.rawblastn -outfmt '6 qseqid qlen sseqid slen frames pident nident length mismatch gapopen qstart qend sstart send evalue bitscore'" log.wac("RUNNING " + cmd) os.system(cmd) #process the files inf = open(INFILE + ".fasta.rawblastn", "r")
LOGFILE = sys.argv[4] log = Logger(LOGFILE) log.a() tab = open(sys.argv[1], "r") idn = {} for i in tab: spls = i.strip().split("\t") idn[spls[3]] = spls[4] tab.close() dirr = sys.argv[2] for o in os.listdir(dirr): if fend != None: if fend not in o: continue seqs = {} for i in seq.read_fasta_file_iter(dirr + "/" + o): if idn[i.name] not in seqs: seqs[idn[i.name]] = [] seqs[idn[i.name]].append(i) #gets the longest for i in seqs: if len(seqs[i]) > 1: longest = None longestn = 0 for j in seqs[i]: if len(j.seq) > longestn: longest = j longestn = len(j.seq) seqs[i] = [longest] fn = open(dirr + "/" + o, "w") keep = []
parser.add_argument("-i", "--infile", type=str, help="Input fasta alignment", required=True) parser.add_argument("-o", "--outfile", type=str, help="Output fasta alignment", required=True) return parser if __name__ == "__main__": parser = generate_argparser() if len(sys.argv[1:]) == 0: sys.argv.append("-h") args = parser.parse_args(sys.argv[1:]) tab = open(args.table, "r") idn = {} for i in tab: spls = i.strip().split("\t") idn[spls[1]] = spls[4] tab.close() outf = open(args.outfile, "w") for i in seq.read_fasta_file_iter(args.infile): i.name = idn[i.name].replace(" ", "_") outf.write(i.get_fasta()) outf.close()
def make_cluster_table(cld, idn, idd, outfile): cli = {} clns = [] for i in os.listdir(cld): if ".fa" not in i: continue sps = set() firstdef = None num = 0 avl = 0 for j in seq.read_fasta_file_iter(cld + "/" + i): sps.add(idn[j.name]) firstdef = idd[j.name] avl += len(j.seq) num += 1 try: avl = avl / float(num) except: avl = 0.0 cli[i] = sps if len(sps) > 2: clns.append([ hl.link(i, "clusters/" + i), len(sps), format(avl, '.4f'), firstdef ]) clns = sorted(clns, key=lambda x: x[1], reverse=True) #clns.insert(0,["<b>name</b>","<b>num_species</b>","<b>avg unaln len</b>","<b>defline</b>"]) htmlf = open(outfile, "w") fhr = None if os.path.isfile(cld + "/../../info.html"): fhr = [hl.link('back', '../info.html')] else: fhr = [""] links = [] for i in os.listdir(cld + "/../"): if os.path.isdir(cld + "/../" + i) and "clusters" not in i: #htmlc = hl.link(" "+i+" ",i+"/info.html") links.append([hl.link(" " + i + " ", i + "/info.html")]) #htmlf.write(htmlc) name = cld.split("/")[-2] htmlf.write(htmlbegin) htmlf.write( '<div class="row"><div class="col">\n<pre>\n ___ ___ __ ____ ___ _ _____ \n / _ \__ __/ _ \/ // / / / _ | | /| / / _ \ \n / ___/ // / ___/ _ / /__/ __ | |/ |/ / // /\n /_/ \_, /_/ /_//_/____/_/ |_|__/|__/____/ \n /___/ results </pre></div>\n' ) htmlf.write('<div class="col"><br><h1>' + name + '</h1></div>\n</div>\n') htmlf.write("<div class=\"row\">\n<div class=\"col-sm-3\">\n") htmlc = hl.table(links, style=None, border=None, cellpadding=None, classs="table", header_row=fhr) htmlf.write(htmlc) htmlf.write("</div>\n<div class=\"col\">\n") htmlc = hl.table(clns, style=None, border=None, cellpadding=None, classs="table", header_row=['name', 'num_species', 'avg len', 'defline']) htmlf.write(htmlc) htmlf.write("</div>\n</div>\n") htmlf.write(htmlend) htmlf.close()
else: blast_file_against_db(dir1,i,tempdir) dclus,clus = filter_blast.process_blast_out(tempdir+tempname+".rawblastn") if len(clus) > 0: for j in clus: G.add_edge(dir1+"/"+i,diro+"/"+j) else: G.add_node(dir1+"/"+i) # need to log these operations origcurcount = curcount for i in nx.connected_components(G): tf = open(diro+"/"+"cluster"+str(curcount)+".fa","w") log.w(" ".join(["MERGING FASTA TO",diro+"/cluster"+str(curcount)+".fa","FROM"," ".join(list(i))])) curcount += 1 for j in i: for k in seq.read_fasta_file_iter(j): tf.write(k.get_fasta()) tf.close() if use_merge == True: for i in nx.connected_components(G): if len(i) > 1: x = [j.replace(".fa",".aln") for j in i] log.w(" ".join(["MERGING ALIGNMENTS FROM"," ".join(x)])) write_merge_table_and_temp_aln_file(x,tempdir) outfile = diro+"/"+"cluster"+str(origcurcount)+".aln" merge_alignments(outfile,tempdir) log.w(" ".join(["CREATED FROM MERGE",diro+"/cluster"+str(origcurcount)+".aln"])) for j in i: if diro in j: log.w(" ".join(["REMOVING ALIGNMENTS",j,j.replace(".fa",".aln")])) os.remove(j)
if len(sys.argv) != 3: print "python " + sys.argv[0] + " curdir logfile" sys.exit(0) curd = sys.argv[1] LOGFILE = sys.argv[2] ff = None dirs = [] for i in os.listdir(curd): if ".fas" == i[-4:]: ff = i elif os.path.isdir(curd + "/" + i) and i != "clusters": dirs.append(curd + "/" + i + "/" + i + ".fas") seqids = [] seqs = {} for i in seq.read_fasta_file_iter(curd + "/" + ff): seqids.append(i.name) seqs[i.name] = i for i in dirs: for j in seq.read_fasta_file_iter(i): if len(j.name) > 0 and j.name in seqids: del seqs[j.name] if len(seqs) > 0: outfile = open(curd + "/notinchildren.fas", "w") for i in seqs: outfile.write(seqs[i].get_fasta()) outfile.close()
LOGFILE = sys.argv[4] log = Logger(LOGFILE) log.a() tab = open(sys.argv[1], "r") idn = {} for i in tab: spls = i.strip().split("\t") idn[spls[3]] = spls[4] tab.close() dirr = sys.argv[2] for o in os.listdir(dirr): if fend != None: if fend not in o: continue seqs = {} for i in seq.read_fasta_file_iter(dirr + "/" + o): if idn[i.name] not in seqs: seqs[idn[i.name]] = [] seqs[idn[i.name]].append(i) for i in seqs: if len(seqs[i]) > 1: longest = None longestn = 0 for j in seqs[i]: if len(j.seq) > longestn: longest = j longestn = len(j.seq) seqs[i] = [longest] fn = open(dirr + "/" + o, "w") for i in seqs: for j in seqs[i]:
import os import sys import trim_tips import seq if __name__ == "__main__": if len(sys.argv) != 7: print "python " + sys.argv[ 0] + " file.tre aln relative_cutoff absolute_cutoff outtre outaln" sys.exit(0) treefile = sys.argv[1] alnfile = sys.argv[2] rel_cut = sys.argv[3] abs_cut = sys.argv[4] outtree = open(sys.argv[5], "w") tree, removed = trim_tips.main(treefile, rel_cut, abs_cut) outtree.write(tree.get_newick_repr(True) + ";\n") outtree.close() outaln = open(sys.argv[6], "w") for i in seq.read_fasta_file_iter(alnfile): if i.name not in removed: outaln.write(i.get_fasta()) outaln.close()
def write_fasta_file(files,outfilen): outfile = open(outfilen,"w") for i in files: for j in seq.read_fasta_file_iter(i): outfile.write(j.get_fasta()) outfile.close()