def construct_db_of_parts(infile, infileparts, outprefix):
    genes = {}  #name,[beg,end]
    genesn = []
    genesfn = {}  #key name, value temp name
    genesf = {}  #key name, value open file
    dbfn = outprefix + ".tempdbfa"
    tempoutf = open(dbfn, "w")
    for i in infileparts:
        spls = i.strip().split(" ")
        name = spls[1]
        shortname = name.split("/")[-1]
        rnges = spls[-1].split("-")
        beg = int(rnges[0])
        end = int(rnges[1])
        genes[name] = [beg, end]
        genesn.append(name)
        genesfn[
            name] = outprefix + "." + shortname  #open file should be the shortname
        genesf[name] = open(genesfn[name], "w")
    for i in seq.read_fasta_file_iter(infile):
        for j in genesn:
            b, e = genes[j]
            if len(i.seq[b - 1:e].replace("-", "")) > 100:
                tempoutf.write(">" + j + "___" + i.name + "\n" +
                               i.seq[b - 1:e].replace("-", "") + "\n")
                genesf[j].write(">" + i.name + "\n" + i.seq[b - 1:e] + "\n")
    tempoutf.close()
    for i in genesn:
        genesf[name].close()
    cmd = "makeblastdb -in " + dbfn + " -out " + dbfn + ".db -dbtype nucl > /dev/null 2>&1"
    os.system(cmd)
    os.remove(dbfn)
    return dbfn, genes, genesfn
def make_blast_db_from_cluster_samp(indir,tempdir="./"):
    outf = open(tempdir+tempname,"w")
    for i in os.listdir(indir):
        if i[-3:] != ".fa":
            continue
        fn = i
        if os.path.isfile(indir+"/"+i.replace(".fa",".samp")):
            for j in seq.read_fasta_file_iter(indir+"/"+i.replace(".fa",".samp")):
                j.name = fn+"___"+j.name
                outf.write(j.get_fasta())
        else:
            for j in seq.read_fasta_file_iter(indir+"/"+i):
                j.name = fn+"___"+j.name
                outf.write(j.get_fasta())
    outf.close()
    cmd = "makeblastdb -in "+tempdir+tempname+" -out "+tempdir+tempname+".db -dbtype nucl > /dev/null 2>&1"
    os.system(cmd)
示例#3
0
def make_files(clus, infile, outfiledir):
    seqs = {}
    for i in seq.read_fasta_file_iter(infile):
        seqs[i.name] = i
    for i in clus:
        i = ".".join(i.split(".")[0:-1]) + ".fa"
        outf = open(outfiledir + "/" + i, "w")
        for j in clus[i]:
            outf.write(seqs[j].get_fasta())
        outf.close()
示例#4
0
def make_blast_db_from_cluster(indir):
    outf = open(tempname, "w")
    for i in os.listdir(indir):
        if i[-3:] != ".fa":
            continue
        fn = i
        for j in seq.read_fasta_file_iter(indir + "/" + i):
            j.name = fn + "___" + j.name
            outf.write(j.get_fasta())
    outf.close()
    cmd = "makeblastdb -in " + tempname + " -out " + tempname + ".db -dbtype nucl > /dev/null 2>&1"
    os.system(cmd)
def check_unaligned(infile):
    clen = None
    count = 0
    for i in seq.read_fasta_file_iter(infile):
        count += 1
        if clen == None:
            clen = len(i.seq)
        else:
            if len(i.seq) != clen:
                return False
    if count == 0:
        return False
    return True
def write_merge_table_and_temp_aln_file(filelist,tempdir="./"):
    tf = open(tempdir+"subMSAtable","w")
    tf2 = open(tempdir+"temp.mergealn","w")
    count = 1
    addlater = []
    for i in filelist:
        flcount = 0
        for j in seq.read_fasta_file_iter(i):
            flcount += 1
        if flcount > 1:
            for j in seq.read_fasta_file_iter(i):
                tf.write(str(count)+" ")
                count += 1
                tf2.write(j.get_fasta())
            tf.write("# "+i)
            tf.write("\n")
        else:
            for j in seq.read_fasta_file_iter(i):
                addlater.append(j.get_fasta())
    for i in addlater:
        tf2.write(i)
    tf.close()
    tf2.close()
示例#7
0
def make_files_bait_cut(clus, bait_cuts, infile, outfiledir):
    seqs = {}
    for i in seq.read_fasta_file_iter(infile):
        seqs[i.name] = i
    for i in clus:
        i = ".".join(i.split(".")[0:-1]) + ".fa"
        outf = open(outfiledir + "/" + i, "w")
        for j in clus[i]:
            log.wac("CUTTING " + seqs[j].name + " " +
                    str(bait_cuts[seqs[j].name][0]) + "-" +
                    str(bait_cuts[seqs[j].name][1]))
            seqs[j].seq = seqs[j].seq[
                bait_cuts[seqs[j].name][0]:bait_cuts[seqs[j].name][1]]
            outf.write(seqs[j].get_fasta())
        outf.close()
示例#8
0
def make_cluster_table(cld, idn,idd, outfile):
    cli = {}
    clns = []
    for i in os.listdir(cld):
        if ".fa" not in i:
            continue
        sps = set()
        firstdef = None
        num = 0
        avl = 0
        for j in seq.read_fasta_file_iter(cld+"/"+i):
            sps.add(idn[j.name])
            firstdef = idd[j.name]
            avl += len(j.seq)
            num += 1
        avl = avl/float(num)
        cli[i] = sps
        if len(sps) > 2:
            clns.append([hl.link(i,"clusters/"+i),len(sps),avl,firstdef])
    
    clns = sorted(clns, key=lambda x: x[1], reverse=True)
    clns.insert(0,["<b>name</b>","<b>num_species</b>","<b>avg unaln len</b>","<b>defline</b>"])
    
    htmlf = open(outfile,"w")
    links = []
    if os.path.isfile(cld+"/../../info.html"):
        #htmlc = hl.link('back','../info.html')
        links.append([hl.link('back','../info.html')])
        #htmlf.write(htmlc)
    
    for i in os.listdir(cld+"/../"):
        if os.path.isdir(cld+"/../"+i) and "clusters" not in i:
            #htmlc = hl.link("  "+i+"  ",i+"/info.html")
            links.append([hl.link("  "+i+"  ",i+"/info.html")])
            #htmlf.write(htmlc)
    
    name = cld.split("/")[-2]
    htmlf.write("<h1>"+name+"</h1>")
    htmlf.write("<div style=\"float: left\">\n")
    htmlc = hl.table(links,style="border: 2px solid #000000; border-collapse: collapse;")
    htmlf.write(htmlc)
    htmlf.write("</div>\n<div style=\"float: left\">\n")
    htmlc = hl.table(clns,width=600,style="border: 2px solid #000000; border-collapse: collapse;")
    htmlf.write(htmlc)
    htmlf.write("</div>\n")
    htmlf.close()
def add_ind_mafft(inseq, cl_file, merge, tempdir="./"):
    tf = open(cl_file, "a")
    tf.write(inseq.get_fasta())
    tf.close()
    #make temp
    if merge:
        tf = open(tempdir + "subMSAtable", "w")
        tf2 = open(tempdir + "temp.mergealn", "w")
        count = 1
        for i in seq.read_fasta_file_iter(cl_file.replace(".fa", ".aln")):
            tf2.write(i.get_fasta())
            tf.write(str(count) + " ")
            count += 1
        tf2.write(inseq.get_fasta())
        tf.close()
        tf2.close()
        merge_alignments(cl_file.replace(".fa", ".aln"), tempdir)
示例#10
0
def make_info_table(clusterd, idn, outfile):
    cli = {}
    clns = []
    spls = {}
    for i in os.listdir(clusterd):
        if ".fa" not in i:
            continue
        sps = set()
        num = 0
        avl = 0
        for j in seq.read_fasta_file_iter(clusterd + "/" + i):
            sps.add(idn[j.name])
            if idn[j.name] not in spls:
                spls[idn[j.name]] = []
            spls[idn[j.name]].append(i)
            num += 1
        cli[i] = sps
        clns.append([i, len(sps)])
    clns = sorted(clns, key=lambda x: x[1], reverse=True)
    keep = []
    if len(spls) > 20:
        for i in clns:
            if i[1] >= 4:
                keep.append(i)
    else:
        keep = clns
    outfile = open(outfile, "w")
    outfile.write("species")
    for i in keep:
        outfile.write("," + i[0])
    outfile.write("\n")
    for j in spls:
        outfile.write(j)
        for i in keep:
            if j in cli[i[0]]:
                outfile.write(",x")
            else:
                outfile.write(",")
        outfile.write("\n")
    outfile.close()
示例#11
0
def make_table_from_fasta(DB, fastafile, outfilen):
    conn = sqlite3.connect(DB)
    c = conn.cursor()
    outfile = open(outfilen, "w")
    for x in seq.read_fasta_file_iter(fastafile):
        sid = x.name
        c.execute("select * from sequence where accession_id = ?", (sid, ))
        l = c.fetchall()
        j0, j1, j2, j3, j4, tname = "", "", "", "", "", ""
        for j in l:
            j0 = str(j[0])
            j1 = str(j[1])
            j2 = str(j[2])
            j3 = str(j[3])
            j4 = str(j[4])
        c.execute(
            "select name from taxonomy where ncbi_id = ? and name_class = 'scientific name'",
            (j1, ))
        l = c.fetchall()
        for j in l:
            tname = str(j[0])
        outfile.write(j0 + "\t" + j1 + "\t" + j2 + "\t" + j3 + "\t" +
                      str(tname) + "\t" + j4 + "\n")
    outfile.close()
示例#12
0
    while going:
        found = False
        for i in rt.iternodes():
            if i.parent != None and len(i.children) == 1 and i.label == "":
                par = i.parent
                ch = i.children[0]
                par.remove_child(i)
                par.add_child(ch)
                found = True
                break
        if found == False:
            going = False
            break
    return rt


if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "python " + sys.argv[0] + " dbname baseid alnfile"
        sys.exit(0)

    dbname = sys.argv[1]
    baseid = sys.argv[2]
    conn = sqlite3.connect(dbname)
    c = conn.cursor()
    ids = set()
    for i in seq.read_fasta_file_iter(sys.argv[3]):
        ids.add(get_taxid_for_name(c, i.name.replace("_", " ")))
    t = construct_tree_only_ids(baseid, c, ids)
    print t.get_newick_repr(False) + ";"
示例#13
0
import sys
import seq
import os

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("python " + sys.argv[0] + " table infile...")
        sys.exit(0)

    tab = open(sys.argv[1], "r")
    idn = {}
    for i in tab:
        spls = i.strip().split("\t")
        idn[spls[3]] = spls[4]
    tab.close()
    for j in sys.argv[2:]:
        outf = open(j + ".rn", "w")
        for i in seq.read_fasta_file_iter(j):
            i.name = idn[i.name].replace(" ", "_"), + "_" + i.name
            outf.write(i.get_fasta())
        outf.close()
        print("python " + sys.argv[0] + " tree files...")
        sys.exit(0)

    tree = next(tree_reader.read_tree_file_iter(sys.argv[1]))
    genes = {}
    seqfiles = []
    badseqs = []
    goodseqs = []
    for i in sys.argv[2:]:
        if i[0] != "_":
            seqfiles.append(i)
        else:
            goodseqs.append(i[1:])
    for i in seqfiles:
        genes[i] = []
        for j in seq.read_fasta_file_iter(i):
            genes[i].append(j.name)

    print(seqfiles)
    print(goodseqs)
    filt = set()

    for i in tree.iternodes(order="POSTORDER"):
        if len(i.children) == 0:
            continue
        lvsnms = i.lvsnms()
        lvsnmsin = set()
        G = nx.MultiGraph()
        gene_per_taxon = {}
        for j in genes:
            ndgenes = []
示例#15
0
        outf = open(outfiledir + "/" + i, "w")
        for j in clus[i]:
            outf.write(seqs[j].get_fasta())
        outf.close()


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "python " + sys.argv[0] + " indir logfile"
        sys.exit(0)
    if sys.argv[1][-1] == "/":
        sys.argv[1] = sys.argv[1][:-1]
    sp = sys.argv[1].split("/")[-1]
    INFILE = sys.argv[1] + "/" + sp
    count = 0
    for i in seq.read_fasta_file_iter(INFILE + ".fas"):
        count += 1
    if count == 0:
        sys.exit(0)
    LOGFILE = sys.argv[2]
    log = Logger(LOGFILE)
    cmd = "blastn -db " + tempname + ".db -query " + INFILE + ".fas -perc_identity " + str(
        perc_identity
    ) + " -evalue " + str(evalue_limit) + " -num_threads " + str(
        nthread
    ) + " -max_target_seqs 10000000 -out " + INFILE + ".fasta.rawblastn -outfmt '6 qseqid qlen sseqid slen frames pident nident length mismatch gapopen qstart qend sstart send evalue bitscore'"
    log.wac("RUNNING " + cmd)
    os.system(cmd)

    #process the files
    inf = open(INFILE + ".fasta.rawblastn", "r")
     LOGFILE = sys.argv[4]
 log = Logger(LOGFILE)
 log.a()
 tab = open(sys.argv[1], "r")
 idn = {}
 for i in tab:
     spls = i.strip().split("\t")
     idn[spls[3]] = spls[4]
 tab.close()
 dirr = sys.argv[2]
 for o in os.listdir(dirr):
     if fend != None:
         if fend not in o:
             continue
     seqs = {}
     for i in seq.read_fasta_file_iter(dirr + "/" + o):
         if idn[i.name] not in seqs:
             seqs[idn[i.name]] = []
         seqs[idn[i.name]].append(i)
     #gets the longest
     for i in seqs:
         if len(seqs[i]) > 1:
             longest = None
             longestn = 0
             for j in seqs[i]:
                 if len(j.seq) > longestn:
                     longest = j
                     longestn = len(j.seq)
             seqs[i] = [longest]
     fn = open(dirr + "/" + o, "w")
     keep = []
示例#17
0
    parser.add_argument("-i",
                        "--infile",
                        type=str,
                        help="Input fasta alignment",
                        required=True)
    parser.add_argument("-o",
                        "--outfile",
                        type=str,
                        help="Output fasta alignment",
                        required=True)
    return parser


if __name__ == "__main__":
    parser = generate_argparser()
    if len(sys.argv[1:]) == 0:
        sys.argv.append("-h")
    args = parser.parse_args(sys.argv[1:])

    tab = open(args.table, "r")
    idn = {}
    for i in tab:
        spls = i.strip().split("\t")
        idn[spls[1]] = spls[4]
    tab.close()
    outf = open(args.outfile, "w")
    for i in seq.read_fasta_file_iter(args.infile):
        i.name = idn[i.name].replace(" ", "_")
        outf.write(i.get_fasta())
    outf.close()
示例#18
0
def make_cluster_table(cld, idn, idd, outfile):
    cli = {}
    clns = []
    for i in os.listdir(cld):
        if ".fa" not in i:
            continue
        sps = set()
        firstdef = None
        num = 0
        avl = 0
        for j in seq.read_fasta_file_iter(cld + "/" + i):
            sps.add(idn[j.name])
            firstdef = idd[j.name]
            avl += len(j.seq)
            num += 1
        try:
            avl = avl / float(num)
        except:
            avl = 0.0
        cli[i] = sps
        if len(sps) > 2:
            clns.append([
                hl.link(i, "clusters/" + i),
                len(sps),
                format(avl, '.4f'), firstdef
            ])

    clns = sorted(clns, key=lambda x: x[1], reverse=True)
    #clns.insert(0,["<b>name</b>","<b>num_species</b>","<b>avg unaln len</b>","<b>defline</b>"])

    htmlf = open(outfile, "w")
    fhr = None
    if os.path.isfile(cld + "/../../info.html"):
        fhr = [hl.link('back', '../info.html')]
    else:
        fhr = [""]
    links = []
    for i in os.listdir(cld + "/../"):
        if os.path.isdir(cld + "/../" + i) and "clusters" not in i:
            #htmlc = hl.link("  "+i+"  ",i+"/info.html")
            links.append([hl.link("  " + i + "  ", i + "/info.html")])
            #htmlf.write(htmlc)

    name = cld.split("/")[-2]
    htmlf.write(htmlbegin)
    htmlf.write(
        '<div class="row"><div class="col">\n<pre>\n     ___       ___  __ ____   ___ _      _____ \n    / _ \__ __/ _ \/ // / /  / _ | | /| / / _ \ \n   / ___/ // / ___/ _  / /__/ __ | |/ |/ / // /\n  /_/   \_, /_/  /_//_/____/_/ |_|__/|__/____/ \n       /___/                results     </pre></div>\n'
    )
    htmlf.write('<div class="col"><br><h1>' + name + '</h1></div>\n</div>\n')
    htmlf.write("<div class=\"row\">\n<div class=\"col-sm-3\">\n")
    htmlc = hl.table(links,
                     style=None,
                     border=None,
                     cellpadding=None,
                     classs="table",
                     header_row=fhr)
    htmlf.write(htmlc)
    htmlf.write("</div>\n<div class=\"col\">\n")
    htmlc = hl.table(clns,
                     style=None,
                     border=None,
                     cellpadding=None,
                     classs="table",
                     header_row=['name', 'num_species', 'avg len', 'defline'])
    htmlf.write(htmlc)
    htmlf.write("</div>\n</div>\n")
    htmlf.write(htmlend)
    htmlf.close()
示例#19
0
     else:
         blast_file_against_db(dir1,i,tempdir)
     dclus,clus = filter_blast.process_blast_out(tempdir+tempname+".rawblastn")
     if len(clus) > 0:
         for j in clus:
             G.add_edge(dir1+"/"+i,diro+"/"+j)
     else:
         G.add_node(dir1+"/"+i)
 # need to log these operations
 origcurcount = curcount
 for i in nx.connected_components(G):
     tf = open(diro+"/"+"cluster"+str(curcount)+".fa","w")
     log.w(" ".join(["MERGING FASTA TO",diro+"/cluster"+str(curcount)+".fa","FROM"," ".join(list(i))]))
     curcount += 1
     for j in i:
         for k in seq.read_fasta_file_iter(j):
             tf.write(k.get_fasta())
     tf.close()
 if use_merge == True:
     for i in nx.connected_components(G):
         if len(i) > 1:
             x = [j.replace(".fa",".aln") for j in i]
             log.w(" ".join(["MERGING ALIGNMENTS FROM"," ".join(x)]))
             write_merge_table_and_temp_aln_file(x,tempdir)
             outfile = diro+"/"+"cluster"+str(origcurcount)+".aln"
             merge_alignments(outfile,tempdir)
             log.w(" ".join(["CREATED FROM MERGE",diro+"/cluster"+str(origcurcount)+".aln"]))
             for j in i:
                 if diro in j:
                     log.w(" ".join(["REMOVING ALIGNMENTS",j,j.replace(".fa",".aln")]))
                     os.remove(j)
示例#20
0
    if len(sys.argv) != 3:
        print "python " + sys.argv[0] + " curdir logfile"
        sys.exit(0)
    curd = sys.argv[1]
    LOGFILE = sys.argv[2]

    ff = None
    dirs = []
    for i in os.listdir(curd):
        if ".fas" == i[-4:]:
            ff = i
        elif os.path.isdir(curd + "/" + i) and i != "clusters":
            dirs.append(curd + "/" + i + "/" + i + ".fas")

    seqids = []
    seqs = {}
    for i in seq.read_fasta_file_iter(curd + "/" + ff):
        seqids.append(i.name)
        seqs[i.name] = i

    for i in dirs:
        for j in seq.read_fasta_file_iter(i):
            if len(j.name) > 0 and j.name in seqids:
                del seqs[j.name]

    if len(seqs) > 0:
        outfile = open(curd + "/notinchildren.fas", "w")
        for i in seqs:
            outfile.write(seqs[i].get_fasta())
        outfile.close()
     LOGFILE = sys.argv[4]
 log = Logger(LOGFILE)
 log.a()
 tab = open(sys.argv[1], "r")
 idn = {}
 for i in tab:
     spls = i.strip().split("\t")
     idn[spls[3]] = spls[4]
 tab.close()
 dirr = sys.argv[2]
 for o in os.listdir(dirr):
     if fend != None:
         if fend not in o:
             continue
     seqs = {}
     for i in seq.read_fasta_file_iter(dirr + "/" + o):
         if idn[i.name] not in seqs:
             seqs[idn[i.name]] = []
         seqs[idn[i.name]].append(i)
     for i in seqs:
         if len(seqs[i]) > 1:
             longest = None
             longestn = 0
             for j in seqs[i]:
                 if len(j.seq) > longestn:
                     longest = j
                     longestn = len(j.seq)
             seqs[i] = [longest]
     fn = open(dirr + "/" + o, "w")
     for i in seqs:
         for j in seqs[i]:
示例#22
0
import os
import sys
import trim_tips
import seq

if __name__ == "__main__":
    if len(sys.argv) != 7:
        print "python " + sys.argv[
            0] + " file.tre aln relative_cutoff absolute_cutoff outtre outaln"
        sys.exit(0)

    treefile = sys.argv[1]
    alnfile = sys.argv[2]
    rel_cut = sys.argv[3]
    abs_cut = sys.argv[4]
    outtree = open(sys.argv[5], "w")
    tree, removed = trim_tips.main(treefile, rel_cut, abs_cut)
    outtree.write(tree.get_newick_repr(True) + ";\n")
    outtree.close()
    outaln = open(sys.argv[6], "w")
    for i in seq.read_fasta_file_iter(alnfile):
        if i.name not in removed:
            outaln.write(i.get_fasta())
    outaln.close()
示例#23
0
def write_fasta_file(files,outfilen):
    outfile = open(outfilen,"w")
    for i in files:
        for j in seq.read_fasta_file_iter(i):
            outfile.write(j.get_fasta())
    outfile.close()