Exemplo n.º 1
0
    clusters = {}
    secs = time()
    if exists(otufolder + "clusters.txt"):
        print "sequences previously clustered"
        clustersin = open(otufolder + "clusters.txt")
        currclust = ""
        for header, seq in MinimalFastaParser(clustersin):
            if "cluster_" in header:
                currclust = header
                clusters[currclust] = []
            else:
                clusters[currclust].append((header,seq))
    else:
        print "Running uclust over sequences"
        #cluster the initial sequences by sequence simmilarity
        clusters = cluster_seqs(args.i, args.sim, folderout=args.o, gapopen='10.0/*TI', gapext='10.0')

        #remove tiny clusters
        #topop = []
        #countarray = []
        #for cluster in clusters:
        #    totalseqs = 0
        #    for seq in clusters[cluster]:
        #        totalseqs += int(seq[0].split("_")[1])
        #    if totalseqs < args.minseqs:
        #        topop.append(cluster)
        #    else:
        #        countarray.append((cluster,totalseqs))
        #for remove in topop:
        #    clusters.pop(remove)
Exemplo n.º 2
0
from cluster_seqs import cluster_seqs
from sys import argv
from cogent import RNA, LoadSeqs
from cogent.app.muscle_v38 import align_unaligned_seqs
from weblogolib import *
from os.path import exists
# test.py /path/to/filein.fasta /path/to/folderout/ #simmilarity

if __name__ == "__main__":
    if argv[2][-1] != "/":
        argv[2] += "/"

    clusters = cluster_seqs(argv[1], argv[3], folderout=argv[2], gapopen='10.0/*TI', gapext='10.0')

    #remove tiny clusters
    topop = []
    countarray = []
    for cluster in clusters:
        totalseqs = 0
        for seq in clusters[cluster]:
            totalseqs += int(seq[0].split("_")[1])
        if totalseqs < 100:
            topop.append(cluster)
        else:
            countarray.append((cluster,totalseqs))
    for remove in topop:
        clusters.pop(remove)
    countarray.sort(reverse=True, key=lambda c: c[1])

    for c in countarray:
        print c[0] + "\t" + str(c[1])