clusters = {} secs = time() if exists(otufolder + "clusters.txt"): print "sequences previously clustered" clustersin = open(otufolder + "clusters.txt") currclust = "" for header, seq in MinimalFastaParser(clustersin): if "cluster_" in header: currclust = header clusters[currclust] = [] else: clusters[currclust].append((header,seq)) else: print "Running uclust over sequences" #cluster the initial sequences by sequence simmilarity clusters = cluster_seqs(args.i, args.sim, folderout=args.o, gapopen='10.0/*TI', gapext='10.0') #remove tiny clusters #topop = [] #countarray = [] #for cluster in clusters: # totalseqs = 0 # for seq in clusters[cluster]: # totalseqs += int(seq[0].split("_")[1]) # if totalseqs < args.minseqs: # topop.append(cluster) # else: # countarray.append((cluster,totalseqs)) #for remove in topop: # clusters.pop(remove)
from cluster_seqs import cluster_seqs from sys import argv from cogent import RNA, LoadSeqs from cogent.app.muscle_v38 import align_unaligned_seqs from weblogolib import * from os.path import exists # test.py /path/to/filein.fasta /path/to/folderout/ #simmilarity if __name__ == "__main__": if argv[2][-1] != "/": argv[2] += "/" clusters = cluster_seqs(argv[1], argv[3], folderout=argv[2], gapopen='10.0/*TI', gapext='10.0') #remove tiny clusters topop = [] countarray = [] for cluster in clusters: totalseqs = 0 for seq in clusters[cluster]: totalseqs += int(seq[0].split("_")[1]) if totalseqs < 100: topop.append(cluster) else: countarray.append((cluster,totalseqs)) for remove in topop: clusters.pop(remove) countarray.sort(reverse=True, key=lambda c: c[1]) for c in countarray: print c[0] + "\t" + str(c[1])