secs = time() if exists(otufolder + "clusters.txt"): clustersin = open(otufolder + "clusters.txt") currclust = "" for header, seq in MinimalFastaParser(clustersin): if "cluster_" in header: currclust = header clusters[currclust] = [] else: clusters[currclust].append((header, seq)) clustersin.close() print "Sequences previously clustered,", len(clusters), "clusters" else: print "Running uclust over sequences" #cluster the initial sequences by sequence simmilarity clusters = cluster_seqs(args.i, args.sim, folderout=args.o, gapopen='10.0', gapext='10.0') #print that shit to file cout = open(otufolder + "clusters.txt", 'w') hold = clusters.keys() hold.sort() for cluster in hold: cout.write(">%s\n%s\n" % (cluster, cluster)) for seq in clusters[cluster]: cout.write(">%s\n%s\n" % seq) cout.close() print str(len(clusters)) + " clusters" print "Runtime: " + str((time() - secs)/60) + " min" if not exists(otufolder + "cluster_structs.fasta"): #create file to write to if not already there
# dont need to do anything since already folded by next step print "Sequences previously clustered" with open(clustfile) as fin: numclusts = int(fin.readline().strip()) elif exists(clustfile): # already clustered but not folded, so read in clusters with open(clustfile) as fin: clusters, numclusts = read_clusters(fin) print "Sequences previously clustered, %i clusters" % numclusts else: print "Running uclust over sequences" # cluster the initial sequences by sequence simmilarity clusters = cluster_seqs(args.i, args.sim, folderout=args.o, gapopen='1.0', gapext='1.0') with open(clustfile, 'w') as fout: write_clusters(clusters, fout) numclusts = len(clusters) clusters.clear() del clusters print "Runtime: %0.2f min" % ((time() - secs) / 60) if not exists(structfile): # create file to write to if not already there with open(structfile, 'w'): pass print "Running BayesFold over %i clusters" % numclusts secs = time()