secs = time()
    if exists(otufolder + "clusters.txt"):
        clustersin = open(otufolder + "clusters.txt")
        currclust = ""
        for header, seq in MinimalFastaParser(clustersin):
            if "cluster_" in header:
                currclust = header
                clusters[currclust] = []
            else:
                clusters[currclust].append((header, seq))
        clustersin.close()
        print "Sequences previously clustered,", len(clusters), "clusters"
    else:
        print "Running uclust over sequences"
        #cluster the initial sequences by sequence simmilarity
        clusters = cluster_seqs(args.i, args.sim, folderout=args.o,
            gapopen='10.0', gapext='10.0')

        #print that shit to file
        cout = open(otufolder + "clusters.txt", 'w')
        hold = clusters.keys()
        hold.sort()
        for cluster in hold:
            cout.write(">%s\n%s\n" % (cluster, cluster))
            for seq in clusters[cluster]:
                cout.write(">%s\n%s\n" % seq)
        cout.close()
        print str(len(clusters)) + " clusters"
        print "Runtime: " + str((time() - secs)/60) + " min"

    if not exists(otufolder + "cluster_structs.fasta"):
        #create file to write to if not already there
Exemplo n.º 2
0
        # dont need to do anything since already folded by next step
        print "Sequences previously clustered"
        with open(clustfile) as fin:
            numclusts = int(fin.readline().strip())
    elif exists(clustfile):
        # already clustered but not folded, so read in clusters
        with open(clustfile) as fin:
            clusters, numclusts = read_clusters(fin)

        print "Sequences previously clustered, %i clusters" % numclusts
    else:
        print "Running uclust over sequences"
        # cluster the initial sequences by sequence simmilarity
        clusters = cluster_seqs(args.i,
                                args.sim,
                                folderout=args.o,
                                gapopen='1.0',
                                gapext='1.0')
        with open(clustfile, 'w') as fout:
            write_clusters(clusters, fout)
        numclusts = len(clusters)
        clusters.clear()
        del clusters
        print "Runtime: %0.2f min" % ((time() - secs) / 60)

    if not exists(structfile):
        # create file to write to if not already there
        with open(structfile, 'w'):
            pass
        print "Running BayesFold over %i clusters" % numclusts
        secs = time()