Пример #1
0
def reassign_reads(readsf, priors, mates, constraints, soft_assign, initial_seed):
    k = len(priors)

    if use_priors:
        priors = update_priors(priors, readsf, mates, constraints, soft_assign)

    (likelihood, read_probs) = get_read_probs(priors, mates, constraints, soft_assign)

    # open files
    read_files = []
    build_files = []
    for i in range(k):
        read_files.append(open('cluster-%d.tmp' % i,'w'))
        if soft_assign:
            build_files.append(open('cluster-%d.build.fa' % i, 'w'))
    
    if initial_seed:
        myk = 1
    else:
        myk = k

    rsments = 0
    for c in range(myk):
        # reassign
        for line in open('cluster-%d.fa' % c):
            if line[0] == '>':
                r = line[1:].strip()  # remove front spaces
                if not read_probs.has_key(r):
                    print 'ERROR: missing read %s scores' % r
                    exit()
                    
                elif constraints.has_key(r):
                    if constraints[r] != c:
                        print 'Found a constrained read in the wrong cluster'
                    max_icm = constraints[r]

                else:
                    (max_prob, max_icm) = util.max_i(read_probs[r])

                    # count reassignments
                    if max_icm != c:
                        rsments += 1

            # print line to files
            read_files[max_icm].write(line)
            if soft_assign:
                for i in range(k):
                    if read_probs[r][i] > soft_assign_t:
                        if line[0] == '>':
                            build_files[i].write('>%f;%s' % (read_probs[r][i],line[1:]))
                        else:
                            build_files[i].write(line)
            
    # close files
    for i in range(k):
        read_files[i].close()
        if soft_assign:
            build_files[i].close()

    # move tmp
    for i in range(k):
        os.rename('cluster-%d.tmp'%i, 'cluster-%d.fa'%i)

    return (rsments,likelihood,priors)
Пример #2
0
def init_clusters(readsf, soft_assign):
    # load_mates
    mates = {}
    #if matesf:   ... just in case I need this later ...
    #    for line in open(options.mates_file):
    #        (lr,rr) = line.split()
    #        mates[lr] = rr
    #        mates[rr] = lr
            
    read_likes = {}
    for line in open('sample.fa.binning.allprobs'):
        a = line.split('\t')
        r = a[0].strip()
        read_likes[r] = [float(x) for x in a[1:]]
        k = len(a[1:])

    # Note that I'm assuming here that the cluster priors
    # are incorporated into the printed likelihoods.  I can't
    # be sure but I'd rather be wrong and have used a uniform
    # prior than reestimate the priors myself and be wrong and
    # double count them.  However, this means I am double
    # counting the prior for mated reads.

    # assign to clusters
    hard_clusters = {}
    soft_clusters = {}
    for r in read_likes:
        if not hard_clusters.has_key(r):  # mate may have been done
            # if mated, combine likelihood with 
            if mates.has_key(r):
                m = mates[r]
                clust_likes = [read_likes[r][i] + read_likes[m][i] for i in range(k)]
            else:
                m = r   # it works
                clust_likes = read_likes[r]

            # hard assignment
            (like_max, clust) = util.max_i(clust_likes)
            hard_clusters[r] = clust
            hard_clusters[m] = clust

            # soft assignment
            if soft_assign:
                sum_score = clust_likes[0]
                for i in range(1,k):
                    sum_score = imm_cluster.log_add(sum_score, clust_likes[i])

                soft_clusters[r] = []
                for i in range(k):
                    prob = math.exp(clust_likes[i] - sum_score)
                    if r != m:
                        soft_clusters[m] = []
                    if prob > imm_cluster.soft_assign_t:
                        soft_clusters[r].append((i,prob))
                        if r != m:
                            soft_clusters[m].append((i,prob))

    chunk_size = 50
    chunk_i = 0
    while chunk_i*chunk_size < k:
        # open files
        init_files = {}
        build_files = {}
        for c in range(chunk_i*chunk_size, min(k, (chunk_i+1)*chunk_size)):
            init_files[c] = open('cluster-%d.fa' % c, 'w')
            if soft_assign:
                build_files[c] = open('cluster-%d.build.fa' % c, 'w')

        # read fasta to cluster-*.fa
        for line in open(readsf):
            if line[0] == '>':
                r = line[1:].strip()  # front spaces are removed by LikelyBin
                if hard_clusters.has_key(r):
                    hc = hard_clusters[r]
                    if init_files.has_key(hc):
                        init_files[hc].write(line)
                        if soft_assign:
                            for (sc,p) in soft_clusters[r]:
                                build_files[sc].write('>%f;%s' % (p,line[1:]))
                else:
                    hc = -1

            elif hc != -1:
                if init_files.has_key(hc):
                    init_files[hc].write(line)
                    if soft_assign:
                        for (sc,p) in soft_clusters[r]:
                            build_files[sc].write(line)

        # close files
        for c in init_files:
            init_files[c].close()
            if soft_assign:
                build_files[c].close()

        # increment
        chunk_i += 1
Пример #3
0
def init_clusters(readsf, soft_assign):
    # load_mates
    mates = {}
    #if matesf:   ... just in case I need this later ...
    #    for line in open(options.mates_file):
    #        (lr,rr) = line.split()
    #        mates[lr] = rr
    #        mates[rr] = lr
            
    read_likes = {}
    for line in open('sample.fa.binning.allprobs'):
        a = line.split('\t')
        r = a[0].strip()
        read_likes[r] = [float(x) for x in a[1:]]
        k = len(a[1:])

    # Note that I'm assuming here that the cluster priors
    # are incorporated into the printed likelihoods.  I can't
    # be sure but I'd rather be wrong and have used a uniform
    # prior than reestimate the priors myself and be wrong and
    # double count them.  However, this means I am double
    # counting the prior for mated reads.

    # assign to clusters
    hard_clusters = {}
    soft_clusters = {}
    for r in read_likes:
        if not hard_clusters.has_key(r):  # mate may have been done
            # if mated, combine likelihood with 
            if mates.has_key(r):
                m = mates[r]
                clust_likes = [read_likes[r][i] + read_likes[m][i] for i in range(k)]
            else:
                m = r   # it works
                clust_likes = read_likes[r]

            # hard assignment
            (like_max, clust) = util.max_i(clust_likes)
            hard_clusters[r] = clust
            hard_clusters[m] = clust

            # soft assignment
            if soft_assign:
                sum_score = clust_likes[0]
                for i in range(1,k):
                    sum_score = imm_cluster.log_add(sum_score, clust_likes[i])

                soft_clusters[r] = []
                for i in range(k):
                    prob = math.exp(clust_likes[i] - sum_score)
                    if r != m:
                        soft_clusters[m] = []
                    if prob > imm_cluster.soft_assign_t:
                        soft_clusters[r].append((i,prob))
                        if r != m:
                            soft_clusters[m].append((i,prob))

    chunk_size = 50
    chunk_i = 0
    while chunk_i*chunk_size < k:
        # open files
        init_files = {}
        build_files = {}
        for c in range(chunk_i*chunk_size, min(k, (chunk_i+1)*chunk_size)):
            init_files[c] = open('cluster-%d.fa' % c, 'w')
            if soft_assign:
                build_files[c] = open('cluster-%d.build.fa' % c, 'w')

        # read fasta to cluster-*.fa
        for line in open(readsf):
            if line[0] == '>':
                r = line[1:].strip()  # front spaces are removed by LikelyBin
                if hard_clusters.has_key(r):
                    hc = hard_clusters[r]
                    if init_files.has_key(hc):
                        init_files[hc].write(line)
                        if soft_assign:
                            for (sc,p) in soft_clusters[r]:
                                build_files[sc].write('>%f;%s' % (p,line[1:]))
                else:
                    hc = -1

            elif hc != -1:
                if init_files.has_key(hc):
                    init_files[hc].write(line)
                    if soft_assign:
                        for (sc,p) in soft_clusters[r]:
                            build_files[sc].write(line)

        # close files
        for c in init_files:
            init_files[c].close()
            if soft_assign:
                build_files[c].close()

        # increment
        chunk_i += 1